133 lines
3.9 KiB
Python
133 lines
3.9 KiB
Python
from typing import Dict, List, Optional, Any
|
|
from dataclasses import dataclass, fields
|
|
from dataclasses_json import dataclass_json
|
|
|
|
|
|
@dataclass_json
|
|
@dataclass
|
|
class DiResult:
|
|
"""Data class for storing"""
|
|
figures: List['FigureFlat']
|
|
di_content: str
|
|
filepath:str
|
|
language:str
|
|
|
|
|
|
@dataclass_json
|
|
@dataclass
|
|
class FigureFlat:
|
|
offset: int
|
|
length: int
|
|
url: str
|
|
content: str
|
|
image: str
|
|
understand_flag:bool
|
|
caption:str
|
|
|
|
|
|
def dict_to_str(v):
|
|
return v if isinstance(v, str) else str(v)
|
|
|
|
@dataclass
|
|
class Document(object):
|
|
"""A data class for storing documents
|
|
|
|
Attributes:
|
|
content (str): The content of the document.
|
|
id (Optional[str]): The id of the document.
|
|
title (Optional[str]): The title of the document.
|
|
filepath (Optional[str]): The filepath of the document.
|
|
url (Optional[str]): The url of the document.
|
|
metadata (Optional[Dict]): The metadata of the document.
|
|
"""
|
|
|
|
content: Optional[str] = None
|
|
id: Optional[str] = None
|
|
title: Optional[str] = None
|
|
filepath: Optional[str] = None
|
|
|
|
url: Optional[str] = None
|
|
metadata: Optional[Dict] = None
|
|
image_mapping: Optional[Dict] = None
|
|
|
|
doc_metadata: Optional[str] = None
|
|
document_schema: Optional[str] = None
|
|
main_title: Optional[str] = None
|
|
sub_title: Optional[str] = None
|
|
publisher: Optional[str] = None
|
|
document_code: Optional[str] = None
|
|
document_category: Optional[str] = None
|
|
main_title_sec_language: Optional[str] = None
|
|
sub_title_sec_language: Optional[str] = None
|
|
primary_language: Optional[str] = None
|
|
secondary_language: Optional[str] = None
|
|
|
|
full_headers: Optional[str] = None
|
|
h1: Optional[str] = None
|
|
h2: Optional[str] = None
|
|
h3: Optional[str] = None
|
|
h4: Optional[str] = None
|
|
h5: Optional[str] = None
|
|
h6: Optional[str] = None
|
|
|
|
contentVector: Optional[List[float]] = None
|
|
full_metadata_vector: Optional[List[float]] = None
|
|
|
|
|
|
def __setattr__(self, key, value) -> None:
|
|
# If the attribute is a list or dictionary, convert it to a string for storage
|
|
if key =="doc_metadata" and value is not None and isinstance(value, (list, dict)):
|
|
value = dict_to_str(value)
|
|
# Avoid infinite recursion of __setattr__ calls
|
|
object.__setattr__(self, key, value)
|
|
|
|
def __setitem__(self, key, value) -> None:
|
|
# Store the attribute directly in the instance's __dict__
|
|
self.__dict__[key] = value
|
|
|
|
|
|
def __getitem__(self, key) -> Any:
|
|
# Retrieve the attribute from the instance's __dict__
|
|
return self.__dict__[key]
|
|
|
|
def copy_dynamic_attrs(self, source) -> None:
|
|
"""Copy dynamic attributes from the source object to the current object"""
|
|
predefined = {f.name for f in fields(source)}
|
|
for attr in dir(source):
|
|
# Filter dynamic attributes
|
|
if (attr not in predefined and
|
|
not attr.startswith('__') and
|
|
not callable(getattr(source, attr))):
|
|
value = getattr(source, attr)
|
|
setattr(self, attr, value)
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
class ChunkingResult:
|
|
"""Data model for chunking result
|
|
|
|
Attributes:
|
|
chunks (List[Document]): List of chunks.
|
|
total_files (int): Total number of files.
|
|
num_unsupported_format_files (int): Number of files with unsupported format.
|
|
num_files_with_errors (int): Number of files with errors.
|
|
skipped_chunks (int): Number of chunks skipped due to too few tokens.
|
|
"""
|
|
chunks: List[Document]
|
|
total_files: int
|
|
num_unsupported_format_files: int = 0
|
|
num_files_with_errors: int = 0
|
|
# some chunks might be skipped due to too few tokens
|
|
skipped_chunks: int = 0
|
|
failed_files = None
|
|
|
|
|
|
|
|
class UnsupportedFormatError(Exception):
|
|
"""Exception raised when a format is not supported by a parser."""
|
|
|
|
pass
|
|
|