from typing import Dict, List, Optional, Any from dataclasses import dataclass, fields from dataclasses_json import dataclass_json @dataclass_json @dataclass class DiResult: """Data class for storing""" figures: List['FigureFlat'] di_content: str filepath:str language:str @dataclass_json @dataclass class FigureFlat: offset: int length: int url: str content: str image: str understand_flag:bool caption:str def dict_to_str(v): return v if isinstance(v, str) else str(v) @dataclass class Document(object): """A data class for storing documents Attributes: content (str): The content of the document. id (Optional[str]): The id of the document. title (Optional[str]): The title of the document. filepath (Optional[str]): The filepath of the document. url (Optional[str]): The url of the document. metadata (Optional[Dict]): The metadata of the document. """ content: Optional[str] = None id: Optional[str] = None title: Optional[str] = None filepath: Optional[str] = None url: Optional[str] = None metadata: Optional[Dict] = None image_mapping: Optional[Dict] = None doc_metadata: Optional[str] = None document_schema: Optional[str] = None main_title: Optional[str] = None sub_title: Optional[str] = None publisher: Optional[str] = None document_code: Optional[str] = None document_category: Optional[str] = None main_title_sec_language: Optional[str] = None sub_title_sec_language: Optional[str] = None primary_language: Optional[str] = None secondary_language: Optional[str] = None full_headers: Optional[str] = None h1: Optional[str] = None h2: Optional[str] = None h3: Optional[str] = None h4: Optional[str] = None h5: Optional[str] = None h6: Optional[str] = None contentVector: Optional[List[float]] = None full_metadata_vector: Optional[List[float]] = None def __setattr__(self, key, value) -> None: # If the attribute is a list or dictionary, convert it to a string for storage if key =="doc_metadata" and value is not None and isinstance(value, (list, dict)): value = dict_to_str(value) # Avoid infinite recursion of __setattr__ calls object.__setattr__(self, key, value) def __setitem__(self, key, value) -> None: # Store the attribute directly in the instance's __dict__ self.__dict__[key] = value def __getitem__(self, key) -> Any: # Retrieve the attribute from the instance's __dict__ return self.__dict__[key] def copy_dynamic_attrs(self, source) -> None: """Copy dynamic attributes from the source object to the current object""" predefined = {f.name for f in fields(source)} for attr in dir(source): # Filter dynamic attributes if (attr not in predefined and not attr.startswith('__') and not callable(getattr(source, attr))): value = getattr(source, attr) setattr(self, attr, value) @dataclass class ChunkingResult: """Data model for chunking result Attributes: chunks (List[Document]): List of chunks. total_files (int): Total number of files. num_unsupported_format_files (int): Number of files with unsupported format. num_files_with_errors (int): Number of files with errors. skipped_chunks (int): Number of chunks skipped due to too few tokens. """ chunks: List[Document] total_files: int num_unsupported_format_files: int = 0 num_files_with_errors: int = 0 # some chunks might be skipped due to too few tokens skipped_chunks: int = 0 failed_files = None class UnsupportedFormatError(Exception): """Exception raised when a format is not supported by a parser.""" pass