catonline_ai/vw-document-ai-indexer/entity_models.py

from typing import Dict, List, Optional, Any
from dataclasses import dataclass, fields
from dataclasses_json import dataclass_json


@dataclass_json
@dataclass
class DiResult:
    """Data class for storing"""
    figures: List['FigureFlat']
    di_content: str
    filepath:str
    language:str


@dataclass_json
@dataclass
class FigureFlat:
    offset: int
    length: int
    url: str
    content: str
    image: str
    understand_flag:bool
    caption:str


def dict_to_str(v):
    return v if isinstance(v, str) else str(v)

@dataclass
class Document(object):
    """A data class for storing documents

    Attributes:
        content (str): The content of the document.
        id (Optional[str]): The id of the document.
        title (Optional[str]): The title of the document.
        filepath (Optional[str]): The filepath of the document.
        url (Optional[str]): The url of the document.
        metadata (Optional[Dict]): The metadata of the document.
    """

    content: Optional[str] = None
    id: Optional[str] = None
    title: Optional[str] = None
    filepath: Optional[str] = None

    url: Optional[str] = None
    metadata: Optional[Dict] = None
    image_mapping: Optional[Dict] = None

    doc_metadata: Optional[str] = None
    document_schema: Optional[str] = None
    main_title: Optional[str] = None
    sub_title: Optional[str] = None
    publisher: Optional[str] = None
    document_code: Optional[str] = None
    document_category: Optional[str] = None
    main_title_sec_language: Optional[str] = None
    sub_title_sec_language: Optional[str] = None
    primary_language: Optional[str] = None
    secondary_language: Optional[str] = None

    full_headers: Optional[str] = None
    h1: Optional[str] = None
    h2: Optional[str] = None
    h3: Optional[str] = None
    h4: Optional[str] = None
    h5: Optional[str] = None
    h6: Optional[str] = None

    contentVector: Optional[List[float]] = None
    full_metadata_vector:  Optional[List[float]] = None


    def __setattr__(self, key, value) -> None:
        # If the attribute is a list or dictionary, convert it to a string for storage
        if key =="doc_metadata" and value is not None and isinstance(value, (list, dict)):
            value = dict_to_str(value)
        # Avoid infinite recursion of __setattr__ calls
        object.__setattr__(self, key, value)

    def __setitem__(self, key, value) -> None:
        # Store the attribute directly in the instance's __dict__
        self.__dict__[key] = value


    def __getitem__(self, key) -> Any:
        # Retrieve the attribute from the instance's __dict__
        return self.__dict__[key]

    def copy_dynamic_attrs(self, source) -> None:
        """Copy dynamic attributes from the source object to the current object"""
        predefined = {f.name for f in fields(source)}
        for attr in dir(source):
            # Filter dynamic attributes
            if (attr not in predefined and
                    not attr.startswith('__') and
                    not callable(getattr(source, attr))):
                value = getattr(source, attr)
                setattr(self, attr, value)


@dataclass
class ChunkingResult:
    """Data model for chunking result

    Attributes:
        chunks (List[Document]): List of chunks.
        total_files (int): Total number of files.
        num_unsupported_format_files (int): Number of files with unsupported format.
        num_files_with_errors (int): Number of files with errors.
        skipped_chunks (int): Number of chunks skipped due to too few tokens.
    """
    chunks: List[Document]
    total_files: int
    num_unsupported_format_files: int = 0
    num_files_with_errors: int = 0
    # some chunks might be skipped due to too few tokens
    skipped_chunks: int = 0
    failed_files = None


class UnsupportedFormatError(Exception):
    """Exception raised when a format is not supported by a parser."""

    pass