init

2025-09-26 17:15:54 +08:00
commit db0e5965ec
211 changed files with 40437 additions and 0 deletions
--- a/vw-document-ai-indexer/entity_models.py
+++ b/vw-document-ai-indexer/entity_models.py
@@ -0,0 +1,132 @@
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass, fields
+from dataclasses_json import dataclass_json
+
+
+@dataclass_json
+@dataclass
+class DiResult:
+    """Data class for storing"""
+    figures: List['FigureFlat']
+    di_content: str
+    filepath:str
+    language:str
+
+
+@dataclass_json
+@dataclass
+class FigureFlat:
+    offset: int
+    length: int
+    url: str
+    content: str
+    image: str
+    understand_flag:bool
+    caption:str
+
+
+def dict_to_str(v):
+    return v if isinstance(v, str) else str(v)
+
+@dataclass
+class Document(object):
+    """A data class for storing documents
+
+    Attributes:
+        content (str): The content of the document.
+        id (Optional[str]): The id of the document.
+        title (Optional[str]): The title of the document.
+        filepath (Optional[str]): The filepath of the document.
+        url (Optional[str]): The url of the document.
+        metadata (Optional[Dict]): The metadata of the document.    
+    """
+
+    content: Optional[str] = None
+    id: Optional[str] = None
+    title: Optional[str] = None
+    filepath: Optional[str] = None
+
+    url: Optional[str] = None
+    metadata: Optional[Dict] = None
+    image_mapping: Optional[Dict] = None
+
+    doc_metadata: Optional[str] = None
+    document_schema: Optional[str] = None
+    main_title: Optional[str] = None
+    sub_title: Optional[str] = None
+    publisher: Optional[str] = None
+    document_code: Optional[str] = None
+    document_category: Optional[str] = None
+    main_title_sec_language: Optional[str] = None
+    sub_title_sec_language: Optional[str] = None
+    primary_language: Optional[str] = None
+    secondary_language: Optional[str] = None
+
+    full_headers: Optional[str] = None
+    h1: Optional[str] = None
+    h2: Optional[str] = None
+    h3: Optional[str] = None
+    h4: Optional[str] = None
+    h5: Optional[str] = None
+    h6: Optional[str] = None
+
+    contentVector: Optional[List[float]] = None
+    full_metadata_vector:  Optional[List[float]] = None
+
+
+    def __setattr__(self, key, value) -> None:
+        # If the attribute is a list or dictionary, convert it to a string for storage
+        if key =="doc_metadata" and value is not None and isinstance(value, (list, dict)):
+            value = dict_to_str(value)
+        # Avoid infinite recursion of __setattr__ calls
+        object.__setattr__(self, key, value)
+
+    def __setitem__(self, key, value) -> None:
+        # Store the attribute directly in the instance's __dict__
+        self.__dict__[key] = value
+
+
+    def __getitem__(self, key) -> Any:
+        # Retrieve the attribute from the instance's __dict__
+        return self.__dict__[key]
+
+    def copy_dynamic_attrs(self, source) -> None:
+        """Copy dynamic attributes from the source object to the current object"""
+        predefined = {f.name for f in fields(source)}
+        for attr in dir(source):
+            # Filter dynamic attributes
+            if (attr not in predefined and
+                    not attr.startswith('__') and
+                    not callable(getattr(source, attr))):
+                value = getattr(source, attr)
+                setattr(self, attr, value)
+
+
+
+
+@dataclass
+class ChunkingResult:
+    """Data model for chunking result
+
+    Attributes:
+        chunks (List[Document]): List of chunks.
+        total_files (int): Total number of files.
+        num_unsupported_format_files (int): Number of files with unsupported format.
+        num_files_with_errors (int): Number of files with errors.
+        skipped_chunks (int): Number of chunks skipped due to too few tokens.
+    """
+    chunks: List[Document]
+    total_files: int
+    num_unsupported_format_files: int = 0
+    num_files_with_errors: int = 0
+    # some chunks might be skipped due to too few tokens
+    skipped_chunks: int = 0
+    failed_files = None
+    
+
+
+class UnsupportedFormatError(Exception):
+    """Exception raised when a format is not supported by a parser."""
+
+    pass
+