Files
catonline_ai/vw-document-ai-indexer/entity_models.py
2025-09-26 17:15:54 +08:00

133 lines
3.9 KiB
Python

from typing import Dict, List, Optional, Any
from dataclasses import dataclass, fields
from dataclasses_json import dataclass_json
@dataclass_json
@dataclass
class DiResult:
"""Data class for storing"""
figures: List['FigureFlat']
di_content: str
filepath:str
language:str
@dataclass_json
@dataclass
class FigureFlat:
offset: int
length: int
url: str
content: str
image: str
understand_flag:bool
caption:str
def dict_to_str(v):
return v if isinstance(v, str) else str(v)
@dataclass
class Document(object):
"""A data class for storing documents
Attributes:
content (str): The content of the document.
id (Optional[str]): The id of the document.
title (Optional[str]): The title of the document.
filepath (Optional[str]): The filepath of the document.
url (Optional[str]): The url of the document.
metadata (Optional[Dict]): The metadata of the document.
"""
content: Optional[str] = None
id: Optional[str] = None
title: Optional[str] = None
filepath: Optional[str] = None
url: Optional[str] = None
metadata: Optional[Dict] = None
image_mapping: Optional[Dict] = None
doc_metadata: Optional[str] = None
document_schema: Optional[str] = None
main_title: Optional[str] = None
sub_title: Optional[str] = None
publisher: Optional[str] = None
document_code: Optional[str] = None
document_category: Optional[str] = None
main_title_sec_language: Optional[str] = None
sub_title_sec_language: Optional[str] = None
primary_language: Optional[str] = None
secondary_language: Optional[str] = None
full_headers: Optional[str] = None
h1: Optional[str] = None
h2: Optional[str] = None
h3: Optional[str] = None
h4: Optional[str] = None
h5: Optional[str] = None
h6: Optional[str] = None
contentVector: Optional[List[float]] = None
full_metadata_vector: Optional[List[float]] = None
def __setattr__(self, key, value) -> None:
# If the attribute is a list or dictionary, convert it to a string for storage
if key =="doc_metadata" and value is not None and isinstance(value, (list, dict)):
value = dict_to_str(value)
# Avoid infinite recursion of __setattr__ calls
object.__setattr__(self, key, value)
def __setitem__(self, key, value) -> None:
# Store the attribute directly in the instance's __dict__
self.__dict__[key] = value
def __getitem__(self, key) -> Any:
# Retrieve the attribute from the instance's __dict__
return self.__dict__[key]
def copy_dynamic_attrs(self, source) -> None:
"""Copy dynamic attributes from the source object to the current object"""
predefined = {f.name for f in fields(source)}
for attr in dir(source):
# Filter dynamic attributes
if (attr not in predefined and
not attr.startswith('__') and
not callable(getattr(source, attr))):
value = getattr(source, attr)
setattr(self, attr, value)
@dataclass
class ChunkingResult:
"""Data model for chunking result
Attributes:
chunks (List[Document]): List of chunks.
total_files (int): Total number of files.
num_unsupported_format_files (int): Number of files with unsupported format.
num_files_with_errors (int): Number of files with errors.
skipped_chunks (int): Number of chunks skipped due to too few tokens.
"""
chunks: List[Document]
total_files: int
num_unsupported_format_files: int = 0
num_files_with_errors: int = 0
# some chunks might be skipped due to too few tokens
skipped_chunks: int = 0
failed_files = None
class UnsupportedFormatError(Exception):
"""Exception raised when a format is not supported by a parser."""
pass