init
This commit is contained in:
132
vw-document-ai-indexer/entity_models.py
Normal file
132
vw-document-ai-indexer/entity_models.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass, fields
|
||||
from dataclasses_json import dataclass_json
|
||||
|
||||
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class DiResult:
|
||||
"""Data class for storing"""
|
||||
figures: List['FigureFlat']
|
||||
di_content: str
|
||||
filepath:str
|
||||
language:str
|
||||
|
||||
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class FigureFlat:
|
||||
offset: int
|
||||
length: int
|
||||
url: str
|
||||
content: str
|
||||
image: str
|
||||
understand_flag:bool
|
||||
caption:str
|
||||
|
||||
|
||||
def dict_to_str(v):
|
||||
return v if isinstance(v, str) else str(v)
|
||||
|
||||
@dataclass
|
||||
class Document(object):
|
||||
"""A data class for storing documents
|
||||
|
||||
Attributes:
|
||||
content (str): The content of the document.
|
||||
id (Optional[str]): The id of the document.
|
||||
title (Optional[str]): The title of the document.
|
||||
filepath (Optional[str]): The filepath of the document.
|
||||
url (Optional[str]): The url of the document.
|
||||
metadata (Optional[Dict]): The metadata of the document.
|
||||
"""
|
||||
|
||||
content: Optional[str] = None
|
||||
id: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
filepath: Optional[str] = None
|
||||
|
||||
url: Optional[str] = None
|
||||
metadata: Optional[Dict] = None
|
||||
image_mapping: Optional[Dict] = None
|
||||
|
||||
doc_metadata: Optional[str] = None
|
||||
document_schema: Optional[str] = None
|
||||
main_title: Optional[str] = None
|
||||
sub_title: Optional[str] = None
|
||||
publisher: Optional[str] = None
|
||||
document_code: Optional[str] = None
|
||||
document_category: Optional[str] = None
|
||||
main_title_sec_language: Optional[str] = None
|
||||
sub_title_sec_language: Optional[str] = None
|
||||
primary_language: Optional[str] = None
|
||||
secondary_language: Optional[str] = None
|
||||
|
||||
full_headers: Optional[str] = None
|
||||
h1: Optional[str] = None
|
||||
h2: Optional[str] = None
|
||||
h3: Optional[str] = None
|
||||
h4: Optional[str] = None
|
||||
h5: Optional[str] = None
|
||||
h6: Optional[str] = None
|
||||
|
||||
contentVector: Optional[List[float]] = None
|
||||
full_metadata_vector: Optional[List[float]] = None
|
||||
|
||||
|
||||
def __setattr__(self, key, value) -> None:
|
||||
# If the attribute is a list or dictionary, convert it to a string for storage
|
||||
if key =="doc_metadata" and value is not None and isinstance(value, (list, dict)):
|
||||
value = dict_to_str(value)
|
||||
# Avoid infinite recursion of __setattr__ calls
|
||||
object.__setattr__(self, key, value)
|
||||
|
||||
def __setitem__(self, key, value) -> None:
|
||||
# Store the attribute directly in the instance's __dict__
|
||||
self.__dict__[key] = value
|
||||
|
||||
|
||||
def __getitem__(self, key) -> Any:
|
||||
# Retrieve the attribute from the instance's __dict__
|
||||
return self.__dict__[key]
|
||||
|
||||
def copy_dynamic_attrs(self, source) -> None:
|
||||
"""Copy dynamic attributes from the source object to the current object"""
|
||||
predefined = {f.name for f in fields(source)}
|
||||
for attr in dir(source):
|
||||
# Filter dynamic attributes
|
||||
if (attr not in predefined and
|
||||
not attr.startswith('__') and
|
||||
not callable(getattr(source, attr))):
|
||||
value = getattr(source, attr)
|
||||
setattr(self, attr, value)
|
||||
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkingResult:
|
||||
"""Data model for chunking result
|
||||
|
||||
Attributes:
|
||||
chunks (List[Document]): List of chunks.
|
||||
total_files (int): Total number of files.
|
||||
num_unsupported_format_files (int): Number of files with unsupported format.
|
||||
num_files_with_errors (int): Number of files with errors.
|
||||
skipped_chunks (int): Number of chunks skipped due to too few tokens.
|
||||
"""
|
||||
chunks: List[Document]
|
||||
total_files: int
|
||||
num_unsupported_format_files: int = 0
|
||||
num_files_with_errors: int = 0
|
||||
# some chunks might be skipped due to too few tokens
|
||||
skipped_chunks: int = 0
|
||||
failed_files = None
|
||||
|
||||
|
||||
|
||||
class UnsupportedFormatError(Exception):
|
||||
"""Exception raised when a format is not supported by a parser."""
|
||||
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user