78 lines
1.9 KiB
Python
78 lines
1.9 KiB
Python
"""Define domain models for documents."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from enum import Enum
|
|
from typing import Any
|
|
# Keep module behavior explicit so the backend flow stays easy to audit.
|
|
|
|
|
|
def utcnow() -> datetime:
|
|
return datetime.now(UTC)
|
|
|
|
|
|
|
|
class DocumentStatus(str, Enum):
|
|
"""Define the Document Status enumeration."""
|
|
PENDING = "pending"
|
|
STORED = "stored"
|
|
PARSED = "parsed"
|
|
INDEXED = "indexed"
|
|
FAILED = "failed"
|
|
|
|
|
|
@dataclass
|
|
class Document:
|
|
"""Represent the Document type."""
|
|
doc_id: str
|
|
doc_name: str
|
|
file_name: str
|
|
object_name: str
|
|
content_type: str
|
|
size_bytes: int
|
|
status: DocumentStatus = DocumentStatus.PENDING
|
|
regulation_type: str = ""
|
|
version: str = ""
|
|
summary: str = ""
|
|
summary_latency_ms: int = 0
|
|
chunk_count: int = 0
|
|
parser_name: str = ""
|
|
index_name: str = ""
|
|
error_message: str = ""
|
|
created_at: datetime = field(default_factory=utcnow)
|
|
updated_at: datetime = field(default_factory=utcnow)
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ParsedDocument:
|
|
"""Represent the Parsed Document type."""
|
|
doc_id: str
|
|
doc_name: str
|
|
structure_nodes: list[dict[str, Any]]
|
|
semantic_blocks: list[dict[str, Any]]
|
|
vector_chunks: list[dict[str, Any]]
|
|
parser_name: str
|
|
raw_text: str = ""
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class Chunk:
|
|
"""Represent the Chunk type."""
|
|
chunk_id: str
|
|
doc_id: str
|
|
doc_name: str
|
|
content: str
|
|
embedding_text: str
|
|
section_title: str = ""
|
|
section_path: list[str] = field(default_factory=list)
|
|
page_number: int = 0
|
|
regulation_type: str = ""
|
|
version: str = ""
|
|
semantic_id: str = ""
|
|
block_type: str = ""
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|