"""Define domain models for documents.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import UTC, datetime from enum import Enum from typing import Any # Keep module behavior explicit so the backend flow stays easy to audit. def utcnow() -> datetime: return datetime.now(UTC) class DocumentStatus(str, Enum): """Define the Document Status enumeration.""" PENDING = "pending" STORED = "stored" PARSED = "parsed" INDEXED = "indexed" FAILED = "failed" @dataclass class Document: """Represent the Document type.""" doc_id: str doc_name: str file_name: str object_name: str content_type: str size_bytes: int status: DocumentStatus = DocumentStatus.PENDING regulation_type: str = "" version: str = "" summary: str = "" summary_latency_ms: int = 0 chunk_count: int = 0 parser_name: str = "" index_name: str = "" error_message: str = "" created_at: datetime = field(default_factory=utcnow) updated_at: datetime = field(default_factory=utcnow) metadata: dict[str, Any] = field(default_factory=dict) @dataclass class ParsedDocument: """Represent the Parsed Document type.""" doc_id: str doc_name: str structure_nodes: list[dict[str, Any]] semantic_blocks: list[dict[str, Any]] vector_chunks: list[dict[str, Any]] parser_name: str raw_text: str = "" raw_layouts: list[dict[str, Any]] = field(default_factory=list) metadata: dict[str, Any] = field(default_factory=dict) @dataclass(init=False) class Chunk: """Represent one retrieval chunk with backward-compatible aliases.""" chunk_id: str doc_id: str doc_title: str text: str embedding_text: str chunk_type: str = "" chunk_index: int = 0 piece_index: int = 0 page_start: int = 0 page_end: int = 0 section_title: str = "" section_path: list[str] = field(default_factory=list) section_level: int = 0 source_ids: list[str] = field(default_factory=list) regulation_type: str = "" version: str = "" semantic_id: str = "" metadata: dict[str, Any] = field(default_factory=dict) def __init__( self, *, chunk_id: str, doc_id: str, doc_title: str | None = None, text: str | None = None, embedding_text: str = "", chunk_type: str = "", chunk_index: int = 0, piece_index: int = 0, page_start: int = 0, page_end: int = 0, section_title: str = "", section_path: list[str] | None = None, section_level: int = 0, source_ids: list[str] | None = None, regulation_type: str = "", version: str = "", semantic_id: str = "", metadata: dict[str, Any] | None = None, doc_name: str | None = None, content: str | None = None, page_number: int | None = None, block_type: str | None = None, **_: Any, ) -> None: """Initialize the chunk while accepting legacy field names.""" self.chunk_id = chunk_id self.doc_id = doc_id self.doc_title = doc_title if doc_title is not None else (doc_name or "") self.text = text if text is not None else (content or "") self.embedding_text = embedding_text or self.text self.chunk_type = chunk_type or (block_type or "") self.chunk_index = int(chunk_index or 0) self.piece_index = int(piece_index or 0) self.page_start = int(page_start or page_number or 0) self.page_end = int(page_end or self.page_start) self.section_title = section_title self.section_path = list(section_path or []) self.section_level = int(section_level or 0) self.source_ids = list(source_ids or []) self.regulation_type = regulation_type self.version = version self.semantic_id = semantic_id self.metadata = dict(metadata or {}) @property def doc_name(self) -> str: """Return the legacy document name alias.""" return self.doc_title @doc_name.setter def doc_name(self, value: str) -> None: """Update the legacy document name alias.""" self.doc_title = value @property def content(self) -> str: """Return the legacy content alias.""" return self.text @content.setter def content(self, value: str) -> None: """Update the legacy content alias.""" self.text = value @property def page_number(self) -> int: """Return the legacy page number alias.""" return self.page_start @page_number.setter def page_number(self, value: int) -> None: """Update the legacy page number alias.""" self.page_start = value self.page_end = max(self.page_end, value) @property def block_type(self) -> str: """Return the legacy block type alias.""" return self.chunk_type @block_type.setter def block_type(self, value: str) -> None: """Update the legacy block type alias.""" self.chunk_type = value @dataclass class DocumentProcessingRun: """Represent one processing attempt for a document.""" run_id: str doc_id: str trigger_type: str run_status: str parser_backend: str = "" chunk_backend: str = "" embedding_model: str = "" index_name: str = "" started_at: datetime = field(default_factory=utcnow) stored_at: datetime | None = None parsed_at: datetime | None = None indexed_at: datetime | None = None finished_at: datetime | None = None layout_count: int = 0 structure_node_count: int = 0 semantic_block_count: int = 0 vector_chunk_count: int = 0 chunk_count: int = 0 failure_stage: str = "" error_message: str = "" metadata: dict[str, Any] = field(default_factory=dict) @dataclass class DocumentStatusEvent: """Represent a document lifecycle event emitted during processing.""" event_id: str doc_id: str run_id: str from_status: str to_status: str stage: str message: str = "" metadata: dict[str, Any] = field(default_factory=dict) occurred_at: datetime = field(default_factory=utcnow) @dataclass class DocumentArtifact: """Represent a persisted artifact reference for one processing run.""" artifact_id: str doc_id: str run_id: str artifact_type: str object_name: str content_type: str byte_size: int = 0 checksum: str = "" metadata: dict[str, Any] = field(default_factory=dict) created_at: datetime = field(default_factory=utcnow)