"""Define domain models for documents.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import UTC, datetime from enum import Enum from typing import Any # Keep module behavior explicit so the backend flow stays easy to audit. def utcnow() -> datetime: return datetime.now(UTC) class DocumentStatus(str, Enum): """Define the Document Status enumeration.""" PENDING = "pending" STORED = "stored" PARSED = "parsed" INDEXED = "indexed" FAILED = "failed" @dataclass class Document: """Represent the Document type.""" doc_id: str doc_name: str file_name: str object_name: str content_type: str size_bytes: int status: DocumentStatus = DocumentStatus.PENDING regulation_type: str = "" version: str = "" summary: str = "" summary_latency_ms: int = 0 chunk_count: int = 0 parser_name: str = "" index_name: str = "" error_message: str = "" created_at: datetime = field(default_factory=utcnow) updated_at: datetime = field(default_factory=utcnow) metadata: dict[str, Any] = field(default_factory=dict) @dataclass class ParsedDocument: """Represent the Parsed Document type.""" doc_id: str doc_name: str structure_nodes: list[dict[str, Any]] semantic_blocks: list[dict[str, Any]] vector_chunks: list[dict[str, Any]] parser_name: str raw_text: str = "" raw_layouts: list[dict[str, Any]] = field(default_factory=list) metadata: dict[str, Any] = field(default_factory=dict) @dataclass class Chunk: """Represent the Chunk type.""" chunk_id: str doc_id: str doc_name: str content: str embedding_text: str section_title: str = "" section_path: list[str] = field(default_factory=list) page_number: int = 0 regulation_type: str = "" version: str = "" semantic_id: str = "" block_type: str = "" metadata: dict[str, Any] = field(default_factory=dict) @dataclass class DocumentProcessingRun: """Represent one processing attempt for a document.""" run_id: str doc_id: str trigger_type: str run_status: str parser_backend: str = "" chunk_backend: str = "" embedding_model: str = "" index_name: str = "" started_at: datetime = field(default_factory=utcnow) stored_at: datetime | None = None parsed_at: datetime | None = None indexed_at: datetime | None = None finished_at: datetime | None = None layout_count: int = 0 structure_node_count: int = 0 semantic_block_count: int = 0 vector_chunk_count: int = 0 chunk_count: int = 0 failure_stage: str = "" error_message: str = "" metadata: dict[str, Any] = field(default_factory=dict) @dataclass class DocumentStatusEvent: """Represent a document lifecycle event emitted during processing.""" event_id: str doc_id: str run_id: str from_status: str to_status: str stage: str message: str = "" metadata: dict[str, Any] = field(default_factory=dict) occurred_at: datetime = field(default_factory=utcnow) @dataclass class DocumentArtifact: """Represent a persisted artifact reference for one processing run.""" artifact_id: str doc_id: str run_id: str artifact_type: str object_name: str content_type: str byte_size: int = 0 checksum: str = "" metadata: dict[str, Any] = field(default_factory=dict) created_at: datetime = field(default_factory=utcnow)