"""Define domain models for documents.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import UTC, datetime from enum import Enum from typing import Any # Keep module behavior explicit so the backend flow stays easy to audit. def utcnow() -> datetime: return datetime.now(UTC) class DocumentStatus(str, Enum): """Define the Document Status enumeration.""" PENDING = "pending" STORED = "stored" PARSED = "parsed" INDEXED = "indexed" FAILED = "failed" @dataclass class Document: """Represent the Document type.""" doc_id: str doc_name: str file_name: str object_name: str content_type: str size_bytes: int status: DocumentStatus = DocumentStatus.PENDING regulation_type: str = "" version: str = "" summary: str = "" summary_latency_ms: int = 0 chunk_count: int = 0 parser_name: str = "" index_name: str = "" error_message: str = "" created_at: datetime = field(default_factory=utcnow) updated_at: datetime = field(default_factory=utcnow) metadata: dict[str, Any] = field(default_factory=dict) @dataclass class ParsedDocument: """Represent the Parsed Document type.""" doc_id: str doc_name: str structure_nodes: list[dict[str, Any]] semantic_blocks: list[dict[str, Any]] vector_chunks: list[dict[str, Any]] parser_name: str raw_text: str = "" raw_layouts: list[dict[str, Any]] = field(default_factory=list) metadata: dict[str, Any] = field(default_factory=dict) @dataclass class Chunk: """Represent the Chunk type.""" chunk_id: str doc_id: str doc_name: str content: str embedding_text: str section_title: str = "" section_path: list[str] = field(default_factory=list) page_number: int = 0 regulation_type: str = "" version: str = "" semantic_id: str = "" block_type: str = "" metadata: dict[str, Any] = field(default_factory=dict)