137 lines
3.4 KiB
Python
137 lines
3.4 KiB
Python
"""Define domain models for documents."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from enum import Enum
|
|
from typing import Any
|
|
# Keep module behavior explicit so the backend flow stays easy to audit.
|
|
|
|
|
|
def utcnow() -> datetime:
|
|
return datetime.now(UTC)
|
|
|
|
|
|
|
|
class DocumentStatus(str, Enum):
|
|
"""Define the Document Status enumeration."""
|
|
PENDING = "pending"
|
|
STORED = "stored"
|
|
PARSED = "parsed"
|
|
INDEXED = "indexed"
|
|
FAILED = "failed"
|
|
|
|
|
|
@dataclass
|
|
class Document:
|
|
"""Represent the Document type."""
|
|
doc_id: str
|
|
doc_name: str
|
|
file_name: str
|
|
object_name: str
|
|
content_type: str
|
|
size_bytes: int
|
|
status: DocumentStatus = DocumentStatus.PENDING
|
|
regulation_type: str = ""
|
|
version: str = ""
|
|
summary: str = ""
|
|
summary_latency_ms: int = 0
|
|
chunk_count: int = 0
|
|
parser_name: str = ""
|
|
index_name: str = ""
|
|
error_message: str = ""
|
|
created_at: datetime = field(default_factory=utcnow)
|
|
updated_at: datetime = field(default_factory=utcnow)
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ParsedDocument:
|
|
"""Represent the Parsed Document type."""
|
|
doc_id: str
|
|
doc_name: str
|
|
structure_nodes: list[dict[str, Any]]
|
|
semantic_blocks: list[dict[str, Any]]
|
|
vector_chunks: list[dict[str, Any]]
|
|
parser_name: str
|
|
raw_text: str = ""
|
|
raw_layouts: list[dict[str, Any]] = field(default_factory=list)
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class Chunk:
|
|
"""Represent the Chunk type."""
|
|
chunk_id: str
|
|
doc_id: str
|
|
doc_name: str
|
|
content: str
|
|
embedding_text: str
|
|
section_title: str = ""
|
|
section_path: list[str] = field(default_factory=list)
|
|
page_number: int = 0
|
|
regulation_type: str = ""
|
|
version: str = ""
|
|
semantic_id: str = ""
|
|
block_type: str = ""
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class DocumentProcessingRun:
|
|
"""Represent one processing attempt for a document."""
|
|
|
|
run_id: str
|
|
doc_id: str
|
|
trigger_type: str
|
|
run_status: str
|
|
parser_backend: str = ""
|
|
chunk_backend: str = ""
|
|
embedding_model: str = ""
|
|
index_name: str = ""
|
|
started_at: datetime = field(default_factory=utcnow)
|
|
stored_at: datetime | None = None
|
|
parsed_at: datetime | None = None
|
|
indexed_at: datetime | None = None
|
|
finished_at: datetime | None = None
|
|
layout_count: int = 0
|
|
structure_node_count: int = 0
|
|
semantic_block_count: int = 0
|
|
vector_chunk_count: int = 0
|
|
chunk_count: int = 0
|
|
failure_stage: str = ""
|
|
error_message: str = ""
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class DocumentStatusEvent:
|
|
"""Represent a document lifecycle event emitted during processing."""
|
|
|
|
event_id: str
|
|
doc_id: str
|
|
run_id: str
|
|
from_status: str
|
|
to_status: str
|
|
stage: str
|
|
message: str = ""
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
occurred_at: datetime = field(default_factory=utcnow)
|
|
|
|
|
|
@dataclass
|
|
class DocumentArtifact:
|
|
"""Represent a persisted artifact reference for one processing run."""
|
|
|
|
artifact_id: str
|
|
doc_id: str
|
|
run_id: str
|
|
artifact_type: str
|
|
object_name: str
|
|
content_type: str
|
|
byte_size: int = 0
|
|
checksum: str = ""
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
created_at: datetime = field(default_factory=utcnow)
|