Files

231 lines
6.5 KiB
Python
Raw Permalink Normal View History

"""Define domain models for documents."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import UTC, datetime
from enum import Enum
from typing import Any
# Keep module behavior explicit so the backend flow stays easy to audit.
def utcnow() -> datetime:
return datetime.now(UTC)
class DocumentStatus(str, Enum):
"""Define the Document Status enumeration."""
PENDING = "pending"
STORED = "stored"
PARSED = "parsed"
INDEXED = "indexed"
FAILED = "failed"
@dataclass
class Document:
"""Represent the Document type."""
doc_id: str
doc_name: str
file_name: str
object_name: str
content_type: str
size_bytes: int
status: DocumentStatus = DocumentStatus.PENDING
regulation_type: str = ""
version: str = ""
summary: str = ""
summary_latency_ms: int = 0
chunk_count: int = 0
parser_name: str = ""
index_name: str = ""
error_message: str = ""
created_at: datetime = field(default_factory=utcnow)
updated_at: datetime = field(default_factory=utcnow)
metadata: dict[str, Any] = field(default_factory=dict)
@dataclass
class ParsedDocument:
"""Represent the Parsed Document type."""
doc_id: str
doc_name: str
structure_nodes: list[dict[str, Any]]
semantic_blocks: list[dict[str, Any]]
vector_chunks: list[dict[str, Any]]
parser_name: str
raw_text: str = ""
raw_layouts: list[dict[str, Any]] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)
@dataclass(init=False)
class Chunk:
"""Represent one retrieval chunk with backward-compatible aliases."""
chunk_id: str
doc_id: str
doc_title: str
text: str
embedding_text: str
chunk_type: str = ""
chunk_index: int = 0
piece_index: int = 0
page_start: int = 0
page_end: int = 0
section_title: str = ""
section_path: list[str] = field(default_factory=list)
section_level: int = 0
source_ids: list[str] = field(default_factory=list)
regulation_type: str = ""
version: str = ""
semantic_id: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
2026-05-26 12:34:12 +08:00
def __init__(
self,
*,
chunk_id: str,
doc_id: str,
doc_title: str | None = None,
text: str | None = None,
embedding_text: str = "",
chunk_type: str = "",
chunk_index: int = 0,
piece_index: int = 0,
page_start: int = 0,
page_end: int = 0,
section_title: str = "",
section_path: list[str] | None = None,
section_level: int = 0,
source_ids: list[str] | None = None,
regulation_type: str = "",
version: str = "",
semantic_id: str = "",
metadata: dict[str, Any] | None = None,
doc_name: str | None = None,
content: str | None = None,
page_number: int | None = None,
block_type: str | None = None,
**_: Any,
) -> None:
"""Initialize the chunk while accepting legacy field names."""
self.chunk_id = chunk_id
self.doc_id = doc_id
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
self.text = text if text is not None else (content or "")
self.embedding_text = embedding_text or self.text
self.chunk_type = chunk_type or (block_type or "")
self.chunk_index = int(chunk_index or 0)
self.piece_index = int(piece_index or 0)
self.page_start = int(page_start or page_number or 0)
self.page_end = int(page_end or self.page_start)
self.section_title = section_title
self.section_path = list(section_path or [])
self.section_level = int(section_level or 0)
self.source_ids = list(source_ids or [])
self.regulation_type = regulation_type
self.version = version
self.semantic_id = semantic_id
self.metadata = dict(metadata or {})
@property
def doc_name(self) -> str:
"""Return the legacy document name alias."""
return self.doc_title
@doc_name.setter
def doc_name(self, value: str) -> None:
"""Update the legacy document name alias."""
self.doc_title = value
@property
def content(self) -> str:
"""Return the legacy content alias."""
return self.text
@content.setter
def content(self, value: str) -> None:
"""Update the legacy content alias."""
self.text = value
@property
def page_number(self) -> int:
"""Return the legacy page number alias."""
return self.page_start
@page_number.setter
def page_number(self, value: int) -> None:
"""Update the legacy page number alias."""
self.page_start = value
self.page_end = max(self.page_end, value)
@property
def block_type(self) -> str:
"""Return the legacy block type alias."""
return self.chunk_type
@block_type.setter
def block_type(self, value: str) -> None:
"""Update the legacy block type alias."""
self.chunk_type = value
2026-05-26 12:34:12 +08:00
@dataclass
class DocumentProcessingRun:
"""Represent one processing attempt for a document."""
run_id: str
doc_id: str
trigger_type: str
run_status: str
parser_backend: str = ""
chunk_backend: str = ""
embedding_model: str = ""
index_name: str = ""
started_at: datetime = field(default_factory=utcnow)
stored_at: datetime | None = None
parsed_at: datetime | None = None
indexed_at: datetime | None = None
finished_at: datetime | None = None
layout_count: int = 0
structure_node_count: int = 0
semantic_block_count: int = 0
vector_chunk_count: int = 0
chunk_count: int = 0
failure_stage: str = ""
error_message: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
@dataclass
class DocumentStatusEvent:
"""Represent a document lifecycle event emitted during processing."""
event_id: str
doc_id: str
run_id: str
from_status: str
to_status: str
stage: str
message: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
occurred_at: datetime = field(default_factory=utcnow)
@dataclass
class DocumentArtifact:
"""Represent a persisted artifact reference for one processing run."""
artifact_id: str
doc_id: str
run_id: str
artifact_type: str
object_name: str
content_type: str
byte_size: int = 0
checksum: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
created_at: datetime = field(default_factory=utcnow)