- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
231 lines
6.5 KiB
Python
231 lines
6.5 KiB
Python
"""Define domain models for documents."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from enum import Enum
|
|
from typing import Any
|
|
# Keep module behavior explicit so the backend flow stays easy to audit.
|
|
|
|
|
|
def utcnow() -> datetime:
|
|
return datetime.now(UTC)
|
|
|
|
|
|
|
|
class DocumentStatus(str, Enum):
|
|
"""Define the Document Status enumeration."""
|
|
PENDING = "pending"
|
|
STORED = "stored"
|
|
PARSED = "parsed"
|
|
INDEXED = "indexed"
|
|
FAILED = "failed"
|
|
|
|
|
|
@dataclass
|
|
class Document:
|
|
"""Represent the Document type."""
|
|
doc_id: str
|
|
doc_name: str
|
|
file_name: str
|
|
object_name: str
|
|
content_type: str
|
|
size_bytes: int
|
|
status: DocumentStatus = DocumentStatus.PENDING
|
|
regulation_type: str = ""
|
|
version: str = ""
|
|
summary: str = ""
|
|
summary_latency_ms: int = 0
|
|
chunk_count: int = 0
|
|
parser_name: str = ""
|
|
index_name: str = ""
|
|
error_message: str = ""
|
|
created_at: datetime = field(default_factory=utcnow)
|
|
updated_at: datetime = field(default_factory=utcnow)
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ParsedDocument:
|
|
"""Represent the Parsed Document type."""
|
|
doc_id: str
|
|
doc_name: str
|
|
structure_nodes: list[dict[str, Any]]
|
|
semantic_blocks: list[dict[str, Any]]
|
|
vector_chunks: list[dict[str, Any]]
|
|
parser_name: str
|
|
raw_text: str = ""
|
|
raw_layouts: list[dict[str, Any]] = field(default_factory=list)
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass(init=False)
|
|
class Chunk:
|
|
"""Represent one retrieval chunk with backward-compatible aliases."""
|
|
|
|
chunk_id: str
|
|
doc_id: str
|
|
doc_title: str
|
|
text: str
|
|
embedding_text: str
|
|
chunk_type: str = ""
|
|
chunk_index: int = 0
|
|
piece_index: int = 0
|
|
page_start: int = 0
|
|
page_end: int = 0
|
|
section_title: str = ""
|
|
section_path: list[str] = field(default_factory=list)
|
|
section_level: int = 0
|
|
source_ids: list[str] = field(default_factory=list)
|
|
regulation_type: str = ""
|
|
version: str = ""
|
|
semantic_id: str = ""
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
chunk_id: str,
|
|
doc_id: str,
|
|
doc_title: str | None = None,
|
|
text: str | None = None,
|
|
embedding_text: str = "",
|
|
chunk_type: str = "",
|
|
chunk_index: int = 0,
|
|
piece_index: int = 0,
|
|
page_start: int = 0,
|
|
page_end: int = 0,
|
|
section_title: str = "",
|
|
section_path: list[str] | None = None,
|
|
section_level: int = 0,
|
|
source_ids: list[str] | None = None,
|
|
regulation_type: str = "",
|
|
version: str = "",
|
|
semantic_id: str = "",
|
|
metadata: dict[str, Any] | None = None,
|
|
doc_name: str | None = None,
|
|
content: str | None = None,
|
|
page_number: int | None = None,
|
|
block_type: str | None = None,
|
|
**_: Any,
|
|
) -> None:
|
|
"""Initialize the chunk while accepting legacy field names."""
|
|
self.chunk_id = chunk_id
|
|
self.doc_id = doc_id
|
|
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
|
|
self.text = text if text is not None else (content or "")
|
|
self.embedding_text = embedding_text or self.text
|
|
self.chunk_type = chunk_type or (block_type or "")
|
|
self.chunk_index = int(chunk_index or 0)
|
|
self.piece_index = int(piece_index or 0)
|
|
self.page_start = int(page_start or page_number or 0)
|
|
self.page_end = int(page_end or self.page_start)
|
|
self.section_title = section_title
|
|
self.section_path = list(section_path or [])
|
|
self.section_level = int(section_level or 0)
|
|
self.source_ids = list(source_ids or [])
|
|
self.regulation_type = regulation_type
|
|
self.version = version
|
|
self.semantic_id = semantic_id
|
|
self.metadata = dict(metadata or {})
|
|
|
|
@property
|
|
def doc_name(self) -> str:
|
|
"""Return the legacy document name alias."""
|
|
return self.doc_title
|
|
|
|
@doc_name.setter
|
|
def doc_name(self, value: str) -> None:
|
|
"""Update the legacy document name alias."""
|
|
self.doc_title = value
|
|
|
|
@property
|
|
def content(self) -> str:
|
|
"""Return the legacy content alias."""
|
|
return self.text
|
|
|
|
@content.setter
|
|
def content(self, value: str) -> None:
|
|
"""Update the legacy content alias."""
|
|
self.text = value
|
|
|
|
@property
|
|
def page_number(self) -> int:
|
|
"""Return the legacy page number alias."""
|
|
return self.page_start
|
|
|
|
@page_number.setter
|
|
def page_number(self, value: int) -> None:
|
|
"""Update the legacy page number alias."""
|
|
self.page_start = value
|
|
self.page_end = max(self.page_end, value)
|
|
|
|
@property
|
|
def block_type(self) -> str:
|
|
"""Return the legacy block type alias."""
|
|
return self.chunk_type
|
|
|
|
@block_type.setter
|
|
def block_type(self, value: str) -> None:
|
|
"""Update the legacy block type alias."""
|
|
self.chunk_type = value
|
|
|
|
|
|
@dataclass
|
|
class DocumentProcessingRun:
|
|
"""Represent one processing attempt for a document."""
|
|
|
|
run_id: str
|
|
doc_id: str
|
|
trigger_type: str
|
|
run_status: str
|
|
parser_backend: str = ""
|
|
chunk_backend: str = ""
|
|
embedding_model: str = ""
|
|
index_name: str = ""
|
|
started_at: datetime = field(default_factory=utcnow)
|
|
stored_at: datetime | None = None
|
|
parsed_at: datetime | None = None
|
|
indexed_at: datetime | None = None
|
|
finished_at: datetime | None = None
|
|
layout_count: int = 0
|
|
structure_node_count: int = 0
|
|
semantic_block_count: int = 0
|
|
vector_chunk_count: int = 0
|
|
chunk_count: int = 0
|
|
failure_stage: str = ""
|
|
error_message: str = ""
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class DocumentStatusEvent:
|
|
"""Represent a document lifecycle event emitted during processing."""
|
|
|
|
event_id: str
|
|
doc_id: str
|
|
run_id: str
|
|
from_status: str
|
|
to_status: str
|
|
stage: str
|
|
message: str = ""
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
occurred_at: datetime = field(default_factory=utcnow)
|
|
|
|
|
|
@dataclass
|
|
class DocumentArtifact:
|
|
"""Represent a persisted artifact reference for one processing run."""
|
|
|
|
artifact_id: str
|
|
doc_id: str
|
|
run_id: str
|
|
artifact_type: str
|
|
object_name: str
|
|
content_type: str
|
|
byte_size: int = 0
|
|
checksum: str = ""
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
created_at: datetime = field(default_factory=utcnow)
|