Fix centered content layout widths
This commit is contained in:
216
tests/test_document_processing_store.py
Normal file
216
tests/test_document_processing_store.py
Normal file
@@ -0,0 +1,216 @@
|
||||
"""Test PostgreSQL-backed document processing history storage."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import psycopg2
|
||||
import pytest
|
||||
|
||||
from app.domain.documents import Document, DocumentArtifact, DocumentProcessingRun, DocumentStatus, DocumentStatusEvent
|
||||
from app.infrastructure.storage.postgres_document_processing_store import PostgresDocumentProcessingStore
|
||||
from app.infrastructure.storage.postgres_document_repository import PostgresDocumentRepository
|
||||
# Keep these tests focused on relational storage behavior only.
|
||||
|
||||
|
||||
def _build_document(doc_id: str) -> Document:
|
||||
"""Create a minimal document row required by the foreign keys."""
|
||||
return Document(
|
||||
doc_id=doc_id,
|
||||
doc_name="Processing Test",
|
||||
file_name="processing-test.pdf",
|
||||
object_name=f"{doc_id}/processing-test.pdf",
|
||||
content_type="application/pdf",
|
||||
size_bytes=128,
|
||||
status=DocumentStatus.PENDING,
|
||||
)
|
||||
|
||||
|
||||
def _connectivity_ready() -> bool:
|
||||
"""Return whether the configured PostgreSQL instance is reachable for integration tests."""
|
||||
try:
|
||||
repository = PostgresDocumentRepository()
|
||||
except psycopg2.Error:
|
||||
return False
|
||||
try:
|
||||
repository.list(limit=1)
|
||||
return True
|
||||
except psycopg2.Error:
|
||||
return False
|
||||
|
||||
|
||||
pytestmark = pytest.mark.skipif(not _connectivity_ready(), reason="PostgreSQL test backend is not reachable")
|
||||
|
||||
|
||||
def test_postgres_document_processing_store_supports_full_run_lifecycle():
|
||||
"""Persist run, event, and artifact history and read it back as dataclasses."""
|
||||
repository = PostgresDocumentRepository()
|
||||
store = PostgresDocumentProcessingStore()
|
||||
doc_id = f"proc-{uuid.uuid4().hex[:10]}"
|
||||
run_id = f"run-{uuid.uuid4().hex[:10]}"
|
||||
base_time = datetime.now(UTC)
|
||||
|
||||
repository.create(_build_document(doc_id))
|
||||
try:
|
||||
created = store.create_run(
|
||||
DocumentProcessingRun(
|
||||
run_id=run_id,
|
||||
doc_id=doc_id,
|
||||
trigger_type="upload",
|
||||
run_status="running",
|
||||
parser_backend="aliyun",
|
||||
chunk_backend="aliyun",
|
||||
embedding_model="text-embedding-v3",
|
||||
started_at=base_time,
|
||||
metadata={"origin": "test"},
|
||||
)
|
||||
)
|
||||
|
||||
stored = store.mark_run_stored(run_id, stored_at=base_time, metadata={"stored": True})
|
||||
parsed = store.mark_run_parsed(
|
||||
run_id,
|
||||
parser_backend="fake_parser",
|
||||
layout_count=2,
|
||||
structure_node_count=3,
|
||||
semantic_block_count=4,
|
||||
vector_chunk_count=5,
|
||||
parsed_at=base_time,
|
||||
metadata={"parse_task_id": "task-1"},
|
||||
)
|
||||
indexed = store.mark_run_indexed(
|
||||
run_id,
|
||||
chunk_count=6,
|
||||
index_name="regulations_dense_1024_v1",
|
||||
indexed_at=base_time,
|
||||
finished_at=base_time,
|
||||
metadata={"collection": "regulations_dense_1024_v1"},
|
||||
)
|
||||
event = store.append_status_event(
|
||||
DocumentStatusEvent(
|
||||
event_id=f"evt-{uuid.uuid4().hex[:10]}",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
from_status="parsed",
|
||||
to_status="indexed",
|
||||
stage="index",
|
||||
message="Indexed successfully",
|
||||
metadata={"chunk_count": 6},
|
||||
occurred_at=base_time,
|
||||
)
|
||||
)
|
||||
artifacts = store.replace_artifacts_for_run(
|
||||
run_id,
|
||||
[
|
||||
DocumentArtifact(
|
||||
artifact_id=f"art-{uuid.uuid4().hex[:10]}",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
artifact_type="layouts",
|
||||
object_name=f"artifacts/{doc_id}/layouts.json",
|
||||
content_type="application/json",
|
||||
created_at=base_time,
|
||||
),
|
||||
DocumentArtifact(
|
||||
artifact_id=f"art-{uuid.uuid4().hex[:10]}",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
artifact_type="vector_chunks",
|
||||
object_name=f"artifacts/{doc_id}/vector_chunks.json",
|
||||
content_type="application/json",
|
||||
created_at=base_time,
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
fetched = store.get_run(run_id)
|
||||
run_rows = store.list_runs_by_document(doc_id)
|
||||
event_rows = store.list_status_events_by_document(doc_id)
|
||||
artifact_rows = store.list_artifacts_by_run(run_id)
|
||||
|
||||
assert created.run_id == run_id
|
||||
assert stored is not None and stored.stored_at is not None
|
||||
assert parsed is not None and parsed.parser_backend == "fake_parser"
|
||||
assert indexed is not None and indexed.run_status == "succeeded"
|
||||
assert fetched is not None and fetched.chunk_count == 6
|
||||
assert isinstance(run_rows[0], DocumentProcessingRun)
|
||||
assert isinstance(event_rows[0], DocumentStatusEvent)
|
||||
assert isinstance(artifact_rows[0], DocumentArtifact)
|
||||
assert event_rows[0].event_id == event.event_id
|
||||
assert {artifact.artifact_type for artifact in artifacts} == {artifact.artifact_type for artifact in artifact_rows}
|
||||
finally:
|
||||
store.delete_by_document(doc_id)
|
||||
repository.delete(doc_id)
|
||||
|
||||
|
||||
def test_postgres_document_processing_store_replaces_artifacts_and_deletes_document_data():
|
||||
"""Replace artifact rows idempotently and remove all history rows for one document."""
|
||||
repository = PostgresDocumentRepository()
|
||||
store = PostgresDocumentProcessingStore()
|
||||
doc_id = f"proc-{uuid.uuid4().hex[:10]}"
|
||||
run_id = f"run-{uuid.uuid4().hex[:10]}"
|
||||
|
||||
repository.create(_build_document(doc_id))
|
||||
try:
|
||||
store.create_run(
|
||||
DocumentProcessingRun(
|
||||
run_id=run_id,
|
||||
doc_id=doc_id,
|
||||
trigger_type="retry",
|
||||
run_status="running",
|
||||
)
|
||||
)
|
||||
|
||||
first = store.replace_artifacts_for_run(
|
||||
run_id,
|
||||
[
|
||||
DocumentArtifact(
|
||||
artifact_id=f"art-{uuid.uuid4().hex[:10]}",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
artifact_type="layouts",
|
||||
object_name=f"artifacts/{doc_id}/layouts-v1.json",
|
||||
content_type="application/json",
|
||||
)
|
||||
],
|
||||
)
|
||||
second = store.replace_artifacts_for_run(
|
||||
run_id,
|
||||
[
|
||||
DocumentArtifact(
|
||||
artifact_id=f"art-{uuid.uuid4().hex[:10]}",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
artifact_type="layouts",
|
||||
object_name=f"artifacts/{doc_id}/layouts-v2.json",
|
||||
content_type="application/json",
|
||||
)
|
||||
],
|
||||
)
|
||||
store.append_status_event(
|
||||
DocumentStatusEvent(
|
||||
event_id=f"evt-{uuid.uuid4().hex[:10]}",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
from_status="pending",
|
||||
to_status="failed",
|
||||
stage="parse",
|
||||
message="failed",
|
||||
)
|
||||
)
|
||||
failed = store.mark_run_failed(run_id, failure_stage="parse", error_message="boom")
|
||||
|
||||
artifact_rows = store.list_artifacts_by_run(run_id)
|
||||
assert len(first) == 1
|
||||
assert len(second) == 1
|
||||
assert len(artifact_rows) == 1
|
||||
assert artifact_rows[0].object_name.endswith("layouts-v2.json")
|
||||
assert failed is not None and failed.run_status == "failed"
|
||||
|
||||
store.delete_by_document(doc_id)
|
||||
|
||||
assert store.list_runs_by_document(doc_id) == []
|
||||
assert store.list_status_events_by_document(doc_id) == []
|
||||
assert store.list_artifacts_by_document(doc_id) == []
|
||||
finally:
|
||||
repository.delete(doc_id)
|
||||
@@ -3,13 +3,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from app.application.documents.services import DocumentCommandService
|
||||
from app.domain.documents import Chunk, Document, DocumentStatus, ParsedDocument
|
||||
from app.domain.documents import Chunk, Document, DocumentArtifact, DocumentProcessingRun, DocumentStatus, DocumentStatusEvent, ParsedDocument
|
||||
from app.infrastructure.storage.json_document_processing_store import JsonDocumentProcessingStore
|
||||
from app.infrastructure.storage.json_document_repository import JsonDocumentRepository
|
||||
from app.shared import bootstrap
|
||||
|
||||
|
||||
class FakeRepository:
|
||||
"""Store document rows in memory for application service tests."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.documents: dict[str, Document] = {}
|
||||
|
||||
@@ -25,9 +30,14 @@ class FakeRepository:
|
||||
return self.documents.get(doc_id)
|
||||
|
||||
def list(self, limit: int | None = None) -> list[Document]:
|
||||
"""Return stored documents in insertion order."""
|
||||
values = list(self.documents.values())
|
||||
return values[:limit] if limit is not None else values
|
||||
|
||||
def delete(self, doc_id: str) -> bool:
|
||||
"""Delete one document from the in-memory repository."""
|
||||
return self.documents.pop(doc_id, None) is not None
|
||||
|
||||
def update_status(
|
||||
self,
|
||||
doc_id: str,
|
||||
@@ -62,6 +72,8 @@ class FakeRepository:
|
||||
|
||||
|
||||
class FakeBinaryStore:
|
||||
"""Store binary payloads in memory for upload and retry tests."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.saved: dict[str, bytes] = {}
|
||||
|
||||
@@ -76,6 +88,8 @@ class FakeBinaryStore:
|
||||
|
||||
|
||||
class FakeParser:
|
||||
"""Return a stable parsed document for deterministic service tests."""
|
||||
|
||||
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
||||
return ParsedDocument(
|
||||
doc_id=doc_id,
|
||||
@@ -101,6 +115,8 @@ class FakeParser:
|
||||
|
||||
|
||||
class FakeChunkBuilder:
|
||||
"""Build one deterministic chunk from the fake parsed document."""
|
||||
|
||||
def build(self, *, parsed_document: ParsedDocument, regulation_type: str, version: str) -> list[Chunk]:
|
||||
return [
|
||||
Chunk(
|
||||
@@ -122,6 +138,8 @@ class FakeChunkBuilder:
|
||||
|
||||
|
||||
class FakeEmbeddingProvider:
|
||||
"""Capture embedding calls and return fixed-length vectors."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.calls: list[list[str]] = []
|
||||
|
||||
@@ -134,6 +152,8 @@ class FakeEmbeddingProvider:
|
||||
|
||||
|
||||
class FakeVectorIndex:
|
||||
"""Capture vector upserts for service assertions."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.upserts: list[tuple[list[Chunk], list[list[float]]]] = []
|
||||
|
||||
@@ -151,11 +171,159 @@ class FakeVectorIndex:
|
||||
return {"collection_name": "regulations_dense_1024_v1"}
|
||||
|
||||
|
||||
@dataclass
|
||||
class FakeProcessingStore:
|
||||
"""Record processing history method calls for orchestration assertions."""
|
||||
|
||||
runs: list[DocumentProcessingRun] = None
|
||||
status_events: list[DocumentStatusEvent] = None
|
||||
artifact_batches: list[list[DocumentArtifact]] = None
|
||||
deleted_doc_ids: list[str] = None
|
||||
stored_run_ids: list[str] = None
|
||||
parsed_calls: list[dict] = None
|
||||
indexed_calls: list[dict] = None
|
||||
failed_calls: list[dict] = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Initialize mutable call collections for each fake instance."""
|
||||
self.runs = []
|
||||
self.status_events = []
|
||||
self.artifact_batches = []
|
||||
self.deleted_doc_ids = []
|
||||
self.stored_run_ids = []
|
||||
self.parsed_calls = []
|
||||
self.indexed_calls = []
|
||||
self.failed_calls = []
|
||||
|
||||
def create_run(self, run: DocumentProcessingRun) -> DocumentProcessingRun:
|
||||
"""Store the created run and return it unchanged."""
|
||||
self.runs.append(run)
|
||||
return run
|
||||
|
||||
def mark_run_stored(self, run_id: str, *, stored_at=None, metadata: dict | None = None) -> DocumentProcessingRun | None:
|
||||
"""Record that one run reached the stored stage."""
|
||||
self.stored_run_ids.append(run_id)
|
||||
return next((run for run in self.runs if run.run_id == run_id), None)
|
||||
|
||||
def mark_run_parsed(
|
||||
self,
|
||||
run_id: str,
|
||||
*,
|
||||
parser_backend: str,
|
||||
layout_count: int,
|
||||
structure_node_count: int,
|
||||
semantic_block_count: int,
|
||||
vector_chunk_count: int,
|
||||
parsed_at=None,
|
||||
metadata: dict | None = None,
|
||||
) -> DocumentProcessingRun | None:
|
||||
"""Record parse metrics for one run."""
|
||||
self.parsed_calls.append(
|
||||
{
|
||||
"run_id": run_id,
|
||||
"parser_backend": parser_backend,
|
||||
"layout_count": layout_count,
|
||||
"structure_node_count": structure_node_count,
|
||||
"semantic_block_count": semantic_block_count,
|
||||
"vector_chunk_count": vector_chunk_count,
|
||||
"metadata": metadata or {},
|
||||
}
|
||||
)
|
||||
return next((run for run in self.runs if run.run_id == run_id), None)
|
||||
|
||||
def mark_run_indexed(
|
||||
self,
|
||||
run_id: str,
|
||||
*,
|
||||
chunk_count: int,
|
||||
index_name: str,
|
||||
indexed_at=None,
|
||||
finished_at=None,
|
||||
metadata: dict | None = None,
|
||||
) -> DocumentProcessingRun | None:
|
||||
"""Record index completion for one run."""
|
||||
self.indexed_calls.append(
|
||||
{
|
||||
"run_id": run_id,
|
||||
"chunk_count": chunk_count,
|
||||
"index_name": index_name,
|
||||
"metadata": metadata or {},
|
||||
}
|
||||
)
|
||||
return next((run for run in self.runs if run.run_id == run_id), None)
|
||||
|
||||
def mark_run_failed(
|
||||
self,
|
||||
run_id: str,
|
||||
*,
|
||||
failure_stage: str,
|
||||
error_message: str,
|
||||
finished_at=None,
|
||||
metadata: dict | None = None,
|
||||
) -> DocumentProcessingRun | None:
|
||||
"""Record terminal failure details for one run."""
|
||||
self.failed_calls.append(
|
||||
{
|
||||
"run_id": run_id,
|
||||
"failure_stage": failure_stage,
|
||||
"error_message": error_message,
|
||||
"metadata": metadata or {},
|
||||
}
|
||||
)
|
||||
return next((run for run in self.runs if run.run_id == run_id), None)
|
||||
|
||||
def append_status_event(self, event: DocumentStatusEvent) -> DocumentStatusEvent:
|
||||
"""Store one status event."""
|
||||
self.status_events.append(event)
|
||||
return event
|
||||
|
||||
def replace_artifacts_for_run(self, run_id: str, artifacts: list[DocumentArtifact]) -> list[DocumentArtifact]:
|
||||
"""Store one artifact replacement batch."""
|
||||
self.artifact_batches.append(artifacts)
|
||||
return artifacts
|
||||
|
||||
def delete_by_document(self, doc_id: str) -> None:
|
||||
"""Record an explicit document-history delete request."""
|
||||
self.deleted_doc_ids.append(doc_id)
|
||||
|
||||
def list_runs_by_document(self, doc_id: str) -> list[DocumentProcessingRun]:
|
||||
"""Return runs for completeness of the fake port."""
|
||||
return [run for run in self.runs if run.doc_id == doc_id]
|
||||
|
||||
def get_run(self, run_id: str) -> DocumentProcessingRun | None:
|
||||
"""Return one run for completeness of the fake port."""
|
||||
return next((run for run in self.runs if run.run_id == run_id), None)
|
||||
|
||||
def list_status_events_by_document(self, doc_id: str) -> list[DocumentStatusEvent]:
|
||||
"""Return status events for completeness of the fake port."""
|
||||
return [event for event in self.status_events if event.doc_id == doc_id]
|
||||
|
||||
def list_status_events_by_run(self, run_id: str) -> list[DocumentStatusEvent]:
|
||||
"""Return status events for completeness of the fake port."""
|
||||
return [event for event in self.status_events if event.run_id == run_id]
|
||||
|
||||
def list_artifacts_by_document(self, doc_id: str) -> list[DocumentArtifact]:
|
||||
"""Return artifact references for completeness of the fake port."""
|
||||
return [artifact for batch in self.artifact_batches for artifact in batch if artifact.doc_id == doc_id]
|
||||
|
||||
def list_artifacts_by_run(self, run_id: str) -> list[DocumentArtifact]:
|
||||
"""Return artifact references for completeness of the fake port."""
|
||||
return [artifact for batch in self.artifact_batches for artifact in batch if artifact.run_id == run_id]
|
||||
|
||||
|
||||
class FailingParser:
|
||||
"""Raise a deterministic parser failure for failure-stage assertions."""
|
||||
|
||||
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
||||
raise RuntimeError("parser exploded")
|
||||
|
||||
|
||||
def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
|
||||
repository = FakeRepository()
|
||||
binary_store = FakeBinaryStore()
|
||||
embedding_provider = FakeEmbeddingProvider()
|
||||
vector_index = FakeVectorIndex()
|
||||
processing_store = FakeProcessingStore()
|
||||
service = DocumentCommandService(
|
||||
document_repository=repository,
|
||||
binary_store=binary_store,
|
||||
@@ -163,6 +331,7 @@ def test_document_command_service_uses_1024_dense_embedding_and_updates_status()
|
||||
chunk_builder=FakeChunkBuilder(),
|
||||
embedding_provider=embedding_provider,
|
||||
vector_index=vector_index,
|
||||
document_processing_store=processing_store,
|
||||
)
|
||||
|
||||
result = service.upload_and_process(
|
||||
@@ -188,6 +357,264 @@ def test_document_command_service_uses_1024_dense_embedding_and_updates_status()
|
||||
assert stored.index_name == "regulations_dense_1024_v1"
|
||||
assert stored.metadata["parse_task_id"] == "task-123"
|
||||
assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")
|
||||
assert len(processing_store.runs) == 1
|
||||
assert processing_store.runs[0].trigger_type == "upload"
|
||||
assert processing_store.stored_run_ids == [processing_store.runs[0].run_id]
|
||||
assert processing_store.parsed_calls[0]["vector_chunk_count"] == 1
|
||||
assert processing_store.indexed_calls[0]["index_name"] == "regulations_dense_1024_v1"
|
||||
assert [event.to_status for event in processing_store.status_events] == ["pending", "stored", "parsed", "indexed"]
|
||||
assert {artifact.artifact_type for artifact in processing_store.artifact_batches[0]} == {
|
||||
"layouts",
|
||||
"structure_nodes",
|
||||
"semantic_blocks",
|
||||
"vector_chunks",
|
||||
}
|
||||
|
||||
|
||||
def test_document_command_service_retry_marks_processing_run_as_retry():
|
||||
repository = FakeRepository()
|
||||
binary_store = FakeBinaryStore()
|
||||
embedding_provider = FakeEmbeddingProvider()
|
||||
vector_index = FakeVectorIndex()
|
||||
processing_store = FakeProcessingStore()
|
||||
|
||||
repository.create(
|
||||
Document(
|
||||
doc_id="doc-retry",
|
||||
doc_name="Retry Doc",
|
||||
file_name="retry.pdf",
|
||||
object_name="doc-retry/retry.pdf",
|
||||
content_type="application/pdf",
|
||||
size_bytes=4,
|
||||
regulation_type="车辆安全",
|
||||
version="2026",
|
||||
metadata={"generate_summary": False},
|
||||
)
|
||||
)
|
||||
binary_store.save(
|
||||
object_name="doc-retry/retry.pdf",
|
||||
data=b"data",
|
||||
content_type="application/pdf",
|
||||
metadata={"doc_id": "doc-retry"},
|
||||
)
|
||||
|
||||
service = DocumentCommandService(
|
||||
document_repository=repository,
|
||||
binary_store=binary_store,
|
||||
parser=FakeParser(),
|
||||
chunk_builder=FakeChunkBuilder(),
|
||||
embedding_provider=embedding_provider,
|
||||
vector_index=vector_index,
|
||||
document_processing_store=processing_store,
|
||||
)
|
||||
|
||||
result = service.retry("doc-retry")
|
||||
|
||||
assert result.status == "indexed"
|
||||
assert processing_store.runs[0].trigger_type == "retry"
|
||||
|
||||
|
||||
def test_document_command_service_records_failed_processing_stage():
|
||||
repository = FakeRepository()
|
||||
binary_store = FakeBinaryStore()
|
||||
embedding_provider = FakeEmbeddingProvider()
|
||||
vector_index = FakeVectorIndex()
|
||||
processing_store = FakeProcessingStore()
|
||||
service = DocumentCommandService(
|
||||
document_repository=repository,
|
||||
binary_store=binary_store,
|
||||
parser=FailingParser(),
|
||||
chunk_builder=FakeChunkBuilder(),
|
||||
embedding_provider=embedding_provider,
|
||||
vector_index=vector_index,
|
||||
document_processing_store=processing_store,
|
||||
)
|
||||
|
||||
result = service.upload_and_process(
|
||||
doc_id="doc-fail",
|
||||
file_name="test.pdf",
|
||||
content=b"dummy pdf bytes",
|
||||
content_type="application/pdf",
|
||||
doc_name="测试法规",
|
||||
regulation_type="车辆安全",
|
||||
version="2026",
|
||||
generate_summary=False,
|
||||
)
|
||||
|
||||
assert result.status == "failed"
|
||||
assert processing_store.failed_calls[0]["failure_stage"] == "parse"
|
||||
assert processing_store.status_events[-1].to_status == "failed"
|
||||
assert repository.get("doc-fail").metadata["failure_stage"] == "parse"
|
||||
|
||||
|
||||
def test_document_command_service_delete_cleans_processing_history_when_present():
|
||||
repository = FakeRepository()
|
||||
binary_store = FakeBinaryStore()
|
||||
vector_index = FakeVectorIndex()
|
||||
processing_store = FakeProcessingStore()
|
||||
repository.create(
|
||||
Document(
|
||||
doc_id="doc-delete",
|
||||
doc_name="Delete Doc",
|
||||
file_name="delete.pdf",
|
||||
object_name="doc-delete/delete.pdf",
|
||||
content_type="application/pdf",
|
||||
size_bytes=4,
|
||||
)
|
||||
)
|
||||
service = DocumentCommandService(
|
||||
document_repository=repository,
|
||||
binary_store=binary_store,
|
||||
parser=FakeParser(),
|
||||
chunk_builder=FakeChunkBuilder(),
|
||||
embedding_provider=FakeEmbeddingProvider(),
|
||||
vector_index=vector_index,
|
||||
document_processing_store=processing_store,
|
||||
)
|
||||
|
||||
deleted = service.delete("doc-delete")
|
||||
|
||||
assert deleted is True
|
||||
assert processing_store.deleted_doc_ids == ["doc-delete"]
|
||||
|
||||
|
||||
def test_document_command_service_persists_processing_history_with_json_store(tmp_path: Path):
|
||||
repository = JsonDocumentRepository(str(tmp_path / "documents.json"))
|
||||
processing_store = JsonDocumentProcessingStore(str(tmp_path / "document_processing.json"))
|
||||
binary_store = FakeBinaryStore()
|
||||
embedding_provider = FakeEmbeddingProvider()
|
||||
vector_index = FakeVectorIndex()
|
||||
service = DocumentCommandService(
|
||||
document_repository=repository,
|
||||
binary_store=binary_store,
|
||||
parser=FakeParser(),
|
||||
chunk_builder=FakeChunkBuilder(),
|
||||
embedding_provider=embedding_provider,
|
||||
vector_index=vector_index,
|
||||
document_processing_store=processing_store,
|
||||
)
|
||||
|
||||
result = service.upload_and_process(
|
||||
doc_id="doc-json-flow",
|
||||
file_name="test.pdf",
|
||||
content=b"dummy pdf bytes",
|
||||
content_type="application/pdf",
|
||||
doc_name="测试法规",
|
||||
regulation_type="车辆安全",
|
||||
version="2026",
|
||||
generate_summary=False,
|
||||
)
|
||||
|
||||
stored = repository.get("doc-json-flow")
|
||||
runs = processing_store.list_runs_by_document("doc-json-flow")
|
||||
events = processing_store.list_status_events_by_document("doc-json-flow")
|
||||
artifacts = processing_store.list_artifacts_by_document("doc-json-flow")
|
||||
|
||||
assert result.status == "indexed"
|
||||
assert stored is not None and stored.status == DocumentStatus.INDEXED
|
||||
assert len(runs) == 1
|
||||
assert runs[0].trigger_type == "upload"
|
||||
assert runs[0].run_status == "succeeded"
|
||||
assert [event.to_status for event in events] == ["pending", "stored", "parsed", "indexed"]
|
||||
assert {artifact.artifact_type for artifact in artifacts} == {
|
||||
"layouts",
|
||||
"structure_nodes",
|
||||
"semantic_blocks",
|
||||
"vector_chunks",
|
||||
}
|
||||
|
||||
|
||||
def test_document_command_service_retry_creates_second_json_processing_run(tmp_path: Path):
|
||||
repository = JsonDocumentRepository(str(tmp_path / "documents.json"))
|
||||
processing_store = JsonDocumentProcessingStore(str(tmp_path / "document_processing.json"))
|
||||
binary_store = FakeBinaryStore()
|
||||
repository.create(
|
||||
Document(
|
||||
doc_id="doc-json-retry",
|
||||
doc_name="Retry Doc",
|
||||
file_name="retry.pdf",
|
||||
object_name="doc-json-retry/retry.pdf",
|
||||
content_type="application/pdf",
|
||||
size_bytes=4,
|
||||
regulation_type="车辆安全",
|
||||
version="2026",
|
||||
metadata={"generate_summary": False},
|
||||
)
|
||||
)
|
||||
binary_store.save(
|
||||
object_name="doc-json-retry/retry.pdf",
|
||||
data=b"data",
|
||||
content_type="application/pdf",
|
||||
metadata={"doc_id": "doc-json-retry"},
|
||||
)
|
||||
service = DocumentCommandService(
|
||||
document_repository=repository,
|
||||
binary_store=binary_store,
|
||||
parser=FakeParser(),
|
||||
chunk_builder=FakeChunkBuilder(),
|
||||
embedding_provider=FakeEmbeddingProvider(),
|
||||
vector_index=FakeVectorIndex(),
|
||||
document_processing_store=processing_store,
|
||||
)
|
||||
|
||||
first = service.retry("doc-json-retry")
|
||||
second = service.retry("doc-json-retry")
|
||||
runs = processing_store.list_runs_by_document("doc-json-retry")
|
||||
|
||||
assert first.status == "indexed"
|
||||
assert second.status == "indexed"
|
||||
assert len(runs) == 2
|
||||
assert {run.trigger_type for run in runs} == {"retry"}
|
||||
|
||||
|
||||
def test_document_command_service_delete_removes_json_processing_history(tmp_path: Path):
|
||||
repository = JsonDocumentRepository(str(tmp_path / "documents.json"))
|
||||
processing_store = JsonDocumentProcessingStore(str(tmp_path / "document_processing.json"))
|
||||
binary_store = FakeBinaryStore()
|
||||
service = DocumentCommandService(
|
||||
document_repository=repository,
|
||||
binary_store=binary_store,
|
||||
parser=FakeParser(),
|
||||
chunk_builder=FakeChunkBuilder(),
|
||||
embedding_provider=FakeEmbeddingProvider(),
|
||||
vector_index=FakeVectorIndex(),
|
||||
document_processing_store=processing_store,
|
||||
)
|
||||
|
||||
service.upload_and_process(
|
||||
doc_id="doc-json-delete",
|
||||
file_name="delete.pdf",
|
||||
content=b"delete me",
|
||||
content_type="application/pdf",
|
||||
doc_name="Delete Doc",
|
||||
regulation_type="车辆安全",
|
||||
version="2026",
|
||||
generate_summary=False,
|
||||
)
|
||||
|
||||
deleted = service.delete("doc-json-delete")
|
||||
|
||||
assert deleted is True
|
||||
assert processing_store.list_runs_by_document("doc-json-delete") == []
|
||||
assert processing_store.list_status_events_by_document("doc-json-delete") == []
|
||||
assert processing_store.list_artifacts_by_document("doc-json-delete") == []
|
||||
|
||||
|
||||
def test_bootstrap_returns_json_processing_store_for_json_backend(tmp_path: Path):
|
||||
original_backend = bootstrap.settings.document_repository_backend
|
||||
original_path = bootstrap.settings.document_processing_metadata_path
|
||||
bootstrap.get_document_processing_store.cache_clear()
|
||||
try:
|
||||
bootstrap.settings.document_repository_backend = "json"
|
||||
bootstrap.settings.document_processing_metadata_path = str(tmp_path / "document_processing.json")
|
||||
|
||||
store = bootstrap.get_document_processing_store()
|
||||
|
||||
assert store.__class__.__name__ == "JsonDocumentProcessingStore"
|
||||
finally:
|
||||
bootstrap.settings.document_repository_backend = original_backend
|
||||
bootstrap.settings.document_processing_metadata_path = original_path
|
||||
bootstrap.get_document_processing_store.cache_clear()
|
||||
|
||||
def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
|
||||
bootstrap.get_parser.cache_clear()
|
||||
|
||||
184
tests/test_json_document_processing_store.py
Normal file
184
tests/test_json_document_processing_store.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""Test JSON-backed document processing history storage."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
|
||||
from app.domain.documents import DocumentArtifact, DocumentProcessingRun, DocumentStatusEvent
|
||||
from app.infrastructure.storage.json_document_processing_store import JsonDocumentProcessingStore
|
||||
# Keep JSON processing-store tests focused on local file persistence behavior.
|
||||
|
||||
|
||||
def test_json_document_processing_store_initializes_missing_file(tmp_path: Path):
|
||||
"""Create the backing file with the canonical empty payload on first use."""
|
||||
file_path = tmp_path / "document_processing.json"
|
||||
|
||||
store = JsonDocumentProcessingStore(str(file_path))
|
||||
|
||||
payload = json.loads(file_path.read_text(encoding="utf-8"))
|
||||
assert payload == {"runs": {}, "status_events": {}, "artifacts": {}}
|
||||
assert store.list_runs_by_document("missing") == []
|
||||
|
||||
|
||||
def test_json_document_processing_store_supports_full_run_lifecycle(tmp_path: Path):
|
||||
"""Persist runs, events, and artifacts and read them back as dataclasses."""
|
||||
file_path = tmp_path / "document_processing.json"
|
||||
store = JsonDocumentProcessingStore(str(file_path))
|
||||
doc_id = "doc-json"
|
||||
run_id = "run-json"
|
||||
event_id = "evt-json"
|
||||
base_time = datetime.now(UTC)
|
||||
|
||||
created = store.create_run(
|
||||
DocumentProcessingRun(
|
||||
run_id=run_id,
|
||||
doc_id=doc_id,
|
||||
trigger_type="upload",
|
||||
run_status="running",
|
||||
parser_backend="aliyun",
|
||||
chunk_backend="aliyun",
|
||||
embedding_model="text-embedding-v3",
|
||||
started_at=base_time,
|
||||
metadata={"origin": "json-test"},
|
||||
)
|
||||
)
|
||||
stored = store.mark_run_stored(run_id, stored_at=base_time, metadata={"stored": True})
|
||||
parsed = store.mark_run_parsed(
|
||||
run_id,
|
||||
parser_backend="fake_parser",
|
||||
layout_count=1,
|
||||
structure_node_count=2,
|
||||
semantic_block_count=3,
|
||||
vector_chunk_count=4,
|
||||
parsed_at=base_time,
|
||||
metadata={"parse_task_id": "task-json"},
|
||||
)
|
||||
indexed = store.mark_run_indexed(
|
||||
run_id,
|
||||
chunk_count=5,
|
||||
index_name="regulations_dense_1024_v1",
|
||||
indexed_at=base_time,
|
||||
finished_at=base_time,
|
||||
metadata={"collection": "regulations_dense_1024_v1"},
|
||||
)
|
||||
event = store.append_status_event(
|
||||
DocumentStatusEvent(
|
||||
event_id=event_id,
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
from_status="parsed",
|
||||
to_status="indexed",
|
||||
stage="index",
|
||||
message="Indexed",
|
||||
metadata={"chunk_count": 5},
|
||||
occurred_at=base_time,
|
||||
)
|
||||
)
|
||||
artifacts = store.replace_artifacts_for_run(
|
||||
run_id,
|
||||
[
|
||||
DocumentArtifact(
|
||||
artifact_id="art-layouts",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
artifact_type="layouts",
|
||||
object_name="artifacts/doc-json/layouts.json",
|
||||
content_type="application/json",
|
||||
created_at=base_time,
|
||||
),
|
||||
DocumentArtifact(
|
||||
artifact_id="art-vectors",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
artifact_type="vector_chunks",
|
||||
object_name="artifacts/doc-json/vector_chunks.json",
|
||||
content_type="application/json",
|
||||
created_at=base_time,
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
fetched = store.get_run(run_id)
|
||||
run_rows = store.list_runs_by_document(doc_id)
|
||||
event_rows = store.list_status_events_by_run(run_id)
|
||||
artifact_rows = store.list_artifacts_by_document(doc_id)
|
||||
|
||||
assert created.run_id == run_id
|
||||
assert stored is not None and stored.metadata["stored"] is True
|
||||
assert parsed is not None and parsed.structure_node_count == 2
|
||||
assert indexed is not None and indexed.run_status == "succeeded"
|
||||
assert fetched is not None and fetched.chunk_count == 5
|
||||
assert run_rows[0].started_at == base_time
|
||||
assert event_rows[0].event_id == event.event_id
|
||||
assert artifact_rows[0].doc_id == doc_id
|
||||
assert {artifact.artifact_type for artifact in artifacts} == {artifact.artifact_type for artifact in artifact_rows}
|
||||
|
||||
|
||||
def test_json_document_processing_store_replaces_artifacts_and_deletes_by_document(tmp_path: Path):
|
||||
"""Replace one run's artifacts idempotently and remove all history for a document."""
|
||||
file_path = tmp_path / "document_processing.json"
|
||||
store = JsonDocumentProcessingStore(str(file_path))
|
||||
doc_id = "doc-delete"
|
||||
run_id = "run-delete"
|
||||
|
||||
store.create_run(
|
||||
DocumentProcessingRun(
|
||||
run_id=run_id,
|
||||
doc_id=doc_id,
|
||||
trigger_type="retry",
|
||||
run_status="running",
|
||||
)
|
||||
)
|
||||
store.append_status_event(
|
||||
DocumentStatusEvent(
|
||||
event_id="evt-delete",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
from_status="pending",
|
||||
to_status="stored",
|
||||
stage="store",
|
||||
occurred_at=datetime.now(UTC),
|
||||
)
|
||||
)
|
||||
first = store.replace_artifacts_for_run(
|
||||
run_id,
|
||||
[
|
||||
DocumentArtifact(
|
||||
artifact_id="art-first",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
artifact_type="layouts",
|
||||
object_name="artifacts/doc-delete/layouts-v1.json",
|
||||
content_type="application/json",
|
||||
)
|
||||
],
|
||||
)
|
||||
second = store.replace_artifacts_for_run(
|
||||
run_id,
|
||||
[
|
||||
DocumentArtifact(
|
||||
artifact_id="art-second",
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
artifact_type="layouts",
|
||||
object_name="artifacts/doc-delete/layouts-v2.json",
|
||||
content_type="application/json",
|
||||
)
|
||||
],
|
||||
)
|
||||
failed = store.mark_run_failed(run_id, failure_stage="parse", error_message="boom")
|
||||
|
||||
artifact_rows = store.list_artifacts_by_run(run_id)
|
||||
assert len(first) == 1
|
||||
assert len(second) == 1
|
||||
assert len(artifact_rows) == 1
|
||||
assert artifact_rows[0].object_name.endswith("layouts-v2.json")
|
||||
assert failed is not None and failed.run_status == "failed"
|
||||
|
||||
store.delete_by_document(doc_id)
|
||||
|
||||
assert store.list_runs_by_document(doc_id) == []
|
||||
assert store.list_status_events_by_document(doc_id) == []
|
||||
assert store.list_artifacts_by_document(doc_id) == []
|
||||
Reference in New Issue
Block a user