2026-05-18 22:30:28 +08:00
|
|
|
"""Document orchestration and embedding boundary tests for the migrated backend."""
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
|
|
|
|
from app.application.documents.services import DocumentCommandService
|
|
|
|
|
from app.domain.documents import Chunk, Document, DocumentStatus, ParsedDocument
|
|
|
|
|
from app.shared import bootstrap
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FakeRepository:
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
self.documents: dict[str, Document] = {}
|
|
|
|
|
|
|
|
|
|
def create(self, document: Document) -> Document:
|
|
|
|
|
self.documents[document.doc_id] = document
|
|
|
|
|
return document
|
|
|
|
|
|
|
|
|
|
def update(self, document: Document) -> Document:
|
|
|
|
|
self.documents[document.doc_id] = document
|
|
|
|
|
return document
|
|
|
|
|
|
|
|
|
|
def get(self, doc_id: str) -> Document | None:
|
|
|
|
|
return self.documents.get(doc_id)
|
|
|
|
|
|
|
|
|
|
def list(self, limit: int | None = None) -> list[Document]:
|
|
|
|
|
values = list(self.documents.values())
|
|
|
|
|
return values[:limit] if limit is not None else values
|
|
|
|
|
|
|
|
|
|
def update_status(
|
|
|
|
|
self,
|
|
|
|
|
doc_id: str,
|
|
|
|
|
status: DocumentStatus,
|
|
|
|
|
*,
|
|
|
|
|
error_message: str = "",
|
|
|
|
|
chunk_count: int | None = None,
|
|
|
|
|
summary: str | None = None,
|
|
|
|
|
summary_latency_ms: int | None = None,
|
|
|
|
|
parser_name: str | None = None,
|
|
|
|
|
index_name: str | None = None,
|
|
|
|
|
metadata: dict | None = None,
|
|
|
|
|
) -> Document | None:
|
|
|
|
|
document = self.documents.get(doc_id)
|
|
|
|
|
if not document:
|
|
|
|
|
return None
|
|
|
|
|
document.status = status
|
|
|
|
|
document.error_message = error_message
|
|
|
|
|
if chunk_count is not None:
|
|
|
|
|
document.chunk_count = chunk_count
|
|
|
|
|
if summary is not None:
|
|
|
|
|
document.summary = summary
|
|
|
|
|
if summary_latency_ms is not None:
|
|
|
|
|
document.summary_latency_ms = summary_latency_ms
|
|
|
|
|
if parser_name is not None:
|
|
|
|
|
document.parser_name = parser_name
|
|
|
|
|
if index_name is not None:
|
|
|
|
|
document.index_name = index_name
|
|
|
|
|
if metadata:
|
|
|
|
|
document.metadata.update(metadata)
|
|
|
|
|
return document
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FakeBinaryStore:
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
self.saved: dict[str, bytes] = {}
|
|
|
|
|
|
|
|
|
|
def save(self, *, object_name: str, data: bytes, content_type: str, metadata: dict[str, str] | None = None) -> None:
|
|
|
|
|
self.saved[object_name] = data
|
|
|
|
|
|
|
|
|
|
def read(self, object_name: str) -> bytes:
|
|
|
|
|
return self.saved[object_name]
|
|
|
|
|
|
|
|
|
|
def delete(self, object_name: str) -> None:
|
|
|
|
|
self.saved.pop(object_name, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FakeParser:
|
|
|
|
|
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
|
|
|
|
return ParsedDocument(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
doc_name=doc_name,
|
2026-05-18 22:30:28 +08:00
|
|
|
raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
|
2026-05-18 16:32:42 +08:00
|
|
|
structure_nodes=[{"title": "第一章"}],
|
|
|
|
|
semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
|
|
|
|
|
vector_chunks=[
|
|
|
|
|
{
|
|
|
|
|
"chunk_id": f"{doc_id}-chunk-1",
|
|
|
|
|
"semantic_id": "semantic-1",
|
|
|
|
|
"chunk_type": "section_text",
|
|
|
|
|
"section_title": "第一章",
|
|
|
|
|
"section_path": ["第一章"],
|
|
|
|
|
"page_start": 1,
|
|
|
|
|
"text": "法规正文",
|
|
|
|
|
"embedding_text": "标准:测试\n章节:第一章\n\n法规正文",
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
parser_name="fake_parser",
|
2026-05-18 22:30:28 +08:00
|
|
|
metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
|
2026-04-28 11:29:33 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
class FakeChunkBuilder:
|
|
|
|
|
def build(self, *, parsed_document: ParsedDocument, regulation_type: str, version: str) -> list[Chunk]:
|
|
|
|
|
return [
|
|
|
|
|
Chunk(
|
|
|
|
|
chunk_id=f"{parsed_document.doc_id}-chunk-1",
|
|
|
|
|
doc_id=parsed_document.doc_id,
|
|
|
|
|
doc_name=parsed_document.doc_name,
|
|
|
|
|
content="法规正文",
|
|
|
|
|
embedding_text="标准:测试\n章节:第一章\n\n法规正文",
|
|
|
|
|
section_title="第一章",
|
|
|
|
|
section_path=["第一章"],
|
|
|
|
|
page_number=1,
|
|
|
|
|
regulation_type=regulation_type,
|
|
|
|
|
version=version,
|
|
|
|
|
semantic_id="semantic-1",
|
|
|
|
|
block_type="section_text",
|
|
|
|
|
metadata={"source": "aliyun_vector_chunk"},
|
|
|
|
|
)
|
2026-04-28 11:29:33 +08:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
class FakeEmbeddingProvider:
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
self.calls: list[list[str]] = []
|
|
|
|
|
|
|
|
|
|
def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
|
|
|
|
self.calls.append(texts)
|
2026-05-18 22:30:28 +08:00
|
|
|
return [[0.1] * 1024 for _ in texts]
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
def embed_query(self, text: str) -> list[float]:
|
2026-05-18 22:30:28 +08:00
|
|
|
return [0.2] * 1024
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class FakeVectorIndex:
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
self.upserts: list[tuple[list[Chunk], list[list[float]]]] = []
|
|
|
|
|
|
|
|
|
|
def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int:
|
|
|
|
|
self.upserts.append((chunks, vectors))
|
|
|
|
|
return len(chunks)
|
|
|
|
|
|
|
|
|
|
def delete_by_document(self, doc_id: str) -> int:
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
def search(self, query_vector: list[float], top_k: int, filters: str | None = None):
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def health(self) -> dict:
|
2026-05-18 22:30:28 +08:00
|
|
|
return {"collection_name": "regulations_dense_1024_v1"}
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
|
2026-05-18 22:30:28 +08:00
|
|
|
def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
|
2026-05-18 16:32:42 +08:00
|
|
|
repository = FakeRepository()
|
|
|
|
|
binary_store = FakeBinaryStore()
|
|
|
|
|
embedding_provider = FakeEmbeddingProvider()
|
|
|
|
|
vector_index = FakeVectorIndex()
|
|
|
|
|
service = DocumentCommandService(
|
|
|
|
|
document_repository=repository,
|
|
|
|
|
binary_store=binary_store,
|
|
|
|
|
parser=FakeParser(),
|
|
|
|
|
chunk_builder=FakeChunkBuilder(),
|
|
|
|
|
embedding_provider=embedding_provider,
|
|
|
|
|
vector_index=vector_index,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
result = service.upload_and_process(
|
|
|
|
|
doc_id="doc12345",
|
|
|
|
|
file_name="test.pdf",
|
|
|
|
|
content=b"dummy pdf bytes",
|
|
|
|
|
content_type="application/pdf",
|
|
|
|
|
doc_name="测试法规",
|
|
|
|
|
regulation_type="车辆安全",
|
|
|
|
|
version="2026",
|
|
|
|
|
generate_summary=False,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert result.status == "indexed"
|
|
|
|
|
assert result.num_chunks == 1
|
|
|
|
|
assert embedding_provider.calls == [["标准:测试\n章节:第一章\n\n法规正文"]]
|
|
|
|
|
assert len(vector_index.upserts) == 1
|
|
|
|
|
stored = repository.get("doc12345")
|
|
|
|
|
assert stored is not None
|
|
|
|
|
assert stored.status == DocumentStatus.INDEXED
|
|
|
|
|
assert stored.chunk_count == 1
|
|
|
|
|
assert stored.parser_name == "fake_parser"
|
2026-05-18 22:30:28 +08:00
|
|
|
assert stored.index_name == "regulations_dense_1024_v1"
|
|
|
|
|
assert stored.metadata["parse_task_id"] == "task-123"
|
|
|
|
|
assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")
|
2026-05-18 16:32:42 +08:00
|
|
|
|
2026-05-18 22:30:28 +08:00
|
|
|
def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
|
2026-05-18 16:32:42 +08:00
|
|
|
bootstrap.get_parser.cache_clear()
|
|
|
|
|
bootstrap.get_chunk_builder.cache_clear()
|
|
|
|
|
|
|
|
|
|
parser = bootstrap.get_parser()
|
|
|
|
|
chunk_builder = bootstrap.get_chunk_builder()
|
|
|
|
|
|
2026-05-18 22:30:28 +08:00
|
|
|
assert parser.__class__.__name__ == "AliyunDocumentParser"
|
|
|
|
|
assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"
|