"""新架构下的文档编排与 embedding 边界测试。""" from __future__ import annotations from dataclasses import dataclass from app.application.documents.services import DocumentCommandService from app.domain.documents import Chunk, Document, DocumentStatus, ParsedDocument from app.shared import bootstrap class FakeRepository: def __init__(self) -> None: self.documents: dict[str, Document] = {} def create(self, document: Document) -> Document: self.documents[document.doc_id] = document return document def update(self, document: Document) -> Document: self.documents[document.doc_id] = document return document def get(self, doc_id: str) -> Document | None: return self.documents.get(doc_id) def list(self, limit: int | None = None) -> list[Document]: values = list(self.documents.values()) return values[:limit] if limit is not None else values def update_status( self, doc_id: str, status: DocumentStatus, *, error_message: str = "", chunk_count: int | None = None, summary: str | None = None, summary_latency_ms: int | None = None, parser_name: str | None = None, index_name: str | None = None, metadata: dict | None = None, ) -> Document | None: document = self.documents.get(doc_id) if not document: return None document.status = status document.error_message = error_message if chunk_count is not None: document.chunk_count = chunk_count if summary is not None: document.summary = summary if summary_latency_ms is not None: document.summary_latency_ms = summary_latency_ms if parser_name is not None: document.parser_name = parser_name if index_name is not None: document.index_name = index_name if metadata: document.metadata.update(metadata) return document class FakeBinaryStore: def __init__(self) -> None: self.saved: dict[str, bytes] = {} def save(self, *, object_name: str, data: bytes, content_type: str, metadata: dict[str, str] | None = None) -> None: self.saved[object_name] = data def read(self, object_name: str) -> bytes: return self.saved[object_name] def delete(self, object_name: str) -> None: self.saved.pop(object_name, None) class FakeParser: def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument: return ParsedDocument( doc_id=doc_id, doc_name=doc_name, structure_nodes=[{"title": "第一章"}], semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}], vector_chunks=[ { "chunk_id": f"{doc_id}-chunk-1", "semantic_id": "semantic-1", "chunk_type": "section_text", "section_title": "第一章", "section_path": ["第一章"], "page_start": 1, "text": "法规正文", "embedding_text": "标准:测试\n章节:第一章\n\n法规正文", } ], parser_name="fake_parser", ) class FakeChunkBuilder: def build(self, *, parsed_document: ParsedDocument, regulation_type: str, version: str) -> list[Chunk]: return [ Chunk( chunk_id=f"{parsed_document.doc_id}-chunk-1", doc_id=parsed_document.doc_id, doc_name=parsed_document.doc_name, content="法规正文", embedding_text="标准:测试\n章节:第一章\n\n法规正文", section_title="第一章", section_path=["第一章"], page_number=1, regulation_type=regulation_type, version=version, semantic_id="semantic-1", block_type="section_text", metadata={"source": "aliyun_vector_chunk"}, ) ] class FakeEmbeddingProvider: def __init__(self) -> None: self.calls: list[list[str]] = [] def embed_texts(self, texts: list[str]) -> list[list[float]]: self.calls.append(texts) return [[0.1] * 1536 for _ in texts] def embed_query(self, text: str) -> list[float]: return [0.2] * 1536 class FakeVectorIndex: def __init__(self) -> None: self.upserts: list[tuple[list[Chunk], list[list[float]]]] = [] def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int: self.upserts.append((chunks, vectors)) return len(chunks) def delete_by_document(self, doc_id: str) -> int: return 0 def search(self, query_vector: list[float], top_k: int, filters: str | None = None): return [] def health(self) -> dict: return {"collection_name": "regulations_dense_1536"} def test_document_command_service_uses_1536_dense_embedding_and_updates_status(): repository = FakeRepository() binary_store = FakeBinaryStore() embedding_provider = FakeEmbeddingProvider() vector_index = FakeVectorIndex() service = DocumentCommandService( document_repository=repository, binary_store=binary_store, parser=FakeParser(), chunk_builder=FakeChunkBuilder(), embedding_provider=embedding_provider, vector_index=vector_index, ) result = service.upload_and_process( doc_id="doc12345", file_name="test.pdf", content=b"dummy pdf bytes", content_type="application/pdf", doc_name="测试法规", regulation_type="车辆安全", version="2026", generate_summary=False, ) assert result.status == "indexed" assert result.num_chunks == 1 assert embedding_provider.calls == [["标准:测试\n章节:第一章\n\n法规正文"]] assert len(vector_index.upserts) == 1 stored = repository.get("doc12345") assert stored is not None assert stored.status == DocumentStatus.INDEXED assert stored.chunk_count == 1 assert stored.parser_name == "fake_parser" assert stored.index_name == "regulations_dense_1536" def test_bootstrap_defaults_to_local_parser_and_chunk_builder(): bootstrap.get_parser.cache_clear() bootstrap.get_chunk_builder.cache_clear() parser = bootstrap.get_parser() chunk_builder = bootstrap.get_chunk_builder() assert parser.__class__.__name__ == "LocalDocumentParser" assert chunk_builder.__class__.__name__ == "LocalRegulationChunkBuilder"