AIRegulation-DocAnalysis/tests/test_embedding.py

"""Document orchestration and embedding boundary tests for the migrated backend."""

from __future__ import annotations

from dataclasses import dataclass

from app.application.documents.services import DocumentCommandService
from app.domain.documents import Chunk, Document, DocumentStatus, ParsedDocument
from app.shared import bootstrap


class FakeRepository:
    def __init__(self) -> None:
        self.documents: dict[str, Document] = {}

    def create(self, document: Document) -> Document:
        self.documents[document.doc_id] = document
        return document

    def update(self, document: Document) -> Document:
        self.documents[document.doc_id] = document
        return document

    def get(self, doc_id: str) -> Document | None:
        return self.documents.get(doc_id)

    def list(self, limit: int | None = None) -> list[Document]:
        values = list(self.documents.values())
        return values[:limit] if limit is not None else values

    def update_status(
        self,
        doc_id: str,
        status: DocumentStatus,
        *,
        error_message: str = "",
        chunk_count: int | None = None,
        summary: str | None = None,
        summary_latency_ms: int | None = None,
        parser_name: str | None = None,
        index_name: str | None = None,
        metadata: dict | None = None,
    ) -> Document | None:
        document = self.documents.get(doc_id)
        if not document:
            return None
        document.status = status
        document.error_message = error_message
        if chunk_count is not None:
            document.chunk_count = chunk_count
        if summary is not None:
            document.summary = summary
        if summary_latency_ms is not None:
            document.summary_latency_ms = summary_latency_ms
        if parser_name is not None:
            document.parser_name = parser_name
        if index_name is not None:
            document.index_name = index_name
        if metadata:
            document.metadata.update(metadata)
        return document


class FakeBinaryStore:
    def __init__(self) -> None:
        self.saved: dict[str, bytes] = {}

    def save(self, *, object_name: str, data: bytes, content_type: str, metadata: dict[str, str] | None = None) -> None:
        self.saved[object_name] = data

    def read(self, object_name: str) -> bytes:
        return self.saved[object_name]

    def delete(self, object_name: str) -> None:
        self.saved.pop(object_name, None)


class FakeParser:
    def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
        return ParsedDocument(
            doc_id=doc_id,
            doc_name=doc_name,
            raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
            structure_nodes=[{"title": "第一章"}],
            semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
            vector_chunks=[
                {
                    "chunk_id": f"{doc_id}-chunk-1",
                    "semantic_id": "semantic-1",
                    "chunk_type": "section_text",
                    "section_title": "第一章",
                    "section_path": ["第一章"],
                    "page_start": 1,
                    "text": "法规正文",
                    "embedding_text": "标准：测试\n章节：第一章\n\n法规正文",
                }
            ],
            parser_name="fake_parser",
            metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
        )


class FakeChunkBuilder:
    def build(self, *, parsed_document: ParsedDocument, regulation_type: str, version: str) -> list[Chunk]:
        return [
            Chunk(
                chunk_id=f"{parsed_document.doc_id}-chunk-1",
                doc_id=parsed_document.doc_id,
                doc_name=parsed_document.doc_name,
                content="法规正文",
                embedding_text="标准：测试\n章节：第一章\n\n法规正文",
                section_title="第一章",
                section_path=["第一章"],
                page_number=1,
                regulation_type=regulation_type,
                version=version,
                semantic_id="semantic-1",
                block_type="section_text",
                metadata={"source": "aliyun_vector_chunk"},
            )
        ]


class FakeEmbeddingProvider:
    def __init__(self) -> None:
        self.calls: list[list[str]] = []

    def embed_texts(self, texts: list[str]) -> list[list[float]]:
        self.calls.append(texts)
        return [[0.1] * 1024 for _ in texts]

    def embed_query(self, text: str) -> list[float]:
        return [0.2] * 1024


class FakeVectorIndex:
    def __init__(self) -> None:
        self.upserts: list[tuple[list[Chunk], list[list[float]]]] = []

    def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int:
        self.upserts.append((chunks, vectors))
        return len(chunks)

    def delete_by_document(self, doc_id: str) -> int:
        return 0

    def search(self, query_vector: list[float], top_k: int, filters: str | None = None):
        return []

    def health(self) -> dict:
        return {"collection_name": "regulations_dense_1024_v1"}


def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
    repository = FakeRepository()
    binary_store = FakeBinaryStore()
    embedding_provider = FakeEmbeddingProvider()
    vector_index = FakeVectorIndex()
    service = DocumentCommandService(
        document_repository=repository,
        binary_store=binary_store,
        parser=FakeParser(),
        chunk_builder=FakeChunkBuilder(),
        embedding_provider=embedding_provider,
        vector_index=vector_index,
    )

    result = service.upload_and_process(
        doc_id="doc12345",
        file_name="test.pdf",
        content=b"dummy pdf bytes",
        content_type="application/pdf",
        doc_name="测试法规",
        regulation_type="车辆安全",
        version="2026",
        generate_summary=False,
    )

    assert result.status == "indexed"
    assert result.num_chunks == 1
    assert embedding_provider.calls == [["标准：测试\n章节：第一章\n\n法规正文"]]
    assert len(vector_index.upserts) == 1
    stored = repository.get("doc12345")
    assert stored is not None
    assert stored.status == DocumentStatus.INDEXED
    assert stored.chunk_count == 1
    assert stored.parser_name == "fake_parser"
    assert stored.index_name == "regulations_dense_1024_v1"
    assert stored.metadata["parse_task_id"] == "task-123"
    assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")

def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
    bootstrap.get_parser.cache_clear()
    bootstrap.get_chunk_builder.cache_clear()

    parser = bootstrap.get_parser()
    chunk_builder = bootstrap.get_chunk_builder()

    assert parser.__class__.__name__ == "AliyunDocumentParser"
    assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"