AIRegulation-DocAnalysis/tests/test_embedding.py

"""Document orchestration and embedding boundary tests for the migrated backend."""

from __future__ import annotations

from dataclasses import dataclass

from app.application.documents.services import DocumentCommandService
from app.domain.documents import Chunk, Document, DocumentStatus, ParsedDocument
from app.shared import bootstrap


class FakeRepository:
    def __init__(self) -> None:
        self.documents: dict[str, Document] = {}

    def create(self, document: Document) -> Document:
        self.documents[document.doc_id] = document
        return document

    def update(self, document: Document) -> Document:
        self.documents[document.doc_id] = document
        return document

    def get(self, doc_id: str) -> Document | None:
        return self.documents.get(doc_id)

    def list(self, limit: int | None = None) -> list[Document]:
        values = list(self.documents.values())
        return values[:limit] if limit is not None else values

    def update_status(
        self,
        doc_id: str,
        status: DocumentStatus,
        *,
        error_message: str = "",
        chunk_count: int | None = None,
        summary: str | None = None,
        summary_latency_ms: int | None = None,
        parser_name: str | None = None,
        index_name: str | None = None,
        metadata: dict | None = None,
    ) -> Document | None:
        document = self.documents.get(doc_id)
        if not document:
            return None
        document.status = status
        document.error_message = error_message
        if chunk_count is not None:
            document.chunk_count = chunk_count
        if summary is not None:
            document.summary = summary
        if summary_latency_ms is not None:
            document.summary_latency_ms = summary_latency_ms
        if parser_name is not None:
            document.parser_name = parser_name
        if index_name is not None:
            document.index_name = index_name
        if metadata:
            document.metadata.update(metadata)
        return document


class FakeBinaryStore:
    def __init__(self) -> None:
        self.saved: dict[str, bytes] = {}

    def save(self, *, object_name: str, data: bytes, content_type: str, metadata: dict[str, str] | None = None) -> None:
        self.saved[object_name] = data

    def read(self, object_name: str) -> bytes:
        return self.saved[object_name]

    def delete(self, object_name: str) -> None:
        self.saved.pop(object_name, None)


class FakeParser:
    def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
        return ParsedDocument(
            doc_id=doc_id,
            doc_name=doc_name,
            raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
            structure_nodes=[{"title": "第一章"}],
            semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
            vector_chunks=[
                {
                    "chunk_id": f"{doc_id}-chunk-1",
                    "semantic_id": "semantic-1",
                    "chunk_type": "section_text",
                    "section_title": "第一章",
                    "section_path": ["第一章"],
                    "page_start": 1,
                    "text": "法规正文",
                    "embedding_text": "标准：测试\n章节：第一章\n\n法规正文",
                }
            ],
            parser_name="fake_parser",
            metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
        )


class FakeChunkBuilder:
    def build(self, *, parsed_document: ParsedDocument, regulation_type: str, version: str) -> list[Chunk]:
        return [
            Chunk(
                chunk_id=f"{parsed_document.doc_id}-chunk-1",
                doc_id=parsed_document.doc_id,
                doc_name=parsed_document.doc_name,
                content="法规正文",
                embedding_text="标准：测试\n章节：第一章\n\n法规正文",
                section_title="第一章",
                section_path=["第一章"],
                page_number=1,
                regulation_type=regulation_type,
                version=version,
                semantic_id="semantic-1",
                block_type="section_text",
                metadata={"source": "aliyun_vector_chunk"},
            )
        ]


class FakeEmbeddingProvider:
    def __init__(self) -> None:
        self.calls: list[list[str]] = []

    def embed_texts(self, texts: list[str]) -> list[list[float]]:
        self.calls.append(texts)
        return [[0.1] * 1024 for _ in texts]

    def embed_query(self, text: str) -> list[float]:
        return [0.2] * 1024


class FakeVectorIndex:
    def __init__(self) -> None:
        self.upserts: list[tuple[list[Chunk], list[list[float]]]] = []

    def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int:
        self.upserts.append((chunks, vectors))
        return len(chunks)

    def delete_by_document(self, doc_id: str) -> int:
        return 0

    def search(self, query_vector: list[float], top_k: int, filters: str | None = None):
        return []

    def health(self) -> dict:
        return {"collection_name": "regulations_dense_1024_v1"}


def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
    repository = FakeRepository()
    binary_store = FakeBinaryStore()
    embedding_provider = FakeEmbeddingProvider()
    vector_index = FakeVectorIndex()
    service = DocumentCommandService(
        document_repository=repository,
        binary_store=binary_store,
        parser=FakeParser(),
        chunk_builder=FakeChunkBuilder(),
        embedding_provider=embedding_provider,
        vector_index=vector_index,
    )

    result = service.upload_and_process(
        doc_id="doc12345",
        file_name="test.pdf",
        content=b"dummy pdf bytes",
        content_type="application/pdf",
        doc_name="测试法规",
        regulation_type="车辆安全",
        version="2026",
        generate_summary=False,
    )

    assert result.status == "indexed"
    assert result.num_chunks == 1
    assert embedding_provider.calls == [["标准：测试\n章节：第一章\n\n法规正文"]]
    assert len(vector_index.upserts) == 1
    stored = repository.get("doc12345")
    assert stored is not None
    assert stored.status == DocumentStatus.INDEXED
    assert stored.chunk_count == 1
    assert stored.parser_name == "fake_parser"
    assert stored.index_name == "regulations_dense_1024_v1"
    assert stored.metadata["parse_task_id"] == "task-123"
    assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")

def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
    bootstrap.get_parser.cache_clear()
    bootstrap.get_chunk_builder.cache_clear()

    parser = bootstrap.get_parser()
    chunk_builder = bootstrap.get_chunk_builder()

    assert parser.__class__.__name__ == "AliyunDocumentParser"
    assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`"""Document orchestration and embedding boundary tests for the migrated backend."""`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00
			`from __future__ import annotations`

			`from dataclasses import dataclass`

			`from app.application.documents.services import DocumentCommandService`
			`from app.domain.documents import Chunk, Document, DocumentStatus, ParsedDocument`
			`from app.shared import bootstrap`


			`class FakeRepository:`
			`def __init__(self) -> None:`
			`self.documents: dict[str, Document] = {}`

			`def create(self, document: Document) -> Document:`
			`self.documents[document.doc_id] = document`
			`return document`

			`def update(self, document: Document) -> Document:`
			`self.documents[document.doc_id] = document`
			`return document`

			`def get(self, doc_id: str) -> Document \| None:`
			`return self.documents.get(doc_id)`

			`def list(self, limit: int \| None = None) -> list[Document]:`
			`values = list(self.documents.values())`
			`return values[:limit] if limit is not None else values`

			`def update_status(`
			`self,`
			`doc_id: str,`
			`status: DocumentStatus,`
			`*,`
			`error_message: str = "",`
			`chunk_count: int \| None = None,`
			`summary: str \| None = None,`
			`summary_latency_ms: int \| None = None,`
			`parser_name: str \| None = None,`
			`index_name: str \| None = None,`
			`metadata: dict \| None = None,`
			`) -> Document \| None:`
			`document = self.documents.get(doc_id)`
			`if not document:`
			`return None`
			`document.status = status`
			`document.error_message = error_message`
			`if chunk_count is not None:`
			`document.chunk_count = chunk_count`
			`if summary is not None:`
			`document.summary = summary`
			`if summary_latency_ms is not None:`
			`document.summary_latency_ms = summary_latency_ms`
			`if parser_name is not None:`
			`document.parser_name = parser_name`
			`if index_name is not None:`
			`document.index_name = index_name`
			`if metadata:`
			`document.metadata.update(metadata)`
			`return document`


			`class FakeBinaryStore:`
			`def __init__(self) -> None:`
			`self.saved: dict[str, bytes] = {}`

			`def save(self, *, object_name: str, data: bytes, content_type: str, metadata: dict[str, str] \| None = None) -> None:`
			`self.saved[object_name] = data`

			`def read(self, object_name: str) -> bytes:`
			`return self.saved[object_name]`

			`def delete(self, object_name: str) -> None:`
			`self.saved.pop(object_name, None)`


			`class FakeParser:`
			`def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:`
			`return ParsedDocument(`
			`doc_id=doc_id,`
			`doc_name=doc_name,`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`structure_nodes=[{"title": "第一章"}],`
			`semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],`
			`vector_chunks=[`
			`{`
			`"chunk_id": f"{doc_id}-chunk-1",`
			`"semantic_id": "semantic-1",`
			`"chunk_type": "section_text",`
			`"section_title": "第一章",`
			`"section_path": ["第一章"],`
			`"page_start": 1,`
			`"text": "法规正文",`
			`"embedding_text": "标准：测试\n章节：第一章\n\n法规正文",`
			`}`
			`],`
			`parser_name="fake_parser",`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},`
first commit 2026-04-28 11:29:33 +08:00			`)`


Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`class FakeChunkBuilder:`
			`def build(self, *, parsed_document: ParsedDocument, regulation_type: str, version: str) -> list[Chunk]:`
			`return [`
			`Chunk(`
			`chunk_id=f"{parsed_document.doc_id}-chunk-1",`
			`doc_id=parsed_document.doc_id,`
			`doc_name=parsed_document.doc_name,`
			`content="法规正文",`
			`embedding_text="标准：测试\n章节：第一章\n\n法规正文",`
			`section_title="第一章",`
			`section_path=["第一章"],`
			`page_number=1,`
			`regulation_type=regulation_type,`
			`version=version,`
			`semantic_id="semantic-1",`
			`block_type="section_text",`
			`metadata={"source": "aliyun_vector_chunk"},`
			`)`
first commit 2026-04-28 11:29:33 +08:00			`]`


Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`class FakeEmbeddingProvider:`
			`def __init__(self) -> None:`
			`self.calls: list[list[str]] = []`

			`def embed_texts(self, texts: list[str]) -> list[list[float]]:`
			`self.calls.append(texts)`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`return [[0.1] * 1024 for _ in texts]`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00
			`def embed_query(self, text: str) -> list[float]:`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`return [0.2] * 1024`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00

			`class FakeVectorIndex:`
			`def __init__(self) -> None:`
			`self.upserts: list[tuple[list[Chunk], list[list[float]]]] = []`

			`def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int:`
			`self.upserts.append((chunks, vectors))`
			`return len(chunks)`

			`def delete_by_document(self, doc_id: str) -> int:`
			`return 0`

			`def search(self, query_vector: list[float], top_k: int, filters: str \| None = None):`
			`return []`

			`def health(self) -> dict:`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`return {"collection_name": "regulations_dense_1024_v1"}`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00

feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`def test_document_command_service_uses_1024_dense_embedding_and_updates_status():`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`repository = FakeRepository()`
			`binary_store = FakeBinaryStore()`
			`embedding_provider = FakeEmbeddingProvider()`
			`vector_index = FakeVectorIndex()`
			`service = DocumentCommandService(`
			`document_repository=repository,`
			`binary_store=binary_store,`
			`parser=FakeParser(),`
			`chunk_builder=FakeChunkBuilder(),`
			`embedding_provider=embedding_provider,`
			`vector_index=vector_index,`
			`)`

			`result = service.upload_and_process(`
			`doc_id="doc12345",`
			`file_name="test.pdf",`
			`content=b"dummy pdf bytes",`
			`content_type="application/pdf",`
			`doc_name="测试法规",`
			`regulation_type="车辆安全",`
			`version="2026",`
			`generate_summary=False,`
			`)`

			`assert result.status == "indexed"`
			`assert result.num_chunks == 1`
			`assert embedding_provider.calls == [["标准：测试\n章节：第一章\n\n法规正文"]]`
			`assert len(vector_index.upserts) == 1`
			`stored = repository.get("doc12345")`
			`assert stored is not None`
			`assert stored.status == DocumentStatus.INDEXED`
			`assert stored.chunk_count == 1`
			`assert stored.parser_name == "fake_parser"`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`assert stored.index_name == "regulations_dense_1024_v1"`
			`assert stored.metadata["parse_task_id"] == "task-123"`
			`assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`bootstrap.get_parser.cache_clear()`
			`bootstrap.get_chunk_builder.cache_clear()`

			`parser = bootstrap.get_parser()`
			`chunk_builder = bootstrap.get_chunk_builder()`

feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`assert parser.__class__.__name__ == "AliyunDocumentParser"`
			`assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"`