feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions
--- a/tests/test_embedding.py
+++ b/tests/test_embedding.py
@@ -1,4 +1,4 @@
-"""新架构下的文档编排与 embedding 边界测试。"""
+"""Document orchestration and embedding boundary tests for the migrated backend."""

 from __future__ import annotations

@@ -80,6 +80,7 @@ class FakeParser:
        return ParsedDocument(
            doc_id=doc_id,
            doc_name=doc_name,
+            raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
            structure_nodes=[{"title": "第一章"}],
            semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
            vector_chunks=[
@@ -95,6 +96,7 @@ class FakeParser:
                }
            ],
            parser_name="fake_parser",
+            metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
        )


@@ -125,10 +127,10 @@ class FakeEmbeddingProvider:

    def embed_texts(self, texts: list[str]) -> list[list[float]]:
        self.calls.append(texts)
-        return [[0.1] * 1536 for _ in texts]
+        return [[0.1] * 1024 for _ in texts]

    def embed_query(self, text: str) -> list[float]:
-        return [0.2] * 1536
+        return [0.2] * 1024


 class FakeVectorIndex:
@@ -146,10 +148,10 @@ class FakeVectorIndex:
        return []

    def health(self) -> dict:
-        return {"collection_name": "regulations_dense_1536"}
+        return {"collection_name": "regulations_dense_1024_v1"}


-def test_document_command_service_uses_1536_dense_embedding_and_updates_status():
+def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
    repository = FakeRepository()
    binary_store = FakeBinaryStore()
    embedding_provider = FakeEmbeddingProvider()
@@ -183,15 +185,16 @@ def test_document_command_service_uses_1536_dense_embedding_and_updates_status()
    assert stored.status == DocumentStatus.INDEXED
    assert stored.chunk_count == 1
    assert stored.parser_name == "fake_parser"
-    assert stored.index_name == "regulations_dense_1536"
+    assert stored.index_name == "regulations_dense_1024_v1"
+    assert stored.metadata["parse_task_id"] == "task-123"
+    assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")

-
-def test_bootstrap_defaults_to_local_parser_and_chunk_builder():
+def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
    bootstrap.get_parser.cache_clear()
    bootstrap.get_chunk_builder.cache_clear()

    parser = bootstrap.get_parser()
    chunk_builder = bootstrap.get_chunk_builder()

-    assert parser.__class__.__name__ == "LocalDocumentParser"
-    assert chunk_builder.__class__.__name__ == "LocalRegulationChunkBuilder"
+    assert parser.__class__.__name__ == "AliyunDocumentParser"
+    assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"