feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
ash66
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions

View File

@@ -1,4 +1,4 @@
"""新架构下的文档编排与 embedding 边界测试。"""
"""Document orchestration and embedding boundary tests for the migrated backend."""
from __future__ import annotations
@@ -80,6 +80,7 @@ class FakeParser:
return ParsedDocument(
doc_id=doc_id,
doc_name=doc_name,
raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
structure_nodes=[{"title": "第一章"}],
semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
vector_chunks=[
@@ -95,6 +96,7 @@ class FakeParser:
}
],
parser_name="fake_parser",
metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
)
@@ -125,10 +127,10 @@ class FakeEmbeddingProvider:
def embed_texts(self, texts: list[str]) -> list[list[float]]:
self.calls.append(texts)
return [[0.1] * 1536 for _ in texts]
return [[0.1] * 1024 for _ in texts]
def embed_query(self, text: str) -> list[float]:
return [0.2] * 1536
return [0.2] * 1024
class FakeVectorIndex:
@@ -146,10 +148,10 @@ class FakeVectorIndex:
return []
def health(self) -> dict:
return {"collection_name": "regulations_dense_1536"}
return {"collection_name": "regulations_dense_1024_v1"}
def test_document_command_service_uses_1536_dense_embedding_and_updates_status():
def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
repository = FakeRepository()
binary_store = FakeBinaryStore()
embedding_provider = FakeEmbeddingProvider()
@@ -183,15 +185,16 @@ def test_document_command_service_uses_1536_dense_embedding_and_updates_status()
assert stored.status == DocumentStatus.INDEXED
assert stored.chunk_count == 1
assert stored.parser_name == "fake_parser"
assert stored.index_name == "regulations_dense_1536"
assert stored.index_name == "regulations_dense_1024_v1"
assert stored.metadata["parse_task_id"] == "task-123"
assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")
def test_bootstrap_defaults_to_local_parser_and_chunk_builder():
def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
bootstrap.get_parser.cache_clear()
bootstrap.get_chunk_builder.cache_clear()
parser = bootstrap.get_parser()
chunk_builder = bootstrap.get_chunk_builder()
assert parser.__class__.__name__ == "LocalDocumentParser"
assert chunk_builder.__class__.__name__ == "LocalRegulationChunkBuilder"
assert parser.__class__.__name__ == "AliyunDocumentParser"
assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"