feat: Migrate document parsing to Aliyun and update embedding configurations
- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
"""新架构下的文档编排与 embedding 边界测试。"""
|
||||
"""Document orchestration and embedding boundary tests for the migrated backend."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -80,6 +80,7 @@ class FakeParser:
|
||||
return ParsedDocument(
|
||||
doc_id=doc_id,
|
||||
doc_name=doc_name,
|
||||
raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
|
||||
structure_nodes=[{"title": "第一章"}],
|
||||
semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
|
||||
vector_chunks=[
|
||||
@@ -95,6 +96,7 @@ class FakeParser:
|
||||
}
|
||||
],
|
||||
parser_name="fake_parser",
|
||||
metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
|
||||
)
|
||||
|
||||
|
||||
@@ -125,10 +127,10 @@ class FakeEmbeddingProvider:
|
||||
|
||||
def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
||||
self.calls.append(texts)
|
||||
return [[0.1] * 1536 for _ in texts]
|
||||
return [[0.1] * 1024 for _ in texts]
|
||||
|
||||
def embed_query(self, text: str) -> list[float]:
|
||||
return [0.2] * 1536
|
||||
return [0.2] * 1024
|
||||
|
||||
|
||||
class FakeVectorIndex:
|
||||
@@ -146,10 +148,10 @@ class FakeVectorIndex:
|
||||
return []
|
||||
|
||||
def health(self) -> dict:
|
||||
return {"collection_name": "regulations_dense_1536"}
|
||||
return {"collection_name": "regulations_dense_1024_v1"}
|
||||
|
||||
|
||||
def test_document_command_service_uses_1536_dense_embedding_and_updates_status():
|
||||
def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
|
||||
repository = FakeRepository()
|
||||
binary_store = FakeBinaryStore()
|
||||
embedding_provider = FakeEmbeddingProvider()
|
||||
@@ -183,15 +185,16 @@ def test_document_command_service_uses_1536_dense_embedding_and_updates_status()
|
||||
assert stored.status == DocumentStatus.INDEXED
|
||||
assert stored.chunk_count == 1
|
||||
assert stored.parser_name == "fake_parser"
|
||||
assert stored.index_name == "regulations_dense_1536"
|
||||
assert stored.index_name == "regulations_dense_1024_v1"
|
||||
assert stored.metadata["parse_task_id"] == "task-123"
|
||||
assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")
|
||||
|
||||
|
||||
def test_bootstrap_defaults_to_local_parser_and_chunk_builder():
|
||||
def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
|
||||
bootstrap.get_parser.cache_clear()
|
||||
bootstrap.get_chunk_builder.cache_clear()
|
||||
|
||||
parser = bootstrap.get_parser()
|
||||
chunk_builder = bootstrap.get_chunk_builder()
|
||||
|
||||
assert parser.__class__.__name__ == "LocalDocumentParser"
|
||||
assert chunk_builder.__class__.__name__ == "LocalRegulationChunkBuilder"
|
||||
assert parser.__class__.__name__ == "AliyunDocumentParser"
|
||||
assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"
|
||||
|
||||
Reference in New Issue
Block a user