- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
201 lines
7.0 KiB
Python
201 lines
7.0 KiB
Python
"""Document orchestration and embedding boundary tests for the migrated backend."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from app.application.documents.services import DocumentCommandService
|
|
from app.domain.documents import Chunk, Document, DocumentStatus, ParsedDocument
|
|
from app.shared import bootstrap
|
|
|
|
|
|
class FakeRepository:
|
|
def __init__(self) -> None:
|
|
self.documents: dict[str, Document] = {}
|
|
|
|
def create(self, document: Document) -> Document:
|
|
self.documents[document.doc_id] = document
|
|
return document
|
|
|
|
def update(self, document: Document) -> Document:
|
|
self.documents[document.doc_id] = document
|
|
return document
|
|
|
|
def get(self, doc_id: str) -> Document | None:
|
|
return self.documents.get(doc_id)
|
|
|
|
def list(self, limit: int | None = None) -> list[Document]:
|
|
values = list(self.documents.values())
|
|
return values[:limit] if limit is not None else values
|
|
|
|
def update_status(
|
|
self,
|
|
doc_id: str,
|
|
status: DocumentStatus,
|
|
*,
|
|
error_message: str = "",
|
|
chunk_count: int | None = None,
|
|
summary: str | None = None,
|
|
summary_latency_ms: int | None = None,
|
|
parser_name: str | None = None,
|
|
index_name: str | None = None,
|
|
metadata: dict | None = None,
|
|
) -> Document | None:
|
|
document = self.documents.get(doc_id)
|
|
if not document:
|
|
return None
|
|
document.status = status
|
|
document.error_message = error_message
|
|
if chunk_count is not None:
|
|
document.chunk_count = chunk_count
|
|
if summary is not None:
|
|
document.summary = summary
|
|
if summary_latency_ms is not None:
|
|
document.summary_latency_ms = summary_latency_ms
|
|
if parser_name is not None:
|
|
document.parser_name = parser_name
|
|
if index_name is not None:
|
|
document.index_name = index_name
|
|
if metadata:
|
|
document.metadata.update(metadata)
|
|
return document
|
|
|
|
|
|
class FakeBinaryStore:
|
|
def __init__(self) -> None:
|
|
self.saved: dict[str, bytes] = {}
|
|
|
|
def save(self, *, object_name: str, data: bytes, content_type: str, metadata: dict[str, str] | None = None) -> None:
|
|
self.saved[object_name] = data
|
|
|
|
def read(self, object_name: str) -> bytes:
|
|
return self.saved[object_name]
|
|
|
|
def delete(self, object_name: str) -> None:
|
|
self.saved.pop(object_name, None)
|
|
|
|
|
|
class FakeParser:
|
|
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
|
return ParsedDocument(
|
|
doc_id=doc_id,
|
|
doc_name=doc_name,
|
|
raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
|
|
structure_nodes=[{"title": "第一章"}],
|
|
semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
|
|
vector_chunks=[
|
|
{
|
|
"chunk_id": f"{doc_id}-chunk-1",
|
|
"semantic_id": "semantic-1",
|
|
"chunk_type": "section_text",
|
|
"section_title": "第一章",
|
|
"section_path": ["第一章"],
|
|
"page_start": 1,
|
|
"text": "法规正文",
|
|
"embedding_text": "标准:测试\n章节:第一章\n\n法规正文",
|
|
}
|
|
],
|
|
parser_name="fake_parser",
|
|
metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
|
|
)
|
|
|
|
|
|
class FakeChunkBuilder:
|
|
def build(self, *, parsed_document: ParsedDocument, regulation_type: str, version: str) -> list[Chunk]:
|
|
return [
|
|
Chunk(
|
|
chunk_id=f"{parsed_document.doc_id}-chunk-1",
|
|
doc_id=parsed_document.doc_id,
|
|
doc_name=parsed_document.doc_name,
|
|
content="法规正文",
|
|
embedding_text="标准:测试\n章节:第一章\n\n法规正文",
|
|
section_title="第一章",
|
|
section_path=["第一章"],
|
|
page_number=1,
|
|
regulation_type=regulation_type,
|
|
version=version,
|
|
semantic_id="semantic-1",
|
|
block_type="section_text",
|
|
metadata={"source": "aliyun_vector_chunk"},
|
|
)
|
|
]
|
|
|
|
|
|
class FakeEmbeddingProvider:
|
|
def __init__(self) -> None:
|
|
self.calls: list[list[str]] = []
|
|
|
|
def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
|
self.calls.append(texts)
|
|
return [[0.1] * 1024 for _ in texts]
|
|
|
|
def embed_query(self, text: str) -> list[float]:
|
|
return [0.2] * 1024
|
|
|
|
|
|
class FakeVectorIndex:
|
|
def __init__(self) -> None:
|
|
self.upserts: list[tuple[list[Chunk], list[list[float]]]] = []
|
|
|
|
def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int:
|
|
self.upserts.append((chunks, vectors))
|
|
return len(chunks)
|
|
|
|
def delete_by_document(self, doc_id: str) -> int:
|
|
return 0
|
|
|
|
def search(self, query_vector: list[float], top_k: int, filters: str | None = None):
|
|
return []
|
|
|
|
def health(self) -> dict:
|
|
return {"collection_name": "regulations_dense_1024_v1"}
|
|
|
|
|
|
def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
|
|
repository = FakeRepository()
|
|
binary_store = FakeBinaryStore()
|
|
embedding_provider = FakeEmbeddingProvider()
|
|
vector_index = FakeVectorIndex()
|
|
service = DocumentCommandService(
|
|
document_repository=repository,
|
|
binary_store=binary_store,
|
|
parser=FakeParser(),
|
|
chunk_builder=FakeChunkBuilder(),
|
|
embedding_provider=embedding_provider,
|
|
vector_index=vector_index,
|
|
)
|
|
|
|
result = service.upload_and_process(
|
|
doc_id="doc12345",
|
|
file_name="test.pdf",
|
|
content=b"dummy pdf bytes",
|
|
content_type="application/pdf",
|
|
doc_name="测试法规",
|
|
regulation_type="车辆安全",
|
|
version="2026",
|
|
generate_summary=False,
|
|
)
|
|
|
|
assert result.status == "indexed"
|
|
assert result.num_chunks == 1
|
|
assert embedding_provider.calls == [["标准:测试\n章节:第一章\n\n法规正文"]]
|
|
assert len(vector_index.upserts) == 1
|
|
stored = repository.get("doc12345")
|
|
assert stored is not None
|
|
assert stored.status == DocumentStatus.INDEXED
|
|
assert stored.chunk_count == 1
|
|
assert stored.parser_name == "fake_parser"
|
|
assert stored.index_name == "regulations_dense_1024_v1"
|
|
assert stored.metadata["parse_task_id"] == "task-123"
|
|
assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")
|
|
|
|
def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
|
|
bootstrap.get_parser.cache_clear()
|
|
bootstrap.get_chunk_builder.cache_clear()
|
|
|
|
parser = bootstrap.get_parser()
|
|
chunk_builder = bootstrap.get_chunk_builder()
|
|
|
|
assert parser.__class__.__name__ == "AliyunDocumentParser"
|
|
assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"
|