Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions
--- a/backend/app/infrastructure/parser/local_chunk_builder.py
+++ b/backend/app/infrastructure/parser/local_chunk_builder.py
@@ -10,6 +10,7 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
    """Adapt the existing markdown chunker to the new chunk builder port."""

    def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
+        """Initialize the local markdown chunk builder."""
        self.chunker = RegulationChunker(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
@@ -22,6 +23,7 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
        regulation_type: str,
        version: str,
    ) -> list[Chunk]:
+        """Build migrated chunk objects from the legacy markdown chunker output."""
        markdown_text = parsed_document.raw_text.strip()
        if not markdown_text:
            return []
@@ -50,16 +52,18 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
                Chunk(
                    chunk_id=item.metadata.chunk_id,
                    doc_id=parsed_document.doc_id,
-                    doc_name=parsed_document.doc_name,
-                    content=item.content,
+                    doc_title=parsed_document.doc_name,
+                    text=item.content,
                    embedding_text=item.content,
+                    chunk_type="local_markdown_chunk",
                    section_title=item.metadata.section_title or item.metadata.section_number,
                    section_path=section_path,
-                    page_number=item.metadata.page_number,
+                    page_start=item.metadata.page_number,
+                    page_end=item.metadata.page_number,
+                    section_level=len(section_path),
                    regulation_type=regulation_type,
                    version=version,
                    semantic_id=item.metadata.clause_number,
-                    block_type="local_markdown_chunk",
                    metadata=metadata,
                )
            )
--- a/backend/app/infrastructure/parser/vector_chunk_builder.py
+++ b/backend/app/infrastructure/parser/vector_chunk_builder.py
@@ -19,29 +19,35 @@ class AliyunVectorChunkBuilder(ChunkBuilder):
        """Handle build for the Aliyun Vector Chunk Builder instance."""
        chunks: list[Chunk] = []
        for index, item in enumerate(parsed_document.vector_chunks):
-            content = item.get("content") or item.get("text") or ""
-            embedding_text = item.get("embedding_text") or content
+            text = item.get("text") or ""
+            embedding_text = item.get("embedding_text") or text
            if not embedding_text.strip():
                continue
            section_path = item.get("section_path") or []
            section_title = item.get("section_title") or (section_path[-1] if section_path else "")
-            page_number = item.get("page_start") or item.get("page") or 0
            chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
-            metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
+            metadata = dict(item)
+            metadata["regulation_type"] = regulation_type
+            metadata["version"] = version
            chunks.append(
                Chunk(
                    chunk_id=str(chunk_id),
                    doc_id=parsed_document.doc_id,
-                    doc_name=parsed_document.doc_name,
-                    content=content,
+                    doc_title=str(item.get("doc_title") or parsed_document.doc_name),
+                    text=text,
                    embedding_text=embedding_text,
+                    chunk_type=str(item.get("chunk_type", item.get("block_type", ""))),
+                    chunk_index=int(item.get("chunk_index") or 0),
+                    piece_index=int(item.get("piece_index") or 0),
+                    page_start=int(item.get("page_start") or 0),
+                    page_end=int(item.get("page_end") or 0),
                    section_title=section_title,
                    section_path=section_path,
-                    page_number=int(page_number or 0),
+                    section_level=int(item.get("section_level") or len(section_path)),
+                    source_ids=[str(v) for v in item.get("source_ids", [])],
                    regulation_type=regulation_type,
                    version=version,
                    semantic_id=item.get("semantic_id", ""),
-                    block_type=item.get("block_type", ""),
                    metadata=metadata,
                )
            )