Refactor document handling and update Milvus collection settings
- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
@@ -19,29 +19,35 @@ class AliyunVectorChunkBuilder(ChunkBuilder):
|
||||
"""Handle build for the Aliyun Vector Chunk Builder instance."""
|
||||
chunks: list[Chunk] = []
|
||||
for index, item in enumerate(parsed_document.vector_chunks):
|
||||
content = item.get("content") or item.get("text") or ""
|
||||
embedding_text = item.get("embedding_text") or content
|
||||
text = item.get("text") or ""
|
||||
embedding_text = item.get("embedding_text") or text
|
||||
if not embedding_text.strip():
|
||||
continue
|
||||
section_path = item.get("section_path") or []
|
||||
section_title = item.get("section_title") or (section_path[-1] if section_path else "")
|
||||
page_number = item.get("page_start") or item.get("page") or 0
|
||||
chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
|
||||
metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
|
||||
metadata = dict(item)
|
||||
metadata["regulation_type"] = regulation_type
|
||||
metadata["version"] = version
|
||||
chunks.append(
|
||||
Chunk(
|
||||
chunk_id=str(chunk_id),
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
content=content,
|
||||
doc_title=str(item.get("doc_title") or parsed_document.doc_name),
|
||||
text=text,
|
||||
embedding_text=embedding_text,
|
||||
chunk_type=str(item.get("chunk_type", item.get("block_type", ""))),
|
||||
chunk_index=int(item.get("chunk_index") or 0),
|
||||
piece_index=int(item.get("piece_index") or 0),
|
||||
page_start=int(item.get("page_start") or 0),
|
||||
page_end=int(item.get("page_end") or 0),
|
||||
section_title=section_title,
|
||||
section_path=section_path,
|
||||
page_number=int(page_number or 0),
|
||||
section_level=int(item.get("section_level") or len(section_path)),
|
||||
source_ids=[str(v) for v in item.get("source_ids", [])],
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
semantic_id=item.get("semantic_id", ""),
|
||||
block_type=item.get("block_type", ""),
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user