Refactor document handling and update Milvus collection settings
- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
@@ -508,7 +508,7 @@ class DocumentQueryService:
|
||||
"""Return documents with real-time state from Milvus as the authoritative source.
|
||||
|
||||
Algorithm:
|
||||
1. Query Milvus for all doc metadata (doc_id, doc_name, chunk_count, …).
|
||||
1. Query Milvus for all doc metadata (doc_id, doc_title, chunk_count, …).
|
||||
2. Load JSON/PG metadata records and index them by doc_id.
|
||||
3. Merge: Milvus-present docs get status=INDEXED and live chunk_count;
|
||||
metadata-only docs with status=INDEXED are demoted to FAILED.
|
||||
@@ -536,8 +536,8 @@ class DocumentQueryService:
|
||||
doc.chunk_count = row["chunk_count"]
|
||||
doc.status = DocumentStatus.INDEXED
|
||||
# Backfill fields that may be missing from older JSON records.
|
||||
if not doc.doc_name and row.get("doc_name"):
|
||||
doc.doc_name = row["doc_name"]
|
||||
if not doc.doc_name and row.get("doc_title"):
|
||||
doc.doc_name = row["doc_title"]
|
||||
if not doc.regulation_type and row.get("regulation_type"):
|
||||
doc.regulation_type = row["regulation_type"]
|
||||
if not doc.version and row.get("version"):
|
||||
@@ -553,8 +553,8 @@ class DocumentQueryService:
|
||||
if doc_id not in meta_by_id:
|
||||
synthetic = Document(
|
||||
doc_id=doc_id,
|
||||
doc_name=row.get("doc_name", doc_id),
|
||||
file_name=row.get("doc_name", doc_id),
|
||||
doc_name=row.get("doc_title", doc_id),
|
||||
file_name=row.get("doc_title", doc_id),
|
||||
object_name="",
|
||||
content_type="",
|
||||
size_bytes=0,
|
||||
|
||||
@@ -29,11 +29,16 @@ def _reciprocal_rank_fusion(
|
||||
RetrievedChunk(
|
||||
chunk_id=chunk_map[ck].chunk_id,
|
||||
doc_id=chunk_map[ck].doc_id,
|
||||
doc_name=chunk_map[ck].doc_name,
|
||||
content=chunk_map[ck].content,
|
||||
doc_title=chunk_map[ck].doc_title,
|
||||
text=chunk_map[ck].text,
|
||||
score=scores[ck],
|
||||
chunk_type=chunk_map[ck].chunk_type,
|
||||
section_title=chunk_map[ck].section_title,
|
||||
page_number=chunk_map[ck].page_number,
|
||||
page_start=chunk_map[ck].page_start,
|
||||
page_end=chunk_map[ck].page_end,
|
||||
section_level=chunk_map[ck].section_level,
|
||||
chunk_index=chunk_map[ck].chunk_index,
|
||||
piece_index=chunk_map[ck].piece_index,
|
||||
metadata=chunk_map[ck].metadata,
|
||||
)
|
||||
for ck in sorted_keys
|
||||
|
||||
@@ -71,9 +71,9 @@ class PerceptionService:
|
||||
affected_docs.append(
|
||||
{
|
||||
"doc_id": chunk.doc_id,
|
||||
"doc_name": chunk.doc_name,
|
||||
"doc_title": chunk.doc_title,
|
||||
"score": round(float(chunk.score), 4),
|
||||
"snippet": (chunk.content or "")[:180],
|
||||
"snippet": (chunk.text or "")[:180],
|
||||
"clause": getattr(chunk, "section_title", "") or "",
|
||||
}
|
||||
)
|
||||
@@ -84,7 +84,7 @@ class PerceptionService:
|
||||
|
||||
# --- 2. Build context from retrieved chunks ---
|
||||
context_parts = [
|
||||
f"[文档{i}: {c.doc_name}]\n{(c.content or '')[:400]}"
|
||||
f"[文档{i}: {c.doc_title}]\n{(c.text or '')[:400]}"
|
||||
for i, c in enumerate(chunks[:5], 1)
|
||||
]
|
||||
context = "\n\n".join(context_parts) if context_parts else "(知识库中暂无相关文档)"
|
||||
|
||||
Reference in New Issue
Block a user