Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`.
- Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`.
- Updated architecture documentation to reflect changes in the Milvus collection name.
- Adjusted requirements by removing the sqlalchemy dependency.
- Modified test cases to align with new document structure and naming conventions.
- Introduced a new test file for Milvus vector index runtime recovery and error handling.
- Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
ash66
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions

View File

@@ -45,10 +45,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
context_tokens = 0
for idx, chunk in enumerate(retrieved_chunks, start=1):
block = (
f"[{idx}] 文档: {chunk.doc_name}\n"
f"[{idx}] 文档: {chunk.doc_title}\n"
f"章节: {chunk.section_title or '未标注'}\n"
f"页码: {chunk.page_number}\n"
f"内容: {chunk.content}"
f"页码: {chunk.page_start}" + (f"-{chunk.page_end}" if chunk.page_end and chunk.page_end != chunk.page_start else "") + "\n"
f"内容: {chunk.text}"
)
block_tokens = self._estimate_tokens(block)
if context_tokens + block_tokens > settings.rag_max_context_tokens:
@@ -73,10 +73,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
return False
estimated_total_tokens = sum(
self._estimate_tokens(
f"[{idx}] 文档: {chunk.doc_name}\n"
f"[{idx}] 文档: {chunk.doc_title}\n"
f"章节: {chunk.section_title or '未标注'}\n"
f"页码: {chunk.page_number}\n"
f"内容: {chunk.content}"
f"页码: {chunk.page_start}" + (f"-{chunk.page_end}" if chunk.page_end and chunk.page_end != chunk.page_start else "") + "\n"
f"内容: {chunk.text}"
)
for idx, chunk in enumerate(retrieved_chunks, start=1)
)
@@ -87,12 +87,17 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
return [
AnswerSource(
doc_id=chunk.doc_id,
doc_name=chunk.doc_name,
doc_title=chunk.doc_title,
chunk_id=chunk.chunk_id,
chunk_type=chunk.chunk_type,
section_title=chunk.section_title,
page_number=chunk.page_number,
page_start=chunk.page_start,
page_end=chunk.page_end,
section_level=chunk.section_level,
chunk_index=chunk.chunk_index,
piece_index=chunk.piece_index,
score=chunk.score,
content=chunk.content,
text=chunk.text,
metadata=chunk.metadata,
)
for chunk in chunks