Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions
--- a/backend/app/infrastructure/llm/openai_compatible_answer_generator.py
+++ b/backend/app/infrastructure/llm/openai_compatible_answer_generator.py
@@ -45,10 +45,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
        context_tokens = 0
        for idx, chunk in enumerate(retrieved_chunks, start=1):
            block = (
-                f"[{idx}] 文档: {chunk.doc_name}\n"
+                f"[{idx}] 文档: {chunk.doc_title}\n"
                f"章节: {chunk.section_title or '未标注'}\n"
-                f"页码: {chunk.page_number}\n"
-                f"内容: {chunk.content}"
+                f"页码: {chunk.page_start}" + (f"-{chunk.page_end}" if chunk.page_end and chunk.page_end != chunk.page_start else "") + "\n"
+                f"内容: {chunk.text}"
            )
            block_tokens = self._estimate_tokens(block)
            if context_tokens + block_tokens > settings.rag_max_context_tokens:
@@ -73,10 +73,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
            return False
        estimated_total_tokens = sum(
            self._estimate_tokens(
-                f"[{idx}] 文档: {chunk.doc_name}\n"
+                f"[{idx}] 文档: {chunk.doc_title}\n"
                f"章节: {chunk.section_title or '未标注'}\n"
-                f"页码: {chunk.page_number}\n"
-                f"内容: {chunk.content}"
+                f"页码: {chunk.page_start}" + (f"-{chunk.page_end}" if chunk.page_end and chunk.page_end != chunk.page_start else "") + "\n"
+                f"内容: {chunk.text}"
            )
            for idx, chunk in enumerate(retrieved_chunks, start=1)
        )
@@ -87,12 +87,17 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
        return [
            AnswerSource(
                doc_id=chunk.doc_id,
-                doc_name=chunk.doc_name,
+                doc_title=chunk.doc_title,
                chunk_id=chunk.chunk_id,
+                chunk_type=chunk.chunk_type,
                section_title=chunk.section_title,
-                page_number=chunk.page_number,
+                page_start=chunk.page_start,
+                page_end=chunk.page_end,
+                section_level=chunk.section_level,
+                chunk_index=chunk.chunk_index,
+                piece_index=chunk.piece_index,
                score=chunk.score,
-                content=chunk.content,
+                text=chunk.text,
                metadata=chunk.metadata,
            )
            for chunk in chunks