Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`.
- Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`.
- Updated architecture documentation to reflect changes in the Milvus collection name.
- Adjusted requirements by removing the sqlalchemy dependency.
- Modified test cases to align with new document structure and naming conventions.
- Introduced a new test file for Milvus vector index runtime recovery and error handling.
- Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
ash66
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions

2
.env
View File

@@ -9,7 +9,7 @@ DEBUG=false
# ===== Milvus向量数据库配置已有=====
MILVUS_HOST=6.86.80.8
MILVUS_PORT=19530
MILVUS_COLLECTION=regulations_dense_1024_v1
MILVUS_COLLECTION=regulations_dense_1024_v2
MILVUS_DB_NAME=default
MILVUS_INDEX_TYPE=IVF_FLAT
MILVUS_NLIST=128

View File

@@ -4,7 +4,7 @@
# ===== Milvus向量数据库配置已有=====
MILVUS_HOST=6.86.80.8
MILVUS_PORT=19530
MILVUS_COLLECTION=regulations_dense_1024_v1
MILVUS_COLLECTION=regulations_dense_1024_v2
MILVUS_DB_NAME=default
MILVUS_INDEX_TYPE=IVF_FLAT
MILVUS_NLIST=128

View File

@@ -9,7 +9,7 @@ DEBUG=false
# ===== Milvus向量数据库配置 =====
MILVUS_HOST=6.86.80.8
MILVUS_PORT=19530
MILVUS_COLLECTION=regulations_dense_1024_v1
MILVUS_COLLECTION=regulations_dense_1024_v2
MILVUS_DB_NAME=default
MILVUS_INDEX_TYPE=IVF_FLAT
MILVUS_NLIST=128

View File

@@ -105,7 +105,7 @@ ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
EMBEDDING_API_KEY=your_embedding_api_key_here
EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536
EMBEDDING_DIM=1024
PARSER_BACKEND=aliyun
CHUNK_BACKEND=aliyun
PARSER_FAILURE_MODE=fail

View File

@@ -8,7 +8,7 @@
- ✅ PDF/DOC/DOCX 文档解析(阿里云文档智能)
- ✅ 基于阿里云 `vector_chunks` 的统一切片
- ✅ OpenAI 兼容 embedding`text-embedding-v3`1536维)
- ✅ OpenAI 兼容 embedding`text-embedding-v3`1024维)
- ✅ Milvus 向量数据库存储与 dense-only 检索
- ✅ FastAPI接口封装
@@ -97,7 +97,7 @@ curl -X POST http://localhost:8000/api/v1/knowledge/search \
|------|------|
| 文档解析 | 阿里云文档智能 + python-docx |
| 分块策略 | 阿里云 `vector_chunks` |
| 嵌入模型 | `text-embedding-v3`1536维 Dense |
| 嵌入模型 | `text-embedding-v3`1024维 Dense |
| 向量数据库 | Milvus 2.4本地Docker部署 |
| 检索方式 | Dense-only 检索 |
| API框架 | FastAPI |
@@ -119,7 +119,7 @@ CHUNK_BACKEND=aliyun
# embedding 配置
EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536
EMBEDDING_DIM=1024
EMBEDDING_API_KEY=your_embedding_api_key_here
# 分块配置
@@ -142,7 +142,7 @@ CHUNK_SIZE=512
- `artifacts/{doc_id}/semantic_blocks.json`
- `artifacts/{doc_id}/vector_chunks.json`
当前默认 Milvus collection 为 `regulations_dense_1536_v2`
当前默认 Milvus collection 为 `regulations_dense_1024_v2`
## 许可证

View File

@@ -0,0 +1,8 @@
{
"permissions": {
"allow": [
"Bash(python3 *)",
"Bash(PGPASSWORD=postgresql123456 psql *)"
]
}
}

View File

@@ -0,0 +1,475 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
阿里云文档智能 API 解析 PDF输出三层结构 chunks
- structure_nodes: 目录树结构
- semantic_blocks: 语义块(章节文本、表格、图片)
- vector_chunks: 检索块(带 overlap 切分)
"""
import argparse
import json
import re
import time
from pathlib import Path
from typing import Dict, List
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_docmind_api20220711 import models as docmind_models
from alibabacloud_tea_util import models as util_models
# ===================== 阿里云配置 =====================
ALIBABA_ACCESS_KEY_ID = "LTAI5t6fWvAsvZkoF9WTbtys"
ALIBABA_ACCESS_KEY_SECRET = "WX4oaE4FLYRa5L85TMQkqRPHeTJAF0"
ALIBABA_ENDPOINT = "docmind-api.cn-hangzhou.aliyuncs.com"
# ===================== 切分参数 =====================
MAX_CHARS = 600
OVERLAP_CHARS = 80
# ===================== 布局类型常量 =====================
TOC_TITLES = {"目次", "目录"}
TITLE_SUBTYPES = {"doc_title", "para_title"}
TEXT_SUBTYPES = {"para", "none"}
FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
# ===================== 阿里云 API 客户端 =====================
def init_client() -> DocmindClient:
config = open_api_models.Config(
access_key_id=ALIBABA_ACCESS_KEY_ID,
access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
)
config.endpoint = ALIBABA_ENDPOINT
return DocmindClient(config)
def submit_job(client: DocmindClient, file_path: str) -> str:
"""提交文档解析任务"""
file_name = Path(file_path).name
request = docmind_models.SubmitDocParserJobAdvanceRequest(
file_url_object=open(file_path, "rb"),
file_name=file_name,
file_name_extension=Path(file_path).suffix.lstrip("."),
llm_enhancement=True,
enhancement_mode="VLM",
)
runtime = util_models.RuntimeOptions()
response = client.submit_doc_parser_job_advance(request, runtime)
return response.body.data.id
def query_status(client: DocmindClient, task_id: str) -> Dict:
"""查询任务状态"""
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
response = client.query_doc_parser_status(request)
return response.body.data.to_map() if response.body.data else None
def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
"""等待任务完成"""
while True:
status_data = query_status(client, task_id)
if not status_data:
return False
status = status_data.get("Status", "").lower()
if status == "success":
return True
elif status == "failed":
print(f"任务失败: {status_data}")
return False
print(f"任务状态: {status}, 等待中...")
time.sleep(poll_interval)
def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
"""获取解析结果"""
request = docmind_models.GetDocParserResultRequest(
id=task_id,
layout_step_size=layout_step_size,
layout_num=layout_num,
)
response = client.get_doc_parser_result(request)
return response.body.data if response.body.data else None
def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
"""收集所有解析结果"""
all_layouts = []
layout_num = 0
while True:
result_data = get_result(client, task_id, layout_num, layout_step_size)
if not result_data:
break
layouts = result_data.get("layouts", [])
if not layouts:
break
all_layouts.extend(layouts)
layout_num += len(layouts)
if len(layouts) < layout_step_size:
break
return all_layouts
# ===================== 文本处理 =====================
def normalize_text(text: str) -> str:
text = text.replace("\r", "\n")
text = text.replace(" ", " ")
text = re.sub(r"\n+", "\n", text)
text = re.sub(r"[ \t]+", " ", text)
return text.strip()
def get_page(layout: Dict) -> int:
return layout.get("pageNum", layout.get("pageNumber", 0))
def get_text(layout: Dict) -> str:
text = normalize_text(layout.get("text", ""))
if text:
return text
return normalize_text(layout.get("markdownContent", ""))
# ===================== 布局类型判断 =====================
def is_title(layout: Dict) -> bool:
return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
def is_text(layout: Dict) -> bool:
return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
def is_figure(layout: Dict) -> bool:
return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
def is_table(layout: Dict) -> bool:
return layout.get("type") == "table"
def is_toc_layout(layout: Dict) -> bool:
text = get_text(layout)
if text in TOC_TITLES:
return True
if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
return True
return False
def extract_table_text(layout: Dict) -> str:
rows = []
for cell in layout.get("cells", []):
texts = []
for cell_layout in cell.get("layouts", []):
cell_text = normalize_text(cell_layout.get("text", ""))
if cell_text:
texts.append(cell_text)
if texts:
rows.append(" ".join(texts))
return "\n".join(rows).strip()
# ===================== 结构层:目录树 =====================
def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
nodes = []
for layout in layouts:
if not is_title(layout):
continue
text = get_text(layout)
if not text or text in TOC_TITLES:
continue
nodes.append(
{
"unique_id": layout.get("uniqueId"),
"page": get_page(layout),
"index": layout.get("index", 0),
"level": layout.get("level", 0),
"title": text,
"type": layout.get("type"),
"sub_type": layout.get("subType"),
}
)
return nodes
# ===================== 语义层:章节内容 =====================
def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
level = layout.get("level", 0)
title = get_text(layout)
while section_stack and section_stack[-1]["level"] >= level:
section_stack.pop()
section_stack.append(
{
"level": level,
"title": title,
"page": get_page(layout),
"unique_id": layout.get("uniqueId"),
}
)
return section_stack
def section_path_titles(section_stack: List[Dict]) -> List[str]:
return [item["title"] for item in section_stack]
def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
if not blocks:
return block_id
texts = [item["text"] for item in blocks if item["text"]]
merged_text = "\n".join(texts).strip()
if not merged_text:
return block_id
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "section_text",
"page_start": min(item["page"] for item in blocks),
"page_end": max(item["page"] for item in blocks),
"section_path": blocks[0]["section_path"],
"section_level": blocks[0]["section_level"],
"section_title": blocks[0]["section_title"],
"source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
"text": merged_text,
}
)
return block_id + 1
def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
semantic_blocks = []
section_stack = []
pending_text_blocks = []
block_id = 1
skip_toc_page = False
for layout in layouts:
text = get_text(layout)
page = get_page(layout)
if is_toc_layout(layout):
skip_toc_page = True
continue
if skip_toc_page and page == 1:
continue
if skip_toc_page and page != 1:
skip_toc_page = False
if is_title(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
section_stack = update_section_path(section_stack, layout)
continue
section_path = section_path_titles(section_stack)
section_title = section_path[-1] if section_path else "未分类"
section_level = len(section_path)
if is_table(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
table_text = extract_table_text(layout)
if table_text:
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "table",
"page_start": page,
"page_end": page,
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
"source_ids": [layout.get("uniqueId")],
"text": table_text,
}
)
block_id += 1
continue
if is_figure(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
if text:
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "figure",
"page_start": page,
"page_end": page,
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
"source_ids": [layout.get("uniqueId")],
"text": text,
}
)
block_id += 1
continue
if is_text(layout) and text:
pending_text_blocks.append(
{
"page": page,
"text": text,
"unique_id": layout.get("uniqueId"),
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
}
)
flush_text_block(pending_text_blocks, semantic_blocks, block_id)
return semantic_blocks
# ===================== 检索层:向量 chunks =====================
def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
text = text.strip()
if len(text) <= max_chars:
return [text] if text else []
parts = []
start = 0
while start < len(text):
end = min(len(text), start + max_chars)
parts.append(text[start:end].strip())
if end >= len(text):
break
start = max(0, end - overlap_chars)
return [part for part in parts if part]
def build_vector_chunks(
semantic_blocks: List[Dict],
doc_id: str,
doc_title: str,
max_chars: int,
overlap_chars: int,
) -> List[Dict]:
vector_chunks = []
chunk_index = 1
for block in semantic_blocks:
pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
for piece_index, piece in enumerate(pieces, start=1):
if block["section_path"]:
header = f"标准:{doc_title}\n章节:{' > '.join(block['section_path'])}\n\n"
else:
header = f"标准:{doc_title}\n\n"
vector_chunks.append(
{
"doc_id": doc_id,
"doc_title": doc_title,
"chunk_id": f"chunk-{chunk_index}",
"chunk_index": chunk_index,
"semantic_id": block["semantic_id"],
"chunk_type": block["block_type"],
"piece_index": piece_index,
"page_start": block["page_start"],
"page_end": block["page_end"],
"section_path": block["section_path"],
"section_level": block["section_level"],
"section_title": block["section_title"],
"source_ids": block["source_ids"],
"text": piece,
"embedding_text": header + piece,
}
)
chunk_index += 1
return vector_chunks
# ===================== 主转换函数 =====================
def convert_layouts(
layouts: List[Dict],
doc_id: str,
doc_title: str,
max_chars: int,
overlap_chars: int,
) -> Dict:
structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks(
semantic_blocks,
doc_id=doc_id,
doc_title=doc_title,
max_chars=max_chars,
overlap_chars=overlap_chars,
)
return {
"doc_id": doc_id,
"doc_title": doc_title,
"structure_nodes": structure_nodes,
"semantic_blocks": semantic_blocks,
"vector_chunks": vector_chunks,
}
# ===================== CLI 入口 =====================
def main() -> None:
parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF输出三层结构 chunks")
parser.add_argument("pdf_path", help="PDF 文件路径")
parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
parser.add_argument("--layouts-out", dest="layouts_output", help="输出原始 layouts JSON")
parser.add_argument("--doc-id", default="GB14747-2006", help="文档 ID")
parser.add_argument("--doc-title", default="GB 14747—2006 儿童三轮车安全要求", help="文档标题")
parser.add_argument("--max-chars", type=int, default=MAX_CHARS, help="单个检索 chunk 最大字符数")
parser.add_argument("--overlap-chars", type=int, default=OVERLAP_CHARS, help="相邻检索 chunk 重叠字符数")
parser.add_argument("--poll-interval", type=int, default=5, help="轮询间隔(秒)")
args = parser.parse_args()
pdf_path = Path(args.pdf_path).expanduser().resolve()
if not pdf_path.exists():
raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
# 1. 提交阿里云任务
client = init_client()
print(f"提交任务: {pdf_path}")
task_id = submit_job(client, str(pdf_path))
print(f"任务 ID: {task_id}")
# 2. 等待完成
print("等待任务完成...")
if not wait_for_completion(client, task_id, args.poll_interval):
print("任务失败,退出")
return
# 3. 获取 layouts
print("获取解析结果...")
layouts = collect_all_results(client, task_id)
print(f"获取到 {len(layouts)} 个布局块")
# 4. 输出原始 layouts可选
if args.layouts_output:
layouts_path = Path(args.layouts_output).expanduser().resolve()
layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"原始 layouts 已写入: {layouts_path}")
# 5. 转换为三层结构
print("转换为三层结构...")
data = convert_layouts(
layouts,
doc_id=args.doc_id,
doc_title=args.doc_title,
max_chars=args.max_chars,
overlap_chars=args.overlap_chars,
)
# 6. 输出结果
output_path = Path(args.out).expanduser().resolve()
output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"结构层节点数: {len(data['structure_nodes'])}")
print(f"语义层块数: {len(data['semantic_blocks'])}")
print(f"检索层块数: {len(data['vector_chunks'])}")
print(f"输出文件: {output_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,115 @@
"""Rebuild the migrated Milvus collection from saved vector chunks."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
DEFAULT_COLLECTION = "regulations_dense_1024_v2"
DEFAULT_DIM = 1024
def build_collection(name: str, dim: int) -> Collection:
"""Create the migrated Milvus collection from scratch."""
if utility.has_collection(name):
utility.drop_collection(name)
schema = CollectionSchema(
fields=[
FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=128, is_primary=True, auto_id=False),
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=256),
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="chunk_index", dtype=DataType.INT64),
FieldSchema(name="piece_index", dtype=DataType.INT64),
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="embedding_text", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="page_start", dtype=DataType.INT64),
FieldSchema(name="page_end", dtype=DataType.INT64),
FieldSchema(name="section_level", dtype=DataType.INT64),
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),
FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="metadata_json", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="created_at", dtype=DataType.INT64),
],
description="Dense-only regulations index",
enable_dynamic_field=False,
)
collection = Collection(name=name, schema=schema)
collection.create_index(
field_name="embedding",
index_params={
"metric_type": "COSINE",
"index_type": "IVF_FLAT",
"params": {"nlist": 128},
},
)
return collection
def load_chunks(payload_path: Path) -> list[dict]:
"""Load vector chunks emitted by the Aliyun parser pipeline."""
payload = json.loads(payload_path.read_text(encoding="utf-8"))
if isinstance(payload, dict):
chunks = payload.get("vector_chunks", [])
else:
chunks = payload
if not isinstance(chunks, list):
raise ValueError("vector chunk payload must be a list or a dict containing vector_chunks")
return chunks
def main() -> None:
"""Rebuild the target collection from a vector chunk payload."""
parser = argparse.ArgumentParser(description="Rebuild the migrated Milvus collection.")
parser.add_argument("--host", default="127.0.0.1", help="Milvus host")
parser.add_argument("--port", default="19530", help="Milvus port")
parser.add_argument("--collection", default=DEFAULT_COLLECTION, help="Milvus collection name")
parser.add_argument("--dim", type=int, default=DEFAULT_DIM, help="Embedding dimension")
parser.add_argument("--payload", required=True, help="Path to vector_chunks.json or a compatible JSON file")
args = parser.parse_args()
connections.connect("default", host=args.host, port=args.port)
collection = build_collection(args.collection, args.dim)
chunks = load_chunks(Path(args.payload))
if not chunks:
print("No vector chunks found; collection was created but remains empty.")
return
data = [
[chunk["chunk_id"] for chunk in chunks],
[chunk["doc_id"] for chunk in chunks],
[chunk["doc_title"] for chunk in chunks],
[chunk["chunk_id"] for chunk in chunks],
[int(chunk.get("chunk_index", 0) or 0) for chunk in chunks],
[int(chunk.get("piece_index", 0) or 0) for chunk in chunks],
[str(chunk.get("text", ""))[:65535] for chunk in chunks],
[str(chunk.get("embedding_text", chunk.get("text", "")))[:65535] for chunk in chunks],
[chunk["embedding"] for chunk in chunks],
[str(chunk.get("semantic_id", "")) for chunk in chunks],
[str(chunk.get("chunk_type", "")) for chunk in chunks],
[int(chunk.get("page_start", 0) or 0) for chunk in chunks],
[int(chunk.get("page_end", 0) or 0) for chunk in chunks],
[int(chunk.get("section_level", 0) or 0) for chunk in chunks],
[json.dumps(chunk.get("source_ids", []), ensure_ascii=False) for chunk in chunks],
[json.dumps(chunk.get("section_path", []), ensure_ascii=False) for chunk in chunks],
[str(chunk.get("section_title", "")) for chunk in chunks],
[json.dumps(chunk, ensure_ascii=False) for chunk in chunks],
[int(chunk.get("created_at", 0) or 0) for chunk in chunks],
]
collection.insert(data)
collection.flush()
collection.load()
print(f"Rebuilt collection {args.collection} with {len(chunks)} chunks.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,122 @@
-- 法规文档向量检索系统数据库表结构
-- PostgreSQL
-- ==================== 文档表 ====================
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) UNIQUE NOT NULL, -- 文档唯一标识,如 "GB14747-2006"
title VARCHAR(512) NOT NULL, -- 文档标题
doc_type VARCHAR(32), -- 文档类型:标准/法规/规范
standard_number VARCHAR(64), -- 标准编号:如 "GB 14747-2006"
publish_date DATE, -- 发布日期
implement_date DATE, -- 实施日期
status VARCHAR(32), -- 状态:现行/废止/修订
source_url VARCHAR(512), -- 来源 URL
file_path VARCHAR(512), -- 本地 PDF 文件路径
file_size INT, -- 文件大小(字节)
upload_time TIMESTAMP DEFAULT NOW(), -- 上传时间
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
COMMENT ON TABLE documents IS '文档元数据表';
COMMENT ON COLUMN documents.doc_id IS '文档唯一标识,用于关联 Milvus 和其他表';
COMMENT ON COLUMN documents.standard_number IS '标准编号,如 GB 14747-2006';
-- ==================== 章节结构表 ====================
CREATE TABLE sections (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
unique_id VARCHAR(64) NOT NULL, -- 阿里云返回的唯一标识
level INT NOT NULL, -- 层级1, 2, 3...
title VARCHAR(512) NOT NULL, -- 章节标题
page INT, -- 所在页码
index INT, -- 页内顺序
parent_id INT, -- 父章节 ID树形结构
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
);
COMMENT ON TABLE sections IS '章节结构表,用于目录导航';
COMMENT ON COLUMN sections.parent_id IS '父章节 ID构建树形结构';
COMMENT ON COLUMN sections.level IS '层级深度1 为最顶层';
-- ==================== 语义块表 ====================
CREATE TABLE semantic_blocks (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
semantic_id VARCHAR(64) NOT NULL, -- 语义块唯一标识
block_type VARCHAR(32) NOT NULL, -- 类型section_text/table/figure
page_start INT NOT NULL, -- 起始页码
page_end INT NOT NULL, -- 结束页码
section_id INT, -- 所属章节
section_title VARCHAR(512), -- 章节标题(冗余,方便查询)
section_level INT, -- 章节层级
source_ids JSONB, -- 原始 layout IDsJSON 数组)
text TEXT NOT NULL, -- 完整内容(未被切分)
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
);
COMMENT ON TABLE semantic_blocks IS '语义块表,用于邻域扩展,恢复完整内容';
COMMENT ON COLUMN semantic_blocks.block_type IS '类型section_text正文、table表格、figure图示';
COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容,未被切分';
-- ==================== 向量块元数据表 ====================
CREATE TABLE vector_chunks (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
chunk_id VARCHAR(64) NOT NULL, -- Milvus 主键
semantic_id VARCHAR(64) NOT NULL, -- 关联语义块
chunk_index INT NOT NULL, -- 切片序号(全局)
piece_index INT, -- 同语义块内的切片序号
page_start INT,
page_end INT,
section_title VARCHAR(512),
text VARCHAR(2048), -- 切片文本(可选,缩短版用于展示)
source_ids JSONB, -- 原始 layout IDsJSON 数组)
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
REFERENCES semantic_blocks(doc_id, semantic_id),
CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
);
COMMENT ON TABLE vector_chunks IS '向量块元数据表,用于快速关联查询';
COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号,用于按序拼接';
-- ==================== 索引 ====================
CREATE INDEX idx_sections_doc_id ON sections(doc_id);
CREATE INDEX idx_sections_parent_id ON sections(parent_id);
CREATE INDEX idx_sections_level ON sections(level);
CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
-- ==================== 触发器:自动更新 updated_at ====================
CREATE OR REPLACE FUNCTION update_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER tr_documents_updated_at
BEFORE UPDATE ON documents
FOR EACH ROW EXECUTE FUNCTION update_updated_at();

View File

@@ -0,0 +1,327 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
将 vector_chunks.json 向量化并上传到 Milvus 和 PostgreSQL
使用中转站的 OpenAI 兼容 API
"""
import argparse
import json
import time
from pathlib import Path
from typing import List, Dict
import psycopg2
from psycopg2.extras import execute_values
from pymilvus import (
connections,
Collection,
FieldSchema,
CollectionSchema,
DataType,
utility,
)
from openai import OpenAI
# ===================== 配置 =====================
# 中转站配置
RELAY_BASE_URL = "http://6.86.80.4:30080/v1"
RELAY_API_KEY = "sk-5HeY7gfSIlyZMacfuXOf5cphpymsNqufEu1ou4U3avbULcyY"
EMBEDDING_MODEL = "text-embedding-v3" # 中转站支持的 embedding 模型
# Milvus 配置
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"
COLLECTION_NAME = "regulation_chunks"
# PostgreSQL 配置
PG_HOST = "6.86.80.10"
PG_PORT = 5432
PG_USER = "postgresql"
PG_PASSWORD = "postgresql123456"
PG_DATABASE = "postgres"
# ===================== Embedding =====================
def get_openai_client(api_key: str, base_url: str) -> OpenAI:
"""创建 OpenAI 客户端连接到中转站"""
return OpenAI(api_key=api_key, base_url=base_url)
def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10) -> List[List[float]]:
"""批量获取文本向量"""
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
print(f"Embedding batch {i // batch_size + 1}/{(len(texts) - 1) // batch_size + 1}...")
response = client.embeddings.create(
model=EMBEDDING_MODEL,
input=batch,
)
embeddings = [item.embedding for item in response.data]
all_embeddings.extend(embeddings)
return all_embeddings
# ===================== Milvus =====================
def init_milvus(host: str, port: str):
connections.connect("default", host=host, port=port)
print(f"已连接 Milvus: {host}:{port}")
def create_collection(name: str, dim: int) -> Collection:
"""创建或获取 collection"""
if utility.has_collection(name):
print(f"Collection '{name}' 已存在,删除重建")
utility.drop_collection(name)
fields = [
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=64, is_primary=True),
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="chunk_index", dtype=DataType.INT64),
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=32),
FieldSchema(name="page_start", dtype=DataType.INT64),
FieldSchema(name="page_end", dtype=DataType.INT64),
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096), # JSON 字符串
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields, description="法规文档检索 chunks")
collection = Collection(name, schema)
# 创建向量索引IVF_FLAT适合中小规模
index_params = {
"metric_type": "COSINE",
"index_type": "IVF_FLAT",
"params": {"nlist": 128},
}
collection.create_index("embedding", index_params)
print(f"Collection '{name}' 创建完成,索引已建立")
return collection
def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[List[float]]):
"""插入 chunks 到 Milvus"""
data = [
[c["chunk_id"] for c in chunks],
[c["doc_id"] for c in chunks],
[c["doc_title"] for c in chunks],
[c["chunk_index"] for c in chunks],
[c["semantic_id"] for c in chunks],
[c["chunk_type"] for c in chunks],
[c["page_start"] for c in chunks],
[c["page_end"] for c in chunks],
[c["section_title"] for c in chunks],
[c["text"] for c in chunks],
[json.dumps(c.get("source_ids", [])) for c in chunks], # JSON 字符串
embeddings,
]
collection.insert(data)
collection.flush()
print(f"已插入 {len(chunks)} 个 chunks")
def load_collection(collection: Collection):
"""加载 collection 到内存(搜索前必须)"""
collection.load()
print(f"Collection 已加载到内存")
# ===================== PostgreSQL =====================
def get_pg_connection(host: str, port: int, user: str, password: str, database: str):
"""获取 PostgreSQL 连接"""
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
database=database,
)
print(f"已连接 PostgreSQL: {host}:{port}/{database}")
return conn
def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
"""插入 chunks 和相关数据到 PostgreSQL"""
cursor = conn.cursor()
try:
# 1. 插入文档
cursor.execute("""
INSERT INTO documents (doc_id, title, standard_number, upload_time)
VALUES (%s, %s, %s, NOW())
ON CONFLICT (doc_id) DO UPDATE SET title = EXCLUDED.title, updated_at = NOW()
""", (doc_data["doc_id"], doc_data["doc_title"], doc_data.get("standard_number")))
# 2. 插入语义块
semantic_blocks = doc_data.get("semantic_blocks", [])
if semantic_blocks:
block_rows = [
(
doc_data["doc_id"],
block["semantic_id"],
block["block_type"],
block["page_start"],
block["page_end"],
block.get("section_title"),
block.get("section_level"),
json.dumps(block.get("source_ids", [])),
block["text"],
)
for block in semantic_blocks
]
execute_values(
cursor,
"""
INSERT INTO semantic_blocks
(doc_id, semantic_id, block_type, page_start, page_end, section_title, section_level, source_ids, text)
VALUES %s
ON CONFLICT (doc_id, semantic_id) DO UPDATE SET text = EXCLUDED.text
""",
block_rows,
)
print(f"已插入 {len(semantic_blocks)} 个语义块")
# 3. 插入向量块元数据
chunk_rows = [
(
doc_data["doc_id"],
chunk["chunk_id"],
chunk["semantic_id"],
chunk["chunk_index"],
chunk.get("piece_index"),
chunk["page_start"],
chunk["page_end"],
chunk.get("section_title"),
chunk["text"],
json.dumps(chunk.get("source_ids", [])),
)
for chunk in chunks
]
execute_values(
cursor,
"""
INSERT INTO vector_chunks
(doc_id, chunk_id, semantic_id, chunk_index, piece_index, page_start, page_end, section_title, text, source_ids)
VALUES %s
ON CONFLICT (doc_id, chunk_id) DO UPDATE SET text = EXCLUDED.text
""",
chunk_rows,
)
print(f"已插入 {len(chunks)} 个向量块元数据")
conn.commit()
print("PostgreSQL 数据插入完成")
except Exception as e:
conn.rollback()
raise e
finally:
cursor.close()
# ===================== 主流程 =====================
def load_data(file_path: Path) -> Dict:
"""加载 vector_chunks.json返回完整数据"""
data = json.loads(file_path.read_text(encoding="utf-8"))
return data
def upload_to_milvus_and_pg(
chunks_file: str,
api_key: str,
base_url: str,
milvus_host: str,
milvus_port: str,
collection_name: str,
batch_size: int,
pg_host: str,
pg_port: int,
pg_user: str,
pg_password: str,
pg_database: str,
):
# 1. 加载完整数据
chunks_path = Path(chunks_file).expanduser().resolve()
if not chunks_path.exists():
raise FileNotFoundError(f"文件不存在: {chunks_path}")
data = load_data(chunks_path)
chunks = data.get("vector_chunks", [])
if not chunks:
raise ValueError("vector_chunks 为空")
print(f"加载 {len(chunks)} 个 chunks")
# 2. 初始化连接
client = get_openai_client(api_key, base_url)
init_milvus(milvus_host, milvus_port)
pg_conn = get_pg_connection(pg_host, pg_port, pg_user, pg_password, pg_database)
# 3. 获取 embeddings
texts = [c["embedding_text"] for c in chunks]
embeddings = get_embeddings_batch(client, texts, batch_size)
print(f"生成 {len(embeddings)} 个向量")
# 4. 获取 embedding 维度
embedding_dim = len(embeddings[0])
print(f"Embedding 维度: {embedding_dim}")
# 5. 创建 collection 并插入 Milvus
collection = create_collection(collection_name, embedding_dim)
insert_chunks(collection, chunks, embeddings)
load_collection(collection)
# 6. 插入 PostgreSQL
insert_chunks_to_pg(pg_conn, chunks, data)
# 7. 关闭连接
pg_conn.close()
print("上传完成!")
# ===================== CLI =====================
def main():
parser = argparse.ArgumentParser(description="将 vector_chunks 向量化并上传到 Milvus 和 PostgreSQL")
parser.add_argument("chunks_file", help="vector_chunks.json 文件路径")
parser.add_argument("--api-key", default=RELAY_API_KEY, help="中转站 API Key")
parser.add_argument("--base-url", default=RELAY_BASE_URL, help="中转站 Base URL")
parser.add_argument("--milvus-host", default=MILVUS_HOST, help="Milvus host")
parser.add_argument("--milvus-port", default=MILVUS_PORT, help="Milvus port")
parser.add_argument("--collection", default=COLLECTION_NAME, help="Milvus collection 名称")
parser.add_argument("--batch-size", type=int, default=10, help="Embedding 批量大小中转站限制最大10")
parser.add_argument("--pg-host", default=PG_HOST, help="PostgreSQL host")
parser.add_argument("--pg-port", type=int, default=PG_PORT, help="PostgreSQL port")
parser.add_argument("--pg-user", default=PG_USER, help="PostgreSQL user")
parser.add_argument("--pg-password", default=PG_PASSWORD, help="PostgreSQL password")
parser.add_argument("--pg-database", default=PG_DATABASE, help="PostgreSQL database")
args = parser.parse_args()
upload_to_milvus_and_pg(
chunks_file=args.chunks_file,
api_key=args.api_key,
base_url=args.base_url,
milvus_host=args.milvus_host,
milvus_port=args.milvus_port,
collection_name=args.collection,
batch_size=args.batch_size,
pg_host=args.pg_host,
pg_port=args.pg_port,
pg_user=args.pg_user,
pg_password=args.pg_password,
pg_database=args.pg_database,
)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,263 @@
# 文档解析与向量检索说明
## 相关文件
- `aliyun_doc_parser.py`:调用阿里云文档智能解析 PDF生成原始 `layouts.json`
- `layouts_to_vector_chunks.py`:把 `layouts.json` 转成适合向量数据库入库的三层结构
- `layouts.json`:阿里云返回的原始布局结果
- `vector_chunks.json`:转换后的结构化输出
## 一、`layouts.json` 的结构
`layouts.json` 顶层是一个数组每个元素代表一个布局块layout。常见字段如下
- `type`:主类型,例如 `title``text``table``figure`
- `subType`:更细的语义类型,例如 `doc_title``para_title``para``picture``pic_title``pic_caption`
- `text`:当前布局块的纯文本
- `markdownContent`:带 markdown 标记的文本
- `pageNum`:页码
- `index`:页内顺序
- `level`:标题层级
- `uniqueId`:布局块唯一标识
- `blocks`:更细粒度的文本与样式信息
- `cells`:表格单元格,仅 `table` 类型存在
这个结构不是简单 OCR 文本流,而是已经带有版面理解和语义分类的结构化数据。
## 二、推荐的三层转换结构
### 1. 结构层 `structure_nodes`
结构层用于恢复文档标题树,不直接作为最终向量检索单元。
示例:
- `1 范围`
- `2 规范性引用文件`
- `3 术语和定义`
- `3.1 儿童三轮车`
- `3.2 轮距`
结构层主要用于给下游 chunk 绑定 `section_path`
### 2. 语义层 `semantic_blocks`
语义层是按文档意义聚合后的内容块,主要分为三类:
- `section_text`:同一章节下连续正文聚合而成
- `table`:表格内容单独成块
- `figure`:图、图名、图注等单独成块
这一层比单 layout 更适合做语义理解,也适合后续做上下文扩展。
### 3. 检索层 `vector_chunks`
检索层是最终写进向量数据库的 chunk。
处理方式:
-`semantic_blocks` 中较短的块直接入库
- 对较长的块按 `max_chars` 再切分
- 相邻切片保留 `overlap_chars` 重叠
- 每个 chunk 都带完整 metadata便于后续过滤、重排和邻域扩展
## 三、当前转换脚本做了什么
`layouts_to_vector_chunks.py` 当前已经实现:
1. 过滤目录页噪声(如 `目次`
2. 根据标题层级维护章节路径
3. 将正文聚合成 `section_text`
4. 将表格单独转成 `table`
5. 将图相关内容单独转成 `figure`
6. 对长文本继续切分为最终 `vector_chunks`
7. 为每个检索 chunk 生成 `embedding_text`
## 四、为什么不要直接按 layout 入库
如果把 `layouts.json` 的每条 layout 直接做向量:
- 颗粒度太碎
- 标题和正文容易分离
- 表格会丢失结构上下文
- 图示信息无法完整表达
- 检索命中结果噪声较大
对于标准文档,最合适的单位通常不是“句子”,而是“条款语义块”。
## 五、建议的入库字段
建议向量数据库每条记录至少保存:
- `embedding_text`:用于生成向量
- `text`:原始 chunk 文本
- `chunk_id`
- `semantic_id`
- `chunk_type``section_text` / `table` / `figure`
- `section_path`
- `section_title`
- `section_level`
- `page_start`
- `page_end`
- `doc_id`
- `doc_title`
- `source_ids`
其中:
- 向量化字段:`embedding_text`
- 展示字段:`text`
- 检索增强字段:其余 metadata
## 六、推荐的检索方式
不要只做最简单的 top-k 向量搜索,建议采用:
**向量召回 + metadata 重排 + 邻域扩展**
### 1. 向量召回
使用 `vector_chunks[*].embedding_text` 做 embedding并在向量数据库中检索 top 10 ~ 15 条。
查询时可以对用户问题做轻微改写,例如:
原问题:
`儿童三轮车的定义是什么?`
可改写为:
`请检索 GB 14747—2006 儿童三轮车安全要求 中关于“儿童三轮车定义”的条款、术语、表格或图示说明。`
这样更适合标准文档检索。
### 2. metadata 重排
向量召回后,根据 metadata 做轻量规则重排。
常见规则:
- `chunk_type == section_text`:对定义类、要求类问题优先级更高
- `section_path` 命中查询关键词:例如查询“定义”时,`术语和定义` 章节优先
- `chunk_type == table`:对“尺寸 / 参数 / 数值 / 对照 / 要求”类问题加权
- `chunk_type == figure`:对“图 / 结构 / 状态 / 示意”类问题加权
### 3. 邻域扩展
检索命中的是最终切片,但回答往往需要更完整上下文。
建议命中某个 `vector_chunk` 后:
1. 优先回捞同一个 `semantic_id` 下的所有 chunk
2. 如果还不够,再补充同 `section_path`、相邻页码或相邻 `chunk_index` 的内容
这样可以恢复完整条款,而不是只给模型一小段碎片。
## 七、不同问题的检索重点
### 1. 定义类问题
例如:
- `儿童三轮车的定义是什么?`
- `轮距是什么意思?`
优先检索:
- `section_text`
- `section_path` 中包含 `术语和定义` 的内容
### 2. 要求类问题
例如:
- `外露突出物有什么要求?`
- `辅助推杆有哪些安全要求?`
优先检索:
- `section_text`
- `table`
### 3. 数值 / 尺寸 / 对照类问题
例如:
- `鞍座到脚蹬距离要求是什么?`
- `哪些项目需要满足规定尺寸?`
优先检索:
- `table`
- `section_text`
### 4. 图示说明类问题
例如:
- `正常乘骑状态是什么意思?`
- `图1表示什么`
优先检索:
- `figure`
- 同章节相邻 `section_text`
## 八、推荐的最终检索流程
建议采用以下固定流程:
1.`vector_chunks.embedding_text` 做 embedding 检索
2. 取 top 10 ~ 15 条候选
3.`chunk_type + section_path` 做规则重排
4.`semantic_id` 为中心回捞完整语义块
5. 选 3 ~ 5 组上下文提供给大模型回答
## 九、给大模型的上下文组织方式
最终不要直接把原始 JSON 扔给模型,建议整理成如下格式:
```text
[命中片段 1]
章节3 术语和定义 > 3.1 儿童三轮车
页码1-2
类型section_text
内容:
......
[命中片段 2]
章节4 要求 > 4.3 外露突出物
页码5
类型section_text
内容:
......
[命中片段 3]
章节5 试验方法
页码8
类型table
内容:
......
```
这种格式更利于模型稳定回答并引用出处。
## 十、转换命令
生成三层结构:
```bash
python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
--layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
--out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json
```
自定义切片大小:
```bash
python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
--layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
--out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json \
--max-chars 500 \
--overlap-chars 80
```

View File

@@ -3,6 +3,7 @@
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
from fastapi.encoders import jsonable_encoder
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from loguru import logger
@@ -12,6 +13,7 @@ from app.api.routes import api_router
from app.config.logging import setup_logging
from app.config.settings import settings
from app.shared.bootstrap import cleanup_runtime_dependencies, preload_runtime_dependencies
from app.shared.errors import VectorStoreSchemaError
# Keep module behavior explicit so the backend flow stays easy to audit.
@@ -55,16 +57,33 @@ app.add_middleware(
app.include_router(api_router, prefix="/api/v1")
@app.exception_handler(VectorStoreSchemaError)
async def vector_store_schema_exception_handler(request: Request, exc: VectorStoreSchemaError):
"""Return a stable JSON response for vector store schema/runtime errors."""
logger.error(f"向量库 schema 异常: {exc}")
return JSONResponse(
status_code=500,
content=jsonable_encoder(
ErrorResponse(
error="VectorStoreSchemaError",
message=str(exc),
)
),
)
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
"""Global exception handler."""
logger.error(f"未处理的异常: {exc}")
return JSONResponse(
status_code=500,
content=ErrorResponse(
error="InternalServerError",
message=str(exc),
).model_dump(),
content=jsonable_encoder(
ErrorResponse(
error="InternalServerError",
message=str(exc),
)
),
)

View File

@@ -7,6 +7,7 @@ from .knowledge import router as knowledge_router
from .agent import router as agent_router
from .status import router as status_router
from .perception import router as perception_router
from .rag import router as rag_router
# Keep package boundaries explicit so backend imports stay predictable.
@@ -20,6 +21,7 @@ api_router.include_router(agent_router)
api_router.include_router(compliance_router)
api_router.include_router(status_router)
api_router.include_router(perception_router)
api_router.include_router(rag_router)
__all__ = [
"api_router",
@@ -29,4 +31,5 @@ __all__ = [
"compliance_router",
"status_router",
"perception_router",
"rag_router",
]

View File

@@ -29,14 +29,19 @@ async def search_knowledge(request: SearchRequest):
results=[
SearchResultItem(
id=index + 1,
content=item.content,
content=item.text,
score=item.score,
metadata={
"doc_id": item.doc_id,
"doc_name": item.doc_name,
"doc_title": item.doc_title,
"chunk_id": item.chunk_id,
"chunk_type": item.chunk_type,
"section_title": item.section_title,
"page_number": item.page_number,
"page_start": item.page_start,
"page_end": item.page_end,
"section_level": item.section_level,
"chunk_index": item.chunk_index,
"piece_index": item.piece_index,
**item.metadata,
},
)

View File

@@ -50,8 +50,8 @@ async def rag_chat(request: RagChatRequest):
{
"id": str(s.get("chunk_id") or s.get("doc_id") or idx + 1),
"score": s.get("score", 0),
"preview": s.get("content", "")[:200],
"doc_name": s.get("doc_name", ""),
"preview": s.get("text", s.get("content", ""))[:200],
"doc_name": s.get("doc_title", s.get("doc_name", "")),
"clause": s.get("section_title", "法规片段"),
"doc_id": s.get("doc_id"),
"download_url": (

View File

@@ -508,7 +508,7 @@ class DocumentQueryService:
"""Return documents with real-time state from Milvus as the authoritative source.
Algorithm:
1. Query Milvus for all doc metadata (doc_id, doc_name, chunk_count, …).
1. Query Milvus for all doc metadata (doc_id, doc_title, chunk_count, …).
2. Load JSON/PG metadata records and index them by doc_id.
3. Merge: Milvus-present docs get status=INDEXED and live chunk_count;
metadata-only docs with status=INDEXED are demoted to FAILED.
@@ -536,8 +536,8 @@ class DocumentQueryService:
doc.chunk_count = row["chunk_count"]
doc.status = DocumentStatus.INDEXED
# Backfill fields that may be missing from older JSON records.
if not doc.doc_name and row.get("doc_name"):
doc.doc_name = row["doc_name"]
if not doc.doc_name and row.get("doc_title"):
doc.doc_name = row["doc_title"]
if not doc.regulation_type and row.get("regulation_type"):
doc.regulation_type = row["regulation_type"]
if not doc.version and row.get("version"):
@@ -553,8 +553,8 @@ class DocumentQueryService:
if doc_id not in meta_by_id:
synthetic = Document(
doc_id=doc_id,
doc_name=row.get("doc_name", doc_id),
file_name=row.get("doc_name", doc_id),
doc_name=row.get("doc_title", doc_id),
file_name=row.get("doc_title", doc_id),
object_name="",
content_type="",
size_bytes=0,

View File

@@ -29,11 +29,16 @@ def _reciprocal_rank_fusion(
RetrievedChunk(
chunk_id=chunk_map[ck].chunk_id,
doc_id=chunk_map[ck].doc_id,
doc_name=chunk_map[ck].doc_name,
content=chunk_map[ck].content,
doc_title=chunk_map[ck].doc_title,
text=chunk_map[ck].text,
score=scores[ck],
chunk_type=chunk_map[ck].chunk_type,
section_title=chunk_map[ck].section_title,
page_number=chunk_map[ck].page_number,
page_start=chunk_map[ck].page_start,
page_end=chunk_map[ck].page_end,
section_level=chunk_map[ck].section_level,
chunk_index=chunk_map[ck].chunk_index,
piece_index=chunk_map[ck].piece_index,
metadata=chunk_map[ck].metadata,
)
for ck in sorted_keys

View File

@@ -71,9 +71,9 @@ class PerceptionService:
affected_docs.append(
{
"doc_id": chunk.doc_id,
"doc_name": chunk.doc_name,
"doc_title": chunk.doc_title,
"score": round(float(chunk.score), 4),
"snippet": (chunk.content or "")[:180],
"snippet": (chunk.text or "")[:180],
"clause": getattr(chunk, "section_title", "") or "",
}
)
@@ -84,7 +84,7 @@ class PerceptionService:
# --- 2. Build context from retrieved chunks ---
context_parts = [
f"[文档{i}: {c.doc_name}]\n{(c.content or '')[:400]}"
f"[文档{i}: {c.doc_title}]\n{(c.text or '')[:400]}"
for i, c in enumerate(chunks[:5], 1)
]
context = "\n\n".join(context_parts) if context_parts else "(知识库中暂无相关文档)"

View File

@@ -33,7 +33,7 @@ class Settings(BaseSettings):
# Keep configuration setup explicit so runtime behavior is easy to reason about.
milvus_host: str = Field(default="6.86.80.8", description="Milvus服务地址")
milvus_port: int = Field(default=19530, description="Milvus服务端口")
milvus_collection: str = Field(default="regulations_dense_1024_v1", description="法规向量集合名称")
milvus_collection: str = Field(default="regulations_dense_1024_v2", description="法规向量集合名称")
milvus_db_name: str = Field(default="default", description="Milvus数据库名称")
# Keep configuration setup explicit so runtime behavior is easy to reason about.

View File

@@ -27,7 +27,7 @@ class Settings(BaseSettings):
# Milvus
milvus_host: str = "6.86.80.8"
milvus_port: int = 19530
milvus_collection: str = "regulations_dense_1024_v1"
milvus_collection: str = "regulations_dense_1024_v2"
# LLM / embedding defaults aligned with the migrated backend path.
llm_model: str = "qwen-max"
@@ -47,7 +47,7 @@ class Settings(BaseSettings):
api_port: int = 8000
# Legacy aliases retained for old utility modules.
regulations_collection: str = "regulations_dense_1024_v1"
regulations_collection: str = "regulations_dense_1024_v2"
compliance_collection: str = "compliance_cache"
# Preserve the legacy module API while keeping env resolution centralized at the repo root.

View File

@@ -8,18 +8,91 @@ from typing import Any
@dataclass
@dataclass(init=False)
class AnswerSource:
"""Represent answer source data."""
"""Represent answer source data with legacy aliases."""
doc_id: str
doc_name: str
doc_title: str
chunk_id: str
chunk_type: str
section_title: str
page_number: int
page_start: int
page_end: int
section_level: int
chunk_index: int
piece_index: int
score: float
content: str
text: str
metadata: dict[str, Any] = field(default_factory=dict)
def __init__(
self,
*,
doc_id: str,
doc_title: str | None = None,
chunk_id: str,
chunk_type: str = "",
section_title: str = "",
page_start: int = 0,
page_end: int = 0,
section_level: int = 0,
chunk_index: int = 0,
piece_index: int = 0,
score: float = 0.0,
text: str | None = None,
metadata: dict[str, Any] | None = None,
doc_name: str | None = None,
content: str | None = None,
page_number: int | None = None,
**_: Any,
) -> None:
"""Initialize the answer source while accepting legacy field names."""
self.doc_id = doc_id
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
self.chunk_id = chunk_id
self.chunk_type = chunk_type
self.section_title = section_title
self.page_start = int(page_start or page_number or 0)
self.page_end = int(page_end or self.page_start)
self.section_level = int(section_level or 0)
self.chunk_index = int(chunk_index or 0)
self.piece_index = int(piece_index or 0)
self.score = float(score)
self.text = text if text is not None else (content or "")
self.metadata = dict(metadata or {})
@property
def doc_name(self) -> str:
"""Return the legacy document name alias."""
return self.doc_title
@doc_name.setter
def doc_name(self, value: str) -> None:
"""Update the legacy document name alias."""
self.doc_title = value
@property
def content(self) -> str:
"""Return the legacy content alias."""
return self.text
@content.setter
def content(self, value: str) -> None:
"""Update the legacy content alias."""
self.text = value
@property
def page_number(self) -> int:
"""Return the legacy page number alias."""
return self.page_start
@page_number.setter
def page_number(self, value: int) -> None:
"""Update the legacy page number alias."""
self.page_start = value
self.page_end = max(self.page_end, value)
@dataclass
class ConversationMessage:

View File

@@ -60,23 +60,117 @@ class ParsedDocument:
metadata: dict[str, Any] = field(default_factory=dict)
@dataclass
@dataclass(init=False)
class Chunk:
"""Represent the Chunk type."""
"""Represent one retrieval chunk with backward-compatible aliases."""
chunk_id: str
doc_id: str
doc_name: str
content: str
doc_title: str
text: str
embedding_text: str
chunk_type: str = ""
chunk_index: int = 0
piece_index: int = 0
page_start: int = 0
page_end: int = 0
section_title: str = ""
section_path: list[str] = field(default_factory=list)
page_number: int = 0
section_level: int = 0
source_ids: list[str] = field(default_factory=list)
regulation_type: str = ""
version: str = ""
semantic_id: str = ""
block_type: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
def __init__(
self,
*,
chunk_id: str,
doc_id: str,
doc_title: str | None = None,
text: str | None = None,
embedding_text: str = "",
chunk_type: str = "",
chunk_index: int = 0,
piece_index: int = 0,
page_start: int = 0,
page_end: int = 0,
section_title: str = "",
section_path: list[str] | None = None,
section_level: int = 0,
source_ids: list[str] | None = None,
regulation_type: str = "",
version: str = "",
semantic_id: str = "",
metadata: dict[str, Any] | None = None,
doc_name: str | None = None,
content: str | None = None,
page_number: int | None = None,
block_type: str | None = None,
**_: Any,
) -> None:
"""Initialize the chunk while accepting legacy field names."""
self.chunk_id = chunk_id
self.doc_id = doc_id
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
self.text = text if text is not None else (content or "")
self.embedding_text = embedding_text or self.text
self.chunk_type = chunk_type or (block_type or "")
self.chunk_index = int(chunk_index or 0)
self.piece_index = int(piece_index or 0)
self.page_start = int(page_start or page_number or 0)
self.page_end = int(page_end or self.page_start)
self.section_title = section_title
self.section_path = list(section_path or [])
self.section_level = int(section_level or 0)
self.source_ids = list(source_ids or [])
self.regulation_type = regulation_type
self.version = version
self.semantic_id = semantic_id
self.metadata = dict(metadata or {})
@property
def doc_name(self) -> str:
"""Return the legacy document name alias."""
return self.doc_title
@doc_name.setter
def doc_name(self, value: str) -> None:
"""Update the legacy document name alias."""
self.doc_title = value
@property
def content(self) -> str:
"""Return the legacy content alias."""
return self.text
@content.setter
def content(self, value: str) -> None:
"""Update the legacy content alias."""
self.text = value
@property
def page_number(self) -> int:
"""Return the legacy page number alias."""
return self.page_start
@page_number.setter
def page_number(self, value: int) -> None:
"""Update the legacy page number alias."""
self.page_start = value
self.page_end = max(self.page_end, value)
@property
def block_type(self) -> str:
"""Return the legacy block type alias."""
return self.chunk_type
@block_type.setter
def block_type(self, value: str) -> None:
"""Update the legacy block type alias."""
self.chunk_type = value
@dataclass
class DocumentProcessingRun:

View File

@@ -16,14 +16,88 @@ class RetrievalQuery:
filters: str | None = None
@dataclass
@dataclass(init=False)
class RetrievedChunk:
"""Represent the Retrieved Chunk type."""
"""Represent the retrieved chunk payload with legacy aliases."""
chunk_id: str
doc_id: str
doc_name: str
content: str
doc_title: str
text: str
score: float
chunk_type: str = ""
section_title: str = ""
page_number: int = 0
page_start: int = 0
page_end: int = 0
section_level: int = 0
chunk_index: int = 0
piece_index: int = 0
metadata: dict[str, Any] = field(default_factory=dict)
def __init__(
self,
*,
chunk_id: str,
doc_id: str,
doc_title: str | None = None,
text: str | None = None,
score: float = 0.0,
chunk_type: str = "",
section_title: str = "",
page_start: int = 0,
page_end: int = 0,
section_level: int = 0,
chunk_index: int = 0,
piece_index: int = 0,
metadata: dict[str, Any] | None = None,
doc_name: str | None = None,
content: str | None = None,
page_number: int | None = None,
block_type: str | None = None,
**_: Any,
) -> None:
"""Initialize the retrieved chunk while accepting legacy field names."""
self.chunk_id = chunk_id
self.doc_id = doc_id
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
self.text = text if text is not None else (content or "")
self.score = float(score)
self.chunk_type = chunk_type or (block_type or "")
self.section_title = section_title
self.page_start = int(page_start or page_number or 0)
self.page_end = int(page_end or self.page_start)
self.section_level = int(section_level or 0)
self.chunk_index = int(chunk_index or 0)
self.piece_index = int(piece_index or 0)
self.metadata = dict(metadata or {})
@property
def doc_name(self) -> str:
"""Return the legacy document name alias."""
return self.doc_title
@doc_name.setter
def doc_name(self, value: str) -> None:
"""Update the legacy document name alias."""
self.doc_title = value
@property
def content(self) -> str:
"""Return the legacy content alias."""
return self.text
@content.setter
def content(self, value: str) -> None:
"""Update the legacy content alias."""
self.text = value
@property
def page_number(self) -> int:
"""Return the legacy page number alias."""
return self.page_start
@page_number.setter
def page_number(self, value: int) -> None:
"""Update the legacy page number alias."""
self.page_start = value
self.page_end = max(self.page_end, value)

View File

@@ -45,10 +45,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
context_tokens = 0
for idx, chunk in enumerate(retrieved_chunks, start=1):
block = (
f"[{idx}] 文档: {chunk.doc_name}\n"
f"[{idx}] 文档: {chunk.doc_title}\n"
f"章节: {chunk.section_title or '未标注'}\n"
f"页码: {chunk.page_number}\n"
f"内容: {chunk.content}"
f"页码: {chunk.page_start}" + (f"-{chunk.page_end}" if chunk.page_end and chunk.page_end != chunk.page_start else "") + "\n"
f"内容: {chunk.text}"
)
block_tokens = self._estimate_tokens(block)
if context_tokens + block_tokens > settings.rag_max_context_tokens:
@@ -73,10 +73,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
return False
estimated_total_tokens = sum(
self._estimate_tokens(
f"[{idx}] 文档: {chunk.doc_name}\n"
f"[{idx}] 文档: {chunk.doc_title}\n"
f"章节: {chunk.section_title or '未标注'}\n"
f"页码: {chunk.page_number}\n"
f"内容: {chunk.content}"
f"页码: {chunk.page_start}" + (f"-{chunk.page_end}" if chunk.page_end and chunk.page_end != chunk.page_start else "") + "\n"
f"内容: {chunk.text}"
)
for idx, chunk in enumerate(retrieved_chunks, start=1)
)
@@ -87,12 +87,17 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
return [
AnswerSource(
doc_id=chunk.doc_id,
doc_name=chunk.doc_name,
doc_title=chunk.doc_title,
chunk_id=chunk.chunk_id,
chunk_type=chunk.chunk_type,
section_title=chunk.section_title,
page_number=chunk.page_number,
page_start=chunk.page_start,
page_end=chunk.page_end,
section_level=chunk.section_level,
chunk_index=chunk.chunk_index,
piece_index=chunk.piece_index,
score=chunk.score,
content=chunk.content,
text=chunk.text,
metadata=chunk.metadata,
)
for chunk in chunks

View File

@@ -10,6 +10,7 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
"""Adapt the existing markdown chunker to the new chunk builder port."""
def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
"""Initialize the local markdown chunk builder."""
self.chunker = RegulationChunker(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
@@ -22,6 +23,7 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
regulation_type: str,
version: str,
) -> list[Chunk]:
"""Build migrated chunk objects from the legacy markdown chunker output."""
markdown_text = parsed_document.raw_text.strip()
if not markdown_text:
return []
@@ -50,16 +52,18 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
Chunk(
chunk_id=item.metadata.chunk_id,
doc_id=parsed_document.doc_id,
doc_name=parsed_document.doc_name,
content=item.content,
doc_title=parsed_document.doc_name,
text=item.content,
embedding_text=item.content,
chunk_type="local_markdown_chunk",
section_title=item.metadata.section_title or item.metadata.section_number,
section_path=section_path,
page_number=item.metadata.page_number,
page_start=item.metadata.page_number,
page_end=item.metadata.page_number,
section_level=len(section_path),
regulation_type=regulation_type,
version=version,
semantic_id=item.metadata.clause_number,
block_type="local_markdown_chunk",
metadata=metadata,
)
)

View File

@@ -19,29 +19,35 @@ class AliyunVectorChunkBuilder(ChunkBuilder):
"""Handle build for the Aliyun Vector Chunk Builder instance."""
chunks: list[Chunk] = []
for index, item in enumerate(parsed_document.vector_chunks):
content = item.get("content") or item.get("text") or ""
embedding_text = item.get("embedding_text") or content
text = item.get("text") or ""
embedding_text = item.get("embedding_text") or text
if not embedding_text.strip():
continue
section_path = item.get("section_path") or []
section_title = item.get("section_title") or (section_path[-1] if section_path else "")
page_number = item.get("page_start") or item.get("page") or 0
chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
metadata = dict(item)
metadata["regulation_type"] = regulation_type
metadata["version"] = version
chunks.append(
Chunk(
chunk_id=str(chunk_id),
doc_id=parsed_document.doc_id,
doc_name=parsed_document.doc_name,
content=content,
doc_title=str(item.get("doc_title") or parsed_document.doc_name),
text=text,
embedding_text=embedding_text,
chunk_type=str(item.get("chunk_type", item.get("block_type", ""))),
chunk_index=int(item.get("chunk_index") or 0),
piece_index=int(item.get("piece_index") or 0),
page_start=int(item.get("page_start") or 0),
page_end=int(item.get("page_end") or 0),
section_title=section_title,
section_path=section_path,
page_number=int(page_number or 0),
section_level=int(item.get("section_level") or len(section_path)),
source_ids=[str(v) for v in item.get("source_ids", [])],
regulation_type=regulation_type,
version=version,
semantic_id=item.get("semantic_id", ""),
block_type=item.get("block_type", ""),
metadata=metadata,
)
)

View File

@@ -56,7 +56,21 @@ class BM25Retriever:
try:
rows = self._vector_index.collection.query(
expr='doc_id != ""',
output_fields=["id", "doc_id", "doc_name", "content", "section_title", "page_number"],
output_fields=[
"id",
"chunk_id",
"doc_id",
"doc_title",
"text",
"chunk_type",
"section_title",
"page_start",
"page_end",
"section_level",
"chunk_index",
"piece_index",
"metadata_json",
],
limit=16384,
)
except Exception:
@@ -64,19 +78,33 @@ class BM25Retriever:
return []
return [
RetrievedChunk(
chunk_id=str(row.get("id", "")),
chunk_id=str(row.get("chunk_id") or row.get("id", "")),
doc_id=str(row.get("doc_id", "")),
doc_name=str(row.get("doc_name", "")),
content=str(row.get("content", "")),
doc_title=str(row.get("doc_title", "")),
text=str(row.get("text", "")),
score=0.0,
chunk_type=str(row.get("chunk_type", "")),
section_title=str(row.get("section_title", "")),
page_number=int(row.get("page_number") or 0),
metadata={},
page_start=int(row.get("page_start") or 0),
page_end=int(row.get("page_end") or 0),
section_level=int(row.get("section_level") or 0),
chunk_index=int(row.get("chunk_index") or 0),
piece_index=int(row.get("piece_index") or 0),
metadata=self._parse_metadata_json(row.get("metadata_json", "")),
)
for row in rows
if row.get("content")
if row.get("text")
]
def _parse_metadata_json(self, raw_metadata: str) -> dict:
"""Parse metadata_json into a dict for BM25-side filtering."""
if not raw_metadata:
return {}
try:
return dict(__import__("json").loads(raw_metadata))
except Exception:
return {}
def _ensure_built(self) -> None:
if self._index is not None:
return
@@ -93,7 +121,7 @@ class BM25Retriever:
self._chunks = []
self._index = BM25Okapi([[]])
return
tokenized = [_tokenize(c.content) for c in chunks]
tokenized = [_tokenize(c.text) for c in chunks]
self._chunks = chunks
self._index = BM25Okapi(tokenized)
logger.info("BM25Retriever: index built with %d chunks", len(chunks))
@@ -127,20 +155,26 @@ class BM25Retriever:
for score, chunk in ranked[: top_k * 2]:
if score <= 0:
break
# Apply simple regulation_type filter if provided
if filters and chunk.metadata.get("regulation_type"):
types = [t.strip() for t in filters.split(",")]
if chunk.metadata.get("regulation_type") not in types:
continue
if filters:
normalized_filter = filters.replace("doc_name", "doc_title").strip()
if normalized_filter.startswith('doc_title == "'):
expected_title = normalized_filter[len('doc_title == "'):-1]
if chunk.doc_title != expected_title:
continue
results.append(
RetrievedChunk(
chunk_id=chunk.chunk_id,
doc_id=chunk.doc_id,
doc_name=chunk.doc_name,
content=chunk.content,
doc_title=chunk.doc_title,
text=chunk.text,
score=score,
chunk_type=chunk.chunk_type,
section_title=chunk.section_title,
page_number=chunk.page_number,
page_start=chunk.page_start,
page_end=chunk.page_end,
section_level=chunk.section_level,
chunk_index=chunk.chunk_index,
piece_index=chunk.piece_index,
metadata=chunk.metadata,
)
)

View File

@@ -31,7 +31,7 @@ class OpenAICompatibleReranker(Reranker):
if not chunks:
return []
texts = [chunk.content for chunk in chunks]
texts = [chunk.text for chunk in chunks]
start = time.time()
try:
scores = self._call_reranker(query, texts)

View File

@@ -4,57 +4,150 @@ from __future__ import annotations
import json
import time
from typing import Iterable
from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
from loguru import logger
from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, MilvusException, connections, utility
from app.config.settings import settings
from app.domain.documents import Chunk
from app.domain.retrieval import RetrievedChunk, VectorIndex
from app.shared.errors import VectorStoreSchemaError
# Keep adapter behavior explicit so integration details remain easy to audit.
_REQUIRED_SCHEMA_FIELDS = (
"doc_id",
"doc_title",
"chunk_id",
"text",
"embedding",
"section_title",
"metadata_json",
)
_SCHEMA_RECOVERY_TOKENS = (
"field doc_title not exist",
"field text not exist",
"field embedding not exist",
"collection not loaded",
"can't find collection",
"not found[collection",
)
class MilvusVectorIndex(VectorIndex):
"""Provide the Milvus Vector Index index implementation."""
def __init__(self) -> None:
"""Initialize the Milvus Vector Index instance."""
self.collection_name = settings.milvus_collection
self.db_name = settings.milvus_db_name
self.host = settings.milvus_host
self.port = settings.milvus_port
# Use an adapter-specific alias so this index never reuses unrelated global Milvus state.
self.alias = f"vector-index::{self.host}:{self.port}/{self.db_name}/{self.collection_name}"
self._connect()
self.collection = self._bind_collection()
def _connect(self, *, refresh: bool = False) -> None:
"""Establish the Milvus connection for this adapter."""
if refresh:
try:
connections.disconnect(self.alias)
except Exception:
# Best-effort disconnect keeps refresh idempotent when no alias is active yet.
pass
connections.connect(
alias="default",
host=settings.milvus_host,
port=settings.milvus_port,
alias=self.alias,
host=self.host,
port=self.port,
db_name=self.db_name,
)
self.collection = self._ensure_collection()
def _schema_field_names(self, collection: Collection) -> list[str]:
"""Return the field names exposed by the bound Milvus collection."""
return [field.name for field in collection.schema.fields]
def _raise_schema_error(self, *, message: str, actual_fields: Iterable[str]) -> None:
"""Raise a typed schema error for the active collection."""
raise VectorStoreSchemaError(
message=message,
host=self.host,
db_name=self.db_name,
collection_name=self.collection_name,
expected_fields=list(_REQUIRED_SCHEMA_FIELDS),
actual_fields=list(actual_fields),
)
def _validate_schema(self, collection: Collection) -> None:
"""Ensure the collection schema matches the dense-only adapter contract."""
actual_fields = self._schema_field_names(collection)
missing_fields = [field_name for field_name in _REQUIRED_SCHEMA_FIELDS if field_name not in actual_fields]
if missing_fields:
self._raise_schema_error(
message=f"Milvus collection schema mismatch; missing required fields: {missing_fields}",
actual_fields=actual_fields,
)
def _log_collection_binding(self, collection: Collection, *, event: str) -> None:
"""Record the bound collection details for runtime diagnostics."""
try:
num_entities = collection.num_entities
except Exception:
num_entities = "unknown"
logger.info(
"Milvus binding {} alias={} host={} db={} collection={} fields={} num_entities={}",
event,
self.alias,
self.host,
self.db_name,
self.collection_name,
self._schema_field_names(collection),
num_entities,
)
def _bind_collection(self, *, force_refresh: bool = False) -> Collection:
"""Bind and validate the configured Milvus collection."""
if force_refresh:
self._connect(refresh=True)
collection = self._ensure_collection()
self._validate_schema(collection)
self._log_collection_binding(collection, event="refreshed" if force_refresh else "initialized")
return collection
def _ensure_collection(self) -> Collection:
"""Handle ensure collection for this module for the Milvus Vector Index instance."""
if utility.has_collection(self.collection_name):
collection = Collection(self.collection_name)
if utility.has_collection(self.collection_name, using=self.alias):
collection = Collection(self.collection_name, using=self.alias)
collection.load()
return collection
schema = CollectionSchema(
fields=[
FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=128, is_primary=True, auto_id=False),
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256),
FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=256),
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="chunk_index", dtype=DataType.INT64),
FieldSchema(name="piece_index", dtype=DataType.INT64),
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="embedding_text", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=settings.embedding_dim),
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
FieldSchema(name="page_number", dtype=DataType.INT64),
FieldSchema(name="regulation_type", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="version", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="block_type", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="page_start", dtype=DataType.INT64),
FieldSchema(name="page_end", dtype=DataType.INT64),
FieldSchema(name="section_level", dtype=DataType.INT64),
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),
FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="metadata_json", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="created_at", dtype=DataType.INT64),
],
description="Dense-only regulations index",
enable_dynamic_field=False,
)
collection = Collection(name=self.collection_name, schema=schema)
collection = Collection(name=self.collection_name, schema=schema, using=self.alias)
collection.create_index(
field_name="embedding",
index_params={
@@ -73,21 +166,34 @@ class MilvusVectorIndex(VectorIndex):
data = []
now = int(time.time())
for chunk, vector in zip(chunks, vectors):
metadata = dict(chunk.metadata)
doc_title = str(metadata.get("doc_title", chunk.doc_title))
text = str(metadata.get("text", chunk.text))
embedding_text = str(metadata.get("embedding_text", chunk.embedding_text))
page_start = int(metadata.get("page_start", 0) or 0)
page_end = int(metadata.get("page_end", 0) or 0)
section_path = metadata.get("section_path", chunk.section_path)
source_ids = metadata.get("source_ids", [])
data.append(
{
"id": chunk.chunk_id,
"doc_id": chunk.doc_id,
"doc_name": chunk.doc_name,
"content": chunk.content[:65535],
"doc_title": doc_title[:256],
"chunk_id": chunk.chunk_id[:128],
"chunk_index": int(metadata.get("chunk_index", chunk.chunk_index) or 0),
"piece_index": int(metadata.get("piece_index", chunk.piece_index) or 0),
"text": text[:65535],
"embedding_text": embedding_text[:65535],
"embedding": vector,
"section_title": chunk.section_title[:512],
"section_path": json.dumps(chunk.section_path, ensure_ascii=False)[:4096],
"page_number": chunk.page_number,
"regulation_type": chunk.regulation_type[:128],
"version": chunk.version[:64],
"semantic_id": chunk.semantic_id[:128],
"block_type": chunk.block_type[:64],
"metadata_json": json.dumps(chunk.metadata, ensure_ascii=False)[:65535],
"semantic_id": str(metadata.get("semantic_id", chunk.semantic_id))[:128],
"chunk_type": str(metadata.get("chunk_type", chunk.chunk_type))[:64],
"page_start": page_start,
"page_end": page_end,
"section_level": int(metadata.get("section_level", chunk.section_level) or 0),
"source_ids": json.dumps(source_ids, ensure_ascii=False)[:4096],
"section_path": json.dumps(section_path, ensure_ascii=False)[:4096],
"section_title": str(metadata.get("section_title", chunk.section_title))[:512],
"metadata_json": json.dumps(metadata, ensure_ascii=False)[:65535],
"created_at": now,
}
)
@@ -107,47 +213,97 @@ class MilvusVectorIndex(VectorIndex):
filters = filters.strip()
# Normalize legacy field names so callers can keep older filter payloads.
replacements = {
"doc_name": "doc_title",
"content": "text",
"page_number": "page_start",
"block_type": "chunk_type",
}
for legacy_name, new_name in replacements.items():
filters = filters.replace(legacy_name, new_name)
# Check if already a Milvus expression (contains operators)
if any(op in filters for op in ["==", "!=", "in", "not in", ">", "<", ">=", "<=", "and", "or"]):
return filters
# Parse simple regulation_type filter
# Support: "GB" or "GB,UN-ECE" or "GB, UN-ECE"
types = [t.strip() for t in filters.split(",") if t.strip()]
# Parse simple document-title filter.
titles = [title.strip() for title in filters.split(",") if title.strip()]
if not types:
if not titles:
return None
if len(types) == 1:
# Single value: regulation_type == "GB"
return f'regulation_type == "{types[0]}"'
else:
# Multiple values: regulation_type in ["GB", "UN-ECE"]
quoted_types = [f'"{t}"' for t in types]
return f'regulation_type in [{", ".join(quoted_types)}]'
if len(titles) == 1:
return f'doc_title == "{titles[0]}"'
quoted_titles = [f'"{title}"' for title in titles]
return f'doc_title in [{", ".join(quoted_titles)}]'
def _should_refresh_after_exception(self, exc: Exception) -> bool:
"""Return whether the Milvus error suggests stale connection or collection state."""
if not isinstance(exc, MilvusException):
return False
normalized = str(exc).lower()
return any(token in normalized for token in _SCHEMA_RECOVERY_TOKENS)
def _run_with_refresh(self, operation):
"""Run a Milvus operation and retry once after a forced reconnect when appropriate."""
try:
return operation()
except VectorStoreSchemaError:
raise
except Exception as exc:
if not self._should_refresh_after_exception(exc):
raise
logger.warning(
"Milvus operation failed for alias={} collection={}; forcing reconnect and retry: {}",
self.alias,
self.collection_name,
exc,
)
self.collection = self._bind_collection(force_refresh=True)
try:
return operation()
except VectorStoreSchemaError:
raise
except Exception as retry_exc:
if isinstance(retry_exc, MilvusException):
self._raise_schema_error(
message=f"Milvus operation failed after refresh: {retry_exc}",
actual_fields=self._schema_field_names(self.collection),
)
raise
def search(self, query_vector: list[float], top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
"""Handle search for the Milvus Vector Index instance."""
milvus_expr = self._parse_filters(filters)
results = self.collection.search(
data=[query_vector],
anns_field="embedding",
param={"metric_type": "COSINE", "params": {"nprobe": settings.milvus_nprobe}},
limit=top_k,
expr=milvus_expr,
output_fields=[
"doc_id",
"doc_name",
"content",
"section_title",
"page_number",
"regulation_type",
"version",
"semantic_id",
"block_type",
"metadata_json",
],
results = self._run_with_refresh(
lambda: self.collection.search(
data=[query_vector],
anns_field="embedding",
param={"metric_type": "COSINE", "params": {"nprobe": settings.milvus_nprobe}},
limit=top_k,
expr=milvus_expr,
output_fields=[
"doc_id",
"doc_title",
"chunk_id",
"chunk_index",
"piece_index",
"text",
"embedding_text",
"section_title",
"semantic_id",
"chunk_type",
"page_start",
"page_end",
"section_level",
"source_ids",
"section_path",
"metadata_json",
],
)
)
payload: list[RetrievedChunk] = []
for hits in results:
@@ -161,13 +317,18 @@ class MilvusVectorIndex(VectorIndex):
metadata = {"raw_metadata": raw_metadata}
payload.append(
RetrievedChunk(
chunk_id=str(hit.id),
chunk_id=str(hit.entity.get("chunk_id", hit.id)),
doc_id=hit.entity.get("doc_id", ""),
doc_name=hit.entity.get("doc_name", ""),
content=hit.entity.get("content", ""),
doc_title=hit.entity.get("doc_title", ""),
text=hit.entity.get("text", ""),
score=float(hit.score),
chunk_type=hit.entity.get("chunk_type", ""),
section_title=hit.entity.get("section_title", ""),
page_number=int(hit.entity.get("page_number", 0) or 0),
page_start=int(hit.entity.get("page_start", 0) or 0),
page_end=int(hit.entity.get("page_end", 0) or 0),
section_level=int(hit.entity.get("section_level", 0) or 0),
chunk_index=int(hit.entity.get("chunk_index", 0) or 0),
piece_index=int(hit.entity.get("piece_index", 0) or 0),
metadata=metadata,
)
)
@@ -176,7 +337,9 @@ class MilvusVectorIndex(VectorIndex):
def count_by_document(self) -> dict[str, int]:
"""Return doc_id -> chunk count from Milvus."""
try:
rows = self.collection.query(expr="doc_id != \"\"", output_fields=["doc_id"])
rows = self._run_with_refresh(
lambda: self.collection.query(expr="doc_id != \"\"", output_fields=["doc_id", "doc_title"])
)
except Exception:
return {}
counts: dict[str, int] = {}
@@ -189,9 +352,11 @@ class MilvusVectorIndex(VectorIndex):
def list_document_metadata(self) -> list[dict]:
"""Return one metadata row per document from Milvus (single query, no embeddings)."""
try:
rows = self.collection.query(
expr="doc_id != \"\"",
output_fields=["doc_id", "doc_name", "regulation_type", "version"],
rows = self._run_with_refresh(
lambda: self.collection.query(
expr="doc_id != \"\"",
output_fields=["doc_id", "doc_title", "metadata_json"],
)
)
except Exception:
return []
@@ -204,15 +369,26 @@ class MilvusVectorIndex(VectorIndex):
continue
counts[doc_id] = counts.get(doc_id, 0) + 1
if doc_id not in seen:
metadata: dict[str, object] = {}
raw_metadata = row.get("metadata_json", "")
if raw_metadata:
try:
metadata = json.loads(raw_metadata)
except json.JSONDecodeError:
metadata = {}
seen[doc_id] = {
"doc_id": doc_id,
"doc_name": row.get("doc_name", ""),
"regulation_type": row.get("regulation_type", ""),
"version": row.get("version", ""),
"doc_title": row.get("doc_title", ""),
"regulation_type": str(metadata.get("regulation_type", "")),
"version": str(metadata.get("version", "")),
}
return [
{**meta, "chunk_count": counts[meta["doc_id"]]}
{
**meta,
"doc_name": meta.get("doc_title", ""),
"chunk_count": counts[meta["doc_id"]],
}
for meta in seen.values()
]

View File

@@ -67,14 +67,14 @@ class DocumentProcessor:
return [
{
"id": item.chunk_id,
"content": item.content,
"content": item.text,
"score": item.score,
"metadata": {
"doc_id": item.doc_id,
"doc_name": item.doc_name,
"doc_name": item.doc_title,
"chunk_id": item.chunk_id,
"section_title": item.section_title,
"page_number": item.page_number,
"page_number": item.page_start,
**item.metadata,
},
}

View File

@@ -0,0 +1,30 @@
"""Define shared backend exception types."""
from __future__ import annotations
class VectorStoreSchemaError(RuntimeError):
"""Signal that the active vector store schema does not match backend expectations."""
def __init__(
self,
*,
message: str,
host: str,
db_name: str,
collection_name: str,
expected_fields: list[str],
actual_fields: list[str],
) -> None:
"""Initialize the vector store schema error details."""
self.host = host
self.db_name = db_name
self.collection_name = collection_name
self.expected_fields = expected_fields
self.actual_fields = actual_fields
# Keep the message self-contained so runtime logs show the full mismatch context.
details = (
f"{message} | host={host} db={db_name} collection={collection_name} "
f"expected_fields={expected_fields} actual_fields={actual_fields}"
)
super().__init__(details)

View File

@@ -1 +0,0 @@
{}

View File

@@ -0,0 +1,131 @@
{
"runs": {
"8e722053-5009-40fe-a483-535b40ebbb16": {
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
"doc_id": "7cbdfe3c",
"trigger_type": "upload",
"run_status": "succeeded",
"parser_backend": "aliyun_docmind",
"chunk_backend": "aliyun",
"embedding_model": "text-embedding-v3",
"index_name": "regulations_dense_1024_v2",
"started_at": "2026-05-26T12:18:27.208692+00:00",
"stored_at": "2026-05-26T12:18:27.712855+00:00",
"parsed_at": "2026-05-26T12:18:42.989238+00:00",
"indexed_at": "2026-05-26T12:18:51.172418+00:00",
"finished_at": "2026-05-26T12:18:51.172418+00:00",
"layout_count": 48,
"structure_node_count": 6,
"semantic_block_count": 33,
"vector_chunk_count": 34,
"chunk_count": 34,
"failure_stage": "",
"error_message": "",
"metadata": {
"generate_summary": true,
"parse_task_id": "docmind-20260526-10b94713ccb348498b12180a5dcf32ff"
}
}
},
"status_events": {
"d0532baf-0d65-4130-b282-ec51f04132fd": {
"event_id": "d0532baf-0d65-4130-b282-ec51f04132fd",
"doc_id": "7cbdfe3c",
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
"from_status": "",
"to_status": "pending",
"stage": "document_created",
"message": "Document record created",
"metadata": {},
"occurred_at": "2026-05-26T12:18:27.235921+00:00"
},
"a5e32db5-25c3-4c73-a987-7311f0e72a31": {
"event_id": "a5e32db5-25c3-4c73-a987-7311f0e72a31",
"doc_id": "7cbdfe3c",
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
"from_status": "pending",
"to_status": "stored",
"stage": "store",
"message": "Source file stored",
"metadata": {},
"occurred_at": "2026-05-26T12:18:27.741462+00:00"
},
"18e04ce7-9d7a-4008-8600-e2590100bd85": {
"event_id": "18e04ce7-9d7a-4008-8600-e2590100bd85",
"doc_id": "7cbdfe3c",
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
"from_status": "stored",
"to_status": "parsed",
"stage": "parse",
"message": "Document parsed",
"metadata": {
"artifact_count": 4
},
"occurred_at": "2026-05-26T12:18:43.218026+00:00"
},
"d3b06025-5c91-4a42-9e5f-dce1c5312b96": {
"event_id": "d3b06025-5c91-4a42-9e5f-dce1c5312b96",
"doc_id": "7cbdfe3c",
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
"from_status": "parsed",
"to_status": "indexed",
"stage": "index",
"message": "Document indexed",
"metadata": {
"chunk_count": 34,
"index_name": "regulations_dense_1024_v2"
},
"occurred_at": "2026-05-26T12:18:51.195442+00:00"
}
},
"artifacts": {
"47fe2877-a8f5-4e1d-901b-80cd0194ba96": {
"artifact_id": "47fe2877-a8f5-4e1d-901b-80cd0194ba96",
"doc_id": "7cbdfe3c",
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
"artifact_type": "layouts",
"object_name": "artifacts/7cbdfe3c/layouts.json",
"content_type": "application/json",
"byte_size": 0,
"checksum": "",
"metadata": {},
"created_at": "2026-05-26T12:18:43.188467+00:00"
},
"44aa075b-86b2-48a7-9d14-a2453bd53863": {
"artifact_id": "44aa075b-86b2-48a7-9d14-a2453bd53863",
"doc_id": "7cbdfe3c",
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
"artifact_type": "structure_nodes",
"object_name": "artifacts/7cbdfe3c/structure_nodes.json",
"content_type": "application/json",
"byte_size": 0,
"checksum": "",
"metadata": {},
"created_at": "2026-05-26T12:18:43.188494+00:00"
},
"dedcc8fe-fa58-4de6-984d-f44332af5204": {
"artifact_id": "dedcc8fe-fa58-4de6-984d-f44332af5204",
"doc_id": "7cbdfe3c",
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
"artifact_type": "semantic_blocks",
"object_name": "artifacts/7cbdfe3c/semantic_blocks.json",
"content_type": "application/json",
"byte_size": 0,
"checksum": "",
"metadata": {},
"created_at": "2026-05-26T12:18:43.188511+00:00"
},
"9b0d8bda-e69e-4a4e-ae06-a308afe43109": {
"artifact_id": "9b0d8bda-e69e-4a4e-ae06-a308afe43109",
"doc_id": "7cbdfe3c",
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
"artifact_type": "vector_chunks",
"object_name": "artifacts/7cbdfe3c/vector_chunks.json",
"content_type": "application/json",
"byte_size": 0,
"checksum": "",
"metadata": {},
"created_at": "2026-05-26T12:18:43.188526+00:00"
}
}
}

View File

@@ -1,392 +1,9 @@
{
"69280841": {
"doc_id": "69280841",
"doc_name": "TCT算法接口.pdf",
"file_name": "TCT算法接口.pdf",
"object_name": "69280841/TCT算法接口.pdf",
"content_type": "application/pdf",
"size_bytes": 165557,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "local_markdown_parser",
"index_name": "",
"error_message": "embedding 维度不匹配,期望 1536",
"created_at": "2026-05-18T07:12:16.668306+00:00",
"updated_at": "2026-05-18T07:12:19.417142+00:00",
"metadata": {
"generate_summary": true,
"structure_nodes": 0
}
},
"44121fbb": {
"doc_id": "44121fbb",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "44121fbb/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T09:53:47.996183+00:00",
"updated_at": "2026-05-18T09:53:50.825868+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"77debb4a": {
"doc_id": "77debb4a",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "77debb4a/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:05:46.104259+00:00",
"updated_at": "2026-05-18T10:05:48.704061+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"d12bdcc8": {
"doc_id": "d12bdcc8",
"doc_name": "TCT算法接口.pdf",
"file_name": "TCT算法接口.pdf",
"object_name": "d12bdcc8/TCT算法接口.pdf",
"content_type": "application/pdf",
"size_bytes": 165557,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:07:22.199824+00:00",
"updated_at": "2026-05-18T10:07:24.653751+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"3c2e8c9c": {
"doc_id": "3c2e8c9c",
"doc_name": "20260415_Continental tire mobile app solution.pdf",
"file_name": "20260415_Continental tire mobile app solution.pdf",
"object_name": "3c2e8c9c/20260415_Continental tire mobile app solution.pdf",
"content_type": "application/pdf",
"size_bytes": 2178074,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:09:58.338274+00:00",
"updated_at": "2026-05-18T10:10:01.295502+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"d22d21a0": {
"doc_id": "d22d21a0",
"doc_name": "20260415_Continental tire mobile app solution.pdf",
"file_name": "20260415_Continental tire mobile app solution.pdf",
"object_name": "d22d21a0/20260415_Continental tire mobile app solution.pdf",
"content_type": "application/pdf",
"size_bytes": 2178074,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:12:20.078027+00:00",
"updated_at": "2026-05-18T10:12:22.999843+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"35f129d3": {
"doc_id": "35f129d3",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "35f129d3/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:13:24.706512+00:00",
"updated_at": "2026-05-18T10:13:27.180509+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"efc21515": {
"doc_id": "efc21515",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "efc21515/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
"created_at": "2026-05-18T13:47:32.076786+00:00",
"updated_at": "2026-05-18T13:47:57.998073+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-a6e84447457f43cb85f95225cfc6495b",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/efc21515/layouts.json",
"structure_nodes": "artifacts/efc21515/structure_nodes.json",
"semantic_blocks": "artifacts/efc21515/semantic_blocks.json",
"vector_chunks": "artifacts/efc21515/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400"
}
},
"0d4b08bc": {
"doc_id": "0d4b08bc",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "0d4b08bc/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
"created_at": "2026-05-18T14:03:15.134344+00:00",
"updated_at": "2026-05-18T14:03:34.843448+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-78353d85daa24147b68d8fb71895179f",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/0d4b08bc/layouts.json",
"structure_nodes": "artifacts/0d4b08bc/structure_nodes.json",
"semantic_blocks": "artifacts/0d4b08bc/semantic_blocks.json",
"vector_chunks": "artifacts/0d4b08bc/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404"
}
},
"4302f314": {
"doc_id": "4302f314",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "4302f314/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "embedding 维度不匹配,期望 1536",
"created_at": "2026-05-18T14:11:29.943973+00:00",
"updated_at": "2026-05-18T14:11:48.554500+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-23935ee455ac4b26ac4201ac4781ee52",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/4302f314/layouts.json",
"structure_nodes": "artifacts/4302f314/structure_nodes.json",
"semantic_blocks": "artifacts/4302f314/semantic_blocks.json",
"vector_chunks": "artifacts/4302f314/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "embedding 维度不匹配,期望 1536"
}
},
"765ed1ee": {
"doc_id": "765ed1ee",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "765ed1ee/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>",
"created_at": "2026-05-18T14:18:28.875138+00:00",
"updated_at": "2026-05-18T14:18:57.389110+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-f116856bc29245baa2531b245078a701",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/765ed1ee/layouts.json",
"structure_nodes": "artifacts/765ed1ee/structure_nodes.json",
"semantic_blocks": "artifacts/765ed1ee/semantic_blocks.json",
"vector_chunks": "artifacts/765ed1ee/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>"
}
},
"05cabe09": {
"doc_id": "05cabe09",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "05cabe09/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "embedding 维度不匹配,期望 1536",
"created_at": "2026-05-18T14:24:32.156500+00:00",
"updated_at": "2026-05-18T14:24:50.114138+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-897d858983df48e28e9819e563d46208",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/05cabe09/layouts.json",
"structure_nodes": "artifacts/05cabe09/structure_nodes.json",
"semantic_blocks": "artifacts/05cabe09/semantic_blocks.json",
"vector_chunks": "artifacts/05cabe09/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "embedding 维度不匹配,期望 1536"
}
},
"9acb2ba0": {
"doc_id": "9acb2ba0",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "9acb2ba0/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "indexed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 27,
"parser_name": "aliyun_docmind",
"index_name": "regulations_dense_1024_v1",
"error_message": "",
"created_at": "2026-05-18T14:29:01.368719+00:00",
"updated_at": "2026-05-18T14:29:23.699068+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-e5fd4a5419e74d569c562e389e6ae72c",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/9acb2ba0/layouts.json",
"structure_nodes": "artifacts/9acb2ba0/structure_nodes.json",
"semantic_blocks": "artifacts/9acb2ba0/semantic_blocks.json",
"vector_chunks": "artifacts/9acb2ba0/vector_chunks.json"
},
"processing_stage": "indexed",
"index_collection": "regulations_dense_1024_v1"
}
},
"52bd970f": {
"doc_id": "52bd970f",
"7cbdfe3c": {
"doc_id": "7cbdfe3c",
"doc_name": "使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
"file_name": "使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
"object_name": "52bd970f/使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
"object_name": "7cbdfe3c/使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
"content_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"size_bytes": 1199920,
"status": "indexed",
@@ -396,26 +13,26 @@
"summary_latency_ms": 0,
"chunk_count": 34,
"parser_name": "aliyun_docmind",
"index_name": "regulations_dense_1024_v1",
"index_name": "regulations_dense_1024_v2",
"error_message": "",
"created_at": "2026-05-25T07:45:12.777459+00:00",
"updated_at": "2026-05-25T07:45:37.314290+00:00",
"created_at": "2026-05-26T12:18:27.206125+00:00",
"updated_at": "2026-05-26T12:18:51.171308+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260525-6d782dc33f2748a4a1020df765b8182d",
"parse_task_id": "docmind-20260526-10b94713ccb348498b12180a5dcf32ff",
"layout_count": 48,
"structure_node_count": 6,
"semantic_block_count": 33,
"vector_chunk_count": 34,
"artifact_keys": {
"layouts": "artifacts/52bd970f/layouts.json",
"structure_nodes": "artifacts/52bd970f/structure_nodes.json",
"semantic_blocks": "artifacts/52bd970f/semantic_blocks.json",
"vector_chunks": "artifacts/52bd970f/vector_chunks.json"
"layouts": "artifacts/7cbdfe3c/layouts.json",
"structure_nodes": "artifacts/7cbdfe3c/structure_nodes.json",
"semantic_blocks": "artifacts/7cbdfe3c/semantic_blocks.json",
"vector_chunks": "artifacts/7cbdfe3c/vector_chunks.json"
},
"processing_stage": "indexed",
"index_collection": "regulations_dense_1024_v1"
"index_collection": "regulations_dense_1024_v2"
}
}
}

View File

@@ -7,7 +7,7 @@
- 上传入口保持为 `/api/v1/documents/upload`
- 默认 `PARSER_BACKEND=aliyun`
- 默认 `CHUNK_BACKEND=aliyun`
- 默认 Milvus collection 为 `regulations_dense_1536_v2`
- 默认 Milvus collection 为 `regulations_dense_1024_v2`
- 解析产物落到 MinIO `artifacts/{doc_id}/`
完整主链路如下:
@@ -19,7 +19,7 @@
5. 转换为 `structure_nodes / semantic_blocks / vector_chunks`
6. 三层结构 JSON 回写 MinIO
7. 使用 `vector_chunks[*].embedding_text` 调 embedding API
8. 写入 `regulations_dense_1536_v2`
8. 写入 `regulations_dense_1024_v2`
9. 文档状态更新为 `indexed`
运行时转换逻辑位于 `backend/app/infrastructure/parser/aliyun_layout_normalizer.py`

View File

@@ -32,7 +32,6 @@ redis>=4.5.0
minio>=7.1.0
# 数据库
sqlalchemy>=2.0.0
psycopg2-binary>=2.9.0
# mysql-connector-python>=8.0.0

View File

@@ -122,16 +122,17 @@ class FakeChunkBuilder:
Chunk(
chunk_id=f"{parsed_document.doc_id}-chunk-1",
doc_id=parsed_document.doc_id,
doc_name=parsed_document.doc_name,
content="法规正文",
doc_title=parsed_document.doc_name,
text="法规正文",
embedding_text="标准:测试\n章节:第一章\n\n法规正文",
section_title="第一章",
section_path=["第一章"],
page_number=1,
page_start=1,
page_end=1,
chunk_type="section_text",
regulation_type=regulation_type,
version=version,
semantic_id="semantic-1",
block_type="section_text",
metadata={"source": "aliyun_vector_chunk"},
)
]

View File

@@ -18,11 +18,11 @@ class FakeRetriever:
RetrievedChunk(
chunk_id="chunk-1",
doc_id="doc-1",
doc_name="测试法规",
content="法规正文",
doc_title="测试法规",
text="法规正文",
score=0.91,
section_title="第一章",
page_number=1,
page_start=1,
metadata={"section_title": "第一章"},
)
]
@@ -47,12 +47,12 @@ class FakeAnswerGenerator:
sources=[
AnswerSource(
doc_id=item.doc_id,
doc_name=item.doc_name,
doc_title=item.doc_title,
chunk_id=item.chunk_id,
section_title=item.section_title,
page_number=item.page_number,
page_start=item.page_start,
score=item.score,
content=item.content,
text=item.text,
metadata=item.metadata,
)
for item in retrieved_chunks

View File

@@ -0,0 +1,117 @@
"""Test runtime recovery and API error serialization for the Milvus vector index."""
from __future__ import annotations
from fastapi.encoders import jsonable_encoder
from pymilvus import MilvusException
from app.api.models import ErrorResponse
from app.infrastructure.vectorstore.milvus_vector_index import MilvusVectorIndex
from app.shared.errors import VectorStoreSchemaError
class FakeField:
"""Represent a minimal Milvus schema field for tests."""
def __init__(self, name: str) -> None:
"""Initialize the fake field."""
self.name = name
class FakeSchema:
"""Represent a minimal Milvus schema container for tests."""
def __init__(self, field_names: list[str]) -> None:
"""Initialize the fake schema from field names."""
self.fields = [FakeField(name) for name in field_names]
class FakeCollection:
"""Represent a minimal collection object for runtime recovery tests."""
def __init__(self, field_names: list[str], responses: list[object]) -> None:
"""Initialize the fake collection with schema fields and queued responses."""
self.schema = FakeSchema(field_names)
self.responses = responses
self.num_entities = 0
self.search_calls = 0
def search(self, **kwargs):
"""Return the next queued response or raise the next queued exception."""
self.search_calls += 1
response = self.responses.pop(0)
if isinstance(response, Exception):
raise response
return response
def _build_index_for_test(*, collection: FakeCollection) -> MilvusVectorIndex:
"""Create a MilvusVectorIndex instance without opening a real Milvus connection."""
index = MilvusVectorIndex.__new__(MilvusVectorIndex)
index.collection_name = "regulations_dense_1024_v2"
index.db_name = "default"
index.host = "6.86.80.8"
index.port = 19530
index.alias = "vector-index::test"
index.collection = collection
return index
def test_search_rebinds_and_retries_after_stale_schema_error(monkeypatch):
"""Refresh the bound collection once when Milvus reports a stale schema field."""
schema_fields = [
"id",
"doc_id",
"doc_title",
"chunk_id",
"text",
"embedding",
"section_title",
"metadata_json",
]
stale_collection = FakeCollection(
schema_fields,
[MilvusException(code=65535, message="field doc_title not exist")],
)
refreshed_collection = FakeCollection(schema_fields, [[]])
index = _build_index_for_test(collection=stale_collection)
def fake_bind_collection(*, force_refresh: bool = False):
"""Return the refreshed collection on forced rebinding."""
assert force_refresh is True
return refreshed_collection
monkeypatch.setattr(index, "_bind_collection", fake_bind_collection)
results = index.search([0.0] * 1024, 1)
assert results == []
assert stale_collection.search_calls == 1
assert refreshed_collection.search_calls == 1
assert index.collection is refreshed_collection
def test_validate_schema_raises_detailed_vector_store_schema_error():
"""Raise a typed schema error when required Milvus fields are missing."""
invalid_collection = FakeCollection(
["id", "doc_id", "doc_name", "content", "dense_vector"],
[[]],
)
index = _build_index_for_test(collection=invalid_collection)
try:
index._validate_schema(invalid_collection)
except VectorStoreSchemaError as exc:
assert "doc_title" in str(exc)
assert "actual_fields=['id', 'doc_id', 'doc_name', 'content', 'dense_vector']" in str(exc)
else:
raise AssertionError("VectorStoreSchemaError was not raised")
def test_error_response_is_json_serializable():
"""Ensure shared API error responses encode datetime fields safely."""
payload = jsonable_encoder(ErrorResponse(error="InternalServerError", message="boom"))
assert payload["error"] == "InternalServerError"
assert payload["message"] == "boom"
assert isinstance(payload["timestamp"], str)

View File

@@ -113,12 +113,12 @@ class FakeAgentConversationService:
sources=[
AnswerSource(
doc_id="doc-api-1",
doc_name="测试法规",
doc_title="测试法规",
chunk_id="chunk-1",
section_title="第一章",
page_number=1,
page_start=1,
score=0.92,
content="法规原文",
text="法规原文",
metadata={"section_title": "第一章"},
)
],
@@ -218,7 +218,6 @@ def test_agent_ask_and_stream_contract_preserved(monkeypatch):
store = FakeConversationStore()
monkeypatch.setattr(agent, "get_agent_conversation_service", lambda: FakeAgentConversationService())
monkeypatch.setattr(agent, "get_conversation_store", lambda: store)
client = TestClient(app)

View File

@@ -65,7 +65,7 @@ def verify_migration_config() -> bool:
try:
assert settings.embedding_model == "text-embedding-v3"
assert settings.embedding_dim == 1024
assert settings.milvus_collection == "regulations_dense_1024_v1"
assert settings.milvus_collection == "regulations_dense_1024_v2"
assert settings.parser_backend == "aliyun"
assert settings.chunk_backend == "aliyun"
logger.info(f"embedding_model={settings.embedding_model}")