Refactor document handling and update Milvus collection settings
- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
8
backend/aliyun_parser/.claude/settings.local.json
Normal file
8
backend/aliyun_parser/.claude/settings.local.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Bash(python3 *)",
|
||||
"Bash(PGPASSWORD=postgresql123456 psql *)"
|
||||
]
|
||||
}
|
||||
}
|
||||
475
backend/aliyun_parser/parse_pdf.py
Normal file
475
backend/aliyun_parser/parse_pdf.py
Normal file
@@ -0,0 +1,475 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
阿里云文档智能 API 解析 PDF,输出三层结构 chunks
|
||||
- structure_nodes: 目录树结构
|
||||
- semantic_blocks: 语义块(章节文本、表格、图片)
|
||||
- vector_chunks: 检索块(带 overlap 切分)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
|
||||
from alibabacloud_tea_openapi import models as open_api_models
|
||||
from alibabacloud_docmind_api20220711 import models as docmind_models
|
||||
from alibabacloud_tea_util import models as util_models
|
||||
|
||||
# ===================== 阿里云配置 =====================
|
||||
ALIBABA_ACCESS_KEY_ID = "LTAI5t6fWvAsvZkoF9WTbtys"
|
||||
ALIBABA_ACCESS_KEY_SECRET = "WX4oaE4FLYRa5L85TMQkqRPHeTJAF0"
|
||||
ALIBABA_ENDPOINT = "docmind-api.cn-hangzhou.aliyuncs.com"
|
||||
|
||||
# ===================== 切分参数 =====================
|
||||
MAX_CHARS = 600
|
||||
OVERLAP_CHARS = 80
|
||||
|
||||
# ===================== 布局类型常量 =====================
|
||||
TOC_TITLES = {"目次", "目录"}
|
||||
TITLE_SUBTYPES = {"doc_title", "para_title"}
|
||||
TEXT_SUBTYPES = {"para", "none"}
|
||||
FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
|
||||
FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
|
||||
|
||||
|
||||
# ===================== 阿里云 API 客户端 =====================
|
||||
def init_client() -> DocmindClient:
|
||||
config = open_api_models.Config(
|
||||
access_key_id=ALIBABA_ACCESS_KEY_ID,
|
||||
access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
|
||||
)
|
||||
config.endpoint = ALIBABA_ENDPOINT
|
||||
return DocmindClient(config)
|
||||
|
||||
|
||||
def submit_job(client: DocmindClient, file_path: str) -> str:
|
||||
"""提交文档解析任务"""
|
||||
file_name = Path(file_path).name
|
||||
request = docmind_models.SubmitDocParserJobAdvanceRequest(
|
||||
file_url_object=open(file_path, "rb"),
|
||||
file_name=file_name,
|
||||
file_name_extension=Path(file_path).suffix.lstrip("."),
|
||||
llm_enhancement=True,
|
||||
enhancement_mode="VLM",
|
||||
)
|
||||
runtime = util_models.RuntimeOptions()
|
||||
response = client.submit_doc_parser_job_advance(request, runtime)
|
||||
return response.body.data.id
|
||||
|
||||
|
||||
def query_status(client: DocmindClient, task_id: str) -> Dict:
|
||||
"""查询任务状态"""
|
||||
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
|
||||
response = client.query_doc_parser_status(request)
|
||||
return response.body.data.to_map() if response.body.data else None
|
||||
|
||||
|
||||
def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
|
||||
"""等待任务完成"""
|
||||
while True:
|
||||
status_data = query_status(client, task_id)
|
||||
if not status_data:
|
||||
return False
|
||||
status = status_data.get("Status", "").lower()
|
||||
if status == "success":
|
||||
return True
|
||||
elif status == "failed":
|
||||
print(f"任务失败: {status_data}")
|
||||
return False
|
||||
print(f"任务状态: {status}, 等待中...")
|
||||
time.sleep(poll_interval)
|
||||
|
||||
|
||||
def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
|
||||
"""获取解析结果"""
|
||||
request = docmind_models.GetDocParserResultRequest(
|
||||
id=task_id,
|
||||
layout_step_size=layout_step_size,
|
||||
layout_num=layout_num,
|
||||
)
|
||||
response = client.get_doc_parser_result(request)
|
||||
return response.body.data if response.body.data else None
|
||||
|
||||
|
||||
def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
|
||||
"""收集所有解析结果"""
|
||||
all_layouts = []
|
||||
layout_num = 0
|
||||
while True:
|
||||
result_data = get_result(client, task_id, layout_num, layout_step_size)
|
||||
if not result_data:
|
||||
break
|
||||
layouts = result_data.get("layouts", [])
|
||||
if not layouts:
|
||||
break
|
||||
all_layouts.extend(layouts)
|
||||
layout_num += len(layouts)
|
||||
if len(layouts) < layout_step_size:
|
||||
break
|
||||
return all_layouts
|
||||
|
||||
|
||||
# ===================== 文本处理 =====================
|
||||
def normalize_text(text: str) -> str:
|
||||
text = text.replace("\r", "\n")
|
||||
text = text.replace(" ", " ")
|
||||
text = re.sub(r"\n+", "\n", text)
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_page(layout: Dict) -> int:
|
||||
return layout.get("pageNum", layout.get("pageNumber", 0))
|
||||
|
||||
|
||||
def get_text(layout: Dict) -> str:
|
||||
text = normalize_text(layout.get("text", ""))
|
||||
if text:
|
||||
return text
|
||||
return normalize_text(layout.get("markdownContent", ""))
|
||||
|
||||
|
||||
# ===================== 布局类型判断 =====================
|
||||
def is_title(layout: Dict) -> bool:
|
||||
return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
|
||||
|
||||
|
||||
def is_text(layout: Dict) -> bool:
|
||||
return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
|
||||
|
||||
|
||||
def is_figure(layout: Dict) -> bool:
|
||||
return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
|
||||
|
||||
|
||||
def is_table(layout: Dict) -> bool:
|
||||
return layout.get("type") == "table"
|
||||
|
||||
|
||||
def is_toc_layout(layout: Dict) -> bool:
|
||||
text = get_text(layout)
|
||||
if text in TOC_TITLES:
|
||||
return True
|
||||
if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def extract_table_text(layout: Dict) -> str:
|
||||
rows = []
|
||||
for cell in layout.get("cells", []):
|
||||
texts = []
|
||||
for cell_layout in cell.get("layouts", []):
|
||||
cell_text = normalize_text(cell_layout.get("text", ""))
|
||||
if cell_text:
|
||||
texts.append(cell_text)
|
||||
if texts:
|
||||
rows.append(" ".join(texts))
|
||||
return "\n".join(rows).strip()
|
||||
|
||||
|
||||
# ===================== 结构层:目录树 =====================
|
||||
def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
|
||||
nodes = []
|
||||
for layout in layouts:
|
||||
if not is_title(layout):
|
||||
continue
|
||||
text = get_text(layout)
|
||||
if not text or text in TOC_TITLES:
|
||||
continue
|
||||
nodes.append(
|
||||
{
|
||||
"unique_id": layout.get("uniqueId"),
|
||||
"page": get_page(layout),
|
||||
"index": layout.get("index", 0),
|
||||
"level": layout.get("level", 0),
|
||||
"title": text,
|
||||
"type": layout.get("type"),
|
||||
"sub_type": layout.get("subType"),
|
||||
}
|
||||
)
|
||||
return nodes
|
||||
|
||||
|
||||
# ===================== 语义层:章节内容 =====================
|
||||
def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
|
||||
level = layout.get("level", 0)
|
||||
title = get_text(layout)
|
||||
while section_stack and section_stack[-1]["level"] >= level:
|
||||
section_stack.pop()
|
||||
section_stack.append(
|
||||
{
|
||||
"level": level,
|
||||
"title": title,
|
||||
"page": get_page(layout),
|
||||
"unique_id": layout.get("uniqueId"),
|
||||
}
|
||||
)
|
||||
return section_stack
|
||||
|
||||
|
||||
def section_path_titles(section_stack: List[Dict]) -> List[str]:
|
||||
return [item["title"] for item in section_stack]
|
||||
|
||||
|
||||
def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
|
||||
if not blocks:
|
||||
return block_id
|
||||
|
||||
texts = [item["text"] for item in blocks if item["text"]]
|
||||
merged_text = "\n".join(texts).strip()
|
||||
if not merged_text:
|
||||
return block_id
|
||||
|
||||
semantic_blocks.append(
|
||||
{
|
||||
"semantic_id": f"semantic-{block_id}",
|
||||
"block_type": "section_text",
|
||||
"page_start": min(item["page"] for item in blocks),
|
||||
"page_end": max(item["page"] for item in blocks),
|
||||
"section_path": blocks[0]["section_path"],
|
||||
"section_level": blocks[0]["section_level"],
|
||||
"section_title": blocks[0]["section_title"],
|
||||
"source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
|
||||
"text": merged_text,
|
||||
}
|
||||
)
|
||||
return block_id + 1
|
||||
|
||||
|
||||
def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
|
||||
semantic_blocks = []
|
||||
section_stack = []
|
||||
pending_text_blocks = []
|
||||
block_id = 1
|
||||
skip_toc_page = False
|
||||
|
||||
for layout in layouts:
|
||||
text = get_text(layout)
|
||||
page = get_page(layout)
|
||||
|
||||
if is_toc_layout(layout):
|
||||
skip_toc_page = True
|
||||
continue
|
||||
if skip_toc_page and page == 1:
|
||||
continue
|
||||
if skip_toc_page and page != 1:
|
||||
skip_toc_page = False
|
||||
|
||||
if is_title(layout):
|
||||
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
pending_text_blocks = []
|
||||
section_stack = update_section_path(section_stack, layout)
|
||||
continue
|
||||
|
||||
section_path = section_path_titles(section_stack)
|
||||
section_title = section_path[-1] if section_path else "未分类"
|
||||
section_level = len(section_path)
|
||||
|
||||
if is_table(layout):
|
||||
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
pending_text_blocks = []
|
||||
table_text = extract_table_text(layout)
|
||||
if table_text:
|
||||
semantic_blocks.append(
|
||||
{
|
||||
"semantic_id": f"semantic-{block_id}",
|
||||
"block_type": "table",
|
||||
"page_start": page,
|
||||
"page_end": page,
|
||||
"section_path": section_path,
|
||||
"section_level": section_level,
|
||||
"section_title": section_title,
|
||||
"source_ids": [layout.get("uniqueId")],
|
||||
"text": table_text,
|
||||
}
|
||||
)
|
||||
block_id += 1
|
||||
continue
|
||||
|
||||
if is_figure(layout):
|
||||
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
pending_text_blocks = []
|
||||
if text:
|
||||
semantic_blocks.append(
|
||||
{
|
||||
"semantic_id": f"semantic-{block_id}",
|
||||
"block_type": "figure",
|
||||
"page_start": page,
|
||||
"page_end": page,
|
||||
"section_path": section_path,
|
||||
"section_level": section_level,
|
||||
"section_title": section_title,
|
||||
"source_ids": [layout.get("uniqueId")],
|
||||
"text": text,
|
||||
}
|
||||
)
|
||||
block_id += 1
|
||||
continue
|
||||
|
||||
if is_text(layout) and text:
|
||||
pending_text_blocks.append(
|
||||
{
|
||||
"page": page,
|
||||
"text": text,
|
||||
"unique_id": layout.get("uniqueId"),
|
||||
"section_path": section_path,
|
||||
"section_level": section_level,
|
||||
"section_title": section_title,
|
||||
}
|
||||
)
|
||||
|
||||
flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
return semantic_blocks
|
||||
|
||||
|
||||
# ===================== 检索层:向量 chunks =====================
|
||||
def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
|
||||
text = text.strip()
|
||||
if len(text) <= max_chars:
|
||||
return [text] if text else []
|
||||
|
||||
parts = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(len(text), start + max_chars)
|
||||
parts.append(text[start:end].strip())
|
||||
if end >= len(text):
|
||||
break
|
||||
start = max(0, end - overlap_chars)
|
||||
return [part for part in parts if part]
|
||||
|
||||
|
||||
def build_vector_chunks(
|
||||
semantic_blocks: List[Dict],
|
||||
doc_id: str,
|
||||
doc_title: str,
|
||||
max_chars: int,
|
||||
overlap_chars: int,
|
||||
) -> List[Dict]:
|
||||
vector_chunks = []
|
||||
chunk_index = 1
|
||||
|
||||
for block in semantic_blocks:
|
||||
pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
|
||||
for piece_index, piece in enumerate(pieces, start=1):
|
||||
if block["section_path"]:
|
||||
header = f"标准:{doc_title}\n章节:{' > '.join(block['section_path'])}\n\n"
|
||||
else:
|
||||
header = f"标准:{doc_title}\n\n"
|
||||
vector_chunks.append(
|
||||
{
|
||||
"doc_id": doc_id,
|
||||
"doc_title": doc_title,
|
||||
"chunk_id": f"chunk-{chunk_index}",
|
||||
"chunk_index": chunk_index,
|
||||
"semantic_id": block["semantic_id"],
|
||||
"chunk_type": block["block_type"],
|
||||
"piece_index": piece_index,
|
||||
"page_start": block["page_start"],
|
||||
"page_end": block["page_end"],
|
||||
"section_path": block["section_path"],
|
||||
"section_level": block["section_level"],
|
||||
"section_title": block["section_title"],
|
||||
"source_ids": block["source_ids"],
|
||||
"text": piece,
|
||||
"embedding_text": header + piece,
|
||||
}
|
||||
)
|
||||
chunk_index += 1
|
||||
|
||||
return vector_chunks
|
||||
|
||||
|
||||
# ===================== 主转换函数 =====================
|
||||
def convert_layouts(
|
||||
layouts: List[Dict],
|
||||
doc_id: str,
|
||||
doc_title: str,
|
||||
max_chars: int,
|
||||
overlap_chars: int,
|
||||
) -> Dict:
|
||||
structure_nodes = build_structure_nodes(layouts)
|
||||
semantic_blocks = build_semantic_blocks(layouts)
|
||||
vector_chunks = build_vector_chunks(
|
||||
semantic_blocks,
|
||||
doc_id=doc_id,
|
||||
doc_title=doc_title,
|
||||
max_chars=max_chars,
|
||||
overlap_chars=overlap_chars,
|
||||
)
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"doc_title": doc_title,
|
||||
"structure_nodes": structure_nodes,
|
||||
"semantic_blocks": semantic_blocks,
|
||||
"vector_chunks": vector_chunks,
|
||||
}
|
||||
|
||||
|
||||
# ===================== CLI 入口 =====================
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF,输出三层结构 chunks")
|
||||
parser.add_argument("pdf_path", help="PDF 文件路径")
|
||||
parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
|
||||
parser.add_argument("--layouts-out", dest="layouts_output", help="输出原始 layouts JSON")
|
||||
parser.add_argument("--doc-id", default="GB14747-2006", help="文档 ID")
|
||||
parser.add_argument("--doc-title", default="GB 14747—2006 儿童三轮车安全要求", help="文档标题")
|
||||
parser.add_argument("--max-chars", type=int, default=MAX_CHARS, help="单个检索 chunk 最大字符数")
|
||||
parser.add_argument("--overlap-chars", type=int, default=OVERLAP_CHARS, help="相邻检索 chunk 重叠字符数")
|
||||
parser.add_argument("--poll-interval", type=int, default=5, help="轮询间隔(秒)")
|
||||
args = parser.parse_args()
|
||||
|
||||
pdf_path = Path(args.pdf_path).expanduser().resolve()
|
||||
if not pdf_path.exists():
|
||||
raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
|
||||
|
||||
# 1. 提交阿里云任务
|
||||
client = init_client()
|
||||
print(f"提交任务: {pdf_path}")
|
||||
task_id = submit_job(client, str(pdf_path))
|
||||
print(f"任务 ID: {task_id}")
|
||||
|
||||
# 2. 等待完成
|
||||
print("等待任务完成...")
|
||||
if not wait_for_completion(client, task_id, args.poll_interval):
|
||||
print("任务失败,退出")
|
||||
return
|
||||
|
||||
# 3. 获取 layouts
|
||||
print("获取解析结果...")
|
||||
layouts = collect_all_results(client, task_id)
|
||||
print(f"获取到 {len(layouts)} 个布局块")
|
||||
|
||||
# 4. 输出原始 layouts(可选)
|
||||
if args.layouts_output:
|
||||
layouts_path = Path(args.layouts_output).expanduser().resolve()
|
||||
layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"原始 layouts 已写入: {layouts_path}")
|
||||
|
||||
# 5. 转换为三层结构
|
||||
print("转换为三层结构...")
|
||||
data = convert_layouts(
|
||||
layouts,
|
||||
doc_id=args.doc_id,
|
||||
doc_title=args.doc_title,
|
||||
max_chars=args.max_chars,
|
||||
overlap_chars=args.overlap_chars,
|
||||
)
|
||||
|
||||
# 6. 输出结果
|
||||
output_path = Path(args.out).expanduser().resolve()
|
||||
output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
print(f"结构层节点数: {len(data['structure_nodes'])}")
|
||||
print(f"语义层块数: {len(data['semantic_blocks'])}")
|
||||
print(f"检索层块数: {len(data['vector_chunks'])}")
|
||||
print(f"输出文件: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
115
backend/aliyun_parser/rebuild_milvus_collection.py
Normal file
115
backend/aliyun_parser/rebuild_milvus_collection.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""Rebuild the migrated Milvus collection from saved vector chunks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
|
||||
|
||||
|
||||
DEFAULT_COLLECTION = "regulations_dense_1024_v2"
|
||||
DEFAULT_DIM = 1024
|
||||
|
||||
|
||||
def build_collection(name: str, dim: int) -> Collection:
|
||||
"""Create the migrated Milvus collection from scratch."""
|
||||
if utility.has_collection(name):
|
||||
utility.drop_collection(name)
|
||||
|
||||
schema = CollectionSchema(
|
||||
fields=[
|
||||
FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=128, is_primary=True, auto_id=False),
|
||||
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=256),
|
||||
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name="chunk_index", dtype=DataType.INT64),
|
||||
FieldSchema(name="piece_index", dtype=DataType.INT64),
|
||||
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="embedding_text", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="page_start", dtype=DataType.INT64),
|
||||
FieldSchema(name="page_end", dtype=DataType.INT64),
|
||||
FieldSchema(name="section_level", dtype=DataType.INT64),
|
||||
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),
|
||||
FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
|
||||
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
|
||||
FieldSchema(name="metadata_json", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="created_at", dtype=DataType.INT64),
|
||||
],
|
||||
description="Dense-only regulations index",
|
||||
enable_dynamic_field=False,
|
||||
)
|
||||
collection = Collection(name=name, schema=schema)
|
||||
collection.create_index(
|
||||
field_name="embedding",
|
||||
index_params={
|
||||
"metric_type": "COSINE",
|
||||
"index_type": "IVF_FLAT",
|
||||
"params": {"nlist": 128},
|
||||
},
|
||||
)
|
||||
return collection
|
||||
|
||||
|
||||
def load_chunks(payload_path: Path) -> list[dict]:
|
||||
"""Load vector chunks emitted by the Aliyun parser pipeline."""
|
||||
payload = json.loads(payload_path.read_text(encoding="utf-8"))
|
||||
if isinstance(payload, dict):
|
||||
chunks = payload.get("vector_chunks", [])
|
||||
else:
|
||||
chunks = payload
|
||||
if not isinstance(chunks, list):
|
||||
raise ValueError("vector chunk payload must be a list or a dict containing vector_chunks")
|
||||
return chunks
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Rebuild the target collection from a vector chunk payload."""
|
||||
parser = argparse.ArgumentParser(description="Rebuild the migrated Milvus collection.")
|
||||
parser.add_argument("--host", default="127.0.0.1", help="Milvus host")
|
||||
parser.add_argument("--port", default="19530", help="Milvus port")
|
||||
parser.add_argument("--collection", default=DEFAULT_COLLECTION, help="Milvus collection name")
|
||||
parser.add_argument("--dim", type=int, default=DEFAULT_DIM, help="Embedding dimension")
|
||||
parser.add_argument("--payload", required=True, help="Path to vector_chunks.json or a compatible JSON file")
|
||||
args = parser.parse_args()
|
||||
|
||||
connections.connect("default", host=args.host, port=args.port)
|
||||
collection = build_collection(args.collection, args.dim)
|
||||
chunks = load_chunks(Path(args.payload))
|
||||
if not chunks:
|
||||
print("No vector chunks found; collection was created but remains empty.")
|
||||
return
|
||||
|
||||
data = [
|
||||
[chunk["chunk_id"] for chunk in chunks],
|
||||
[chunk["doc_id"] for chunk in chunks],
|
||||
[chunk["doc_title"] for chunk in chunks],
|
||||
[chunk["chunk_id"] for chunk in chunks],
|
||||
[int(chunk.get("chunk_index", 0) or 0) for chunk in chunks],
|
||||
[int(chunk.get("piece_index", 0) or 0) for chunk in chunks],
|
||||
[str(chunk.get("text", ""))[:65535] for chunk in chunks],
|
||||
[str(chunk.get("embedding_text", chunk.get("text", "")))[:65535] for chunk in chunks],
|
||||
[chunk["embedding"] for chunk in chunks],
|
||||
[str(chunk.get("semantic_id", "")) for chunk in chunks],
|
||||
[str(chunk.get("chunk_type", "")) for chunk in chunks],
|
||||
[int(chunk.get("page_start", 0) or 0) for chunk in chunks],
|
||||
[int(chunk.get("page_end", 0) or 0) for chunk in chunks],
|
||||
[int(chunk.get("section_level", 0) or 0) for chunk in chunks],
|
||||
[json.dumps(chunk.get("source_ids", []), ensure_ascii=False) for chunk in chunks],
|
||||
[json.dumps(chunk.get("section_path", []), ensure_ascii=False) for chunk in chunks],
|
||||
[str(chunk.get("section_title", "")) for chunk in chunks],
|
||||
[json.dumps(chunk, ensure_ascii=False) for chunk in chunks],
|
||||
[int(chunk.get("created_at", 0) or 0) for chunk in chunks],
|
||||
]
|
||||
collection.insert(data)
|
||||
collection.flush()
|
||||
collection.load()
|
||||
print(f"Rebuilt collection {args.collection} with {len(chunks)} chunks.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
122
backend/aliyun_parser/schema.sql
Normal file
122
backend/aliyun_parser/schema.sql
Normal file
@@ -0,0 +1,122 @@
|
||||
-- 法规文档向量检索系统数据库表结构
|
||||
-- PostgreSQL
|
||||
|
||||
-- ==================== 文档表 ====================
|
||||
CREATE TABLE documents (
|
||||
id SERIAL PRIMARY KEY,
|
||||
doc_id VARCHAR(128) UNIQUE NOT NULL, -- 文档唯一标识,如 "GB14747-2006"
|
||||
title VARCHAR(512) NOT NULL, -- 文档标题
|
||||
doc_type VARCHAR(32), -- 文档类型:标准/法规/规范
|
||||
standard_number VARCHAR(64), -- 标准编号:如 "GB 14747-2006"
|
||||
publish_date DATE, -- 发布日期
|
||||
implement_date DATE, -- 实施日期
|
||||
status VARCHAR(32), -- 状态:现行/废止/修订
|
||||
source_url VARCHAR(512), -- 来源 URL
|
||||
file_path VARCHAR(512), -- 本地 PDF 文件路径
|
||||
file_size INT, -- 文件大小(字节)
|
||||
upload_time TIMESTAMP DEFAULT NOW(), -- 上传时间
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
COMMENT ON TABLE documents IS '文档元数据表';
|
||||
COMMENT ON COLUMN documents.doc_id IS '文档唯一标识,用于关联 Milvus 和其他表';
|
||||
COMMENT ON COLUMN documents.standard_number IS '标准编号,如 GB 14747-2006';
|
||||
|
||||
-- ==================== 章节结构表 ====================
|
||||
CREATE TABLE sections (
|
||||
id SERIAL PRIMARY KEY,
|
||||
doc_id VARCHAR(128) NOT NULL,
|
||||
unique_id VARCHAR(64) NOT NULL, -- 阿里云返回的唯一标识
|
||||
level INT NOT NULL, -- 层级:1, 2, 3...
|
||||
title VARCHAR(512) NOT NULL, -- 章节标题
|
||||
page INT, -- 所在页码
|
||||
index INT, -- 页内顺序
|
||||
parent_id INT, -- 父章节 ID(树形结构)
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||||
CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
|
||||
CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
|
||||
);
|
||||
|
||||
COMMENT ON TABLE sections IS '章节结构表,用于目录导航';
|
||||
COMMENT ON COLUMN sections.parent_id IS '父章节 ID,构建树形结构';
|
||||
COMMENT ON COLUMN sections.level IS '层级深度,1 为最顶层';
|
||||
|
||||
-- ==================== 语义块表 ====================
|
||||
CREATE TABLE semantic_blocks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
doc_id VARCHAR(128) NOT NULL,
|
||||
semantic_id VARCHAR(64) NOT NULL, -- 语义块唯一标识
|
||||
block_type VARCHAR(32) NOT NULL, -- 类型:section_text/table/figure
|
||||
page_start INT NOT NULL, -- 起始页码
|
||||
page_end INT NOT NULL, -- 结束页码
|
||||
section_id INT, -- 所属章节
|
||||
section_title VARCHAR(512), -- 章节标题(冗余,方便查询)
|
||||
section_level INT, -- 章节层级
|
||||
source_ids JSONB, -- 原始 layout IDs(JSON 数组)
|
||||
text TEXT NOT NULL, -- 完整内容(未被切分)
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||||
CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
|
||||
CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
|
||||
);
|
||||
|
||||
COMMENT ON TABLE semantic_blocks IS '语义块表,用于邻域扩展,恢复完整内容';
|
||||
COMMENT ON COLUMN semantic_blocks.block_type IS '类型:section_text(正文)、table(表格)、figure(图示)';
|
||||
COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
|
||||
COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容,未被切分';
|
||||
|
||||
-- ==================== 向量块元数据表 ====================
|
||||
CREATE TABLE vector_chunks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
doc_id VARCHAR(128) NOT NULL,
|
||||
chunk_id VARCHAR(64) NOT NULL, -- Milvus 主键
|
||||
semantic_id VARCHAR(64) NOT NULL, -- 关联语义块
|
||||
chunk_index INT NOT NULL, -- 切片序号(全局)
|
||||
piece_index INT, -- 同语义块内的切片序号
|
||||
page_start INT,
|
||||
page_end INT,
|
||||
section_title VARCHAR(512),
|
||||
text VARCHAR(2048), -- 切片文本(可选,缩短版用于展示)
|
||||
source_ids JSONB, -- 原始 layout IDs(JSON 数组)
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||||
CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
|
||||
REFERENCES semantic_blocks(doc_id, semantic_id),
|
||||
CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
|
||||
);
|
||||
|
||||
COMMENT ON TABLE vector_chunks IS '向量块元数据表,用于快速关联查询';
|
||||
COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
|
||||
COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号,用于按序拼接';
|
||||
|
||||
-- ==================== 索引 ====================
|
||||
CREATE INDEX idx_sections_doc_id ON sections(doc_id);
|
||||
CREATE INDEX idx_sections_parent_id ON sections(parent_id);
|
||||
CREATE INDEX idx_sections_level ON sections(level);
|
||||
|
||||
CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
|
||||
CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
|
||||
CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
|
||||
CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
|
||||
|
||||
CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
|
||||
CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
|
||||
CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
|
||||
|
||||
-- ==================== 触发器:自动更新 updated_at ====================
|
||||
CREATE OR REPLACE FUNCTION update_updated_at()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER tr_documents_updated_at
|
||||
BEFORE UPDATE ON documents
|
||||
FOR EACH ROW EXECUTE FUNCTION update_updated_at();
|
||||
327
backend/aliyun_parser/upload_to_milvus.py
Normal file
327
backend/aliyun_parser/upload_to_milvus.py
Normal file
@@ -0,0 +1,327 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
将 vector_chunks.json 向量化并上传到 Milvus 和 PostgreSQL
|
||||
使用中转站的 OpenAI 兼容 API
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
from pymilvus import (
|
||||
connections,
|
||||
Collection,
|
||||
FieldSchema,
|
||||
CollectionSchema,
|
||||
DataType,
|
||||
utility,
|
||||
)
|
||||
from openai import OpenAI
|
||||
|
||||
# ===================== 配置 =====================
|
||||
# 中转站配置
|
||||
RELAY_BASE_URL = "http://6.86.80.4:30080/v1"
|
||||
RELAY_API_KEY = "sk-5HeY7gfSIlyZMacfuXOf5cphpymsNqufEu1ou4U3avbULcyY"
|
||||
EMBEDDING_MODEL = "text-embedding-v3" # 中转站支持的 embedding 模型
|
||||
|
||||
# Milvus 配置
|
||||
MILVUS_HOST = "localhost"
|
||||
MILVUS_PORT = "19530"
|
||||
COLLECTION_NAME = "regulation_chunks"
|
||||
|
||||
# PostgreSQL 配置
|
||||
PG_HOST = "6.86.80.10"
|
||||
PG_PORT = 5432
|
||||
PG_USER = "postgresql"
|
||||
PG_PASSWORD = "postgresql123456"
|
||||
PG_DATABASE = "postgres"
|
||||
|
||||
|
||||
# ===================== Embedding =====================
|
||||
def get_openai_client(api_key: str, base_url: str) -> OpenAI:
|
||||
"""创建 OpenAI 客户端连接到中转站"""
|
||||
return OpenAI(api_key=api_key, base_url=base_url)
|
||||
|
||||
|
||||
def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10) -> List[List[float]]:
|
||||
"""批量获取文本向量"""
|
||||
all_embeddings = []
|
||||
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch = texts[i:i + batch_size]
|
||||
print(f"Embedding batch {i // batch_size + 1}/{(len(texts) - 1) // batch_size + 1}...")
|
||||
|
||||
response = client.embeddings.create(
|
||||
model=EMBEDDING_MODEL,
|
||||
input=batch,
|
||||
)
|
||||
|
||||
embeddings = [item.embedding for item in response.data]
|
||||
all_embeddings.extend(embeddings)
|
||||
|
||||
return all_embeddings
|
||||
|
||||
|
||||
# ===================== Milvus =====================
|
||||
def init_milvus(host: str, port: str):
|
||||
connections.connect("default", host=host, port=port)
|
||||
print(f"已连接 Milvus: {host}:{port}")
|
||||
|
||||
|
||||
def create_collection(name: str, dim: int) -> Collection:
|
||||
"""创建或获取 collection"""
|
||||
if utility.has_collection(name):
|
||||
print(f"Collection '{name}' 已存在,删除重建")
|
||||
utility.drop_collection(name)
|
||||
|
||||
fields = [
|
||||
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=64, is_primary=True),
|
||||
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=512),
|
||||
FieldSchema(name="chunk_index", dtype=DataType.INT64),
|
||||
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=32),
|
||||
FieldSchema(name="page_start", dtype=DataType.INT64),
|
||||
FieldSchema(name="page_end", dtype=DataType.INT64),
|
||||
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
|
||||
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
|
||||
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096), # JSON 字符串
|
||||
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
||||
schema = CollectionSchema(fields, description="法规文档检索 chunks")
|
||||
collection = Collection(name, schema)
|
||||
|
||||
# 创建向量索引(IVF_FLAT,适合中小规模)
|
||||
index_params = {
|
||||
"metric_type": "COSINE",
|
||||
"index_type": "IVF_FLAT",
|
||||
"params": {"nlist": 128},
|
||||
}
|
||||
collection.create_index("embedding", index_params)
|
||||
print(f"Collection '{name}' 创建完成,索引已建立")
|
||||
|
||||
return collection
|
||||
|
||||
|
||||
def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[List[float]]):
|
||||
"""插入 chunks 到 Milvus"""
|
||||
data = [
|
||||
[c["chunk_id"] for c in chunks],
|
||||
[c["doc_id"] for c in chunks],
|
||||
[c["doc_title"] for c in chunks],
|
||||
[c["chunk_index"] for c in chunks],
|
||||
[c["semantic_id"] for c in chunks],
|
||||
[c["chunk_type"] for c in chunks],
|
||||
[c["page_start"] for c in chunks],
|
||||
[c["page_end"] for c in chunks],
|
||||
[c["section_title"] for c in chunks],
|
||||
[c["text"] for c in chunks],
|
||||
[json.dumps(c.get("source_ids", [])) for c in chunks], # JSON 字符串
|
||||
embeddings,
|
||||
]
|
||||
|
||||
collection.insert(data)
|
||||
collection.flush()
|
||||
print(f"已插入 {len(chunks)} 个 chunks")
|
||||
|
||||
|
||||
def load_collection(collection: Collection):
|
||||
"""加载 collection 到内存(搜索前必须)"""
|
||||
collection.load()
|
||||
print(f"Collection 已加载到内存")
|
||||
|
||||
|
||||
# ===================== PostgreSQL =====================
|
||||
def get_pg_connection(host: str, port: int, user: str, password: str, database: str):
|
||||
"""获取 PostgreSQL 连接"""
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
database=database,
|
||||
)
|
||||
print(f"已连接 PostgreSQL: {host}:{port}/{database}")
|
||||
return conn
|
||||
|
||||
|
||||
def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
|
||||
"""插入 chunks 和相关数据到 PostgreSQL"""
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
# 1. 插入文档
|
||||
cursor.execute("""
|
||||
INSERT INTO documents (doc_id, title, standard_number, upload_time)
|
||||
VALUES (%s, %s, %s, NOW())
|
||||
ON CONFLICT (doc_id) DO UPDATE SET title = EXCLUDED.title, updated_at = NOW()
|
||||
""", (doc_data["doc_id"], doc_data["doc_title"], doc_data.get("standard_number")))
|
||||
|
||||
# 2. 插入语义块
|
||||
semantic_blocks = doc_data.get("semantic_blocks", [])
|
||||
if semantic_blocks:
|
||||
block_rows = [
|
||||
(
|
||||
doc_data["doc_id"],
|
||||
block["semantic_id"],
|
||||
block["block_type"],
|
||||
block["page_start"],
|
||||
block["page_end"],
|
||||
block.get("section_title"),
|
||||
block.get("section_level"),
|
||||
json.dumps(block.get("source_ids", [])),
|
||||
block["text"],
|
||||
)
|
||||
for block in semantic_blocks
|
||||
]
|
||||
execute_values(
|
||||
cursor,
|
||||
"""
|
||||
INSERT INTO semantic_blocks
|
||||
(doc_id, semantic_id, block_type, page_start, page_end, section_title, section_level, source_ids, text)
|
||||
VALUES %s
|
||||
ON CONFLICT (doc_id, semantic_id) DO UPDATE SET text = EXCLUDED.text
|
||||
""",
|
||||
block_rows,
|
||||
)
|
||||
print(f"已插入 {len(semantic_blocks)} 个语义块")
|
||||
|
||||
# 3. 插入向量块元数据
|
||||
chunk_rows = [
|
||||
(
|
||||
doc_data["doc_id"],
|
||||
chunk["chunk_id"],
|
||||
chunk["semantic_id"],
|
||||
chunk["chunk_index"],
|
||||
chunk.get("piece_index"),
|
||||
chunk["page_start"],
|
||||
chunk["page_end"],
|
||||
chunk.get("section_title"),
|
||||
chunk["text"],
|
||||
json.dumps(chunk.get("source_ids", [])),
|
||||
)
|
||||
for chunk in chunks
|
||||
]
|
||||
execute_values(
|
||||
cursor,
|
||||
"""
|
||||
INSERT INTO vector_chunks
|
||||
(doc_id, chunk_id, semantic_id, chunk_index, piece_index, page_start, page_end, section_title, text, source_ids)
|
||||
VALUES %s
|
||||
ON CONFLICT (doc_id, chunk_id) DO UPDATE SET text = EXCLUDED.text
|
||||
""",
|
||||
chunk_rows,
|
||||
)
|
||||
print(f"已插入 {len(chunks)} 个向量块元数据")
|
||||
|
||||
conn.commit()
|
||||
print("PostgreSQL 数据插入完成")
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
raise e
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
|
||||
# ===================== 主流程 =====================
|
||||
def load_data(file_path: Path) -> Dict:
|
||||
"""加载 vector_chunks.json,返回完整数据"""
|
||||
data = json.loads(file_path.read_text(encoding="utf-8"))
|
||||
return data
|
||||
|
||||
|
||||
def upload_to_milvus_and_pg(
|
||||
chunks_file: str,
|
||||
api_key: str,
|
||||
base_url: str,
|
||||
milvus_host: str,
|
||||
milvus_port: str,
|
||||
collection_name: str,
|
||||
batch_size: int,
|
||||
pg_host: str,
|
||||
pg_port: int,
|
||||
pg_user: str,
|
||||
pg_password: str,
|
||||
pg_database: str,
|
||||
):
|
||||
# 1. 加载完整数据
|
||||
chunks_path = Path(chunks_file).expanduser().resolve()
|
||||
if not chunks_path.exists():
|
||||
raise FileNotFoundError(f"文件不存在: {chunks_path}")
|
||||
|
||||
data = load_data(chunks_path)
|
||||
chunks = data.get("vector_chunks", [])
|
||||
if not chunks:
|
||||
raise ValueError("vector_chunks 为空")
|
||||
print(f"加载 {len(chunks)} 个 chunks")
|
||||
|
||||
# 2. 初始化连接
|
||||
client = get_openai_client(api_key, base_url)
|
||||
init_milvus(milvus_host, milvus_port)
|
||||
pg_conn = get_pg_connection(pg_host, pg_port, pg_user, pg_password, pg_database)
|
||||
|
||||
# 3. 获取 embeddings
|
||||
texts = [c["embedding_text"] for c in chunks]
|
||||
embeddings = get_embeddings_batch(client, texts, batch_size)
|
||||
print(f"生成 {len(embeddings)} 个向量")
|
||||
|
||||
# 4. 获取 embedding 维度
|
||||
embedding_dim = len(embeddings[0])
|
||||
print(f"Embedding 维度: {embedding_dim}")
|
||||
|
||||
# 5. 创建 collection 并插入 Milvus
|
||||
collection = create_collection(collection_name, embedding_dim)
|
||||
insert_chunks(collection, chunks, embeddings)
|
||||
load_collection(collection)
|
||||
|
||||
# 6. 插入 PostgreSQL
|
||||
insert_chunks_to_pg(pg_conn, chunks, data)
|
||||
|
||||
# 7. 关闭连接
|
||||
pg_conn.close()
|
||||
|
||||
print("上传完成!")
|
||||
|
||||
|
||||
# ===================== CLI =====================
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="将 vector_chunks 向量化并上传到 Milvus 和 PostgreSQL")
|
||||
parser.add_argument("chunks_file", help="vector_chunks.json 文件路径")
|
||||
parser.add_argument("--api-key", default=RELAY_API_KEY, help="中转站 API Key")
|
||||
parser.add_argument("--base-url", default=RELAY_BASE_URL, help="中转站 Base URL")
|
||||
parser.add_argument("--milvus-host", default=MILVUS_HOST, help="Milvus host")
|
||||
parser.add_argument("--milvus-port", default=MILVUS_PORT, help="Milvus port")
|
||||
parser.add_argument("--collection", default=COLLECTION_NAME, help="Milvus collection 名称")
|
||||
parser.add_argument("--batch-size", type=int, default=10, help="Embedding 批量大小(中转站限制最大10)")
|
||||
parser.add_argument("--pg-host", default=PG_HOST, help="PostgreSQL host")
|
||||
parser.add_argument("--pg-port", type=int, default=PG_PORT, help="PostgreSQL port")
|
||||
parser.add_argument("--pg-user", default=PG_USER, help="PostgreSQL user")
|
||||
parser.add_argument("--pg-password", default=PG_PASSWORD, help="PostgreSQL password")
|
||||
parser.add_argument("--pg-database", default=PG_DATABASE, help="PostgreSQL database")
|
||||
args = parser.parse_args()
|
||||
|
||||
upload_to_milvus_and_pg(
|
||||
chunks_file=args.chunks_file,
|
||||
api_key=args.api_key,
|
||||
base_url=args.base_url,
|
||||
milvus_host=args.milvus_host,
|
||||
milvus_port=args.milvus_port,
|
||||
collection_name=args.collection,
|
||||
batch_size=args.batch_size,
|
||||
pg_host=args.pg_host,
|
||||
pg_port=args.pg_port,
|
||||
pg_user=args.pg_user,
|
||||
pg_password=args.pg_password,
|
||||
pg_database=args.pg_database,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
5212
backend/aliyun_parser/vector_chunks.json
Normal file
5212
backend/aliyun_parser/vector_chunks.json
Normal file
File diff suppressed because it is too large
Load Diff
263
backend/aliyun_parser/嵌入和召回.md
Normal file
263
backend/aliyun_parser/嵌入和召回.md
Normal file
@@ -0,0 +1,263 @@
|
||||
# 文档解析与向量检索说明
|
||||
|
||||
## 相关文件
|
||||
|
||||
- `aliyun_doc_parser.py`:调用阿里云文档智能解析 PDF,生成原始 `layouts.json`
|
||||
- `layouts_to_vector_chunks.py`:把 `layouts.json` 转成适合向量数据库入库的三层结构
|
||||
- `layouts.json`:阿里云返回的原始布局结果
|
||||
- `vector_chunks.json`:转换后的结构化输出
|
||||
|
||||
## 一、`layouts.json` 的结构
|
||||
|
||||
`layouts.json` 顶层是一个数组,每个元素代表一个布局块(layout)。常见字段如下:
|
||||
|
||||
- `type`:主类型,例如 `title`、`text`、`table`、`figure`
|
||||
- `subType`:更细的语义类型,例如 `doc_title`、`para_title`、`para`、`picture`、`pic_title`、`pic_caption`
|
||||
- `text`:当前布局块的纯文本
|
||||
- `markdownContent`:带 markdown 标记的文本
|
||||
- `pageNum`:页码
|
||||
- `index`:页内顺序
|
||||
- `level`:标题层级
|
||||
- `uniqueId`:布局块唯一标识
|
||||
- `blocks`:更细粒度的文本与样式信息
|
||||
- `cells`:表格单元格,仅 `table` 类型存在
|
||||
|
||||
这个结构不是简单 OCR 文本流,而是已经带有版面理解和语义分类的结构化数据。
|
||||
|
||||
## 二、推荐的三层转换结构
|
||||
|
||||
### 1. 结构层 `structure_nodes`
|
||||
|
||||
结构层用于恢复文档标题树,不直接作为最终向量检索单元。
|
||||
|
||||
示例:
|
||||
|
||||
- `1 范围`
|
||||
- `2 规范性引用文件`
|
||||
- `3 术语和定义`
|
||||
- `3.1 儿童三轮车`
|
||||
- `3.2 轮距`
|
||||
|
||||
结构层主要用于给下游 chunk 绑定 `section_path`。
|
||||
|
||||
### 2. 语义层 `semantic_blocks`
|
||||
|
||||
语义层是按文档意义聚合后的内容块,主要分为三类:
|
||||
|
||||
- `section_text`:同一章节下连续正文聚合而成
|
||||
- `table`:表格内容单独成块
|
||||
- `figure`:图、图名、图注等单独成块
|
||||
|
||||
这一层比单 layout 更适合做语义理解,也适合后续做上下文扩展。
|
||||
|
||||
### 3. 检索层 `vector_chunks`
|
||||
|
||||
检索层是最终写进向量数据库的 chunk。
|
||||
|
||||
处理方式:
|
||||
|
||||
- 对 `semantic_blocks` 中较短的块直接入库
|
||||
- 对较长的块按 `max_chars` 再切分
|
||||
- 相邻切片保留 `overlap_chars` 重叠
|
||||
- 每个 chunk 都带完整 metadata,便于后续过滤、重排和邻域扩展
|
||||
|
||||
## 三、当前转换脚本做了什么
|
||||
|
||||
`layouts_to_vector_chunks.py` 当前已经实现:
|
||||
|
||||
1. 过滤目录页噪声(如 `目次`)
|
||||
2. 根据标题层级维护章节路径
|
||||
3. 将正文聚合成 `section_text`
|
||||
4. 将表格单独转成 `table`
|
||||
5. 将图相关内容单独转成 `figure`
|
||||
6. 对长文本继续切分为最终 `vector_chunks`
|
||||
7. 为每个检索 chunk 生成 `embedding_text`
|
||||
|
||||
## 四、为什么不要直接按 layout 入库
|
||||
|
||||
如果把 `layouts.json` 的每条 layout 直接做向量:
|
||||
|
||||
- 颗粒度太碎
|
||||
- 标题和正文容易分离
|
||||
- 表格会丢失结构上下文
|
||||
- 图示信息无法完整表达
|
||||
- 检索命中结果噪声较大
|
||||
|
||||
对于标准文档,最合适的单位通常不是“句子”,而是“条款语义块”。
|
||||
|
||||
## 五、建议的入库字段
|
||||
|
||||
建议向量数据库每条记录至少保存:
|
||||
|
||||
- `embedding_text`:用于生成向量
|
||||
- `text`:原始 chunk 文本
|
||||
- `chunk_id`
|
||||
- `semantic_id`
|
||||
- `chunk_type`:`section_text` / `table` / `figure`
|
||||
- `section_path`
|
||||
- `section_title`
|
||||
- `section_level`
|
||||
- `page_start`
|
||||
- `page_end`
|
||||
- `doc_id`
|
||||
- `doc_title`
|
||||
- `source_ids`
|
||||
|
||||
其中:
|
||||
|
||||
- 向量化字段:`embedding_text`
|
||||
- 展示字段:`text`
|
||||
- 检索增强字段:其余 metadata
|
||||
|
||||
## 六、推荐的检索方式
|
||||
|
||||
不要只做最简单的 top-k 向量搜索,建议采用:
|
||||
|
||||
**向量召回 + metadata 重排 + 邻域扩展**
|
||||
|
||||
### 1. 向量召回
|
||||
|
||||
使用 `vector_chunks[*].embedding_text` 做 embedding,并在向量数据库中检索 top 10 ~ 15 条。
|
||||
|
||||
查询时可以对用户问题做轻微改写,例如:
|
||||
|
||||
原问题:
|
||||
|
||||
`儿童三轮车的定义是什么?`
|
||||
|
||||
可改写为:
|
||||
|
||||
`请检索 GB 14747—2006 儿童三轮车安全要求 中关于“儿童三轮车定义”的条款、术语、表格或图示说明。`
|
||||
|
||||
这样更适合标准文档检索。
|
||||
|
||||
### 2. metadata 重排
|
||||
|
||||
向量召回后,根据 metadata 做轻量规则重排。
|
||||
|
||||
常见规则:
|
||||
|
||||
- `chunk_type == section_text`:对定义类、要求类问题优先级更高
|
||||
- `section_path` 命中查询关键词:例如查询“定义”时,`术语和定义` 章节优先
|
||||
- `chunk_type == table`:对“尺寸 / 参数 / 数值 / 对照 / 要求”类问题加权
|
||||
- `chunk_type == figure`:对“图 / 结构 / 状态 / 示意”类问题加权
|
||||
|
||||
### 3. 邻域扩展
|
||||
|
||||
检索命中的是最终切片,但回答往往需要更完整上下文。
|
||||
|
||||
建议命中某个 `vector_chunk` 后:
|
||||
|
||||
1. 优先回捞同一个 `semantic_id` 下的所有 chunk
|
||||
2. 如果还不够,再补充同 `section_path`、相邻页码或相邻 `chunk_index` 的内容
|
||||
|
||||
这样可以恢复完整条款,而不是只给模型一小段碎片。
|
||||
|
||||
## 七、不同问题的检索重点
|
||||
|
||||
### 1. 定义类问题
|
||||
|
||||
例如:
|
||||
|
||||
- `儿童三轮车的定义是什么?`
|
||||
- `轮距是什么意思?`
|
||||
|
||||
优先检索:
|
||||
|
||||
- `section_text`
|
||||
- `section_path` 中包含 `术语和定义` 的内容
|
||||
|
||||
### 2. 要求类问题
|
||||
|
||||
例如:
|
||||
|
||||
- `外露突出物有什么要求?`
|
||||
- `辅助推杆有哪些安全要求?`
|
||||
|
||||
优先检索:
|
||||
|
||||
- `section_text`
|
||||
- `table`
|
||||
|
||||
### 3. 数值 / 尺寸 / 对照类问题
|
||||
|
||||
例如:
|
||||
|
||||
- `鞍座到脚蹬距离要求是什么?`
|
||||
- `哪些项目需要满足规定尺寸?`
|
||||
|
||||
优先检索:
|
||||
|
||||
- `table`
|
||||
- `section_text`
|
||||
|
||||
### 4. 图示说明类问题
|
||||
|
||||
例如:
|
||||
|
||||
- `正常乘骑状态是什么意思?`
|
||||
- `图1表示什么?`
|
||||
|
||||
优先检索:
|
||||
|
||||
- `figure`
|
||||
- 同章节相邻 `section_text`
|
||||
|
||||
## 八、推荐的最终检索流程
|
||||
|
||||
建议采用以下固定流程:
|
||||
|
||||
1. 用 `vector_chunks.embedding_text` 做 embedding 检索
|
||||
2. 取 top 10 ~ 15 条候选
|
||||
3. 按 `chunk_type + section_path` 做规则重排
|
||||
4. 以 `semantic_id` 为中心回捞完整语义块
|
||||
5. 选 3 ~ 5 组上下文提供给大模型回答
|
||||
|
||||
## 九、给大模型的上下文组织方式
|
||||
|
||||
最终不要直接把原始 JSON 扔给模型,建议整理成如下格式:
|
||||
|
||||
```text
|
||||
[命中片段 1]
|
||||
章节:3 术语和定义 > 3.1 儿童三轮车
|
||||
页码:1-2
|
||||
类型:section_text
|
||||
内容:
|
||||
......
|
||||
|
||||
[命中片段 2]
|
||||
章节:4 要求 > 4.3 外露突出物
|
||||
页码:5
|
||||
类型:section_text
|
||||
内容:
|
||||
......
|
||||
|
||||
[命中片段 3]
|
||||
章节:5 试验方法
|
||||
页码:8
|
||||
类型:table
|
||||
内容:
|
||||
......
|
||||
```
|
||||
|
||||
这种格式更利于模型稳定回答并引用出处。
|
||||
|
||||
## 十、转换命令
|
||||
|
||||
生成三层结构:
|
||||
|
||||
```bash
|
||||
python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
|
||||
--layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
|
||||
--out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json
|
||||
```
|
||||
|
||||
自定义切片大小:
|
||||
|
||||
```bash
|
||||
python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
|
||||
--layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
|
||||
--out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json \
|
||||
--max-chars 500 \
|
||||
--overlap-chars 80
|
||||
```
|
||||
@@ -3,6 +3,7 @@
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
from loguru import logger
|
||||
@@ -12,6 +13,7 @@ from app.api.routes import api_router
|
||||
from app.config.logging import setup_logging
|
||||
from app.config.settings import settings
|
||||
from app.shared.bootstrap import cleanup_runtime_dependencies, preload_runtime_dependencies
|
||||
from app.shared.errors import VectorStoreSchemaError
|
||||
# Keep module behavior explicit so the backend flow stays easy to audit.
|
||||
|
||||
|
||||
@@ -55,16 +57,33 @@ app.add_middleware(
|
||||
app.include_router(api_router, prefix="/api/v1")
|
||||
|
||||
|
||||
@app.exception_handler(VectorStoreSchemaError)
|
||||
async def vector_store_schema_exception_handler(request: Request, exc: VectorStoreSchemaError):
|
||||
"""Return a stable JSON response for vector store schema/runtime errors."""
|
||||
logger.error(f"向量库 schema 异常: {exc}")
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content=jsonable_encoder(
|
||||
ErrorResponse(
|
||||
error="VectorStoreSchemaError",
|
||||
message=str(exc),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(Exception)
|
||||
async def global_exception_handler(request: Request, exc: Exception):
|
||||
"""Global exception handler."""
|
||||
logger.error(f"未处理的异常: {exc}")
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content=ErrorResponse(
|
||||
error="InternalServerError",
|
||||
message=str(exc),
|
||||
).model_dump(),
|
||||
content=jsonable_encoder(
|
||||
ErrorResponse(
|
||||
error="InternalServerError",
|
||||
message=str(exc),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ from .knowledge import router as knowledge_router
|
||||
from .agent import router as agent_router
|
||||
from .status import router as status_router
|
||||
from .perception import router as perception_router
|
||||
from .rag import router as rag_router
|
||||
# Keep package boundaries explicit so backend imports stay predictable.
|
||||
|
||||
|
||||
@@ -20,6 +21,7 @@ api_router.include_router(agent_router)
|
||||
api_router.include_router(compliance_router)
|
||||
api_router.include_router(status_router)
|
||||
api_router.include_router(perception_router)
|
||||
api_router.include_router(rag_router)
|
||||
|
||||
__all__ = [
|
||||
"api_router",
|
||||
@@ -29,4 +31,5 @@ __all__ = [
|
||||
"compliance_router",
|
||||
"status_router",
|
||||
"perception_router",
|
||||
"rag_router",
|
||||
]
|
||||
|
||||
@@ -29,14 +29,19 @@ async def search_knowledge(request: SearchRequest):
|
||||
results=[
|
||||
SearchResultItem(
|
||||
id=index + 1,
|
||||
content=item.content,
|
||||
content=item.text,
|
||||
score=item.score,
|
||||
metadata={
|
||||
"doc_id": item.doc_id,
|
||||
"doc_name": item.doc_name,
|
||||
"doc_title": item.doc_title,
|
||||
"chunk_id": item.chunk_id,
|
||||
"chunk_type": item.chunk_type,
|
||||
"section_title": item.section_title,
|
||||
"page_number": item.page_number,
|
||||
"page_start": item.page_start,
|
||||
"page_end": item.page_end,
|
||||
"section_level": item.section_level,
|
||||
"chunk_index": item.chunk_index,
|
||||
"piece_index": item.piece_index,
|
||||
**item.metadata,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -50,8 +50,8 @@ async def rag_chat(request: RagChatRequest):
|
||||
{
|
||||
"id": str(s.get("chunk_id") or s.get("doc_id") or idx + 1),
|
||||
"score": s.get("score", 0),
|
||||
"preview": s.get("content", "")[:200],
|
||||
"doc_name": s.get("doc_name", ""),
|
||||
"preview": s.get("text", s.get("content", ""))[:200],
|
||||
"doc_name": s.get("doc_title", s.get("doc_name", "")),
|
||||
"clause": s.get("section_title", "法规片段"),
|
||||
"doc_id": s.get("doc_id"),
|
||||
"download_url": (
|
||||
|
||||
@@ -508,7 +508,7 @@ class DocumentQueryService:
|
||||
"""Return documents with real-time state from Milvus as the authoritative source.
|
||||
|
||||
Algorithm:
|
||||
1. Query Milvus for all doc metadata (doc_id, doc_name, chunk_count, …).
|
||||
1. Query Milvus for all doc metadata (doc_id, doc_title, chunk_count, …).
|
||||
2. Load JSON/PG metadata records and index them by doc_id.
|
||||
3. Merge: Milvus-present docs get status=INDEXED and live chunk_count;
|
||||
metadata-only docs with status=INDEXED are demoted to FAILED.
|
||||
@@ -536,8 +536,8 @@ class DocumentQueryService:
|
||||
doc.chunk_count = row["chunk_count"]
|
||||
doc.status = DocumentStatus.INDEXED
|
||||
# Backfill fields that may be missing from older JSON records.
|
||||
if not doc.doc_name and row.get("doc_name"):
|
||||
doc.doc_name = row["doc_name"]
|
||||
if not doc.doc_name and row.get("doc_title"):
|
||||
doc.doc_name = row["doc_title"]
|
||||
if not doc.regulation_type and row.get("regulation_type"):
|
||||
doc.regulation_type = row["regulation_type"]
|
||||
if not doc.version and row.get("version"):
|
||||
@@ -553,8 +553,8 @@ class DocumentQueryService:
|
||||
if doc_id not in meta_by_id:
|
||||
synthetic = Document(
|
||||
doc_id=doc_id,
|
||||
doc_name=row.get("doc_name", doc_id),
|
||||
file_name=row.get("doc_name", doc_id),
|
||||
doc_name=row.get("doc_title", doc_id),
|
||||
file_name=row.get("doc_title", doc_id),
|
||||
object_name="",
|
||||
content_type="",
|
||||
size_bytes=0,
|
||||
|
||||
@@ -29,11 +29,16 @@ def _reciprocal_rank_fusion(
|
||||
RetrievedChunk(
|
||||
chunk_id=chunk_map[ck].chunk_id,
|
||||
doc_id=chunk_map[ck].doc_id,
|
||||
doc_name=chunk_map[ck].doc_name,
|
||||
content=chunk_map[ck].content,
|
||||
doc_title=chunk_map[ck].doc_title,
|
||||
text=chunk_map[ck].text,
|
||||
score=scores[ck],
|
||||
chunk_type=chunk_map[ck].chunk_type,
|
||||
section_title=chunk_map[ck].section_title,
|
||||
page_number=chunk_map[ck].page_number,
|
||||
page_start=chunk_map[ck].page_start,
|
||||
page_end=chunk_map[ck].page_end,
|
||||
section_level=chunk_map[ck].section_level,
|
||||
chunk_index=chunk_map[ck].chunk_index,
|
||||
piece_index=chunk_map[ck].piece_index,
|
||||
metadata=chunk_map[ck].metadata,
|
||||
)
|
||||
for ck in sorted_keys
|
||||
|
||||
@@ -71,9 +71,9 @@ class PerceptionService:
|
||||
affected_docs.append(
|
||||
{
|
||||
"doc_id": chunk.doc_id,
|
||||
"doc_name": chunk.doc_name,
|
||||
"doc_title": chunk.doc_title,
|
||||
"score": round(float(chunk.score), 4),
|
||||
"snippet": (chunk.content or "")[:180],
|
||||
"snippet": (chunk.text or "")[:180],
|
||||
"clause": getattr(chunk, "section_title", "") or "",
|
||||
}
|
||||
)
|
||||
@@ -84,7 +84,7 @@ class PerceptionService:
|
||||
|
||||
# --- 2. Build context from retrieved chunks ---
|
||||
context_parts = [
|
||||
f"[文档{i}: {c.doc_name}]\n{(c.content or '')[:400]}"
|
||||
f"[文档{i}: {c.doc_title}]\n{(c.text or '')[:400]}"
|
||||
for i, c in enumerate(chunks[:5], 1)
|
||||
]
|
||||
context = "\n\n".join(context_parts) if context_parts else "(知识库中暂无相关文档)"
|
||||
|
||||
@@ -33,7 +33,7 @@ class Settings(BaseSettings):
|
||||
# Keep configuration setup explicit so runtime behavior is easy to reason about.
|
||||
milvus_host: str = Field(default="6.86.80.8", description="Milvus服务地址")
|
||||
milvus_port: int = Field(default=19530, description="Milvus服务端口")
|
||||
milvus_collection: str = Field(default="regulations_dense_1024_v1", description="法规向量集合名称")
|
||||
milvus_collection: str = Field(default="regulations_dense_1024_v2", description="法规向量集合名称")
|
||||
milvus_db_name: str = Field(default="default", description="Milvus数据库名称")
|
||||
|
||||
# Keep configuration setup explicit so runtime behavior is easy to reason about.
|
||||
|
||||
@@ -27,7 +27,7 @@ class Settings(BaseSettings):
|
||||
# Milvus
|
||||
milvus_host: str = "6.86.80.8"
|
||||
milvus_port: int = 19530
|
||||
milvus_collection: str = "regulations_dense_1024_v1"
|
||||
milvus_collection: str = "regulations_dense_1024_v2"
|
||||
|
||||
# LLM / embedding defaults aligned with the migrated backend path.
|
||||
llm_model: str = "qwen-max"
|
||||
@@ -47,7 +47,7 @@ class Settings(BaseSettings):
|
||||
api_port: int = 8000
|
||||
|
||||
# Legacy aliases retained for old utility modules.
|
||||
regulations_collection: str = "regulations_dense_1024_v1"
|
||||
regulations_collection: str = "regulations_dense_1024_v2"
|
||||
compliance_collection: str = "compliance_cache"
|
||||
|
||||
# Preserve the legacy module API while keeping env resolution centralized at the repo root.
|
||||
|
||||
@@ -8,18 +8,91 @@ from typing import Any
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class AnswerSource:
|
||||
"""Represent answer source data."""
|
||||
"""Represent answer source data with legacy aliases."""
|
||||
|
||||
doc_id: str
|
||||
doc_name: str
|
||||
doc_title: str
|
||||
chunk_id: str
|
||||
chunk_type: str
|
||||
section_title: str
|
||||
page_number: int
|
||||
page_start: int
|
||||
page_end: int
|
||||
section_level: int
|
||||
chunk_index: int
|
||||
piece_index: int
|
||||
score: float
|
||||
content: str
|
||||
text: str
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
doc_id: str,
|
||||
doc_title: str | None = None,
|
||||
chunk_id: str,
|
||||
chunk_type: str = "",
|
||||
section_title: str = "",
|
||||
page_start: int = 0,
|
||||
page_end: int = 0,
|
||||
section_level: int = 0,
|
||||
chunk_index: int = 0,
|
||||
piece_index: int = 0,
|
||||
score: float = 0.0,
|
||||
text: str | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
doc_name: str | None = None,
|
||||
content: str | None = None,
|
||||
page_number: int | None = None,
|
||||
**_: Any,
|
||||
) -> None:
|
||||
"""Initialize the answer source while accepting legacy field names."""
|
||||
self.doc_id = doc_id
|
||||
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
|
||||
self.chunk_id = chunk_id
|
||||
self.chunk_type = chunk_type
|
||||
self.section_title = section_title
|
||||
self.page_start = int(page_start or page_number or 0)
|
||||
self.page_end = int(page_end or self.page_start)
|
||||
self.section_level = int(section_level or 0)
|
||||
self.chunk_index = int(chunk_index or 0)
|
||||
self.piece_index = int(piece_index or 0)
|
||||
self.score = float(score)
|
||||
self.text = text if text is not None else (content or "")
|
||||
self.metadata = dict(metadata or {})
|
||||
|
||||
@property
|
||||
def doc_name(self) -> str:
|
||||
"""Return the legacy document name alias."""
|
||||
return self.doc_title
|
||||
|
||||
@doc_name.setter
|
||||
def doc_name(self, value: str) -> None:
|
||||
"""Update the legacy document name alias."""
|
||||
self.doc_title = value
|
||||
|
||||
@property
|
||||
def content(self) -> str:
|
||||
"""Return the legacy content alias."""
|
||||
return self.text
|
||||
|
||||
@content.setter
|
||||
def content(self, value: str) -> None:
|
||||
"""Update the legacy content alias."""
|
||||
self.text = value
|
||||
|
||||
@property
|
||||
def page_number(self) -> int:
|
||||
"""Return the legacy page number alias."""
|
||||
return self.page_start
|
||||
|
||||
@page_number.setter
|
||||
def page_number(self, value: int) -> None:
|
||||
"""Update the legacy page number alias."""
|
||||
self.page_start = value
|
||||
self.page_end = max(self.page_end, value)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConversationMessage:
|
||||
|
||||
@@ -60,23 +60,117 @@ class ParsedDocument:
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class Chunk:
|
||||
"""Represent the Chunk type."""
|
||||
"""Represent one retrieval chunk with backward-compatible aliases."""
|
||||
|
||||
chunk_id: str
|
||||
doc_id: str
|
||||
doc_name: str
|
||||
content: str
|
||||
doc_title: str
|
||||
text: str
|
||||
embedding_text: str
|
||||
chunk_type: str = ""
|
||||
chunk_index: int = 0
|
||||
piece_index: int = 0
|
||||
page_start: int = 0
|
||||
page_end: int = 0
|
||||
section_title: str = ""
|
||||
section_path: list[str] = field(default_factory=list)
|
||||
page_number: int = 0
|
||||
section_level: int = 0
|
||||
source_ids: list[str] = field(default_factory=list)
|
||||
regulation_type: str = ""
|
||||
version: str = ""
|
||||
semantic_id: str = ""
|
||||
block_type: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
chunk_id: str,
|
||||
doc_id: str,
|
||||
doc_title: str | None = None,
|
||||
text: str | None = None,
|
||||
embedding_text: str = "",
|
||||
chunk_type: str = "",
|
||||
chunk_index: int = 0,
|
||||
piece_index: int = 0,
|
||||
page_start: int = 0,
|
||||
page_end: int = 0,
|
||||
section_title: str = "",
|
||||
section_path: list[str] | None = None,
|
||||
section_level: int = 0,
|
||||
source_ids: list[str] | None = None,
|
||||
regulation_type: str = "",
|
||||
version: str = "",
|
||||
semantic_id: str = "",
|
||||
metadata: dict[str, Any] | None = None,
|
||||
doc_name: str | None = None,
|
||||
content: str | None = None,
|
||||
page_number: int | None = None,
|
||||
block_type: str | None = None,
|
||||
**_: Any,
|
||||
) -> None:
|
||||
"""Initialize the chunk while accepting legacy field names."""
|
||||
self.chunk_id = chunk_id
|
||||
self.doc_id = doc_id
|
||||
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
|
||||
self.text = text if text is not None else (content or "")
|
||||
self.embedding_text = embedding_text or self.text
|
||||
self.chunk_type = chunk_type or (block_type or "")
|
||||
self.chunk_index = int(chunk_index or 0)
|
||||
self.piece_index = int(piece_index or 0)
|
||||
self.page_start = int(page_start or page_number or 0)
|
||||
self.page_end = int(page_end or self.page_start)
|
||||
self.section_title = section_title
|
||||
self.section_path = list(section_path or [])
|
||||
self.section_level = int(section_level or 0)
|
||||
self.source_ids = list(source_ids or [])
|
||||
self.regulation_type = regulation_type
|
||||
self.version = version
|
||||
self.semantic_id = semantic_id
|
||||
self.metadata = dict(metadata or {})
|
||||
|
||||
@property
|
||||
def doc_name(self) -> str:
|
||||
"""Return the legacy document name alias."""
|
||||
return self.doc_title
|
||||
|
||||
@doc_name.setter
|
||||
def doc_name(self, value: str) -> None:
|
||||
"""Update the legacy document name alias."""
|
||||
self.doc_title = value
|
||||
|
||||
@property
|
||||
def content(self) -> str:
|
||||
"""Return the legacy content alias."""
|
||||
return self.text
|
||||
|
||||
@content.setter
|
||||
def content(self, value: str) -> None:
|
||||
"""Update the legacy content alias."""
|
||||
self.text = value
|
||||
|
||||
@property
|
||||
def page_number(self) -> int:
|
||||
"""Return the legacy page number alias."""
|
||||
return self.page_start
|
||||
|
||||
@page_number.setter
|
||||
def page_number(self, value: int) -> None:
|
||||
"""Update the legacy page number alias."""
|
||||
self.page_start = value
|
||||
self.page_end = max(self.page_end, value)
|
||||
|
||||
@property
|
||||
def block_type(self) -> str:
|
||||
"""Return the legacy block type alias."""
|
||||
return self.chunk_type
|
||||
|
||||
@block_type.setter
|
||||
def block_type(self, value: str) -> None:
|
||||
"""Update the legacy block type alias."""
|
||||
self.chunk_type = value
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentProcessingRun:
|
||||
|
||||
@@ -16,14 +16,88 @@ class RetrievalQuery:
|
||||
filters: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class RetrievedChunk:
|
||||
"""Represent the Retrieved Chunk type."""
|
||||
"""Represent the retrieved chunk payload with legacy aliases."""
|
||||
|
||||
chunk_id: str
|
||||
doc_id: str
|
||||
doc_name: str
|
||||
content: str
|
||||
doc_title: str
|
||||
text: str
|
||||
score: float
|
||||
chunk_type: str = ""
|
||||
section_title: str = ""
|
||||
page_number: int = 0
|
||||
page_start: int = 0
|
||||
page_end: int = 0
|
||||
section_level: int = 0
|
||||
chunk_index: int = 0
|
||||
piece_index: int = 0
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
chunk_id: str,
|
||||
doc_id: str,
|
||||
doc_title: str | None = None,
|
||||
text: str | None = None,
|
||||
score: float = 0.0,
|
||||
chunk_type: str = "",
|
||||
section_title: str = "",
|
||||
page_start: int = 0,
|
||||
page_end: int = 0,
|
||||
section_level: int = 0,
|
||||
chunk_index: int = 0,
|
||||
piece_index: int = 0,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
doc_name: str | None = None,
|
||||
content: str | None = None,
|
||||
page_number: int | None = None,
|
||||
block_type: str | None = None,
|
||||
**_: Any,
|
||||
) -> None:
|
||||
"""Initialize the retrieved chunk while accepting legacy field names."""
|
||||
self.chunk_id = chunk_id
|
||||
self.doc_id = doc_id
|
||||
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
|
||||
self.text = text if text is not None else (content or "")
|
||||
self.score = float(score)
|
||||
self.chunk_type = chunk_type or (block_type or "")
|
||||
self.section_title = section_title
|
||||
self.page_start = int(page_start or page_number or 0)
|
||||
self.page_end = int(page_end or self.page_start)
|
||||
self.section_level = int(section_level or 0)
|
||||
self.chunk_index = int(chunk_index or 0)
|
||||
self.piece_index = int(piece_index or 0)
|
||||
self.metadata = dict(metadata or {})
|
||||
|
||||
@property
|
||||
def doc_name(self) -> str:
|
||||
"""Return the legacy document name alias."""
|
||||
return self.doc_title
|
||||
|
||||
@doc_name.setter
|
||||
def doc_name(self, value: str) -> None:
|
||||
"""Update the legacy document name alias."""
|
||||
self.doc_title = value
|
||||
|
||||
@property
|
||||
def content(self) -> str:
|
||||
"""Return the legacy content alias."""
|
||||
return self.text
|
||||
|
||||
@content.setter
|
||||
def content(self, value: str) -> None:
|
||||
"""Update the legacy content alias."""
|
||||
self.text = value
|
||||
|
||||
@property
|
||||
def page_number(self) -> int:
|
||||
"""Return the legacy page number alias."""
|
||||
return self.page_start
|
||||
|
||||
@page_number.setter
|
||||
def page_number(self, value: int) -> None:
|
||||
"""Update the legacy page number alias."""
|
||||
self.page_start = value
|
||||
self.page_end = max(self.page_end, value)
|
||||
|
||||
@@ -45,10 +45,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
|
||||
context_tokens = 0
|
||||
for idx, chunk in enumerate(retrieved_chunks, start=1):
|
||||
block = (
|
||||
f"[{idx}] 文档: {chunk.doc_name}\n"
|
||||
f"[{idx}] 文档: {chunk.doc_title}\n"
|
||||
f"章节: {chunk.section_title or '未标注'}\n"
|
||||
f"页码: {chunk.page_number}\n"
|
||||
f"内容: {chunk.content}"
|
||||
f"页码: {chunk.page_start}" + (f"-{chunk.page_end}" if chunk.page_end and chunk.page_end != chunk.page_start else "") + "\n"
|
||||
f"内容: {chunk.text}"
|
||||
)
|
||||
block_tokens = self._estimate_tokens(block)
|
||||
if context_tokens + block_tokens > settings.rag_max_context_tokens:
|
||||
@@ -73,10 +73,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
|
||||
return False
|
||||
estimated_total_tokens = sum(
|
||||
self._estimate_tokens(
|
||||
f"[{idx}] 文档: {chunk.doc_name}\n"
|
||||
f"[{idx}] 文档: {chunk.doc_title}\n"
|
||||
f"章节: {chunk.section_title or '未标注'}\n"
|
||||
f"页码: {chunk.page_number}\n"
|
||||
f"内容: {chunk.content}"
|
||||
f"页码: {chunk.page_start}" + (f"-{chunk.page_end}" if chunk.page_end and chunk.page_end != chunk.page_start else "") + "\n"
|
||||
f"内容: {chunk.text}"
|
||||
)
|
||||
for idx, chunk in enumerate(retrieved_chunks, start=1)
|
||||
)
|
||||
@@ -87,12 +87,17 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
|
||||
return [
|
||||
AnswerSource(
|
||||
doc_id=chunk.doc_id,
|
||||
doc_name=chunk.doc_name,
|
||||
doc_title=chunk.doc_title,
|
||||
chunk_id=chunk.chunk_id,
|
||||
chunk_type=chunk.chunk_type,
|
||||
section_title=chunk.section_title,
|
||||
page_number=chunk.page_number,
|
||||
page_start=chunk.page_start,
|
||||
page_end=chunk.page_end,
|
||||
section_level=chunk.section_level,
|
||||
chunk_index=chunk.chunk_index,
|
||||
piece_index=chunk.piece_index,
|
||||
score=chunk.score,
|
||||
content=chunk.content,
|
||||
text=chunk.text,
|
||||
metadata=chunk.metadata,
|
||||
)
|
||||
for chunk in chunks
|
||||
|
||||
@@ -10,6 +10,7 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
|
||||
"""Adapt the existing markdown chunker to the new chunk builder port."""
|
||||
|
||||
def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
|
||||
"""Initialize the local markdown chunk builder."""
|
||||
self.chunker = RegulationChunker(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
@@ -22,6 +23,7 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
|
||||
regulation_type: str,
|
||||
version: str,
|
||||
) -> list[Chunk]:
|
||||
"""Build migrated chunk objects from the legacy markdown chunker output."""
|
||||
markdown_text = parsed_document.raw_text.strip()
|
||||
if not markdown_text:
|
||||
return []
|
||||
@@ -50,16 +52,18 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
|
||||
Chunk(
|
||||
chunk_id=item.metadata.chunk_id,
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
content=item.content,
|
||||
doc_title=parsed_document.doc_name,
|
||||
text=item.content,
|
||||
embedding_text=item.content,
|
||||
chunk_type="local_markdown_chunk",
|
||||
section_title=item.metadata.section_title or item.metadata.section_number,
|
||||
section_path=section_path,
|
||||
page_number=item.metadata.page_number,
|
||||
page_start=item.metadata.page_number,
|
||||
page_end=item.metadata.page_number,
|
||||
section_level=len(section_path),
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
semantic_id=item.metadata.clause_number,
|
||||
block_type="local_markdown_chunk",
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -19,29 +19,35 @@ class AliyunVectorChunkBuilder(ChunkBuilder):
|
||||
"""Handle build for the Aliyun Vector Chunk Builder instance."""
|
||||
chunks: list[Chunk] = []
|
||||
for index, item in enumerate(parsed_document.vector_chunks):
|
||||
content = item.get("content") or item.get("text") or ""
|
||||
embedding_text = item.get("embedding_text") or content
|
||||
text = item.get("text") or ""
|
||||
embedding_text = item.get("embedding_text") or text
|
||||
if not embedding_text.strip():
|
||||
continue
|
||||
section_path = item.get("section_path") or []
|
||||
section_title = item.get("section_title") or (section_path[-1] if section_path else "")
|
||||
page_number = item.get("page_start") or item.get("page") or 0
|
||||
chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
|
||||
metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
|
||||
metadata = dict(item)
|
||||
metadata["regulation_type"] = regulation_type
|
||||
metadata["version"] = version
|
||||
chunks.append(
|
||||
Chunk(
|
||||
chunk_id=str(chunk_id),
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
content=content,
|
||||
doc_title=str(item.get("doc_title") or parsed_document.doc_name),
|
||||
text=text,
|
||||
embedding_text=embedding_text,
|
||||
chunk_type=str(item.get("chunk_type", item.get("block_type", ""))),
|
||||
chunk_index=int(item.get("chunk_index") or 0),
|
||||
piece_index=int(item.get("piece_index") or 0),
|
||||
page_start=int(item.get("page_start") or 0),
|
||||
page_end=int(item.get("page_end") or 0),
|
||||
section_title=section_title,
|
||||
section_path=section_path,
|
||||
page_number=int(page_number or 0),
|
||||
section_level=int(item.get("section_level") or len(section_path)),
|
||||
source_ids=[str(v) for v in item.get("source_ids", [])],
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
semantic_id=item.get("semantic_id", ""),
|
||||
block_type=item.get("block_type", ""),
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -56,7 +56,21 @@ class BM25Retriever:
|
||||
try:
|
||||
rows = self._vector_index.collection.query(
|
||||
expr='doc_id != ""',
|
||||
output_fields=["id", "doc_id", "doc_name", "content", "section_title", "page_number"],
|
||||
output_fields=[
|
||||
"id",
|
||||
"chunk_id",
|
||||
"doc_id",
|
||||
"doc_title",
|
||||
"text",
|
||||
"chunk_type",
|
||||
"section_title",
|
||||
"page_start",
|
||||
"page_end",
|
||||
"section_level",
|
||||
"chunk_index",
|
||||
"piece_index",
|
||||
"metadata_json",
|
||||
],
|
||||
limit=16384,
|
||||
)
|
||||
except Exception:
|
||||
@@ -64,19 +78,33 @@ class BM25Retriever:
|
||||
return []
|
||||
return [
|
||||
RetrievedChunk(
|
||||
chunk_id=str(row.get("id", "")),
|
||||
chunk_id=str(row.get("chunk_id") or row.get("id", "")),
|
||||
doc_id=str(row.get("doc_id", "")),
|
||||
doc_name=str(row.get("doc_name", "")),
|
||||
content=str(row.get("content", "")),
|
||||
doc_title=str(row.get("doc_title", "")),
|
||||
text=str(row.get("text", "")),
|
||||
score=0.0,
|
||||
chunk_type=str(row.get("chunk_type", "")),
|
||||
section_title=str(row.get("section_title", "")),
|
||||
page_number=int(row.get("page_number") or 0),
|
||||
metadata={},
|
||||
page_start=int(row.get("page_start") or 0),
|
||||
page_end=int(row.get("page_end") or 0),
|
||||
section_level=int(row.get("section_level") or 0),
|
||||
chunk_index=int(row.get("chunk_index") or 0),
|
||||
piece_index=int(row.get("piece_index") or 0),
|
||||
metadata=self._parse_metadata_json(row.get("metadata_json", "")),
|
||||
)
|
||||
for row in rows
|
||||
if row.get("content")
|
||||
if row.get("text")
|
||||
]
|
||||
|
||||
def _parse_metadata_json(self, raw_metadata: str) -> dict:
|
||||
"""Parse metadata_json into a dict for BM25-side filtering."""
|
||||
if not raw_metadata:
|
||||
return {}
|
||||
try:
|
||||
return dict(__import__("json").loads(raw_metadata))
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _ensure_built(self) -> None:
|
||||
if self._index is not None:
|
||||
return
|
||||
@@ -93,7 +121,7 @@ class BM25Retriever:
|
||||
self._chunks = []
|
||||
self._index = BM25Okapi([[]])
|
||||
return
|
||||
tokenized = [_tokenize(c.content) for c in chunks]
|
||||
tokenized = [_tokenize(c.text) for c in chunks]
|
||||
self._chunks = chunks
|
||||
self._index = BM25Okapi(tokenized)
|
||||
logger.info("BM25Retriever: index built with %d chunks", len(chunks))
|
||||
@@ -127,20 +155,26 @@ class BM25Retriever:
|
||||
for score, chunk in ranked[: top_k * 2]:
|
||||
if score <= 0:
|
||||
break
|
||||
# Apply simple regulation_type filter if provided
|
||||
if filters and chunk.metadata.get("regulation_type"):
|
||||
types = [t.strip() for t in filters.split(",")]
|
||||
if chunk.metadata.get("regulation_type") not in types:
|
||||
continue
|
||||
if filters:
|
||||
normalized_filter = filters.replace("doc_name", "doc_title").strip()
|
||||
if normalized_filter.startswith('doc_title == "'):
|
||||
expected_title = normalized_filter[len('doc_title == "'):-1]
|
||||
if chunk.doc_title != expected_title:
|
||||
continue
|
||||
results.append(
|
||||
RetrievedChunk(
|
||||
chunk_id=chunk.chunk_id,
|
||||
doc_id=chunk.doc_id,
|
||||
doc_name=chunk.doc_name,
|
||||
content=chunk.content,
|
||||
doc_title=chunk.doc_title,
|
||||
text=chunk.text,
|
||||
score=score,
|
||||
chunk_type=chunk.chunk_type,
|
||||
section_title=chunk.section_title,
|
||||
page_number=chunk.page_number,
|
||||
page_start=chunk.page_start,
|
||||
page_end=chunk.page_end,
|
||||
section_level=chunk.section_level,
|
||||
chunk_index=chunk.chunk_index,
|
||||
piece_index=chunk.piece_index,
|
||||
metadata=chunk.metadata,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -31,7 +31,7 @@ class OpenAICompatibleReranker(Reranker):
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
texts = [chunk.content for chunk in chunks]
|
||||
texts = [chunk.text for chunk in chunks]
|
||||
start = time.time()
|
||||
try:
|
||||
scores = self._call_reranker(query, texts)
|
||||
|
||||
@@ -4,57 +4,150 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
from typing import Iterable
|
||||
|
||||
from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
|
||||
from loguru import logger
|
||||
from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, MilvusException, connections, utility
|
||||
|
||||
from app.config.settings import settings
|
||||
from app.domain.documents import Chunk
|
||||
from app.domain.retrieval import RetrievedChunk, VectorIndex
|
||||
from app.shared.errors import VectorStoreSchemaError
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
_REQUIRED_SCHEMA_FIELDS = (
|
||||
"doc_id",
|
||||
"doc_title",
|
||||
"chunk_id",
|
||||
"text",
|
||||
"embedding",
|
||||
"section_title",
|
||||
"metadata_json",
|
||||
)
|
||||
_SCHEMA_RECOVERY_TOKENS = (
|
||||
"field doc_title not exist",
|
||||
"field text not exist",
|
||||
"field embedding not exist",
|
||||
"collection not loaded",
|
||||
"can't find collection",
|
||||
"not found[collection",
|
||||
)
|
||||
|
||||
|
||||
|
||||
class MilvusVectorIndex(VectorIndex):
|
||||
"""Provide the Milvus Vector Index index implementation."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the Milvus Vector Index instance."""
|
||||
self.collection_name = settings.milvus_collection
|
||||
self.db_name = settings.milvus_db_name
|
||||
self.host = settings.milvus_host
|
||||
self.port = settings.milvus_port
|
||||
# Use an adapter-specific alias so this index never reuses unrelated global Milvus state.
|
||||
self.alias = f"vector-index::{self.host}:{self.port}/{self.db_name}/{self.collection_name}"
|
||||
self._connect()
|
||||
self.collection = self._bind_collection()
|
||||
|
||||
def _connect(self, *, refresh: bool = False) -> None:
|
||||
"""Establish the Milvus connection for this adapter."""
|
||||
if refresh:
|
||||
try:
|
||||
connections.disconnect(self.alias)
|
||||
except Exception:
|
||||
# Best-effort disconnect keeps refresh idempotent when no alias is active yet.
|
||||
pass
|
||||
connections.connect(
|
||||
alias="default",
|
||||
host=settings.milvus_host,
|
||||
port=settings.milvus_port,
|
||||
alias=self.alias,
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
db_name=self.db_name,
|
||||
)
|
||||
self.collection = self._ensure_collection()
|
||||
|
||||
def _schema_field_names(self, collection: Collection) -> list[str]:
|
||||
"""Return the field names exposed by the bound Milvus collection."""
|
||||
return [field.name for field in collection.schema.fields]
|
||||
|
||||
def _raise_schema_error(self, *, message: str, actual_fields: Iterable[str]) -> None:
|
||||
"""Raise a typed schema error for the active collection."""
|
||||
raise VectorStoreSchemaError(
|
||||
message=message,
|
||||
host=self.host,
|
||||
db_name=self.db_name,
|
||||
collection_name=self.collection_name,
|
||||
expected_fields=list(_REQUIRED_SCHEMA_FIELDS),
|
||||
actual_fields=list(actual_fields),
|
||||
)
|
||||
|
||||
def _validate_schema(self, collection: Collection) -> None:
|
||||
"""Ensure the collection schema matches the dense-only adapter contract."""
|
||||
actual_fields = self._schema_field_names(collection)
|
||||
missing_fields = [field_name for field_name in _REQUIRED_SCHEMA_FIELDS if field_name not in actual_fields]
|
||||
if missing_fields:
|
||||
self._raise_schema_error(
|
||||
message=f"Milvus collection schema mismatch; missing required fields: {missing_fields}",
|
||||
actual_fields=actual_fields,
|
||||
)
|
||||
|
||||
def _log_collection_binding(self, collection: Collection, *, event: str) -> None:
|
||||
"""Record the bound collection details for runtime diagnostics."""
|
||||
try:
|
||||
num_entities = collection.num_entities
|
||||
except Exception:
|
||||
num_entities = "unknown"
|
||||
logger.info(
|
||||
"Milvus binding {} alias={} host={} db={} collection={} fields={} num_entities={}",
|
||||
event,
|
||||
self.alias,
|
||||
self.host,
|
||||
self.db_name,
|
||||
self.collection_name,
|
||||
self._schema_field_names(collection),
|
||||
num_entities,
|
||||
)
|
||||
|
||||
def _bind_collection(self, *, force_refresh: bool = False) -> Collection:
|
||||
"""Bind and validate the configured Milvus collection."""
|
||||
if force_refresh:
|
||||
self._connect(refresh=True)
|
||||
collection = self._ensure_collection()
|
||||
self._validate_schema(collection)
|
||||
self._log_collection_binding(collection, event="refreshed" if force_refresh else "initialized")
|
||||
return collection
|
||||
|
||||
def _ensure_collection(self) -> Collection:
|
||||
"""Handle ensure collection for this module for the Milvus Vector Index instance."""
|
||||
if utility.has_collection(self.collection_name):
|
||||
collection = Collection(self.collection_name)
|
||||
if utility.has_collection(self.collection_name, using=self.alias):
|
||||
collection = Collection(self.collection_name, using=self.alias)
|
||||
collection.load()
|
||||
return collection
|
||||
schema = CollectionSchema(
|
||||
fields=[
|
||||
FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=128, is_primary=True, auto_id=False),
|
||||
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256),
|
||||
FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=256),
|
||||
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name="chunk_index", dtype=DataType.INT64),
|
||||
FieldSchema(name="piece_index", dtype=DataType.INT64),
|
||||
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="embedding_text", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=settings.embedding_dim),
|
||||
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
|
||||
FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
|
||||
FieldSchema(name="page_number", dtype=DataType.INT64),
|
||||
FieldSchema(name="regulation_type", dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name="version", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name="block_type", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="page_start", dtype=DataType.INT64),
|
||||
FieldSchema(name="page_end", dtype=DataType.INT64),
|
||||
FieldSchema(name="section_level", dtype=DataType.INT64),
|
||||
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),
|
||||
FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
|
||||
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
|
||||
FieldSchema(name="metadata_json", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="created_at", dtype=DataType.INT64),
|
||||
],
|
||||
description="Dense-only regulations index",
|
||||
enable_dynamic_field=False,
|
||||
)
|
||||
collection = Collection(name=self.collection_name, schema=schema)
|
||||
collection = Collection(name=self.collection_name, schema=schema, using=self.alias)
|
||||
collection.create_index(
|
||||
field_name="embedding",
|
||||
index_params={
|
||||
@@ -73,21 +166,34 @@ class MilvusVectorIndex(VectorIndex):
|
||||
data = []
|
||||
now = int(time.time())
|
||||
for chunk, vector in zip(chunks, vectors):
|
||||
metadata = dict(chunk.metadata)
|
||||
doc_title = str(metadata.get("doc_title", chunk.doc_title))
|
||||
text = str(metadata.get("text", chunk.text))
|
||||
embedding_text = str(metadata.get("embedding_text", chunk.embedding_text))
|
||||
page_start = int(metadata.get("page_start", 0) or 0)
|
||||
page_end = int(metadata.get("page_end", 0) or 0)
|
||||
section_path = metadata.get("section_path", chunk.section_path)
|
||||
source_ids = metadata.get("source_ids", [])
|
||||
data.append(
|
||||
{
|
||||
"id": chunk.chunk_id,
|
||||
"doc_id": chunk.doc_id,
|
||||
"doc_name": chunk.doc_name,
|
||||
"content": chunk.content[:65535],
|
||||
"doc_title": doc_title[:256],
|
||||
"chunk_id": chunk.chunk_id[:128],
|
||||
"chunk_index": int(metadata.get("chunk_index", chunk.chunk_index) or 0),
|
||||
"piece_index": int(metadata.get("piece_index", chunk.piece_index) or 0),
|
||||
"text": text[:65535],
|
||||
"embedding_text": embedding_text[:65535],
|
||||
"embedding": vector,
|
||||
"section_title": chunk.section_title[:512],
|
||||
"section_path": json.dumps(chunk.section_path, ensure_ascii=False)[:4096],
|
||||
"page_number": chunk.page_number,
|
||||
"regulation_type": chunk.regulation_type[:128],
|
||||
"version": chunk.version[:64],
|
||||
"semantic_id": chunk.semantic_id[:128],
|
||||
"block_type": chunk.block_type[:64],
|
||||
"metadata_json": json.dumps(chunk.metadata, ensure_ascii=False)[:65535],
|
||||
"semantic_id": str(metadata.get("semantic_id", chunk.semantic_id))[:128],
|
||||
"chunk_type": str(metadata.get("chunk_type", chunk.chunk_type))[:64],
|
||||
"page_start": page_start,
|
||||
"page_end": page_end,
|
||||
"section_level": int(metadata.get("section_level", chunk.section_level) or 0),
|
||||
"source_ids": json.dumps(source_ids, ensure_ascii=False)[:4096],
|
||||
"section_path": json.dumps(section_path, ensure_ascii=False)[:4096],
|
||||
"section_title": str(metadata.get("section_title", chunk.section_title))[:512],
|
||||
"metadata_json": json.dumps(metadata, ensure_ascii=False)[:65535],
|
||||
"created_at": now,
|
||||
}
|
||||
)
|
||||
@@ -107,47 +213,97 @@ class MilvusVectorIndex(VectorIndex):
|
||||
|
||||
filters = filters.strip()
|
||||
|
||||
# Normalize legacy field names so callers can keep older filter payloads.
|
||||
replacements = {
|
||||
"doc_name": "doc_title",
|
||||
"content": "text",
|
||||
"page_number": "page_start",
|
||||
"block_type": "chunk_type",
|
||||
}
|
||||
for legacy_name, new_name in replacements.items():
|
||||
filters = filters.replace(legacy_name, new_name)
|
||||
|
||||
# Check if already a Milvus expression (contains operators)
|
||||
if any(op in filters for op in ["==", "!=", "in", "not in", ">", "<", ">=", "<=", "and", "or"]):
|
||||
return filters
|
||||
|
||||
# Parse simple regulation_type filter
|
||||
# Support: "GB" or "GB,UN-ECE" or "GB, UN-ECE"
|
||||
types = [t.strip() for t in filters.split(",") if t.strip()]
|
||||
# Parse simple document-title filter.
|
||||
titles = [title.strip() for title in filters.split(",") if title.strip()]
|
||||
|
||||
if not types:
|
||||
if not titles:
|
||||
return None
|
||||
|
||||
if len(types) == 1:
|
||||
# Single value: regulation_type == "GB"
|
||||
return f'regulation_type == "{types[0]}"'
|
||||
else:
|
||||
# Multiple values: regulation_type in ["GB", "UN-ECE"]
|
||||
quoted_types = [f'"{t}"' for t in types]
|
||||
return f'regulation_type in [{", ".join(quoted_types)}]'
|
||||
if len(titles) == 1:
|
||||
return f'doc_title == "{titles[0]}"'
|
||||
|
||||
quoted_titles = [f'"{title}"' for title in titles]
|
||||
return f'doc_title in [{", ".join(quoted_titles)}]'
|
||||
|
||||
def _should_refresh_after_exception(self, exc: Exception) -> bool:
|
||||
"""Return whether the Milvus error suggests stale connection or collection state."""
|
||||
if not isinstance(exc, MilvusException):
|
||||
return False
|
||||
normalized = str(exc).lower()
|
||||
return any(token in normalized for token in _SCHEMA_RECOVERY_TOKENS)
|
||||
|
||||
def _run_with_refresh(self, operation):
|
||||
"""Run a Milvus operation and retry once after a forced reconnect when appropriate."""
|
||||
try:
|
||||
return operation()
|
||||
except VectorStoreSchemaError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
if not self._should_refresh_after_exception(exc):
|
||||
raise
|
||||
logger.warning(
|
||||
"Milvus operation failed for alias={} collection={}; forcing reconnect and retry: {}",
|
||||
self.alias,
|
||||
self.collection_name,
|
||||
exc,
|
||||
)
|
||||
self.collection = self._bind_collection(force_refresh=True)
|
||||
try:
|
||||
return operation()
|
||||
except VectorStoreSchemaError:
|
||||
raise
|
||||
except Exception as retry_exc:
|
||||
if isinstance(retry_exc, MilvusException):
|
||||
self._raise_schema_error(
|
||||
message=f"Milvus operation failed after refresh: {retry_exc}",
|
||||
actual_fields=self._schema_field_names(self.collection),
|
||||
)
|
||||
raise
|
||||
|
||||
def search(self, query_vector: list[float], top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
|
||||
"""Handle search for the Milvus Vector Index instance."""
|
||||
milvus_expr = self._parse_filters(filters)
|
||||
|
||||
results = self.collection.search(
|
||||
data=[query_vector],
|
||||
anns_field="embedding",
|
||||
param={"metric_type": "COSINE", "params": {"nprobe": settings.milvus_nprobe}},
|
||||
limit=top_k,
|
||||
expr=milvus_expr,
|
||||
output_fields=[
|
||||
"doc_id",
|
||||
"doc_name",
|
||||
"content",
|
||||
"section_title",
|
||||
"page_number",
|
||||
"regulation_type",
|
||||
"version",
|
||||
"semantic_id",
|
||||
"block_type",
|
||||
"metadata_json",
|
||||
],
|
||||
results = self._run_with_refresh(
|
||||
lambda: self.collection.search(
|
||||
data=[query_vector],
|
||||
anns_field="embedding",
|
||||
param={"metric_type": "COSINE", "params": {"nprobe": settings.milvus_nprobe}},
|
||||
limit=top_k,
|
||||
expr=milvus_expr,
|
||||
output_fields=[
|
||||
"doc_id",
|
||||
"doc_title",
|
||||
"chunk_id",
|
||||
"chunk_index",
|
||||
"piece_index",
|
||||
"text",
|
||||
"embedding_text",
|
||||
"section_title",
|
||||
"semantic_id",
|
||||
"chunk_type",
|
||||
"page_start",
|
||||
"page_end",
|
||||
"section_level",
|
||||
"source_ids",
|
||||
"section_path",
|
||||
"metadata_json",
|
||||
],
|
||||
)
|
||||
)
|
||||
payload: list[RetrievedChunk] = []
|
||||
for hits in results:
|
||||
@@ -161,13 +317,18 @@ class MilvusVectorIndex(VectorIndex):
|
||||
metadata = {"raw_metadata": raw_metadata}
|
||||
payload.append(
|
||||
RetrievedChunk(
|
||||
chunk_id=str(hit.id),
|
||||
chunk_id=str(hit.entity.get("chunk_id", hit.id)),
|
||||
doc_id=hit.entity.get("doc_id", ""),
|
||||
doc_name=hit.entity.get("doc_name", ""),
|
||||
content=hit.entity.get("content", ""),
|
||||
doc_title=hit.entity.get("doc_title", ""),
|
||||
text=hit.entity.get("text", ""),
|
||||
score=float(hit.score),
|
||||
chunk_type=hit.entity.get("chunk_type", ""),
|
||||
section_title=hit.entity.get("section_title", ""),
|
||||
page_number=int(hit.entity.get("page_number", 0) or 0),
|
||||
page_start=int(hit.entity.get("page_start", 0) or 0),
|
||||
page_end=int(hit.entity.get("page_end", 0) or 0),
|
||||
section_level=int(hit.entity.get("section_level", 0) or 0),
|
||||
chunk_index=int(hit.entity.get("chunk_index", 0) or 0),
|
||||
piece_index=int(hit.entity.get("piece_index", 0) or 0),
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
@@ -176,7 +337,9 @@ class MilvusVectorIndex(VectorIndex):
|
||||
def count_by_document(self) -> dict[str, int]:
|
||||
"""Return doc_id -> chunk count from Milvus."""
|
||||
try:
|
||||
rows = self.collection.query(expr="doc_id != \"\"", output_fields=["doc_id"])
|
||||
rows = self._run_with_refresh(
|
||||
lambda: self.collection.query(expr="doc_id != \"\"", output_fields=["doc_id", "doc_title"])
|
||||
)
|
||||
except Exception:
|
||||
return {}
|
||||
counts: dict[str, int] = {}
|
||||
@@ -189,9 +352,11 @@ class MilvusVectorIndex(VectorIndex):
|
||||
def list_document_metadata(self) -> list[dict]:
|
||||
"""Return one metadata row per document from Milvus (single query, no embeddings)."""
|
||||
try:
|
||||
rows = self.collection.query(
|
||||
expr="doc_id != \"\"",
|
||||
output_fields=["doc_id", "doc_name", "regulation_type", "version"],
|
||||
rows = self._run_with_refresh(
|
||||
lambda: self.collection.query(
|
||||
expr="doc_id != \"\"",
|
||||
output_fields=["doc_id", "doc_title", "metadata_json"],
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
return []
|
||||
@@ -204,15 +369,26 @@ class MilvusVectorIndex(VectorIndex):
|
||||
continue
|
||||
counts[doc_id] = counts.get(doc_id, 0) + 1
|
||||
if doc_id not in seen:
|
||||
metadata: dict[str, object] = {}
|
||||
raw_metadata = row.get("metadata_json", "")
|
||||
if raw_metadata:
|
||||
try:
|
||||
metadata = json.loads(raw_metadata)
|
||||
except json.JSONDecodeError:
|
||||
metadata = {}
|
||||
seen[doc_id] = {
|
||||
"doc_id": doc_id,
|
||||
"doc_name": row.get("doc_name", ""),
|
||||
"regulation_type": row.get("regulation_type", ""),
|
||||
"version": row.get("version", ""),
|
||||
"doc_title": row.get("doc_title", ""),
|
||||
"regulation_type": str(metadata.get("regulation_type", "")),
|
||||
"version": str(metadata.get("version", "")),
|
||||
}
|
||||
|
||||
return [
|
||||
{**meta, "chunk_count": counts[meta["doc_id"]]}
|
||||
{
|
||||
**meta,
|
||||
"doc_name": meta.get("doc_title", ""),
|
||||
"chunk_count": counts[meta["doc_id"]],
|
||||
}
|
||||
for meta in seen.values()
|
||||
]
|
||||
|
||||
|
||||
@@ -67,14 +67,14 @@ class DocumentProcessor:
|
||||
return [
|
||||
{
|
||||
"id": item.chunk_id,
|
||||
"content": item.content,
|
||||
"content": item.text,
|
||||
"score": item.score,
|
||||
"metadata": {
|
||||
"doc_id": item.doc_id,
|
||||
"doc_name": item.doc_name,
|
||||
"doc_name": item.doc_title,
|
||||
"chunk_id": item.chunk_id,
|
||||
"section_title": item.section_title,
|
||||
"page_number": item.page_number,
|
||||
"page_number": item.page_start,
|
||||
**item.metadata,
|
||||
},
|
||||
}
|
||||
|
||||
30
backend/app/shared/errors.py
Normal file
30
backend/app/shared/errors.py
Normal file
@@ -0,0 +1,30 @@
|
||||
"""Define shared backend exception types."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class VectorStoreSchemaError(RuntimeError):
|
||||
"""Signal that the active vector store schema does not match backend expectations."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
message: str,
|
||||
host: str,
|
||||
db_name: str,
|
||||
collection_name: str,
|
||||
expected_fields: list[str],
|
||||
actual_fields: list[str],
|
||||
) -> None:
|
||||
"""Initialize the vector store schema error details."""
|
||||
self.host = host
|
||||
self.db_name = db_name
|
||||
self.collection_name = collection_name
|
||||
self.expected_fields = expected_fields
|
||||
self.actual_fields = actual_fields
|
||||
# Keep the message self-contained so runtime logs show the full mismatch context.
|
||||
details = (
|
||||
f"{message} | host={host} db={db_name} collection={collection_name} "
|
||||
f"expected_fields={expected_fields} actual_fields={actual_fields}"
|
||||
)
|
||||
super().__init__(details)
|
||||
@@ -1 +0,0 @@
|
||||
{}
|
||||
131
backend/data/document_processing.json
Normal file
131
backend/data/document_processing.json
Normal file
@@ -0,0 +1,131 @@
|
||||
{
|
||||
"runs": {
|
||||
"8e722053-5009-40fe-a483-535b40ebbb16": {
|
||||
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
|
||||
"doc_id": "7cbdfe3c",
|
||||
"trigger_type": "upload",
|
||||
"run_status": "succeeded",
|
||||
"parser_backend": "aliyun_docmind",
|
||||
"chunk_backend": "aliyun",
|
||||
"embedding_model": "text-embedding-v3",
|
||||
"index_name": "regulations_dense_1024_v2",
|
||||
"started_at": "2026-05-26T12:18:27.208692+00:00",
|
||||
"stored_at": "2026-05-26T12:18:27.712855+00:00",
|
||||
"parsed_at": "2026-05-26T12:18:42.989238+00:00",
|
||||
"indexed_at": "2026-05-26T12:18:51.172418+00:00",
|
||||
"finished_at": "2026-05-26T12:18:51.172418+00:00",
|
||||
"layout_count": 48,
|
||||
"structure_node_count": 6,
|
||||
"semantic_block_count": 33,
|
||||
"vector_chunk_count": 34,
|
||||
"chunk_count": 34,
|
||||
"failure_stage": "",
|
||||
"error_message": "",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"parse_task_id": "docmind-20260526-10b94713ccb348498b12180a5dcf32ff"
|
||||
}
|
||||
}
|
||||
},
|
||||
"status_events": {
|
||||
"d0532baf-0d65-4130-b282-ec51f04132fd": {
|
||||
"event_id": "d0532baf-0d65-4130-b282-ec51f04132fd",
|
||||
"doc_id": "7cbdfe3c",
|
||||
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
|
||||
"from_status": "",
|
||||
"to_status": "pending",
|
||||
"stage": "document_created",
|
||||
"message": "Document record created",
|
||||
"metadata": {},
|
||||
"occurred_at": "2026-05-26T12:18:27.235921+00:00"
|
||||
},
|
||||
"a5e32db5-25c3-4c73-a987-7311f0e72a31": {
|
||||
"event_id": "a5e32db5-25c3-4c73-a987-7311f0e72a31",
|
||||
"doc_id": "7cbdfe3c",
|
||||
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
|
||||
"from_status": "pending",
|
||||
"to_status": "stored",
|
||||
"stage": "store",
|
||||
"message": "Source file stored",
|
||||
"metadata": {},
|
||||
"occurred_at": "2026-05-26T12:18:27.741462+00:00"
|
||||
},
|
||||
"18e04ce7-9d7a-4008-8600-e2590100bd85": {
|
||||
"event_id": "18e04ce7-9d7a-4008-8600-e2590100bd85",
|
||||
"doc_id": "7cbdfe3c",
|
||||
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
|
||||
"from_status": "stored",
|
||||
"to_status": "parsed",
|
||||
"stage": "parse",
|
||||
"message": "Document parsed",
|
||||
"metadata": {
|
||||
"artifact_count": 4
|
||||
},
|
||||
"occurred_at": "2026-05-26T12:18:43.218026+00:00"
|
||||
},
|
||||
"d3b06025-5c91-4a42-9e5f-dce1c5312b96": {
|
||||
"event_id": "d3b06025-5c91-4a42-9e5f-dce1c5312b96",
|
||||
"doc_id": "7cbdfe3c",
|
||||
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
|
||||
"from_status": "parsed",
|
||||
"to_status": "indexed",
|
||||
"stage": "index",
|
||||
"message": "Document indexed",
|
||||
"metadata": {
|
||||
"chunk_count": 34,
|
||||
"index_name": "regulations_dense_1024_v2"
|
||||
},
|
||||
"occurred_at": "2026-05-26T12:18:51.195442+00:00"
|
||||
}
|
||||
},
|
||||
"artifacts": {
|
||||
"47fe2877-a8f5-4e1d-901b-80cd0194ba96": {
|
||||
"artifact_id": "47fe2877-a8f5-4e1d-901b-80cd0194ba96",
|
||||
"doc_id": "7cbdfe3c",
|
||||
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
|
||||
"artifact_type": "layouts",
|
||||
"object_name": "artifacts/7cbdfe3c/layouts.json",
|
||||
"content_type": "application/json",
|
||||
"byte_size": 0,
|
||||
"checksum": "",
|
||||
"metadata": {},
|
||||
"created_at": "2026-05-26T12:18:43.188467+00:00"
|
||||
},
|
||||
"44aa075b-86b2-48a7-9d14-a2453bd53863": {
|
||||
"artifact_id": "44aa075b-86b2-48a7-9d14-a2453bd53863",
|
||||
"doc_id": "7cbdfe3c",
|
||||
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
|
||||
"artifact_type": "structure_nodes",
|
||||
"object_name": "artifacts/7cbdfe3c/structure_nodes.json",
|
||||
"content_type": "application/json",
|
||||
"byte_size": 0,
|
||||
"checksum": "",
|
||||
"metadata": {},
|
||||
"created_at": "2026-05-26T12:18:43.188494+00:00"
|
||||
},
|
||||
"dedcc8fe-fa58-4de6-984d-f44332af5204": {
|
||||
"artifact_id": "dedcc8fe-fa58-4de6-984d-f44332af5204",
|
||||
"doc_id": "7cbdfe3c",
|
||||
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
|
||||
"artifact_type": "semantic_blocks",
|
||||
"object_name": "artifacts/7cbdfe3c/semantic_blocks.json",
|
||||
"content_type": "application/json",
|
||||
"byte_size": 0,
|
||||
"checksum": "",
|
||||
"metadata": {},
|
||||
"created_at": "2026-05-26T12:18:43.188511+00:00"
|
||||
},
|
||||
"9b0d8bda-e69e-4a4e-ae06-a308afe43109": {
|
||||
"artifact_id": "9b0d8bda-e69e-4a4e-ae06-a308afe43109",
|
||||
"doc_id": "7cbdfe3c",
|
||||
"run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
|
||||
"artifact_type": "vector_chunks",
|
||||
"object_name": "artifacts/7cbdfe3c/vector_chunks.json",
|
||||
"content_type": "application/json",
|
||||
"byte_size": 0,
|
||||
"checksum": "",
|
||||
"metadata": {},
|
||||
"created_at": "2026-05-26T12:18:43.188526+00:00"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,392 +1,9 @@
|
||||
{
|
||||
"69280841": {
|
||||
"doc_id": "69280841",
|
||||
"doc_name": "TCT算法接口.pdf",
|
||||
"file_name": "TCT算法接口.pdf",
|
||||
"object_name": "69280841/TCT算法接口.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 165557,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "local_markdown_parser",
|
||||
"index_name": "",
|
||||
"error_message": "embedding 维度不匹配,期望 1536",
|
||||
"created_at": "2026-05-18T07:12:16.668306+00:00",
|
||||
"updated_at": "2026-05-18T07:12:19.417142+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"structure_nodes": 0
|
||||
}
|
||||
},
|
||||
"44121fbb": {
|
||||
"doc_id": "44121fbb",
|
||||
"doc_name": "大众汽车手册.pdf",
|
||||
"file_name": "大众汽车手册.pdf",
|
||||
"object_name": "44121fbb/大众汽车手册.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 766565,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "",
|
||||
"index_name": "",
|
||||
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"created_at": "2026-05-18T09:53:47.996183+00:00",
|
||||
"updated_at": "2026-05-18T09:53:50.825868+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"processing_stage": "failed"
|
||||
}
|
||||
},
|
||||
"77debb4a": {
|
||||
"doc_id": "77debb4a",
|
||||
"doc_name": "大众汽车手册.pdf",
|
||||
"file_name": "大众汽车手册.pdf",
|
||||
"object_name": "77debb4a/大众汽车手册.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 766565,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "",
|
||||
"index_name": "",
|
||||
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"created_at": "2026-05-18T10:05:46.104259+00:00",
|
||||
"updated_at": "2026-05-18T10:05:48.704061+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"processing_stage": "failed"
|
||||
}
|
||||
},
|
||||
"d12bdcc8": {
|
||||
"doc_id": "d12bdcc8",
|
||||
"doc_name": "TCT算法接口.pdf",
|
||||
"file_name": "TCT算法接口.pdf",
|
||||
"object_name": "d12bdcc8/TCT算法接口.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 165557,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "",
|
||||
"index_name": "",
|
||||
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"created_at": "2026-05-18T10:07:22.199824+00:00",
|
||||
"updated_at": "2026-05-18T10:07:24.653751+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"processing_stage": "failed"
|
||||
}
|
||||
},
|
||||
"3c2e8c9c": {
|
||||
"doc_id": "3c2e8c9c",
|
||||
"doc_name": "20260415_Continental tire mobile app solution.pdf",
|
||||
"file_name": "20260415_Continental tire mobile app solution.pdf",
|
||||
"object_name": "3c2e8c9c/20260415_Continental tire mobile app solution.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 2178074,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "",
|
||||
"index_name": "",
|
||||
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"created_at": "2026-05-18T10:09:58.338274+00:00",
|
||||
"updated_at": "2026-05-18T10:10:01.295502+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"processing_stage": "failed"
|
||||
}
|
||||
},
|
||||
"d22d21a0": {
|
||||
"doc_id": "d22d21a0",
|
||||
"doc_name": "20260415_Continental tire mobile app solution.pdf",
|
||||
"file_name": "20260415_Continental tire mobile app solution.pdf",
|
||||
"object_name": "d22d21a0/20260415_Continental tire mobile app solution.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 2178074,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "",
|
||||
"index_name": "",
|
||||
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"created_at": "2026-05-18T10:12:20.078027+00:00",
|
||||
"updated_at": "2026-05-18T10:12:22.999843+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"processing_stage": "failed"
|
||||
}
|
||||
},
|
||||
"35f129d3": {
|
||||
"doc_id": "35f129d3",
|
||||
"doc_name": "大众汽车手册.pdf",
|
||||
"file_name": "大众汽车手册.pdf",
|
||||
"object_name": "35f129d3/大众汽车手册.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 766565,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "",
|
||||
"index_name": "",
|
||||
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"created_at": "2026-05-18T10:13:24.706512+00:00",
|
||||
"updated_at": "2026-05-18T10:13:27.180509+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
|
||||
"processing_stage": "failed"
|
||||
}
|
||||
},
|
||||
"efc21515": {
|
||||
"doc_id": "efc21515",
|
||||
"doc_name": "大众汽车手册.pdf",
|
||||
"file_name": "大众汽车手册.pdf",
|
||||
"object_name": "efc21515/大众汽车手册.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 766565,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "aliyun_docmind",
|
||||
"index_name": "",
|
||||
"error_message": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
|
||||
"created_at": "2026-05-18T13:47:32.076786+00:00",
|
||||
"updated_at": "2026-05-18T13:47:57.998073+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"parser_backend": "aliyun_docmind",
|
||||
"parse_task_id": "docmind-20260518-a6e84447457f43cb85f95225cfc6495b",
|
||||
"layout_count": 87,
|
||||
"structure_node_count": 20,
|
||||
"semantic_block_count": 27,
|
||||
"vector_chunk_count": 27,
|
||||
"artifact_keys": {
|
||||
"layouts": "artifacts/efc21515/layouts.json",
|
||||
"structure_nodes": "artifacts/efc21515/structure_nodes.json",
|
||||
"semantic_blocks": "artifacts/efc21515/semantic_blocks.json",
|
||||
"vector_chunks": "artifacts/efc21515/vector_chunks.json"
|
||||
},
|
||||
"processing_stage": "failed",
|
||||
"failure_reason": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400"
|
||||
}
|
||||
},
|
||||
"0d4b08bc": {
|
||||
"doc_id": "0d4b08bc",
|
||||
"doc_name": "大众汽车手册.pdf",
|
||||
"file_name": "大众汽车手册.pdf",
|
||||
"object_name": "0d4b08bc/大众汽车手册.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 766565,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "aliyun_docmind",
|
||||
"index_name": "",
|
||||
"error_message": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
|
||||
"created_at": "2026-05-18T14:03:15.134344+00:00",
|
||||
"updated_at": "2026-05-18T14:03:34.843448+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"parser_backend": "aliyun_docmind",
|
||||
"parse_task_id": "docmind-20260518-78353d85daa24147b68d8fb71895179f",
|
||||
"layout_count": 87,
|
||||
"structure_node_count": 20,
|
||||
"semantic_block_count": 27,
|
||||
"vector_chunk_count": 27,
|
||||
"artifact_keys": {
|
||||
"layouts": "artifacts/0d4b08bc/layouts.json",
|
||||
"structure_nodes": "artifacts/0d4b08bc/structure_nodes.json",
|
||||
"semantic_blocks": "artifacts/0d4b08bc/semantic_blocks.json",
|
||||
"vector_chunks": "artifacts/0d4b08bc/vector_chunks.json"
|
||||
},
|
||||
"processing_stage": "failed",
|
||||
"failure_reason": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404"
|
||||
}
|
||||
},
|
||||
"4302f314": {
|
||||
"doc_id": "4302f314",
|
||||
"doc_name": "大众汽车手册.pdf",
|
||||
"file_name": "大众汽车手册.pdf",
|
||||
"object_name": "4302f314/大众汽车手册.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 766565,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "aliyun_docmind",
|
||||
"index_name": "",
|
||||
"error_message": "embedding 维度不匹配,期望 1536",
|
||||
"created_at": "2026-05-18T14:11:29.943973+00:00",
|
||||
"updated_at": "2026-05-18T14:11:48.554500+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"parser_backend": "aliyun_docmind",
|
||||
"parse_task_id": "docmind-20260518-23935ee455ac4b26ac4201ac4781ee52",
|
||||
"layout_count": 87,
|
||||
"structure_node_count": 20,
|
||||
"semantic_block_count": 27,
|
||||
"vector_chunk_count": 27,
|
||||
"artifact_keys": {
|
||||
"layouts": "artifacts/4302f314/layouts.json",
|
||||
"structure_nodes": "artifacts/4302f314/structure_nodes.json",
|
||||
"semantic_blocks": "artifacts/4302f314/semantic_blocks.json",
|
||||
"vector_chunks": "artifacts/4302f314/vector_chunks.json"
|
||||
},
|
||||
"processing_stage": "failed",
|
||||
"failure_reason": "embedding 维度不匹配,期望 1536"
|
||||
}
|
||||
},
|
||||
"765ed1ee": {
|
||||
"doc_id": "765ed1ee",
|
||||
"doc_name": "大众汽车手册.pdf",
|
||||
"file_name": "大众汽车手册.pdf",
|
||||
"object_name": "765ed1ee/大众汽车手册.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 766565,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "aliyun_docmind",
|
||||
"index_name": "",
|
||||
"error_message": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>",
|
||||
"created_at": "2026-05-18T14:18:28.875138+00:00",
|
||||
"updated_at": "2026-05-18T14:18:57.389110+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"parser_backend": "aliyun_docmind",
|
||||
"parse_task_id": "docmind-20260518-f116856bc29245baa2531b245078a701",
|
||||
"layout_count": 87,
|
||||
"structure_node_count": 20,
|
||||
"semantic_block_count": 27,
|
||||
"vector_chunk_count": 27,
|
||||
"artifact_keys": {
|
||||
"layouts": "artifacts/765ed1ee/layouts.json",
|
||||
"structure_nodes": "artifacts/765ed1ee/structure_nodes.json",
|
||||
"semantic_blocks": "artifacts/765ed1ee/semantic_blocks.json",
|
||||
"vector_chunks": "artifacts/765ed1ee/vector_chunks.json"
|
||||
},
|
||||
"processing_stage": "failed",
|
||||
"failure_reason": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>"
|
||||
}
|
||||
},
|
||||
"05cabe09": {
|
||||
"doc_id": "05cabe09",
|
||||
"doc_name": "大众汽车手册.pdf",
|
||||
"file_name": "大众汽车手册.pdf",
|
||||
"object_name": "05cabe09/大众汽车手册.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 766565,
|
||||
"status": "failed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 0,
|
||||
"parser_name": "aliyun_docmind",
|
||||
"index_name": "",
|
||||
"error_message": "embedding 维度不匹配,期望 1536",
|
||||
"created_at": "2026-05-18T14:24:32.156500+00:00",
|
||||
"updated_at": "2026-05-18T14:24:50.114138+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"parser_backend": "aliyun_docmind",
|
||||
"parse_task_id": "docmind-20260518-897d858983df48e28e9819e563d46208",
|
||||
"layout_count": 87,
|
||||
"structure_node_count": 20,
|
||||
"semantic_block_count": 27,
|
||||
"vector_chunk_count": 27,
|
||||
"artifact_keys": {
|
||||
"layouts": "artifacts/05cabe09/layouts.json",
|
||||
"structure_nodes": "artifacts/05cabe09/structure_nodes.json",
|
||||
"semantic_blocks": "artifacts/05cabe09/semantic_blocks.json",
|
||||
"vector_chunks": "artifacts/05cabe09/vector_chunks.json"
|
||||
},
|
||||
"processing_stage": "failed",
|
||||
"failure_reason": "embedding 维度不匹配,期望 1536"
|
||||
}
|
||||
},
|
||||
"9acb2ba0": {
|
||||
"doc_id": "9acb2ba0",
|
||||
"doc_name": "大众汽车手册.pdf",
|
||||
"file_name": "大众汽车手册.pdf",
|
||||
"object_name": "9acb2ba0/大众汽车手册.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size_bytes": 766565,
|
||||
"status": "indexed",
|
||||
"regulation_type": "",
|
||||
"version": "",
|
||||
"summary": "",
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 27,
|
||||
"parser_name": "aliyun_docmind",
|
||||
"index_name": "regulations_dense_1024_v1",
|
||||
"error_message": "",
|
||||
"created_at": "2026-05-18T14:29:01.368719+00:00",
|
||||
"updated_at": "2026-05-18T14:29:23.699068+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"parser_backend": "aliyun_docmind",
|
||||
"parse_task_id": "docmind-20260518-e5fd4a5419e74d569c562e389e6ae72c",
|
||||
"layout_count": 87,
|
||||
"structure_node_count": 20,
|
||||
"semantic_block_count": 27,
|
||||
"vector_chunk_count": 27,
|
||||
"artifact_keys": {
|
||||
"layouts": "artifacts/9acb2ba0/layouts.json",
|
||||
"structure_nodes": "artifacts/9acb2ba0/structure_nodes.json",
|
||||
"semantic_blocks": "artifacts/9acb2ba0/semantic_blocks.json",
|
||||
"vector_chunks": "artifacts/9acb2ba0/vector_chunks.json"
|
||||
},
|
||||
"processing_stage": "indexed",
|
||||
"index_collection": "regulations_dense_1024_v1"
|
||||
}
|
||||
},
|
||||
"52bd970f": {
|
||||
"doc_id": "52bd970f",
|
||||
"7cbdfe3c": {
|
||||
"doc_id": "7cbdfe3c",
|
||||
"doc_name": "使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
|
||||
"file_name": "使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
|
||||
"object_name": "52bd970f/使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
|
||||
"object_name": "7cbdfe3c/使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
|
||||
"content_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"size_bytes": 1199920,
|
||||
"status": "indexed",
|
||||
@@ -396,26 +13,26 @@
|
||||
"summary_latency_ms": 0,
|
||||
"chunk_count": 34,
|
||||
"parser_name": "aliyun_docmind",
|
||||
"index_name": "regulations_dense_1024_v1",
|
||||
"index_name": "regulations_dense_1024_v2",
|
||||
"error_message": "",
|
||||
"created_at": "2026-05-25T07:45:12.777459+00:00",
|
||||
"updated_at": "2026-05-25T07:45:37.314290+00:00",
|
||||
"created_at": "2026-05-26T12:18:27.206125+00:00",
|
||||
"updated_at": "2026-05-26T12:18:51.171308+00:00",
|
||||
"metadata": {
|
||||
"generate_summary": true,
|
||||
"parser_backend": "aliyun_docmind",
|
||||
"parse_task_id": "docmind-20260525-6d782dc33f2748a4a1020df765b8182d",
|
||||
"parse_task_id": "docmind-20260526-10b94713ccb348498b12180a5dcf32ff",
|
||||
"layout_count": 48,
|
||||
"structure_node_count": 6,
|
||||
"semantic_block_count": 33,
|
||||
"vector_chunk_count": 34,
|
||||
"artifact_keys": {
|
||||
"layouts": "artifacts/52bd970f/layouts.json",
|
||||
"structure_nodes": "artifacts/52bd970f/structure_nodes.json",
|
||||
"semantic_blocks": "artifacts/52bd970f/semantic_blocks.json",
|
||||
"vector_chunks": "artifacts/52bd970f/vector_chunks.json"
|
||||
"layouts": "artifacts/7cbdfe3c/layouts.json",
|
||||
"structure_nodes": "artifacts/7cbdfe3c/structure_nodes.json",
|
||||
"semantic_blocks": "artifacts/7cbdfe3c/semantic_blocks.json",
|
||||
"vector_chunks": "artifacts/7cbdfe3c/vector_chunks.json"
|
||||
},
|
||||
"processing_stage": "indexed",
|
||||
"index_collection": "regulations_dense_1024_v1"
|
||||
"index_collection": "regulations_dense_1024_v2"
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user