feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
ash66
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions

18
.env
View File

@@ -9,7 +9,7 @@ DEBUG=false
# ===== Milvus向量数据库配置已有=====
MILVUS_HOST=localhost
MILVUS_PORT=19530
MILVUS_COLLECTION=regulations_dense_1536
MILVUS_COLLECTION=regulations_dense_1024_v1
MILVUS_DB_NAME=default
# ===== MinIO对象存储配置已有=====
@@ -34,7 +34,7 @@ POSTGRES_DB=compliance_db
# ===== 嵌入模型配置 =====
EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536
EMBEDDING_DIM=1024
EMBEDDING_API_KEY=sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8
EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1
EMBEDDING_TIMEOUT_SECONDS=120
@@ -59,7 +59,7 @@ LLM_TEMPERATURE=0.7
# 获取API Key: https://dashscope.console.aliyun.com/
QWEN_API_KEY=sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8
QWEN_BASE_URL=http://6.86.80.4:30080/v1
QWEN_MODEL=qwen3.5-plus
QWEN_MODEL=qwen3.6-plus
QWEN_VL_MODEL=qwen3-vl-plus
# ===== DeepSeek API配置 =====
@@ -73,3 +73,15 @@ RAG_TOP_K=10
RAG_MAX_CONTEXT_TOKENS=4000
RAG_SUMMARY_MAX_TOKENS=1024
RAG_SKILLS_MAX_TOKENS=2048
# ===== 阿里云文档解析 =====
ALIBABA_ACCESS_KEY_ID=LTAI5t9ZjvwSU9bKuMyiExrE
ALIBABA_ACCESS_KEY_SECRET=hNvY6XocmEO6inYlrmiBwBcx5OfidL
ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
ALIYUN_PARSE_POLL_INTERVAL_SECONDS=5
ALIYUN_PARSE_TIMEOUT_SECONDS=900
ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
ALIYUN_LLM_ENHANCEMENT=true
ALIYUN_ENHANCEMENT_MODE=VLM
DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
PARSER_FAILURE_MODE=fail

View File

@@ -4,7 +4,7 @@
# ===== Milvus向量数据库配置已有=====
MILVUS_HOST=6.86.80.8
MILVUS_PORT=19530
MILVUS_COLLECTION=regulations_dense_1536
MILVUS_COLLECTION=regulations_dense_1024_v1
MILVUS_DB_NAME=default
# ===== MinIO对象存储配置已有=====
@@ -26,4 +26,3 @@ POSTGRES_PORT=5432
POSTGRES_USER=postgresql
POSTGRES_PASSWORD=postgresql123456
POSTGRES_DB=compliance_db

View File

@@ -9,12 +9,12 @@ DEBUG=false
# ===== Milvus向量数据库配置 =====
MILVUS_HOST=localhost
MILVUS_PORT=19530
MILVUS_COLLECTION=regulations_dense_1536
MILVUS_COLLECTION=regulations_dense_1024_v1
MILVUS_DB_NAME=default
# ===== 嵌入模型配置 =====
EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536
EMBEDDING_DIM=1024
EMBEDDING_API_KEY=your_embedding_api_key_here
EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1
EMBEDDING_TIMEOUT_SECONDS=120
@@ -44,11 +44,20 @@ CHUNK_SIZE=512
CHUNK_OVERLAP=50
MAX_FILE_SIZE_MB=100
DOCUMENT_METADATA_PATH=backend/data/documents.json
PARSER_BACKEND=aliyun
CHUNK_BACKEND=aliyun
# ===== 阿里云文档解析 =====
ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
ALIYUN_PARSE_POLL_INTERVAL_SECONDS=5
ALIYUN_PARSE_TIMEOUT_SECONDS=900
ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
ALIYUN_LLM_ENHANCEMENT=true
ALIYUN_ENHANCEMENT_MODE=VLM
DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
PARSER_FAILURE_MODE=fail
# ===== API服务配置 =====
API_HOST=0.0.0.0
@@ -73,7 +82,7 @@ DEEPSEEK_BASE_URL=http://6.86.80.4:30080/v1
# Qwen系列: qwen3.5-plus, qwen3-plus, qwen-max, qwen-turbo, qwen-long
# Qwen VL系列: qwen3-vl-plus, qwen-vl-max
# DeepSeek系列: deepseek-v4-flash, deepseek-v3.2, deepseek-v3, deepseek-chat, deepseek-coder
QWEN_MODEL=qwen3.5-plus
QWEN_MODEL=qwen3.6-plus
QWEN_VL_MODEL=qwen3-vl-plus
DEEPSEEK_MODEL=deepseek-v4-flash

View File

@@ -106,6 +106,9 @@ ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
EMBEDDING_API_KEY=your_embedding_api_key_here
EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536
PARSER_BACKEND=aliyun
CHUNK_BACKEND=aliyun
PARSER_FAILURE_MODE=fail
```
---

View File

@@ -39,7 +39,7 @@ AIRegulation-DocAnalysis-Demo/
### 1. 安装依赖
```bash
pip install -r backend/requirements.txt
./dev.sh setup
```
### 2. 启动Milvus向量数据库
@@ -57,7 +57,7 @@ docker-compose logs -f milvus
### 3. 启动API服务
```bash
PYTHONPATH=backend uvicorn app.main:app --reload --port 8000
./dev.sh start api --foreground
```
访问API文档http://localhost:8000/docs
@@ -104,6 +104,8 @@ MILVUS_PORT=19530
# 阿里云文档解析
ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
PARSER_BACKEND=aliyun
CHUNK_BACKEND=aliyun
# embedding 配置
EMBEDDING_MODEL=text-embedding-v3
@@ -121,6 +123,17 @@ CHUNK_SIZE=512
- 混合检索问答功能
- 法规变更监控与自动更新
## 解析产物
上传成功后,系统会把阿里云解析的中间结果持久化到 MinIO
- `artifacts/{doc_id}/layouts.json`
- `artifacts/{doc_id}/structure_nodes.json`
- `artifacts/{doc_id}/semantic_blocks.json`
- `artifacts/{doc_id}/vector_chunks.json`
当前默认 Milvus collection 为 `regulations_dense_1536_v2`
## 许可证
MIT License

View File

@@ -1,8 +0,0 @@
{
"permissions": {
"allow": [
"Bash(python3 *)",
"Bash(PGPASSWORD=postgresql123456 psql *)"
]
}
}

View File

@@ -1,516 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Handle Aliyun parsing support for parse pdf."""
import argparse
import json
import os
import re
import time
from pathlib import Path
from typing import Dict, List
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_docmind_api20220711 import models as docmind_models
from alibabacloud_tea_util import models as util_models
# Keep parser integration steps explicit so external workflow behavior stays traceable.
ALIBABA_ACCESS_KEY_ID = os.getenv("ALIBABA_ACCESS_KEY_ID", "")
ALIBABA_ACCESS_KEY_SECRET = os.getenv("ALIBABA_ACCESS_KEY_SECRET", "")
ALIBABA_ENDPOINT = os.getenv("ALIBABA_ENDPOINT", "docmind-api.cn-hangzhou.aliyuncs.com")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
MAX_CHARS = 600
OVERLAP_CHARS = 80
# Keep parser integration steps explicit so external workflow behavior stays traceable.
TOC_TITLES = {"目次", "目录"}
TITLE_SUBTYPES = {"doc_title", "para_title"}
TEXT_SUBTYPES = {"para", "none"}
FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def init_client() -> DocmindClient:
"""Handle init client."""
if not ALIBABA_ACCESS_KEY_ID or not ALIBABA_ACCESS_KEY_SECRET:
raise ValueError("缺少阿里云文档解析凭据,请设置 ALIBABA_ACCESS_KEY_ID 和 ALIBABA_ACCESS_KEY_SECRET")
config = open_api_models.Config(
access_key_id=ALIBABA_ACCESS_KEY_ID,
access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
)
config.endpoint = ALIBABA_ENDPOINT
return DocmindClient(config)
def submit_job(client: DocmindClient, file_path: str) -> str:
"""Submit job."""
file_name = Path(file_path).name
request = docmind_models.SubmitDocParserJobAdvanceRequest(
file_url_object=open(file_path, "rb"),
file_name=file_name,
file_name_extension=Path(file_path).suffix.lstrip("."),
llm_enhancement=True,
enhancement_mode="VLM",
)
runtime = util_models.RuntimeOptions()
response = client.submit_doc_parser_job_advance(request, runtime)
return response.body.data.id
def query_status(client: DocmindClient, task_id: str) -> Dict:
"""Handle query status."""
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
response = client.query_doc_parser_status(request)
return response.body.data.to_map() if response.body.data else None
def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
"""Wait for for completion."""
while True:
status_data = query_status(client, task_id)
if not status_data:
return False
status = status_data.get("Status", "").lower()
if status == "success":
return True
elif status == "failed":
print(f"任务失败: {status_data}")
return False
print(f"任务状态: {status}, 等待中...")
time.sleep(poll_interval)
def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
"""Return result."""
request = docmind_models.GetDocParserResultRequest(
id=task_id,
layout_step_size=layout_step_size,
layout_num=layout_num,
)
response = client.get_doc_parser_result(request)
return response.body.data if response.body.data else None
def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
"""Collect all results."""
all_layouts = []
layout_num = 0
while True:
result_data = get_result(client, task_id, layout_num, layout_step_size)
if not result_data:
break
layouts = result_data.get("layouts", [])
if not layouts:
break
all_layouts.extend(layouts)
layout_num += len(layouts)
if len(layouts) < layout_step_size:
break
return all_layouts
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def normalize_text(text: str) -> str:
"""Normalize text."""
text = text.replace("\r", "\n")
text = text.replace(" ", " ")
text = re.sub(r"\n+", "\n", text)
text = re.sub(r"[ \t]+", " ", text)
return text.strip()
def get_page(layout: Dict) -> int:
"""Return page."""
return layout.get("pageNum", layout.get("pageNumber", 0))
def get_text(layout: Dict) -> str:
"""Return text."""
text = normalize_text(layout.get("text", ""))
if text:
return text
return normalize_text(layout.get("markdownContent", ""))
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def is_title(layout: Dict) -> bool:
"""Return whether title."""
return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
def is_text(layout: Dict) -> bool:
"""Return whether text."""
return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
def is_figure(layout: Dict) -> bool:
"""Return whether figure."""
return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
def is_table(layout: Dict) -> bool:
"""Return whether table."""
return layout.get("type") == "table"
def is_toc_layout(layout: Dict) -> bool:
"""Return whether toc layout."""
text = get_text(layout)
if text in TOC_TITLES:
return True
if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
return True
return False
def extract_table_text(layout: Dict) -> str:
"""Extract table text."""
rows = []
for cell in layout.get("cells", []):
texts = []
for cell_layout in cell.get("layouts", []):
cell_text = normalize_text(cell_layout.get("text", ""))
if cell_text:
texts.append(cell_text)
if texts:
rows.append(" ".join(texts))
return "\n".join(rows).strip()
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
"""Build structure nodes."""
nodes = []
for layout in layouts:
if not is_title(layout):
continue
text = get_text(layout)
if not text or text in TOC_TITLES:
continue
nodes.append(
{
"unique_id": layout.get("uniqueId"),
"page": get_page(layout),
"index": layout.get("index", 0),
"level": layout.get("level", 0),
"title": text,
"type": layout.get("type"),
"sub_type": layout.get("subType"),
}
)
return nodes
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
"""Update section path."""
level = layout.get("level", 0)
title = get_text(layout)
while section_stack and section_stack[-1]["level"] >= level:
section_stack.pop()
section_stack.append(
{
"level": level,
"title": title,
"page": get_page(layout),
"unique_id": layout.get("uniqueId"),
}
)
return section_stack
def section_path_titles(section_stack: List[Dict]) -> List[str]:
"""Handle section path titles."""
return [item["title"] for item in section_stack]
def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
"""Handle flush text block."""
if not blocks:
return block_id
texts = [item["text"] for item in blocks if item["text"]]
merged_text = "\n".join(texts).strip()
if not merged_text:
return block_id
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "section_text",
"page_start": min(item["page"] for item in blocks),
"page_end": max(item["page"] for item in blocks),
"section_path": blocks[0]["section_path"],
"section_level": blocks[0]["section_level"],
"section_title": blocks[0]["section_title"],
"source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
"text": merged_text,
}
)
return block_id + 1
def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
"""Build semantic blocks."""
semantic_blocks = []
section_stack = []
pending_text_blocks = []
block_id = 1
skip_toc_page = False
for layout in layouts:
text = get_text(layout)
page = get_page(layout)
if is_toc_layout(layout):
skip_toc_page = True
continue
if skip_toc_page and page == 1:
continue
if skip_toc_page and page != 1:
skip_toc_page = False
if is_title(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
section_stack = update_section_path(section_stack, layout)
continue
section_path = section_path_titles(section_stack)
section_title = section_path[-1] if section_path else "未分类"
section_level = len(section_path)
if is_table(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
table_text = extract_table_text(layout)
if table_text:
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "table",
"page_start": page,
"page_end": page,
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
"source_ids": [layout.get("uniqueId")],
"text": table_text,
}
)
block_id += 1
continue
if is_figure(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
if text:
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "figure",
"page_start": page,
"page_end": page,
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
"source_ids": [layout.get("uniqueId")],
"text": text,
}
)
block_id += 1
continue
if is_text(layout) and text:
pending_text_blocks.append(
{
"page": page,
"text": text,
"unique_id": layout.get("uniqueId"),
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
}
)
flush_text_block(pending_text_blocks, semantic_blocks, block_id)
return semantic_blocks
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
"""Handle split text with overlap."""
text = text.strip()
if len(text) <= max_chars:
return [text] if text else []
parts = []
start = 0
while start < len(text):
end = min(len(text), start + max_chars)
parts.append(text[start:end].strip())
if end >= len(text):
break
start = max(0, end - overlap_chars)
return [part for part in parts if part]
def build_vector_chunks(
semantic_blocks: List[Dict],
doc_id: str,
doc_title: str,
max_chars: int,
overlap_chars: int,
) -> List[Dict]:
"""Build vector chunks."""
vector_chunks = []
chunk_index = 1
for block in semantic_blocks:
pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
for piece_index, piece in enumerate(pieces, start=1):
if block["section_path"]:
header = f"标准:{doc_title}\n章节:{' > '.join(block['section_path'])}\n\n"
else:
header = f"标准:{doc_title}\n\n"
vector_chunks.append(
{
"doc_id": doc_id,
"doc_title": doc_title,
"chunk_id": f"chunk-{chunk_index}",
"chunk_index": chunk_index,
"semantic_id": block["semantic_id"],
"chunk_type": block["block_type"],
"piece_index": piece_index,
"page_start": block["page_start"],
"page_end": block["page_end"],
"section_path": block["section_path"],
"section_level": block["section_level"],
"section_title": block["section_title"],
"source_ids": block["source_ids"],
"text": piece,
"embedding_text": header + piece,
}
)
chunk_index += 1
return vector_chunks
def parse_pdf_to_structured_chunks(
pdf_path: str,
*,
doc_id: str,
doc_title: str,
max_chars: int = MAX_CHARS,
overlap_chars: int = OVERLAP_CHARS,
poll_interval: int = 5,
) -> Dict:
"""Parse pdf to structured chunks."""
client = init_client()
task_id = submit_job(client, pdf_path)
if not wait_for_completion(client, task_id, poll_interval):
raise RuntimeError("阿里云文档解析任务失败")
layouts = collect_all_results(client, task_id)
return convert_layouts(
layouts,
doc_id=doc_id,
doc_title=doc_title,
max_chars=max_chars,
overlap_chars=overlap_chars,
)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def convert_layouts(
layouts: List[Dict],
doc_id: str,
doc_title: str,
max_chars: int,
overlap_chars: int,
) -> Dict:
"""Handle convert layouts."""
structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks(
semantic_blocks,
doc_id=doc_id,
doc_title=doc_title,
max_chars=max_chars,
overlap_chars=overlap_chars,
)
return {
"doc_id": doc_id,
"doc_title": doc_title,
"structure_nodes": structure_nodes,
"semantic_blocks": semantic_blocks,
"vector_chunks": vector_chunks,
}
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def main() -> None:
"""Run the module entrypoint."""
parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF输出三层结构 chunks")
parser.add_argument("pdf_path", help="PDF 文件路径")
parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
parser.add_argument("--layouts-out", dest="layouts_output", help="输出原始 layouts JSON")
parser.add_argument("--doc-id", default="GB14747-2006", help="文档 ID")
parser.add_argument("--doc-title", default="GB 14747—2006 儿童三轮车安全要求", help="文档标题")
parser.add_argument("--max-chars", type=int, default=MAX_CHARS, help="单个检索 chunk 最大字符数")
parser.add_argument("--overlap-chars", type=int, default=OVERLAP_CHARS, help="相邻检索 chunk 重叠字符数")
parser.add_argument("--poll-interval", type=int, default=5, help="轮询间隔(秒)")
args = parser.parse_args()
pdf_path = Path(args.pdf_path).expanduser().resolve()
if not pdf_path.exists():
raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
client = init_client()
print(f"提交任务: {pdf_path}")
task_id = submit_job(client, str(pdf_path))
print(f"任务 ID: {task_id}")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
print("等待任务完成...")
if not wait_for_completion(client, task_id, args.poll_interval):
print("任务失败,退出")
return
# Keep parser integration steps explicit so external workflow behavior stays traceable.
print("获取解析结果...")
layouts = collect_all_results(client, task_id)
print(f"获取到 {len(layouts)} 个布局块")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
if args.layouts_output:
layouts_path = Path(args.layouts_output).expanduser().resolve()
layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"原始 layouts 已写入: {layouts_path}")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
print("转换为三层结构...")
data = convert_layouts(
layouts,
doc_id=args.doc_id,
doc_title=args.doc_title,
max_chars=args.max_chars,
overlap_chars=args.overlap_chars,
)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
output_path = Path(args.out).expanduser().resolve()
output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"结构层节点数: {len(data['structure_nodes'])}")
print(f"语义层块数: {len(data['semantic_blocks'])}")
print(f"检索层块数: {len(data['vector_chunks'])}")
print(f"输出文件: {output_path}")
if __name__ == "__main__":
main()

View File

@@ -1,122 +0,0 @@
-- 法规文档向量检索系统数据库表结构
-- PostgreSQL
-- ==================== 文档表 ====================
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) UNIQUE NOT NULL, -- 文档唯一标识,如 "GB14747-2006"
title VARCHAR(512) NOT NULL, -- 文档标题
doc_type VARCHAR(32), -- 文档类型:标准/法规/规范
standard_number VARCHAR(64), -- 标准编号:如 "GB 14747-2006"
publish_date DATE, -- 发布日期
implement_date DATE, -- 实施日期
status VARCHAR(32), -- 状态:现行/废止/修订
source_url VARCHAR(512), -- 来源 URL
file_path VARCHAR(512), -- 本地 PDF 文件路径
file_size INT, -- 文件大小(字节)
upload_time TIMESTAMP DEFAULT NOW(), -- 上传时间
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
COMMENT ON TABLE documents IS '文档元数据表';
COMMENT ON COLUMN documents.doc_id IS '文档唯一标识,用于关联 Milvus 和其他表';
COMMENT ON COLUMN documents.standard_number IS '标准编号,如 GB 14747-2006';
-- ==================== 章节结构表 ====================
CREATE TABLE sections (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
unique_id VARCHAR(64) NOT NULL, -- 阿里云返回的唯一标识
level INT NOT NULL, -- 层级1, 2, 3...
title VARCHAR(512) NOT NULL, -- 章节标题
page INT, -- 所在页码
index INT, -- 页内顺序
parent_id INT, -- 父章节 ID树形结构
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
);
COMMENT ON TABLE sections IS '章节结构表,用于目录导航';
COMMENT ON COLUMN sections.parent_id IS '父章节 ID构建树形结构';
COMMENT ON COLUMN sections.level IS '层级深度1 为最顶层';
-- ==================== 语义块表 ====================
CREATE TABLE semantic_blocks (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
semantic_id VARCHAR(64) NOT NULL, -- 语义块唯一标识
block_type VARCHAR(32) NOT NULL, -- 类型section_text/table/figure
page_start INT NOT NULL, -- 起始页码
page_end INT NOT NULL, -- 结束页码
section_id INT, -- 所属章节
section_title VARCHAR(512), -- 章节标题(冗余,方便查询)
section_level INT, -- 章节层级
source_ids JSONB, -- 原始 layout IDsJSON 数组)
text TEXT NOT NULL, -- 完整内容(未被切分)
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
);
COMMENT ON TABLE semantic_blocks IS '语义块表,用于邻域扩展,恢复完整内容';
COMMENT ON COLUMN semantic_blocks.block_type IS '类型section_text正文、table表格、figure图示';
COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容,未被切分';
-- ==================== 向量块元数据表 ====================
CREATE TABLE vector_chunks (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
chunk_id VARCHAR(64) NOT NULL, -- Milvus 主键
semantic_id VARCHAR(64) NOT NULL, -- 关联语义块
chunk_index INT NOT NULL, -- 切片序号(全局)
piece_index INT, -- 同语义块内的切片序号
page_start INT,
page_end INT,
section_title VARCHAR(512),
text VARCHAR(2048), -- 切片文本(可选,缩短版用于展示)
source_ids JSONB, -- 原始 layout IDsJSON 数组)
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
REFERENCES semantic_blocks(doc_id, semantic_id),
CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
);
COMMENT ON TABLE vector_chunks IS '向量块元数据表,用于快速关联查询';
COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号,用于按序拼接';
-- ==================== 索引 ====================
CREATE INDEX idx_sections_doc_id ON sections(doc_id);
CREATE INDEX idx_sections_parent_id ON sections(parent_id);
CREATE INDEX idx_sections_level ON sections(level);
CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
-- ==================== 触发器:自动更新 updated_at ====================
CREATE OR REPLACE FUNCTION update_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER tr_documents_updated_at
BEFORE UPDATE ON documents
FOR EACH ROW EXECUTE FUNCTION update_updated_at();

View File

@@ -1,327 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Handle Aliyun parsing support for upload to milvus."""
import argparse
import json
import time
from pathlib import Path
from typing import List, Dict
import psycopg2
from psycopg2.extras import execute_values
from pymilvus import (
connections,
Collection,
FieldSchema,
CollectionSchema,
DataType,
utility,
)
from openai import OpenAI
# Keep parser integration steps explicit so external workflow behavior stays traceable.
# Keep parser integration steps explicit so external workflow behavior stays traceable.
RELAY_BASE_URL = "http://6.86.80.4:30080/v1"
RELAY_API_KEY = "sk-5HeY7gfSIlyZMacfuXOf5cphpymsNqufEu1ou4U3avbULcyY"
EMBEDDING_MODEL = "text-embedding-v3" # Keep parser integration steps explicit so external workflow behavior stays traceable.
# Keep parser integration steps explicit so external workflow behavior stays traceable.
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"
COLLECTION_NAME = "regulation_chunks"
# Keep parser integration steps explicit so external workflow behavior stays traceable.
PG_HOST = "6.86.80.10"
PG_PORT = 5432
PG_USER = "postgresql"
PG_PASSWORD = "postgresql123456"
PG_DATABASE = "postgres"
# ===================== Embedding =====================
def get_openai_client(api_key: str, base_url: str) -> OpenAI:
"""Return openai client."""
return OpenAI(api_key=api_key, base_url=base_url)
def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10) -> List[List[float]]:
"""Return embeddings batch."""
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
print(f"Embedding batch {i // batch_size + 1}/{(len(texts) - 1) // batch_size + 1}...")
response = client.embeddings.create(
model=EMBEDDING_MODEL,
input=batch,
)
embeddings = [item.embedding for item in response.data]
all_embeddings.extend(embeddings)
return all_embeddings
# ===================== Milvus =====================
def init_milvus(host: str, port: str):
"""Handle init milvus."""
connections.connect("default", host=host, port=port)
print(f"已连接 Milvus: {host}:{port}")
def create_collection(name: str, dim: int) -> Collection:
"""Create collection."""
if utility.has_collection(name):
print(f"Collection '{name}' 已存在,删除重建")
utility.drop_collection(name)
fields = [
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=64, is_primary=True),
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="chunk_index", dtype=DataType.INT64),
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=32),
FieldSchema(name="page_start", dtype=DataType.INT64),
FieldSchema(name="page_end", dtype=DataType.INT64),
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096), # Keep parser integration steps explicit so external workflow behavior stays traceable.
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields, description="法规文档检索 chunks")
collection = Collection(name, schema)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
index_params = {
"metric_type": "COSINE",
"index_type": "IVF_FLAT",
"params": {"nlist": 128},
}
collection.create_index("embedding", index_params)
print(f"Collection '{name}' 创建完成,索引已建立")
return collection
def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[List[float]]):
"""Handle insert chunks."""
data = [
[c["chunk_id"] for c in chunks],
[c["doc_id"] for c in chunks],
[c["doc_title"] for c in chunks],
[c["chunk_index"] for c in chunks],
[c["semantic_id"] for c in chunks],
[c["chunk_type"] for c in chunks],
[c["page_start"] for c in chunks],
[c["page_end"] for c in chunks],
[c["section_title"] for c in chunks],
[c["text"] for c in chunks],
[json.dumps(c.get("source_ids", [])) for c in chunks], # Keep parser integration steps explicit so external workflow behavior stays traceable.
embeddings,
]
collection.insert(data)
collection.flush()
print(f"已插入 {len(chunks)} 个 chunks")
def load_collection(collection: Collection):
"""Load collection."""
collection.load()
print(f"Collection 已加载到内存")
# ===================== PostgreSQL =====================
def get_pg_connection(host: str, port: int, user: str, password: str, database: str):
"""Return pg connection."""
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
database=database,
)
print(f"已连接 PostgreSQL: {host}:{port}/{database}")
return conn
def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
"""Handle insert chunks to pg."""
cursor = conn.cursor()
try:
# Keep parser integration steps explicit so external workflow behavior stays traceable.
cursor.execute("""
INSERT INTO documents (doc_id, title, standard_number, upload_time)
VALUES (%s, %s, %s, NOW())
ON CONFLICT (doc_id) DO UPDATE SET title = EXCLUDED.title, updated_at = NOW()
""", (doc_data["doc_id"], doc_data["doc_title"], doc_data.get("standard_number")))
# Keep parser integration steps explicit so external workflow behavior stays traceable.
semantic_blocks = doc_data.get("semantic_blocks", [])
if semantic_blocks:
block_rows = [
(
doc_data["doc_id"],
block["semantic_id"],
block["block_type"],
block["page_start"],
block["page_end"],
block.get("section_title"),
block.get("section_level"),
json.dumps(block.get("source_ids", [])),
block["text"],
)
for block in semantic_blocks
]
execute_values(
cursor,
"""
INSERT INTO semantic_blocks
(doc_id, semantic_id, block_type, page_start, page_end, section_title, section_level, source_ids, text)
VALUES %s
ON CONFLICT (doc_id, semantic_id) DO UPDATE SET text = EXCLUDED.text
""",
block_rows,
)
print(f"已插入 {len(semantic_blocks)} 个语义块")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
chunk_rows = [
(
doc_data["doc_id"],
chunk["chunk_id"],
chunk["semantic_id"],
chunk["chunk_index"],
chunk.get("piece_index"),
chunk["page_start"],
chunk["page_end"],
chunk.get("section_title"),
chunk["text"],
json.dumps(chunk.get("source_ids", [])),
)
for chunk in chunks
]
execute_values(
cursor,
"""
INSERT INTO vector_chunks
(doc_id, chunk_id, semantic_id, chunk_index, piece_index, page_start, page_end, section_title, text, source_ids)
VALUES %s
ON CONFLICT (doc_id, chunk_id) DO UPDATE SET text = EXCLUDED.text
""",
chunk_rows,
)
print(f"已插入 {len(chunks)} 个向量块元数据")
conn.commit()
print("PostgreSQL 数据插入完成")
except Exception as e:
conn.rollback()
raise e
finally:
cursor.close()
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def load_data(file_path: Path) -> Dict:
"""Load data."""
data = json.loads(file_path.read_text(encoding="utf-8"))
return data
def upload_to_milvus_and_pg(
chunks_file: str,
api_key: str,
base_url: str,
milvus_host: str,
milvus_port: str,
collection_name: str,
batch_size: int,
pg_host: str,
pg_port: int,
pg_user: str,
pg_password: str,
pg_database: str,
):
# Keep parser integration steps explicit so external workflow behavior stays traceable.
"""Handle upload to milvus and pg."""
chunks_path = Path(chunks_file).expanduser().resolve()
if not chunks_path.exists():
raise FileNotFoundError(f"文件不存在: {chunks_path}")
data = load_data(chunks_path)
chunks = data.get("vector_chunks", [])
if not chunks:
raise ValueError("vector_chunks 为空")
print(f"加载 {len(chunks)} 个 chunks")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
client = get_openai_client(api_key, base_url)
init_milvus(milvus_host, milvus_port)
pg_conn = get_pg_connection(pg_host, pg_port, pg_user, pg_password, pg_database)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
texts = [c["embedding_text"] for c in chunks]
embeddings = get_embeddings_batch(client, texts, batch_size)
print(f"生成 {len(embeddings)} 个向量")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
embedding_dim = len(embeddings[0])
print(f"Embedding 维度: {embedding_dim}")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
collection = create_collection(collection_name, embedding_dim)
insert_chunks(collection, chunks, embeddings)
load_collection(collection)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
insert_chunks_to_pg(pg_conn, chunks, data)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
pg_conn.close()
print("上传完成!")
# ===================== CLI =====================
def main():
"""Run the module entrypoint."""
parser = argparse.ArgumentParser(description="将 vector_chunks 向量化并上传到 Milvus 和 PostgreSQL")
parser.add_argument("chunks_file", help="vector_chunks.json 文件路径")
parser.add_argument("--api-key", default=RELAY_API_KEY, help="中转站 API Key")
parser.add_argument("--base-url", default=RELAY_BASE_URL, help="中转站 Base URL")
parser.add_argument("--milvus-host", default=MILVUS_HOST, help="Milvus host")
parser.add_argument("--milvus-port", default=MILVUS_PORT, help="Milvus port")
parser.add_argument("--collection", default=COLLECTION_NAME, help="Milvus collection 名称")
parser.add_argument("--batch-size", type=int, default=10, help="Embedding 批量大小中转站限制最大10")
parser.add_argument("--pg-host", default=PG_HOST, help="PostgreSQL host")
parser.add_argument("--pg-port", type=int, default=PG_PORT, help="PostgreSQL port")
parser.add_argument("--pg-user", default=PG_USER, help="PostgreSQL user")
parser.add_argument("--pg-password", default=PG_PASSWORD, help="PostgreSQL password")
parser.add_argument("--pg-database", default=PG_DATABASE, help="PostgreSQL database")
args = parser.parse_args()
upload_to_milvus_and_pg(
chunks_file=args.chunks_file,
api_key=args.api_key,
base_url=args.base_url,
milvus_host=args.milvus_host,
milvus_port=args.milvus_port,
collection_name=args.collection,
batch_size=args.batch_size,
pg_host=args.pg_host,
pg_port=args.pg_port,
pg_user=args.pg_user,
pg_password=args.pg_password,
pg_database=args.pg_database,
)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -1,263 +0,0 @@
# 文档解析与向量检索说明
## 相关文件
- `aliyun_doc_parser.py`:调用阿里云文档智能解析 PDF生成原始 `layouts.json`
- `layouts_to_vector_chunks.py`:把 `layouts.json` 转成适合向量数据库入库的三层结构
- `layouts.json`:阿里云返回的原始布局结果
- `vector_chunks.json`:转换后的结构化输出
## 一、`layouts.json` 的结构
`layouts.json` 顶层是一个数组每个元素代表一个布局块layout。常见字段如下
- `type`:主类型,例如 `title``text``table``figure`
- `subType`:更细的语义类型,例如 `doc_title``para_title``para``picture``pic_title``pic_caption`
- `text`:当前布局块的纯文本
- `markdownContent`:带 markdown 标记的文本
- `pageNum`:页码
- `index`:页内顺序
- `level`:标题层级
- `uniqueId`:布局块唯一标识
- `blocks`:更细粒度的文本与样式信息
- `cells`:表格单元格,仅 `table` 类型存在
这个结构不是简单 OCR 文本流,而是已经带有版面理解和语义分类的结构化数据。
## 二、推荐的三层转换结构
### 1. 结构层 `structure_nodes`
结构层用于恢复文档标题树,不直接作为最终向量检索单元。
示例:
- `1 范围`
- `2 规范性引用文件`
- `3 术语和定义`
- `3.1 儿童三轮车`
- `3.2 轮距`
结构层主要用于给下游 chunk 绑定 `section_path`
### 2. 语义层 `semantic_blocks`
语义层是按文档意义聚合后的内容块,主要分为三类:
- `section_text`:同一章节下连续正文聚合而成
- `table`:表格内容单独成块
- `figure`:图、图名、图注等单独成块
这一层比单 layout 更适合做语义理解,也适合后续做上下文扩展。
### 3. 检索层 `vector_chunks`
检索层是最终写进向量数据库的 chunk。
处理方式:
-`semantic_blocks` 中较短的块直接入库
- 对较长的块按 `max_chars` 再切分
- 相邻切片保留 `overlap_chars` 重叠
- 每个 chunk 都带完整 metadata便于后续过滤、重排和邻域扩展
## 三、当前转换脚本做了什么
`layouts_to_vector_chunks.py` 当前已经实现:
1. 过滤目录页噪声(如 `目次`
2. 根据标题层级维护章节路径
3. 将正文聚合成 `section_text`
4. 将表格单独转成 `table`
5. 将图相关内容单独转成 `figure`
6. 对长文本继续切分为最终 `vector_chunks`
7. 为每个检索 chunk 生成 `embedding_text`
## 四、为什么不要直接按 layout 入库
如果把 `layouts.json` 的每条 layout 直接做向量:
- 颗粒度太碎
- 标题和正文容易分离
- 表格会丢失结构上下文
- 图示信息无法完整表达
- 检索命中结果噪声较大
对于标准文档,最合适的单位通常不是“句子”,而是“条款语义块”。
## 五、建议的入库字段
建议向量数据库每条记录至少保存:
- `embedding_text`:用于生成向量
- `text`:原始 chunk 文本
- `chunk_id`
- `semantic_id`
- `chunk_type``section_text` / `table` / `figure`
- `section_path`
- `section_title`
- `section_level`
- `page_start`
- `page_end`
- `doc_id`
- `doc_title`
- `source_ids`
其中:
- 向量化字段:`embedding_text`
- 展示字段:`text`
- 检索增强字段:其余 metadata
## 六、推荐的检索方式
不要只做最简单的 top-k 向量搜索,建议采用:
**向量召回 + metadata 重排 + 邻域扩展**
### 1. 向量召回
使用 `vector_chunks[*].embedding_text` 做 embedding并在向量数据库中检索 top 10 ~ 15 条。
查询时可以对用户问题做轻微改写,例如:
原问题:
`儿童三轮车的定义是什么?`
可改写为:
`请检索 GB 14747—2006 儿童三轮车安全要求 中关于“儿童三轮车定义”的条款、术语、表格或图示说明。`
这样更适合标准文档检索。
### 2. metadata 重排
向量召回后,根据 metadata 做轻量规则重排。
常见规则:
- `chunk_type == section_text`:对定义类、要求类问题优先级更高
- `section_path` 命中查询关键词:例如查询“定义”时,`术语和定义` 章节优先
- `chunk_type == table`:对“尺寸 / 参数 / 数值 / 对照 / 要求”类问题加权
- `chunk_type == figure`:对“图 / 结构 / 状态 / 示意”类问题加权
### 3. 邻域扩展
检索命中的是最终切片,但回答往往需要更完整上下文。
建议命中某个 `vector_chunk` 后:
1. 优先回捞同一个 `semantic_id` 下的所有 chunk
2. 如果还不够,再补充同 `section_path`、相邻页码或相邻 `chunk_index` 的内容
这样可以恢复完整条款,而不是只给模型一小段碎片。
## 七、不同问题的检索重点
### 1. 定义类问题
例如:
- `儿童三轮车的定义是什么?`
- `轮距是什么意思?`
优先检索:
- `section_text`
- `section_path` 中包含 `术语和定义` 的内容
### 2. 要求类问题
例如:
- `外露突出物有什么要求?`
- `辅助推杆有哪些安全要求?`
优先检索:
- `section_text`
- `table`
### 3. 数值 / 尺寸 / 对照类问题
例如:
- `鞍座到脚蹬距离要求是什么?`
- `哪些项目需要满足规定尺寸?`
优先检索:
- `table`
- `section_text`
### 4. 图示说明类问题
例如:
- `正常乘骑状态是什么意思?`
- `图1表示什么`
优先检索:
- `figure`
- 同章节相邻 `section_text`
## 八、推荐的最终检索流程
建议采用以下固定流程:
1.`vector_chunks.embedding_text` 做 embedding 检索
2. 取 top 10 ~ 15 条候选
3.`chunk_type + section_path` 做规则重排
4.`semantic_id` 为中心回捞完整语义块
5. 选 3 ~ 5 组上下文提供给大模型回答
## 九、给大模型的上下文组织方式
最终不要直接把原始 JSON 扔给模型,建议整理成如下格式:
```text
[命中片段 1]
章节3 术语和定义 > 3.1 儿童三轮车
页码1-2
类型section_text
内容:
......
[命中片段 2]
章节4 要求 > 4.3 外露突出物
页码5
类型section_text
内容:
......
[命中片段 3]
章节5 试验方法
页码8
类型table
内容:
......
```
这种格式更利于模型稳定回答并引用出处。
## 十、转换命令
生成三层结构:
```bash
python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
--layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
--out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json
```
自定义切片大小:
```bash
python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
--layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
--out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json \
--max-chars 500 \
--overlap-chars 80
```

View File

@@ -32,6 +32,10 @@ async def get_config():
"embedding_dim": settings.embedding_dim,
"embedding_base_url": settings.embedding_base_url,
"milvus_collection": settings.milvus_collection,
"parser_backend": settings.parser_backend,
"chunk_backend": settings.chunk_backend,
"artifact_prefix": settings.document_parse_artifact_prefix,
"parser_failure_mode": settings.parser_failure_mode,
"llm_provider": settings.llm_provider,
"llm_model": settings.llm_model,
"document_metadata_path": settings.document_metadata_path,

View File

@@ -5,6 +5,7 @@ from __future__ import annotations
import os
import tempfile
import uuid
import json
from dataclasses import dataclass
from loguru import logger
@@ -16,6 +17,7 @@ from app.domain.documents import (
DocumentParser,
DocumentRepository,
DocumentStatus,
ParsedDocument,
)
from app.domain.retrieval import EmbeddingProvider, VectorIndex
# Keep orchestration logic centralized so use-case flow stays easy to trace.
@@ -54,6 +56,27 @@ class DocumentCommandService:
self.embedding_provider = embedding_provider
self.vector_index = vector_index
def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
"""Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"
artifact_payloads = {
"layouts": parsed_document.raw_layouts,
"structure_nodes": parsed_document.structure_nodes,
"semantic_blocks": parsed_document.semantic_blocks,
"vector_chunks": parsed_document.vector_chunks,
}
artifact_keys: dict[str, str] = {}
for name, payload in artifact_payloads.items():
object_name = f"{prefix}/{name}.json"
self.binary_store.save(
object_name=object_name,
data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),
content_type="application/json",
metadata={"doc_id": doc_id, "artifact_type": name},
)
artifact_keys[name] = object_name
return artifact_keys
def upload_and_process(
self,
*,
@@ -104,11 +127,21 @@ class DocumentCommandService:
doc_id=doc_id,
doc_name=final_doc_name,
)
artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
self.document_repository.update_status(
doc_id,
DocumentStatus.PARSED,
parser_name=parsed_document.parser_name,
metadata={"structure_nodes": len(parsed_document.structure_nodes)},
metadata={
"parser_backend": parsed_document.parser_name,
"parse_task_id": parsed_document.metadata.get("task_id", ""),
"layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
"structure_node_count": len(parsed_document.structure_nodes),
"semantic_block_count": len(parsed_document.semantic_blocks),
"vector_chunk_count": len(parsed_document.vector_chunks),
"artifact_keys": artifact_keys,
"processing_stage": "parsed",
},
)
chunks = self.chunk_builder.build(
@@ -124,13 +157,18 @@ class DocumentCommandService:
if inserted != len(chunks):
logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
health = self.vector_index.health()
self.document_repository.update_status(
doc_id,
DocumentStatus.INDEXED,
chunk_count=len(chunks),
summary="",
summary_latency_ms=0,
index_name=self.vector_index.health().get("collection_name", ""),
index_name=health.get("collection_name", ""),
metadata={
"index_collection": health.get("collection_name", ""),
"processing_stage": "indexed",
},
)
stored = self.document_repository.get(doc_id)
return DocumentProcessResult(
@@ -148,6 +186,10 @@ class DocumentCommandService:
doc_id,
DocumentStatus.FAILED,
error_message=str(exc),
metadata={
"failure_reason": str(exc),
"processing_stage": "failed",
},
)
return DocumentProcessResult(
doc_id=doc_id,

View File

@@ -1,9 +1,9 @@
"""Configure backend settings for settings."""
"""Configure backend settings for the backend application."""
from pathlib import Path
from pydantic_settings import BaseSettings, SettingsConfigDict
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
from functools import lru_cache
# Keep configuration setup explicit so runtime behavior is easy to reason about.
@@ -33,18 +33,25 @@ class Settings(BaseSettings):
# Keep configuration setup explicit so runtime behavior is easy to reason about.
milvus_host: str = Field(default="localhost", description="Milvus服务地址")
milvus_port: int = Field(default=19530, description="Milvus服务端口")
milvus_collection: str = Field(default="regulations_dense_1536", description="法规向量集合名称")
milvus_collection: str = Field(default="regulations_dense_1024_v1", description="法规向量集合名称")
milvus_db_name: str = Field(default="default", description="Milvus数据库名称")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
embedding_model: str = Field(default="text-embedding-v3", description="嵌入模型名称")
embedding_dim: int = Field(default=1536, description="嵌入向量维度")
embedding_dim: int = Field(default=1024, description="嵌入向量维度")
embedding_api_key: str = Field(default="", description="Embedding API密钥")
embedding_base_url: str = Field(default="http://6.86.80.4:30080/v1", description="Embedding API地址")
embedding_timeout_seconds: int = Field(default=120, description="Embedding API超时时间(秒)")
alibaba_access_key_id: str = Field(default="", description="阿里云文档解析 Access Key ID")
alibaba_access_key_secret: str = Field(default="", description="阿里云文档解析 Access Key Secret")
alibaba_endpoint: str = Field(default="docmind-api.cn-hangzhou.aliyuncs.com", description="阿里云文档解析 endpoint")
aliyun_parse_poll_interval_seconds: int = Field(default=5, description="阿里云文档解析轮询间隔(秒)")
aliyun_parse_timeout_seconds: int = Field(default=900, description="阿里云文档解析超时时间(秒)")
aliyun_parse_layout_step_size: int = Field(default=50, description="阿里云文档解析分页步长")
aliyun_llm_enhancement: bool = Field(default=True, description="是否启用阿里云解析增强")
aliyun_enhancement_mode: str = Field(default="VLM", description="阿里云解析增强模式")
document_parse_artifact_prefix: str = Field(default="artifacts", description="解析产物对象前缀")
parser_failure_mode: str = Field(default="fail", description="解析失败策略")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
minio_endpoint: str = Field(default="localhost:9000", description="MinIO服务地址")
@@ -71,8 +78,8 @@ class Settings(BaseSettings):
chunk_overlap: int = Field(default=50, description="分块重叠大小")
max_file_size_mb: int = Field(default=100, description="最大文件大小(MB)")
document_metadata_path: str = Field(default="backend/data/documents.json", description="文档元数据存储路径")
parser_backend: str = Field(default="local", description="解析后端(local/aliyun)")
chunk_backend: str = Field(default="local", description="分块后端(local/aliyun)")
parser_backend: str = Field(default="aliyun", description="解析后端(local/aliyun)")
chunk_backend: str = Field(default="aliyun", description="分块后端(local/aliyun)")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
api_host: str = Field(default="0.0.0.0", description="API服务地址")

View File

@@ -27,12 +27,12 @@ class Settings(BaseSettings):
# Milvus
milvus_host: str = "localhost"
milvus_port: int = 19530
milvus_collection: str = "regulations_dense_1536"
milvus_collection: str = "regulations_dense_1024_v1"
# LLM / embedding defaults aligned with the migrated backend path.
llm_model: str = "qwen-max"
embedding_model: str = "text-embedding-v3"
embedding_dim: int = 1536
embedding_dim: int = 1024
# Legacy workflow compatibility only.
vector_top_k: int = 10
@@ -47,7 +47,7 @@ class Settings(BaseSettings):
api_port: int = 8000
# Legacy aliases retained for old utility modules.
regulations_collection: str = "regulations_dense_1536"
regulations_collection: str = "regulations_dense_1024_v1"
compliance_collection: str = "compliance_cache"
# Preserve the legacy module API while keeping env resolution centralized at the repo root.

View File

@@ -56,6 +56,7 @@ class ParsedDocument:
vector_chunks: list[dict[str, Any]]
parser_name: str
raw_text: str = ""
raw_layouts: list[dict[str, Any]] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)

View File

@@ -10,6 +10,8 @@ from app.config.settings import settings
from app.domain.retrieval import EmbeddingProvider
# Keep adapter behavior explicit so integration details remain easy to audit.
EMBEDDING_BATCH_SIZE = 8
class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
@@ -27,6 +29,18 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
self.timeout = settings.embedding_timeout_seconds
self.dimension = settings.embedding_dim
def _raise_for_status(self, response: httpx.Response, *, batch_size: int) -> None:
"""Raise a detailed error so upstream gateway failures are easier to diagnose."""
try:
response.raise_for_status()
except httpx.HTTPStatusError as exc:
response_preview = response.text[:500].strip()
detail = (
f"Embedding request failed for model={self.model}, batch_size={batch_size}, "
f"status={response.status_code}, url={response.request.url}, response={response_preview}"
)
raise httpx.HTTPStatusError(detail, request=exc.request, response=exc.response) from exc
def _request(self, texts: list[str]) -> list[list[float]]:
"""Handle request for this module for the Open A I Compatible Embedding Provider instance."""
if not self.api_key:
@@ -40,7 +54,7 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
json={"model": self.model, "input": texts},
timeout=self.timeout,
)
response.raise_for_status()
self._raise_for_status(response, batch_size=len(texts))
data = response.json()
vectors = [item["embedding"] for item in sorted(data.get("data", []), key=lambda item: item["index"])]
if any(len(vector) != self.dimension for vector in vectors):
@@ -51,7 +65,12 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
"""Embed texts for the Open A I Compatible Embedding Provider instance."""
if not texts:
return []
return self._request(texts)
vectors: list[list[float]] = []
# Batch requests conservatively because some gateways reject larger embedding payloads.
for start in range(0, len(texts), EMBEDDING_BATCH_SIZE):
batch = texts[start:start + EMBEDDING_BATCH_SIZE]
vectors.extend(self._request(batch))
return vectors
def embed_query(self, text: str) -> list[float]:
"""Embed query for the Open A I Compatible Embedding Provider instance."""

View File

@@ -0,0 +1,142 @@
"""Aliyun Docmind gateway helpers for the document ingest pipeline."""
from __future__ import annotations
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from alibabacloud_docmind_api20220711 import models as docmind_models
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_tea_util import models as util_models
from app.config.settings import settings
# Keep provider-specific behavior isolated so the rest of the backend can stay stable.
@dataclass
class AliyunParsePayload:
"""Represent the raw Aliyun parse payload returned by the gateway."""
task_id: str
layouts: list[dict[str, Any]]
poll_attempts: int
duration_ms: int
class AliyunDocmindGateway:
"""Submit, poll, and collect results from the Aliyun Docmind API."""
def __init__(self) -> None:
"""Initialize the gateway with runtime configuration."""
self.endpoint = settings.alibaba_endpoint
self.poll_interval_seconds = settings.aliyun_parse_poll_interval_seconds
self.timeout_seconds = settings.aliyun_parse_timeout_seconds
self.layout_step_size = settings.aliyun_parse_layout_step_size
self.llm_enhancement = settings.aliyun_llm_enhancement
self.enhancement_mode = settings.aliyun_enhancement_mode
def parse_document(self, *, file_path: str) -> AliyunParsePayload:
"""Parse a single document and return the collected layouts."""
client = self._create_client()
started_at = time.monotonic()
task_id = self._submit_job(client=client, file_path=file_path)
poll_attempts = self._wait_for_completion(client=client, task_id=task_id, started_at=started_at)
layouts = self._collect_all_results(client=client, task_id=task_id)
duration_ms = int((time.monotonic() - started_at) * 1000)
return AliyunParsePayload(
task_id=task_id,
layouts=layouts,
poll_attempts=poll_attempts,
duration_ms=duration_ms,
)
def _create_client(self) -> DocmindClient:
"""Create a Docmind client using explicit AccessKey settings only."""
config = open_api_models.Config()
config.endpoint = self.endpoint
if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
raise ValueError(
"Missing Aliyun parser credentials. Set ALIBABA_ACCESS_KEY_ID and "
"ALIBABA_ACCESS_KEY_SECRET in the project root .env."
)
# Keep production behavior deterministic by using only project-configured credentials.
config.access_key_id = settings.alibaba_access_key_id
config.access_key_secret = settings.alibaba_access_key_secret
return DocmindClient(config)
def _submit_job(self, *, client: DocmindClient, file_path: str) -> str:
"""Submit an asynchronous Docmind parse job."""
path = Path(file_path)
with open(file_path, "rb") as file_stream:
request = docmind_models.SubmitDocParserJobAdvanceRequest(
file_url_object=file_stream,
file_name=path.name,
file_name_extension=path.suffix.lstrip("."),
llm_enhancement=self.llm_enhancement,
enhancement_mode=self.enhancement_mode,
)
runtime = util_models.RuntimeOptions()
response = client.submit_doc_parser_job_advance(request, runtime)
task_id = response.body.data.id if response.body and response.body.data else ""
if not task_id:
raise RuntimeError("Aliyun Docmind did not return a parse task id.")
return task_id
def _query_status(self, *, client: DocmindClient, task_id: str) -> dict[str, Any] | None:
"""Query the current Docmind parse status."""
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
response = client.query_doc_parser_status(request)
return response.body.data.to_map() if response.body and response.body.data else None
def _wait_for_completion(self, *, client: DocmindClient, task_id: str, started_at: float) -> int:
"""Poll until the parse job finishes or times out."""
poll_attempts = 0
while True:
poll_attempts += 1
status_payload = self._query_status(client=client, task_id=task_id)
if not status_payload:
raise RuntimeError(f"Aliyun parse status payload is empty for task {task_id}.")
status = str(status_payload.get("Status", "")).lower()
if status == "success":
return poll_attempts
if status == "failed":
raise RuntimeError(f"Aliyun parse task failed: {status_payload}")
elapsed = time.monotonic() - started_at
if elapsed > self.timeout_seconds:
raise TimeoutError(
f"Aliyun parse task timed out after {self.timeout_seconds}s: task_id={task_id}"
)
time.sleep(self.poll_interval_seconds)
def _collect_all_results(self, *, client: DocmindClient, task_id: str) -> list[dict[str, Any]]:
"""Collect all paginated layout results from a completed parse task."""
all_layouts: list[dict[str, Any]] = []
layout_num = 0
while True:
request = docmind_models.GetDocParserResultRequest(
id=task_id,
layout_step_size=self.layout_step_size,
layout_num=layout_num,
)
response = client.get_doc_parser_result(request)
payload = response.body.data if response.body else None
if not payload:
break
layouts = payload.get("layouts", [])
if not layouts:
break
all_layouts.extend(layouts)
layout_num += len(layouts)
if len(layouts) < self.layout_step_size:
break
if not all_layouts:
raise RuntimeError(f"Aliyun parse task returned no layouts: task_id={task_id}")
return all_layouts

View File

@@ -1,19 +1,18 @@
"""Implement infrastructure support for aliyun document parser."""
"""Implement infrastructure support for Aliyun document parsing."""
from __future__ import annotations
from app.aliyun_parser.parse_pdf import (
from app.config.settings import settings
from app.domain.documents import DocumentParser, ParsedDocument
from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway
from app.infrastructure.parser.aliyun_layout_normalizer import (
MAX_CHARS,
OVERLAP_CHARS,
build_semantic_blocks,
build_structure_nodes,
build_vector_chunks,
collect_all_results,
init_client,
submit_job,
wait_for_completion,
)
from app.domain.documents import DocumentParser, ParsedDocument
# Keep adapter behavior explicit so integration details remain easy to audit.
@@ -22,13 +21,14 @@ class AliyunDocumentParser(DocumentParser):
"""Provide the Aliyun Document Parser parser."""
parser_name = "aliyun_docmind"
def __init__(self) -> None:
"""Initialize the parser adapter and its gateway dependency."""
self.gateway = AliyunDocmindGateway()
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
"""Handle parse for the Aliyun Document Parser instance."""
client = init_client()
task_id = submit_job(client, file_path)
if not wait_for_completion(client, task_id):
raise RuntimeError("阿里云文档解析任务失败")
layouts = collect_all_results(client, task_id)
payload = self.gateway.parse_document(file_path=file_path)
layouts = payload.layouts
structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks(
@@ -51,5 +51,13 @@ class AliyunDocumentParser(DocumentParser):
vector_chunks=vector_chunks,
parser_name=self.parser_name,
raw_text=raw_text,
metadata={"task_id": task_id, "layout_count": len(layouts)},
raw_layouts=layouts,
metadata={
"task_id": payload.task_id,
"layout_count": len(layouts),
"poll_attempts": payload.poll_attempts,
"duration_ms": payload.duration_ms,
"parser_backend": self.parser_name,
"artifact_prefix": settings.document_parse_artifact_prefix,
},
)

View File

@@ -0,0 +1,336 @@
"""Normalize Aliyun Docmind layouts into production document structures."""
from __future__ import annotations
import re
from typing import Any
# Keep layout normalization rules centralized so parser and demos stay aligned.
MAX_CHARS = 600
OVERLAP_CHARS = 80
TOC_TITLES = {"目次", "目录"}
TITLE_SUBTYPES = {"doc_title", "para_title"}
TEXT_SUBTYPES = {"para", "none"}
FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
def normalize_text(text: str) -> str:
"""Normalize raw text content emitted by the parser."""
text = text.replace("\r", "\n")
text = text.replace(" ", " ")
text = re.sub(r"\n+", "\n", text)
text = re.sub(r"[ \t]+", " ", text)
return text.strip()
def get_page(layout: dict[str, Any]) -> int:
"""Return the page number for a layout record."""
return layout.get("pageNum", layout.get("pageNumber", 0))
def get_text(layout: dict[str, Any]) -> str:
"""Return the most useful text content for a layout record."""
text = normalize_text(layout.get("text", ""))
if text:
return text
return normalize_text(layout.get("markdownContent", ""))
def is_title(layout: dict[str, Any]) -> bool:
"""Return whether the layout should be treated as a title."""
return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
def is_text(layout: dict[str, Any]) -> bool:
"""Return whether the layout should be treated as plain paragraph text."""
return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
def is_figure(layout: dict[str, Any]) -> bool:
"""Return whether the layout should be treated as figure-related content."""
return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
def is_table(layout: dict[str, Any]) -> bool:
"""Return whether the layout should be treated as a table."""
return layout.get("type") == "table"
def is_toc_layout(layout: dict[str, Any]) -> bool:
"""Return whether the layout appears to belong to a table of contents."""
text = get_text(layout)
if text in TOC_TITLES:
return True
if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
return True
return False
def extract_table_text(layout: dict[str, Any]) -> str:
"""Flatten nested table cells into retrievable plain text."""
rows: list[str] = []
for cell in layout.get("cells", []):
texts: list[str] = []
for cell_layout in cell.get("layouts", []):
cell_text = normalize_text(cell_layout.get("text", ""))
if cell_text:
texts.append(cell_text)
if texts:
rows.append(" ".join(texts))
return "\n".join(rows).strip()
def build_structure_nodes(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Build the title hierarchy emitted to downstream storage."""
nodes: list[dict[str, Any]] = []
for layout in layouts:
if not is_title(layout):
continue
text = get_text(layout)
if not text or text in TOC_TITLES:
continue
nodes.append(
{
"unique_id": layout.get("uniqueId"),
"page": get_page(layout),
"index": layout.get("index", 0),
"level": layout.get("level", 0),
"title": text,
"type": layout.get("type"),
"sub_type": layout.get("subType"),
}
)
return nodes
def update_section_path(
section_stack: list[dict[str, Any]],
layout: dict[str, Any],
) -> list[dict[str, Any]]:
"""Update the current heading stack with a newly observed title layout."""
level = layout.get("level", 0)
title = get_text(layout)
while section_stack and section_stack[-1]["level"] >= level:
section_stack.pop()
section_stack.append(
{
"level": level,
"title": title,
"page": get_page(layout),
"unique_id": layout.get("uniqueId"),
}
)
return section_stack
def section_path_titles(section_stack: list[dict[str, Any]]) -> list[str]:
"""Return the title-only view of the current heading stack."""
return [item["title"] for item in section_stack]
def flush_text_block(
blocks: list[dict[str, Any]],
semantic_blocks: list[dict[str, Any]],
block_id: int,
) -> int:
"""Flush buffered paragraph layouts into a single semantic block."""
if not blocks:
return block_id
texts = [item["text"] for item in blocks if item["text"]]
merged_text = "\n".join(texts).strip()
if not merged_text:
return block_id
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "section_text",
"page_start": min(item["page"] for item in blocks),
"page_end": max(item["page"] for item in blocks),
"section_path": blocks[0]["section_path"],
"section_level": blocks[0]["section_level"],
"section_title": blocks[0]["section_title"],
"source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
"text": merged_text,
}
)
return block_id + 1
def build_semantic_blocks(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Build semantic content blocks from raw Aliyun layouts."""
semantic_blocks: list[dict[str, Any]] = []
section_stack: list[dict[str, Any]] = []
pending_text_blocks: list[dict[str, Any]] = []
block_id = 1
skip_toc_page = False
for layout in layouts:
text = get_text(layout)
page = get_page(layout)
if is_toc_layout(layout):
skip_toc_page = True
continue
if skip_toc_page and page == 1:
continue
if skip_toc_page and page != 1:
skip_toc_page = False
if is_title(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
section_stack = update_section_path(section_stack, layout)
continue
section_path = section_path_titles(section_stack)
section_title = section_path[-1] if section_path else "未分类"
section_level = len(section_path)
if is_table(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
table_text = extract_table_text(layout)
if table_text:
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "table",
"page_start": page,
"page_end": page,
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
"source_ids": [layout.get("uniqueId")],
"text": table_text,
}
)
block_id += 1
continue
if is_figure(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
if text:
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "figure",
"page_start": page,
"page_end": page,
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
"source_ids": [layout.get("uniqueId")],
"text": text,
}
)
block_id += 1
continue
if is_text(layout) and text:
pending_text_blocks.append(
{
"page": page,
"text": text,
"unique_id": layout.get("uniqueId"),
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
}
)
flush_text_block(pending_text_blocks, semantic_blocks, block_id)
return semantic_blocks
def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> list[str]:
"""Split long text into overlapping windows for embedding."""
text = text.strip()
if len(text) <= max_chars:
return [text] if text else []
parts: list[str] = []
start = 0
while start < len(text):
end = min(len(text), start + max_chars)
parts.append(text[start:end].strip())
if end >= len(text):
break
start = max(0, end - overlap_chars)
return [part for part in parts if part]
def build_vector_chunks(
semantic_blocks: list[dict[str, Any]],
*,
doc_id: str,
doc_title: str,
max_chars: int,
overlap_chars: int,
) -> list[dict[str, Any]]:
"""Build retrieval chunks from semantic blocks."""
vector_chunks: list[dict[str, Any]] = []
chunk_index = 1
for block in semantic_blocks:
pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
for piece_index, piece in enumerate(pieces, start=1):
if block["section_path"]:
header = f"标准:{doc_title}\n章节:{' > '.join(block['section_path'])}\n\n"
else:
header = f"标准:{doc_title}\n\n"
# Preserve enriched embedding text so retrieval keeps section context.
vector_chunks.append(
{
"doc_id": doc_id,
"doc_title": doc_title,
"chunk_id": f"chunk-{chunk_index}",
"chunk_index": chunk_index,
"semantic_id": block["semantic_id"],
"chunk_type": block["block_type"],
"piece_index": piece_index,
"page_start": block["page_start"],
"page_end": block["page_end"],
"section_path": block["section_path"],
"section_level": block["section_level"],
"section_title": block["section_title"],
"source_ids": block["source_ids"],
"text": piece,
"embedding_text": header + piece,
}
)
chunk_index += 1
return vector_chunks
def convert_layouts(
layouts: list[dict[str, Any]],
*,
doc_id: str,
doc_title: str,
max_chars: int,
overlap_chars: int,
) -> dict[str, Any]:
"""Convert raw Aliyun layouts into the three-layer ingest payload."""
structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks(
semantic_blocks,
doc_id=doc_id,
doc_title=doc_title,
max_chars=max_chars,
overlap_chars=overlap_chars,
)
return {
"doc_id": doc_id,
"doc_title": doc_title,
"structure_nodes": structure_nodes,
"semantic_blocks": semantic_blocks,
"vector_chunks": vector_chunks,
}

View File

@@ -4,6 +4,7 @@ from __future__ import annotations
from pathlib import Path
from app.config.settings import settings
from app.domain.documents import DocumentParser, ParsedDocument
from app.services.parser.docx_parser import parse_docx_to_markdown
from app.services.parser.pdf_parser import parse_pdf_to_markdown
@@ -34,5 +35,10 @@ class LocalDocumentParser(DocumentParser):
vector_chunks=[],
parser_name=self.parser_name,
raw_text=markdown_text,
metadata={"source": "local_parser", "file_suffix": suffix},
raw_layouts=[],
metadata={
"source": "local_parser",
"file_suffix": suffix,
"artifact_prefix": settings.document_parse_artifact_prefix,
},
)

View File

@@ -21,5 +21,365 @@
"generate_summary": true,
"structure_nodes": 0
}
},
"44121fbb": {
"doc_id": "44121fbb",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "44121fbb/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T09:53:47.996183+00:00",
"updated_at": "2026-05-18T09:53:50.825868+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"77debb4a": {
"doc_id": "77debb4a",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "77debb4a/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:05:46.104259+00:00",
"updated_at": "2026-05-18T10:05:48.704061+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"d12bdcc8": {
"doc_id": "d12bdcc8",
"doc_name": "TCT算法接口.pdf",
"file_name": "TCT算法接口.pdf",
"object_name": "d12bdcc8/TCT算法接口.pdf",
"content_type": "application/pdf",
"size_bytes": 165557,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:07:22.199824+00:00",
"updated_at": "2026-05-18T10:07:24.653751+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"3c2e8c9c": {
"doc_id": "3c2e8c9c",
"doc_name": "20260415_Continental tire mobile app solution.pdf",
"file_name": "20260415_Continental tire mobile app solution.pdf",
"object_name": "3c2e8c9c/20260415_Continental tire mobile app solution.pdf",
"content_type": "application/pdf",
"size_bytes": 2178074,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:09:58.338274+00:00",
"updated_at": "2026-05-18T10:10:01.295502+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"d22d21a0": {
"doc_id": "d22d21a0",
"doc_name": "20260415_Continental tire mobile app solution.pdf",
"file_name": "20260415_Continental tire mobile app solution.pdf",
"object_name": "d22d21a0/20260415_Continental tire mobile app solution.pdf",
"content_type": "application/pdf",
"size_bytes": 2178074,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:12:20.078027+00:00",
"updated_at": "2026-05-18T10:12:22.999843+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"35f129d3": {
"doc_id": "35f129d3",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "35f129d3/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:13:24.706512+00:00",
"updated_at": "2026-05-18T10:13:27.180509+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"efc21515": {
"doc_id": "efc21515",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "efc21515/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
"created_at": "2026-05-18T13:47:32.076786+00:00",
"updated_at": "2026-05-18T13:47:57.998073+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-a6e84447457f43cb85f95225cfc6495b",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/efc21515/layouts.json",
"structure_nodes": "artifacts/efc21515/structure_nodes.json",
"semantic_blocks": "artifacts/efc21515/semantic_blocks.json",
"vector_chunks": "artifacts/efc21515/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400"
}
},
"0d4b08bc": {
"doc_id": "0d4b08bc",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "0d4b08bc/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
"created_at": "2026-05-18T14:03:15.134344+00:00",
"updated_at": "2026-05-18T14:03:34.843448+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-78353d85daa24147b68d8fb71895179f",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/0d4b08bc/layouts.json",
"structure_nodes": "artifacts/0d4b08bc/structure_nodes.json",
"semantic_blocks": "artifacts/0d4b08bc/semantic_blocks.json",
"vector_chunks": "artifacts/0d4b08bc/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404"
}
},
"4302f314": {
"doc_id": "4302f314",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "4302f314/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "embedding 维度不匹配,期望 1536",
"created_at": "2026-05-18T14:11:29.943973+00:00",
"updated_at": "2026-05-18T14:11:48.554500+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-23935ee455ac4b26ac4201ac4781ee52",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/4302f314/layouts.json",
"structure_nodes": "artifacts/4302f314/structure_nodes.json",
"semantic_blocks": "artifacts/4302f314/semantic_blocks.json",
"vector_chunks": "artifacts/4302f314/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "embedding 维度不匹配,期望 1536"
}
},
"765ed1ee": {
"doc_id": "765ed1ee",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "765ed1ee/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>",
"created_at": "2026-05-18T14:18:28.875138+00:00",
"updated_at": "2026-05-18T14:18:57.389110+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-f116856bc29245baa2531b245078a701",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/765ed1ee/layouts.json",
"structure_nodes": "artifacts/765ed1ee/structure_nodes.json",
"semantic_blocks": "artifacts/765ed1ee/semantic_blocks.json",
"vector_chunks": "artifacts/765ed1ee/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>"
}
},
"05cabe09": {
"doc_id": "05cabe09",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "05cabe09/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "embedding 维度不匹配,期望 1536",
"created_at": "2026-05-18T14:24:32.156500+00:00",
"updated_at": "2026-05-18T14:24:50.114138+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-897d858983df48e28e9819e563d46208",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/05cabe09/layouts.json",
"structure_nodes": "artifacts/05cabe09/structure_nodes.json",
"semantic_blocks": "artifacts/05cabe09/semantic_blocks.json",
"vector_chunks": "artifacts/05cabe09/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "embedding 维度不匹配,期望 1536"
}
},
"9acb2ba0": {
"doc_id": "9acb2ba0",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "9acb2ba0/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "indexed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 27,
"parser_name": "aliyun_docmind",
"index_name": "regulations_dense_1024_v1",
"error_message": "",
"created_at": "2026-05-18T14:29:01.368719+00:00",
"updated_at": "2026-05-18T14:29:23.699068+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-e5fd4a5419e74d569c562e389e6ae72c",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/9acb2ba0/layouts.json",
"structure_nodes": "artifacts/9acb2ba0/structure_nodes.json",
"semantic_blocks": "artifacts/9acb2ba0/semantic_blocks.json",
"vector_chunks": "artifacts/9acb2ba0/vector_chunks.json"
},
"processing_stage": "indexed",
"index_collection": "regulations_dense_1024_v1"
}
}
}

View File

@@ -0,0 +1,71 @@
# 阿里云解析主链路实现说明
本文档描述当前仓库已经落地的文档 ingest 主链路实现,作为迁移设计到代码实现之间的收口说明。
## 1. 当前默认链路
- 上传入口保持为 `/api/v1/documents/upload`
- 默认 `PARSER_BACKEND=aliyun`
- 默认 `CHUNK_BACKEND=aliyun`
- 默认 Milvus collection 为 `regulations_dense_1536_v2`
- 解析产物落到 MinIO `artifacts/{doc_id}/`
完整主链路如下:
1. 原始文件上传到 MinIO
2. `AliyunDocmindGateway` 提交阿里云异步解析任务
3. 轮询任务状态直到成功或超时
4. 分页拉取 `layouts`
5. 转换为 `structure_nodes / semantic_blocks / vector_chunks`
6. 三层结构 JSON 回写 MinIO
7. 使用 `vector_chunks[*].embedding_text` 调 embedding API
8. 写入 `regulations_dense_1536_v2`
9. 文档状态更新为 `indexed`
运行时转换逻辑位于 `backend/app/infrastructure/parser/aliyun_layout_normalizer.py`
旧的 `backend/app/aliyun_parser/` 示例目录已移除,不参与生产运行时。
## 2. 解析产物持久化
每个文档会额外写入以下对象:
- `artifacts/{doc_id}/layouts.json`
- `artifacts/{doc_id}/structure_nodes.json`
- `artifacts/{doc_id}/semantic_blocks.json`
- `artifacts/{doc_id}/vector_chunks.json`
`documents.json` 仅保留对象 key、统计信息和处理阶段不保存完整大 JSON。
## 3. 失败策略
- 当前 `PARSER_FAILURE_MODE=fail`
- 阿里云解析失败不自动回退到本地 parser
- 失败时保留原始文件与已写入的 artifacts便于排障
## 4. 运行参数
关键环境变量如下:
- `ALIBABA_ACCESS_KEY_ID`
- `ALIBABA_ACCESS_KEY_SECRET`
- `ALIBABA_ENDPOINT`
- `ALIYUN_PARSE_POLL_INTERVAL_SECONDS`
- `ALIYUN_PARSE_TIMEOUT_SECONDS`
- `ALIYUN_PARSE_LAYOUT_STEP_SIZE`
- `ALIYUN_LLM_ENHANCEMENT`
- `ALIYUN_ENHANCEMENT_MODE`
- `DOCUMENT_PARSE_ARTIFACT_PREFIX`
- `PARSER_BACKEND`
- `CHUNK_BACKEND`
## 5. 运行态确认
可通过 `/api/v1/status/config` 确认以下字段:
- `parser_backend`
- `chunk_backend`
- `milvus_collection`
- `artifact_prefix`
- `parser_failure_mode`
这几个值用于确认服务是否实际运行在迁移后的默认链路上。

View File

@@ -29,7 +29,7 @@
已确认的目标需求如下:
- 文档解析统一改为阿里云文档智能能力
- 当前阿里云接入基础来自 `backend/app/aliyun_parser/parse_pdf.py`
- 当前阿里云接入基础已经迁移到 `backend/app/infrastructure/parser/aliyun_layout_normalizer.py`
- 解析结果以 `structure_nodes``semantic_blocks``vector_chunks` 三层结构为基础
- 分块以阿里云 `vector_chunks` 为准,不再走当前本地 `RegulationChunker`
- embedding 改为 OpenAI 兼容 API 调用,模型使用 `text-embedding-v3`
@@ -80,7 +80,7 @@
受影响的解析能力范围包括:
- 当前本地 parser 目录
- `backend/app/aliyun_parser`
- `backend/app/infrastructure/parser`
迁移后阿里云文档智能能力将成为主解析来源,本地 PDF/DOCX/MinerU 解析链路需要重新界定保留、下线或回退策略,但具体模块组织方式不在本文件内定义。
@@ -133,7 +133,7 @@
以下风险和约束在本期已经明确,需要在后续架构和实施阶段优先处理:
- 旧 Milvus collection 与新 `1536` 维 schema 不兼容,需要新 collection 和重建索引
- `backend/app/aliyun_parser` 现有脚本含硬编码密钥,后续必须全部移到环境变量
- 阿里云凭据必须继续只通过环境变量或凭据链注入,不能回到脚本内硬编码
- RAG 下游当前对 `clause_number` 有依赖,迁移后需要优先适配 `section_title` 和 Aliyun chunk metadata
- 如果阿里云返回字段与当前样例不同,需要在架构阶段补充 adapter 层

View File

@@ -1,4 +1,4 @@
"""新架构下的文档编排与 embedding 边界测试。"""
"""Document orchestration and embedding boundary tests for the migrated backend."""
from __future__ import annotations
@@ -80,6 +80,7 @@ class FakeParser:
return ParsedDocument(
doc_id=doc_id,
doc_name=doc_name,
raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
structure_nodes=[{"title": "第一章"}],
semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
vector_chunks=[
@@ -95,6 +96,7 @@ class FakeParser:
}
],
parser_name="fake_parser",
metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
)
@@ -125,10 +127,10 @@ class FakeEmbeddingProvider:
def embed_texts(self, texts: list[str]) -> list[list[float]]:
self.calls.append(texts)
return [[0.1] * 1536 for _ in texts]
return [[0.1] * 1024 for _ in texts]
def embed_query(self, text: str) -> list[float]:
return [0.2] * 1536
return [0.2] * 1024
class FakeVectorIndex:
@@ -146,10 +148,10 @@ class FakeVectorIndex:
return []
def health(self) -> dict:
return {"collection_name": "regulations_dense_1536"}
return {"collection_name": "regulations_dense_1024_v1"}
def test_document_command_service_uses_1536_dense_embedding_and_updates_status():
def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
repository = FakeRepository()
binary_store = FakeBinaryStore()
embedding_provider = FakeEmbeddingProvider()
@@ -183,15 +185,16 @@ def test_document_command_service_uses_1536_dense_embedding_and_updates_status()
assert stored.status == DocumentStatus.INDEXED
assert stored.chunk_count == 1
assert stored.parser_name == "fake_parser"
assert stored.index_name == "regulations_dense_1536"
assert stored.index_name == "regulations_dense_1024_v1"
assert stored.metadata["parse_task_id"] == "task-123"
assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")
def test_bootstrap_defaults_to_local_parser_and_chunk_builder():
def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
bootstrap.get_parser.cache_clear()
bootstrap.get_chunk_builder.cache_clear()
parser = bootstrap.get_parser()
chunk_builder = bootstrap.get_chunk_builder()
assert parser.__class__.__name__ == "LocalDocumentParser"
assert chunk_builder.__class__.__name__ == "LocalRegulationChunkBuilder"
assert parser.__class__.__name__ == "AliyunDocumentParser"
assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"

View File

@@ -64,11 +64,16 @@ def verify_migration_config() -> bool:
try:
assert settings.embedding_model == "text-embedding-v3"
assert settings.embedding_dim == 1536
assert settings.milvus_collection == "regulations_dense_1536"
assert settings.embedding_dim == 1024
assert settings.milvus_collection == "regulations_dense_1024_v1"
assert settings.parser_backend == "aliyun"
assert settings.chunk_backend == "aliyun"
logger.info(f"embedding_model={settings.embedding_model}")
logger.info(f"embedding_base_url={settings.embedding_base_url}")
logger.info(f"embedding_dim={settings.embedding_dim}")
logger.info(f"milvus_collection={settings.milvus_collection}")
logger.info(f"parser_backend={settings.parser_backend}")
logger.info(f"chunk_backend={settings.chunk_backend}")
logger.success("migration config ok")
return True
except Exception as exc: