feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
ash66
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions

18
.env
View File

@@ -9,7 +9,7 @@ DEBUG=false
# ===== Milvus向量数据库配置已有===== # ===== Milvus向量数据库配置已有=====
MILVUS_HOST=localhost MILVUS_HOST=localhost
MILVUS_PORT=19530 MILVUS_PORT=19530
MILVUS_COLLECTION=regulations_dense_1536 MILVUS_COLLECTION=regulations_dense_1024_v1
MILVUS_DB_NAME=default MILVUS_DB_NAME=default
# ===== MinIO对象存储配置已有===== # ===== MinIO对象存储配置已有=====
@@ -34,7 +34,7 @@ POSTGRES_DB=compliance_db
# ===== 嵌入模型配置 ===== # ===== 嵌入模型配置 =====
EMBEDDING_MODEL=text-embedding-v3 EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536 EMBEDDING_DIM=1024
EMBEDDING_API_KEY=sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8 EMBEDDING_API_KEY=sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8
EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1 EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1
EMBEDDING_TIMEOUT_SECONDS=120 EMBEDDING_TIMEOUT_SECONDS=120
@@ -59,7 +59,7 @@ LLM_TEMPERATURE=0.7
# 获取API Key: https://dashscope.console.aliyun.com/ # 获取API Key: https://dashscope.console.aliyun.com/
QWEN_API_KEY=sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8 QWEN_API_KEY=sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8
QWEN_BASE_URL=http://6.86.80.4:30080/v1 QWEN_BASE_URL=http://6.86.80.4:30080/v1
QWEN_MODEL=qwen3.5-plus QWEN_MODEL=qwen3.6-plus
QWEN_VL_MODEL=qwen3-vl-plus QWEN_VL_MODEL=qwen3-vl-plus
# ===== DeepSeek API配置 ===== # ===== DeepSeek API配置 =====
@@ -73,3 +73,15 @@ RAG_TOP_K=10
RAG_MAX_CONTEXT_TOKENS=4000 RAG_MAX_CONTEXT_TOKENS=4000
RAG_SUMMARY_MAX_TOKENS=1024 RAG_SUMMARY_MAX_TOKENS=1024
RAG_SKILLS_MAX_TOKENS=2048 RAG_SKILLS_MAX_TOKENS=2048
# ===== 阿里云文档解析 =====
ALIBABA_ACCESS_KEY_ID=LTAI5t9ZjvwSU9bKuMyiExrE
ALIBABA_ACCESS_KEY_SECRET=hNvY6XocmEO6inYlrmiBwBcx5OfidL
ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
ALIYUN_PARSE_POLL_INTERVAL_SECONDS=5
ALIYUN_PARSE_TIMEOUT_SECONDS=900
ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
ALIYUN_LLM_ENHANCEMENT=true
ALIYUN_ENHANCEMENT_MODE=VLM
DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
PARSER_FAILURE_MODE=fail

View File

@@ -4,7 +4,7 @@
# ===== Milvus向量数据库配置已有===== # ===== Milvus向量数据库配置已有=====
MILVUS_HOST=6.86.80.8 MILVUS_HOST=6.86.80.8
MILVUS_PORT=19530 MILVUS_PORT=19530
MILVUS_COLLECTION=regulations_dense_1536 MILVUS_COLLECTION=regulations_dense_1024_v1
MILVUS_DB_NAME=default MILVUS_DB_NAME=default
# ===== MinIO对象存储配置已有===== # ===== MinIO对象存储配置已有=====
@@ -26,4 +26,3 @@ POSTGRES_PORT=5432
POSTGRES_USER=postgresql POSTGRES_USER=postgresql
POSTGRES_PASSWORD=postgresql123456 POSTGRES_PASSWORD=postgresql123456
POSTGRES_DB=compliance_db POSTGRES_DB=compliance_db

View File

@@ -9,12 +9,12 @@ DEBUG=false
# ===== Milvus向量数据库配置 ===== # ===== Milvus向量数据库配置 =====
MILVUS_HOST=localhost MILVUS_HOST=localhost
MILVUS_PORT=19530 MILVUS_PORT=19530
MILVUS_COLLECTION=regulations_dense_1536 MILVUS_COLLECTION=regulations_dense_1024_v1
MILVUS_DB_NAME=default MILVUS_DB_NAME=default
# ===== 嵌入模型配置 ===== # ===== 嵌入模型配置 =====
EMBEDDING_MODEL=text-embedding-v3 EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536 EMBEDDING_DIM=1024
EMBEDDING_API_KEY=your_embedding_api_key_here EMBEDDING_API_KEY=your_embedding_api_key_here
EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1 EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1
EMBEDDING_TIMEOUT_SECONDS=120 EMBEDDING_TIMEOUT_SECONDS=120
@@ -44,11 +44,20 @@ CHUNK_SIZE=512
CHUNK_OVERLAP=50 CHUNK_OVERLAP=50
MAX_FILE_SIZE_MB=100 MAX_FILE_SIZE_MB=100
DOCUMENT_METADATA_PATH=backend/data/documents.json DOCUMENT_METADATA_PATH=backend/data/documents.json
PARSER_BACKEND=aliyun
CHUNK_BACKEND=aliyun
# ===== 阿里云文档解析 ===== # ===== 阿里云文档解析 =====
ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
ALIYUN_PARSE_POLL_INTERVAL_SECONDS=5
ALIYUN_PARSE_TIMEOUT_SECONDS=900
ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
ALIYUN_LLM_ENHANCEMENT=true
ALIYUN_ENHANCEMENT_MODE=VLM
DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
PARSER_FAILURE_MODE=fail
# ===== API服务配置 ===== # ===== API服务配置 =====
API_HOST=0.0.0.0 API_HOST=0.0.0.0
@@ -73,7 +82,7 @@ DEEPSEEK_BASE_URL=http://6.86.80.4:30080/v1
# Qwen系列: qwen3.5-plus, qwen3-plus, qwen-max, qwen-turbo, qwen-long # Qwen系列: qwen3.5-plus, qwen3-plus, qwen-max, qwen-turbo, qwen-long
# Qwen VL系列: qwen3-vl-plus, qwen-vl-max # Qwen VL系列: qwen3-vl-plus, qwen-vl-max
# DeepSeek系列: deepseek-v4-flash, deepseek-v3.2, deepseek-v3, deepseek-chat, deepseek-coder # DeepSeek系列: deepseek-v4-flash, deepseek-v3.2, deepseek-v3, deepseek-chat, deepseek-coder
QWEN_MODEL=qwen3.5-plus QWEN_MODEL=qwen3.6-plus
QWEN_VL_MODEL=qwen3-vl-plus QWEN_VL_MODEL=qwen3-vl-plus
DEEPSEEK_MODEL=deepseek-v4-flash DEEPSEEK_MODEL=deepseek-v4-flash

View File

@@ -106,6 +106,9 @@ ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
EMBEDDING_API_KEY=your_embedding_api_key_here EMBEDDING_API_KEY=your_embedding_api_key_here
EMBEDDING_MODEL=text-embedding-v3 EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536 EMBEDDING_DIM=1536
PARSER_BACKEND=aliyun
CHUNK_BACKEND=aliyun
PARSER_FAILURE_MODE=fail
``` ```
--- ---

View File

@@ -39,7 +39,7 @@ AIRegulation-DocAnalysis-Demo/
### 1. 安装依赖 ### 1. 安装依赖
```bash ```bash
pip install -r backend/requirements.txt ./dev.sh setup
``` ```
### 2. 启动Milvus向量数据库 ### 2. 启动Milvus向量数据库
@@ -57,7 +57,7 @@ docker-compose logs -f milvus
### 3. 启动API服务 ### 3. 启动API服务
```bash ```bash
PYTHONPATH=backend uvicorn app.main:app --reload --port 8000 ./dev.sh start api --foreground
``` ```
访问API文档http://localhost:8000/docs 访问API文档http://localhost:8000/docs
@@ -104,6 +104,8 @@ MILVUS_PORT=19530
# 阿里云文档解析 # 阿里云文档解析
ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
PARSER_BACKEND=aliyun
CHUNK_BACKEND=aliyun
# embedding 配置 # embedding 配置
EMBEDDING_MODEL=text-embedding-v3 EMBEDDING_MODEL=text-embedding-v3
@@ -121,6 +123,17 @@ CHUNK_SIZE=512
- 混合检索问答功能 - 混合检索问答功能
- 法规变更监控与自动更新 - 法规变更监控与自动更新
## 解析产物
上传成功后,系统会把阿里云解析的中间结果持久化到 MinIO
- `artifacts/{doc_id}/layouts.json`
- `artifacts/{doc_id}/structure_nodes.json`
- `artifacts/{doc_id}/semantic_blocks.json`
- `artifacts/{doc_id}/vector_chunks.json`
当前默认 Milvus collection 为 `regulations_dense_1536_v2`
## 许可证 ## 许可证
MIT License MIT License

View File

@@ -1,8 +0,0 @@
{
"permissions": {
"allow": [
"Bash(python3 *)",
"Bash(PGPASSWORD=postgresql123456 psql *)"
]
}
}

View File

@@ -1,516 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Handle Aliyun parsing support for parse pdf."""
import argparse
import json
import os
import re
import time
from pathlib import Path
from typing import Dict, List
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_docmind_api20220711 import models as docmind_models
from alibabacloud_tea_util import models as util_models
# Keep parser integration steps explicit so external workflow behavior stays traceable.
ALIBABA_ACCESS_KEY_ID = os.getenv("ALIBABA_ACCESS_KEY_ID", "")
ALIBABA_ACCESS_KEY_SECRET = os.getenv("ALIBABA_ACCESS_KEY_SECRET", "")
ALIBABA_ENDPOINT = os.getenv("ALIBABA_ENDPOINT", "docmind-api.cn-hangzhou.aliyuncs.com")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
MAX_CHARS = 600
OVERLAP_CHARS = 80
# Keep parser integration steps explicit so external workflow behavior stays traceable.
TOC_TITLES = {"目次", "目录"}
TITLE_SUBTYPES = {"doc_title", "para_title"}
TEXT_SUBTYPES = {"para", "none"}
FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def init_client() -> DocmindClient:
"""Handle init client."""
if not ALIBABA_ACCESS_KEY_ID or not ALIBABA_ACCESS_KEY_SECRET:
raise ValueError("缺少阿里云文档解析凭据,请设置 ALIBABA_ACCESS_KEY_ID 和 ALIBABA_ACCESS_KEY_SECRET")
config = open_api_models.Config(
access_key_id=ALIBABA_ACCESS_KEY_ID,
access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
)
config.endpoint = ALIBABA_ENDPOINT
return DocmindClient(config)
def submit_job(client: DocmindClient, file_path: str) -> str:
"""Submit job."""
file_name = Path(file_path).name
request = docmind_models.SubmitDocParserJobAdvanceRequest(
file_url_object=open(file_path, "rb"),
file_name=file_name,
file_name_extension=Path(file_path).suffix.lstrip("."),
llm_enhancement=True,
enhancement_mode="VLM",
)
runtime = util_models.RuntimeOptions()
response = client.submit_doc_parser_job_advance(request, runtime)
return response.body.data.id
def query_status(client: DocmindClient, task_id: str) -> Dict:
"""Handle query status."""
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
response = client.query_doc_parser_status(request)
return response.body.data.to_map() if response.body.data else None
def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
"""Wait for for completion."""
while True:
status_data = query_status(client, task_id)
if not status_data:
return False
status = status_data.get("Status", "").lower()
if status == "success":
return True
elif status == "failed":
print(f"任务失败: {status_data}")
return False
print(f"任务状态: {status}, 等待中...")
time.sleep(poll_interval)
def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
"""Return result."""
request = docmind_models.GetDocParserResultRequest(
id=task_id,
layout_step_size=layout_step_size,
layout_num=layout_num,
)
response = client.get_doc_parser_result(request)
return response.body.data if response.body.data else None
def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
"""Collect all results."""
all_layouts = []
layout_num = 0
while True:
result_data = get_result(client, task_id, layout_num, layout_step_size)
if not result_data:
break
layouts = result_data.get("layouts", [])
if not layouts:
break
all_layouts.extend(layouts)
layout_num += len(layouts)
if len(layouts) < layout_step_size:
break
return all_layouts
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def normalize_text(text: str) -> str:
"""Normalize text."""
text = text.replace("\r", "\n")
text = text.replace(" ", " ")
text = re.sub(r"\n+", "\n", text)
text = re.sub(r"[ \t]+", " ", text)
return text.strip()
def get_page(layout: Dict) -> int:
"""Return page."""
return layout.get("pageNum", layout.get("pageNumber", 0))
def get_text(layout: Dict) -> str:
"""Return text."""
text = normalize_text(layout.get("text", ""))
if text:
return text
return normalize_text(layout.get("markdownContent", ""))
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def is_title(layout: Dict) -> bool:
"""Return whether title."""
return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
def is_text(layout: Dict) -> bool:
"""Return whether text."""
return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
def is_figure(layout: Dict) -> bool:
"""Return whether figure."""
return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
def is_table(layout: Dict) -> bool:
"""Return whether table."""
return layout.get("type") == "table"
def is_toc_layout(layout: Dict) -> bool:
"""Return whether toc layout."""
text = get_text(layout)
if text in TOC_TITLES:
return True
if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
return True
return False
def extract_table_text(layout: Dict) -> str:
"""Extract table text."""
rows = []
for cell in layout.get("cells", []):
texts = []
for cell_layout in cell.get("layouts", []):
cell_text = normalize_text(cell_layout.get("text", ""))
if cell_text:
texts.append(cell_text)
if texts:
rows.append(" ".join(texts))
return "\n".join(rows).strip()
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
"""Build structure nodes."""
nodes = []
for layout in layouts:
if not is_title(layout):
continue
text = get_text(layout)
if not text or text in TOC_TITLES:
continue
nodes.append(
{
"unique_id": layout.get("uniqueId"),
"page": get_page(layout),
"index": layout.get("index", 0),
"level": layout.get("level", 0),
"title": text,
"type": layout.get("type"),
"sub_type": layout.get("subType"),
}
)
return nodes
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
"""Update section path."""
level = layout.get("level", 0)
title = get_text(layout)
while section_stack and section_stack[-1]["level"] >= level:
section_stack.pop()
section_stack.append(
{
"level": level,
"title": title,
"page": get_page(layout),
"unique_id": layout.get("uniqueId"),
}
)
return section_stack
def section_path_titles(section_stack: List[Dict]) -> List[str]:
"""Handle section path titles."""
return [item["title"] for item in section_stack]
def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
"""Handle flush text block."""
if not blocks:
return block_id
texts = [item["text"] for item in blocks if item["text"]]
merged_text = "\n".join(texts).strip()
if not merged_text:
return block_id
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "section_text",
"page_start": min(item["page"] for item in blocks),
"page_end": max(item["page"] for item in blocks),
"section_path": blocks[0]["section_path"],
"section_level": blocks[0]["section_level"],
"section_title": blocks[0]["section_title"],
"source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
"text": merged_text,
}
)
return block_id + 1
def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
"""Build semantic blocks."""
semantic_blocks = []
section_stack = []
pending_text_blocks = []
block_id = 1
skip_toc_page = False
for layout in layouts:
text = get_text(layout)
page = get_page(layout)
if is_toc_layout(layout):
skip_toc_page = True
continue
if skip_toc_page and page == 1:
continue
if skip_toc_page and page != 1:
skip_toc_page = False
if is_title(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
section_stack = update_section_path(section_stack, layout)
continue
section_path = section_path_titles(section_stack)
section_title = section_path[-1] if section_path else "未分类"
section_level = len(section_path)
if is_table(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
table_text = extract_table_text(layout)
if table_text:
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "table",
"page_start": page,
"page_end": page,
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
"source_ids": [layout.get("uniqueId")],
"text": table_text,
}
)
block_id += 1
continue
if is_figure(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
if text:
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "figure",
"page_start": page,
"page_end": page,
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
"source_ids": [layout.get("uniqueId")],
"text": text,
}
)
block_id += 1
continue
if is_text(layout) and text:
pending_text_blocks.append(
{
"page": page,
"text": text,
"unique_id": layout.get("uniqueId"),
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
}
)
flush_text_block(pending_text_blocks, semantic_blocks, block_id)
return semantic_blocks
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
"""Handle split text with overlap."""
text = text.strip()
if len(text) <= max_chars:
return [text] if text else []
parts = []
start = 0
while start < len(text):
end = min(len(text), start + max_chars)
parts.append(text[start:end].strip())
if end >= len(text):
break
start = max(0, end - overlap_chars)
return [part for part in parts if part]
def build_vector_chunks(
semantic_blocks: List[Dict],
doc_id: str,
doc_title: str,
max_chars: int,
overlap_chars: int,
) -> List[Dict]:
"""Build vector chunks."""
vector_chunks = []
chunk_index = 1
for block in semantic_blocks:
pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
for piece_index, piece in enumerate(pieces, start=1):
if block["section_path"]:
header = f"标准:{doc_title}\n章节:{' > '.join(block['section_path'])}\n\n"
else:
header = f"标准:{doc_title}\n\n"
vector_chunks.append(
{
"doc_id": doc_id,
"doc_title": doc_title,
"chunk_id": f"chunk-{chunk_index}",
"chunk_index": chunk_index,
"semantic_id": block["semantic_id"],
"chunk_type": block["block_type"],
"piece_index": piece_index,
"page_start": block["page_start"],
"page_end": block["page_end"],
"section_path": block["section_path"],
"section_level": block["section_level"],
"section_title": block["section_title"],
"source_ids": block["source_ids"],
"text": piece,
"embedding_text": header + piece,
}
)
chunk_index += 1
return vector_chunks
def parse_pdf_to_structured_chunks(
pdf_path: str,
*,
doc_id: str,
doc_title: str,
max_chars: int = MAX_CHARS,
overlap_chars: int = OVERLAP_CHARS,
poll_interval: int = 5,
) -> Dict:
"""Parse pdf to structured chunks."""
client = init_client()
task_id = submit_job(client, pdf_path)
if not wait_for_completion(client, task_id, poll_interval):
raise RuntimeError("阿里云文档解析任务失败")
layouts = collect_all_results(client, task_id)
return convert_layouts(
layouts,
doc_id=doc_id,
doc_title=doc_title,
max_chars=max_chars,
overlap_chars=overlap_chars,
)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def convert_layouts(
layouts: List[Dict],
doc_id: str,
doc_title: str,
max_chars: int,
overlap_chars: int,
) -> Dict:
"""Handle convert layouts."""
structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks(
semantic_blocks,
doc_id=doc_id,
doc_title=doc_title,
max_chars=max_chars,
overlap_chars=overlap_chars,
)
return {
"doc_id": doc_id,
"doc_title": doc_title,
"structure_nodes": structure_nodes,
"semantic_blocks": semantic_blocks,
"vector_chunks": vector_chunks,
}
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def main() -> None:
"""Run the module entrypoint."""
parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF输出三层结构 chunks")
parser.add_argument("pdf_path", help="PDF 文件路径")
parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
parser.add_argument("--layouts-out", dest="layouts_output", help="输出原始 layouts JSON")
parser.add_argument("--doc-id", default="GB14747-2006", help="文档 ID")
parser.add_argument("--doc-title", default="GB 14747—2006 儿童三轮车安全要求", help="文档标题")
parser.add_argument("--max-chars", type=int, default=MAX_CHARS, help="单个检索 chunk 最大字符数")
parser.add_argument("--overlap-chars", type=int, default=OVERLAP_CHARS, help="相邻检索 chunk 重叠字符数")
parser.add_argument("--poll-interval", type=int, default=5, help="轮询间隔(秒)")
args = parser.parse_args()
pdf_path = Path(args.pdf_path).expanduser().resolve()
if not pdf_path.exists():
raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
client = init_client()
print(f"提交任务: {pdf_path}")
task_id = submit_job(client, str(pdf_path))
print(f"任务 ID: {task_id}")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
print("等待任务完成...")
if not wait_for_completion(client, task_id, args.poll_interval):
print("任务失败,退出")
return
# Keep parser integration steps explicit so external workflow behavior stays traceable.
print("获取解析结果...")
layouts = collect_all_results(client, task_id)
print(f"获取到 {len(layouts)} 个布局块")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
if args.layouts_output:
layouts_path = Path(args.layouts_output).expanduser().resolve()
layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"原始 layouts 已写入: {layouts_path}")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
print("转换为三层结构...")
data = convert_layouts(
layouts,
doc_id=args.doc_id,
doc_title=args.doc_title,
max_chars=args.max_chars,
overlap_chars=args.overlap_chars,
)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
output_path = Path(args.out).expanduser().resolve()
output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"结构层节点数: {len(data['structure_nodes'])}")
print(f"语义层块数: {len(data['semantic_blocks'])}")
print(f"检索层块数: {len(data['vector_chunks'])}")
print(f"输出文件: {output_path}")
if __name__ == "__main__":
main()

View File

@@ -1,122 +0,0 @@
-- 法规文档向量检索系统数据库表结构
-- PostgreSQL
-- ==================== 文档表 ====================
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) UNIQUE NOT NULL, -- 文档唯一标识,如 "GB14747-2006"
title VARCHAR(512) NOT NULL, -- 文档标题
doc_type VARCHAR(32), -- 文档类型:标准/法规/规范
standard_number VARCHAR(64), -- 标准编号:如 "GB 14747-2006"
publish_date DATE, -- 发布日期
implement_date DATE, -- 实施日期
status VARCHAR(32), -- 状态:现行/废止/修订
source_url VARCHAR(512), -- 来源 URL
file_path VARCHAR(512), -- 本地 PDF 文件路径
file_size INT, -- 文件大小(字节)
upload_time TIMESTAMP DEFAULT NOW(), -- 上传时间
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
COMMENT ON TABLE documents IS '文档元数据表';
COMMENT ON COLUMN documents.doc_id IS '文档唯一标识,用于关联 Milvus 和其他表';
COMMENT ON COLUMN documents.standard_number IS '标准编号,如 GB 14747-2006';
-- ==================== 章节结构表 ====================
CREATE TABLE sections (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
unique_id VARCHAR(64) NOT NULL, -- 阿里云返回的唯一标识
level INT NOT NULL, -- 层级1, 2, 3...
title VARCHAR(512) NOT NULL, -- 章节标题
page INT, -- 所在页码
index INT, -- 页内顺序
parent_id INT, -- 父章节 ID树形结构
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
);
COMMENT ON TABLE sections IS '章节结构表,用于目录导航';
COMMENT ON COLUMN sections.parent_id IS '父章节 ID构建树形结构';
COMMENT ON COLUMN sections.level IS '层级深度1 为最顶层';
-- ==================== 语义块表 ====================
CREATE TABLE semantic_blocks (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
semantic_id VARCHAR(64) NOT NULL, -- 语义块唯一标识
block_type VARCHAR(32) NOT NULL, -- 类型section_text/table/figure
page_start INT NOT NULL, -- 起始页码
page_end INT NOT NULL, -- 结束页码
section_id INT, -- 所属章节
section_title VARCHAR(512), -- 章节标题(冗余,方便查询)
section_level INT, -- 章节层级
source_ids JSONB, -- 原始 layout IDsJSON 数组)
text TEXT NOT NULL, -- 完整内容(未被切分)
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
);
COMMENT ON TABLE semantic_blocks IS '语义块表,用于邻域扩展,恢复完整内容';
COMMENT ON COLUMN semantic_blocks.block_type IS '类型section_text正文、table表格、figure图示';
COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容,未被切分';
-- ==================== 向量块元数据表 ====================
CREATE TABLE vector_chunks (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
chunk_id VARCHAR(64) NOT NULL, -- Milvus 主键
semantic_id VARCHAR(64) NOT NULL, -- 关联语义块
chunk_index INT NOT NULL, -- 切片序号(全局)
piece_index INT, -- 同语义块内的切片序号
page_start INT,
page_end INT,
section_title VARCHAR(512),
text VARCHAR(2048), -- 切片文本(可选,缩短版用于展示)
source_ids JSONB, -- 原始 layout IDsJSON 数组)
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
REFERENCES semantic_blocks(doc_id, semantic_id),
CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
);
COMMENT ON TABLE vector_chunks IS '向量块元数据表,用于快速关联查询';
COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号,用于按序拼接';
-- ==================== 索引 ====================
CREATE INDEX idx_sections_doc_id ON sections(doc_id);
CREATE INDEX idx_sections_parent_id ON sections(parent_id);
CREATE INDEX idx_sections_level ON sections(level);
CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
-- ==================== 触发器:自动更新 updated_at ====================
CREATE OR REPLACE FUNCTION update_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER tr_documents_updated_at
BEFORE UPDATE ON documents
FOR EACH ROW EXECUTE FUNCTION update_updated_at();

View File

@@ -1,327 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Handle Aliyun parsing support for upload to milvus."""
import argparse
import json
import time
from pathlib import Path
from typing import List, Dict
import psycopg2
from psycopg2.extras import execute_values
from pymilvus import (
connections,
Collection,
FieldSchema,
CollectionSchema,
DataType,
utility,
)
from openai import OpenAI
# Keep parser integration steps explicit so external workflow behavior stays traceable.
# Keep parser integration steps explicit so external workflow behavior stays traceable.
RELAY_BASE_URL = "http://6.86.80.4:30080/v1"
RELAY_API_KEY = "sk-5HeY7gfSIlyZMacfuXOf5cphpymsNqufEu1ou4U3avbULcyY"
EMBEDDING_MODEL = "text-embedding-v3" # Keep parser integration steps explicit so external workflow behavior stays traceable.
# Keep parser integration steps explicit so external workflow behavior stays traceable.
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"
COLLECTION_NAME = "regulation_chunks"
# Keep parser integration steps explicit so external workflow behavior stays traceable.
PG_HOST = "6.86.80.10"
PG_PORT = 5432
PG_USER = "postgresql"
PG_PASSWORD = "postgresql123456"
PG_DATABASE = "postgres"
# ===================== Embedding =====================
def get_openai_client(api_key: str, base_url: str) -> OpenAI:
"""Return openai client."""
return OpenAI(api_key=api_key, base_url=base_url)
def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10) -> List[List[float]]:
"""Return embeddings batch."""
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
print(f"Embedding batch {i // batch_size + 1}/{(len(texts) - 1) // batch_size + 1}...")
response = client.embeddings.create(
model=EMBEDDING_MODEL,
input=batch,
)
embeddings = [item.embedding for item in response.data]
all_embeddings.extend(embeddings)
return all_embeddings
# ===================== Milvus =====================
def init_milvus(host: str, port: str):
"""Handle init milvus."""
connections.connect("default", host=host, port=port)
print(f"已连接 Milvus: {host}:{port}")
def create_collection(name: str, dim: int) -> Collection:
"""Create collection."""
if utility.has_collection(name):
print(f"Collection '{name}' 已存在,删除重建")
utility.drop_collection(name)
fields = [
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=64, is_primary=True),
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="chunk_index", dtype=DataType.INT64),
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=32),
FieldSchema(name="page_start", dtype=DataType.INT64),
FieldSchema(name="page_end", dtype=DataType.INT64),
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096), # Keep parser integration steps explicit so external workflow behavior stays traceable.
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields, description="法规文档检索 chunks")
collection = Collection(name, schema)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
index_params = {
"metric_type": "COSINE",
"index_type": "IVF_FLAT",
"params": {"nlist": 128},
}
collection.create_index("embedding", index_params)
print(f"Collection '{name}' 创建完成,索引已建立")
return collection
def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[List[float]]):
"""Handle insert chunks."""
data = [
[c["chunk_id"] for c in chunks],
[c["doc_id"] for c in chunks],
[c["doc_title"] for c in chunks],
[c["chunk_index"] for c in chunks],
[c["semantic_id"] for c in chunks],
[c["chunk_type"] for c in chunks],
[c["page_start"] for c in chunks],
[c["page_end"] for c in chunks],
[c["section_title"] for c in chunks],
[c["text"] for c in chunks],
[json.dumps(c.get("source_ids", [])) for c in chunks], # Keep parser integration steps explicit so external workflow behavior stays traceable.
embeddings,
]
collection.insert(data)
collection.flush()
print(f"已插入 {len(chunks)} 个 chunks")
def load_collection(collection: Collection):
"""Load collection."""
collection.load()
print(f"Collection 已加载到内存")
# ===================== PostgreSQL =====================
def get_pg_connection(host: str, port: int, user: str, password: str, database: str):
"""Return pg connection."""
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
database=database,
)
print(f"已连接 PostgreSQL: {host}:{port}/{database}")
return conn
def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
"""Handle insert chunks to pg."""
cursor = conn.cursor()
try:
# Keep parser integration steps explicit so external workflow behavior stays traceable.
cursor.execute("""
INSERT INTO documents (doc_id, title, standard_number, upload_time)
VALUES (%s, %s, %s, NOW())
ON CONFLICT (doc_id) DO UPDATE SET title = EXCLUDED.title, updated_at = NOW()
""", (doc_data["doc_id"], doc_data["doc_title"], doc_data.get("standard_number")))
# Keep parser integration steps explicit so external workflow behavior stays traceable.
semantic_blocks = doc_data.get("semantic_blocks", [])
if semantic_blocks:
block_rows = [
(
doc_data["doc_id"],
block["semantic_id"],
block["block_type"],
block["page_start"],
block["page_end"],
block.get("section_title"),
block.get("section_level"),
json.dumps(block.get("source_ids", [])),
block["text"],
)
for block in semantic_blocks
]
execute_values(
cursor,
"""
INSERT INTO semantic_blocks
(doc_id, semantic_id, block_type, page_start, page_end, section_title, section_level, source_ids, text)
VALUES %s
ON CONFLICT (doc_id, semantic_id) DO UPDATE SET text = EXCLUDED.text
""",
block_rows,
)
print(f"已插入 {len(semantic_blocks)} 个语义块")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
chunk_rows = [
(
doc_data["doc_id"],
chunk["chunk_id"],
chunk["semantic_id"],
chunk["chunk_index"],
chunk.get("piece_index"),
chunk["page_start"],
chunk["page_end"],
chunk.get("section_title"),
chunk["text"],
json.dumps(chunk.get("source_ids", [])),
)
for chunk in chunks
]
execute_values(
cursor,
"""
INSERT INTO vector_chunks
(doc_id, chunk_id, semantic_id, chunk_index, piece_index, page_start, page_end, section_title, text, source_ids)
VALUES %s
ON CONFLICT (doc_id, chunk_id) DO UPDATE SET text = EXCLUDED.text
""",
chunk_rows,
)
print(f"已插入 {len(chunks)} 个向量块元数据")
conn.commit()
print("PostgreSQL 数据插入完成")
except Exception as e:
conn.rollback()
raise e
finally:
cursor.close()
# Keep parser integration steps explicit so external workflow behavior stays traceable.
def load_data(file_path: Path) -> Dict:
"""Load data."""
data = json.loads(file_path.read_text(encoding="utf-8"))
return data
def upload_to_milvus_and_pg(
chunks_file: str,
api_key: str,
base_url: str,
milvus_host: str,
milvus_port: str,
collection_name: str,
batch_size: int,
pg_host: str,
pg_port: int,
pg_user: str,
pg_password: str,
pg_database: str,
):
# Keep parser integration steps explicit so external workflow behavior stays traceable.
"""Handle upload to milvus and pg."""
chunks_path = Path(chunks_file).expanduser().resolve()
if not chunks_path.exists():
raise FileNotFoundError(f"文件不存在: {chunks_path}")
data = load_data(chunks_path)
chunks = data.get("vector_chunks", [])
if not chunks:
raise ValueError("vector_chunks 为空")
print(f"加载 {len(chunks)} 个 chunks")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
client = get_openai_client(api_key, base_url)
init_milvus(milvus_host, milvus_port)
pg_conn = get_pg_connection(pg_host, pg_port, pg_user, pg_password, pg_database)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
texts = [c["embedding_text"] for c in chunks]
embeddings = get_embeddings_batch(client, texts, batch_size)
print(f"生成 {len(embeddings)} 个向量")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
embedding_dim = len(embeddings[0])
print(f"Embedding 维度: {embedding_dim}")
# Keep parser integration steps explicit so external workflow behavior stays traceable.
collection = create_collection(collection_name, embedding_dim)
insert_chunks(collection, chunks, embeddings)
load_collection(collection)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
insert_chunks_to_pg(pg_conn, chunks, data)
# Keep parser integration steps explicit so external workflow behavior stays traceable.
pg_conn.close()
print("上传完成!")
# ===================== CLI =====================
def main():
"""Run the module entrypoint."""
parser = argparse.ArgumentParser(description="将 vector_chunks 向量化并上传到 Milvus 和 PostgreSQL")
parser.add_argument("chunks_file", help="vector_chunks.json 文件路径")
parser.add_argument("--api-key", default=RELAY_API_KEY, help="中转站 API Key")
parser.add_argument("--base-url", default=RELAY_BASE_URL, help="中转站 Base URL")
parser.add_argument("--milvus-host", default=MILVUS_HOST, help="Milvus host")
parser.add_argument("--milvus-port", default=MILVUS_PORT, help="Milvus port")
parser.add_argument("--collection", default=COLLECTION_NAME, help="Milvus collection 名称")
parser.add_argument("--batch-size", type=int, default=10, help="Embedding 批量大小中转站限制最大10")
parser.add_argument("--pg-host", default=PG_HOST, help="PostgreSQL host")
parser.add_argument("--pg-port", type=int, default=PG_PORT, help="PostgreSQL port")
parser.add_argument("--pg-user", default=PG_USER, help="PostgreSQL user")
parser.add_argument("--pg-password", default=PG_PASSWORD, help="PostgreSQL password")
parser.add_argument("--pg-database", default=PG_DATABASE, help="PostgreSQL database")
args = parser.parse_args()
upload_to_milvus_and_pg(
chunks_file=args.chunks_file,
api_key=args.api_key,
base_url=args.base_url,
milvus_host=args.milvus_host,
milvus_port=args.milvus_port,
collection_name=args.collection,
batch_size=args.batch_size,
pg_host=args.pg_host,
pg_port=args.pg_port,
pg_user=args.pg_user,
pg_password=args.pg_password,
pg_database=args.pg_database,
)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -1,263 +0,0 @@
# 文档解析与向量检索说明
## 相关文件
- `aliyun_doc_parser.py`:调用阿里云文档智能解析 PDF生成原始 `layouts.json`
- `layouts_to_vector_chunks.py`:把 `layouts.json` 转成适合向量数据库入库的三层结构
- `layouts.json`:阿里云返回的原始布局结果
- `vector_chunks.json`:转换后的结构化输出
## 一、`layouts.json` 的结构
`layouts.json` 顶层是一个数组每个元素代表一个布局块layout。常见字段如下
- `type`:主类型,例如 `title``text``table``figure`
- `subType`:更细的语义类型,例如 `doc_title``para_title``para``picture``pic_title``pic_caption`
- `text`:当前布局块的纯文本
- `markdownContent`:带 markdown 标记的文本
- `pageNum`:页码
- `index`:页内顺序
- `level`:标题层级
- `uniqueId`:布局块唯一标识
- `blocks`:更细粒度的文本与样式信息
- `cells`:表格单元格,仅 `table` 类型存在
这个结构不是简单 OCR 文本流,而是已经带有版面理解和语义分类的结构化数据。
## 二、推荐的三层转换结构
### 1. 结构层 `structure_nodes`
结构层用于恢复文档标题树,不直接作为最终向量检索单元。
示例:
- `1 范围`
- `2 规范性引用文件`
- `3 术语和定义`
- `3.1 儿童三轮车`
- `3.2 轮距`
结构层主要用于给下游 chunk 绑定 `section_path`
### 2. 语义层 `semantic_blocks`
语义层是按文档意义聚合后的内容块,主要分为三类:
- `section_text`:同一章节下连续正文聚合而成
- `table`:表格内容单独成块
- `figure`:图、图名、图注等单独成块
这一层比单 layout 更适合做语义理解,也适合后续做上下文扩展。
### 3. 检索层 `vector_chunks`
检索层是最终写进向量数据库的 chunk。
处理方式:
-`semantic_blocks` 中较短的块直接入库
- 对较长的块按 `max_chars` 再切分
- 相邻切片保留 `overlap_chars` 重叠
- 每个 chunk 都带完整 metadata便于后续过滤、重排和邻域扩展
## 三、当前转换脚本做了什么
`layouts_to_vector_chunks.py` 当前已经实现:
1. 过滤目录页噪声(如 `目次`
2. 根据标题层级维护章节路径
3. 将正文聚合成 `section_text`
4. 将表格单独转成 `table`
5. 将图相关内容单独转成 `figure`
6. 对长文本继续切分为最终 `vector_chunks`
7. 为每个检索 chunk 生成 `embedding_text`
## 四、为什么不要直接按 layout 入库
如果把 `layouts.json` 的每条 layout 直接做向量:
- 颗粒度太碎
- 标题和正文容易分离
- 表格会丢失结构上下文
- 图示信息无法完整表达
- 检索命中结果噪声较大
对于标准文档,最合适的单位通常不是“句子”,而是“条款语义块”。
## 五、建议的入库字段
建议向量数据库每条记录至少保存:
- `embedding_text`:用于生成向量
- `text`:原始 chunk 文本
- `chunk_id`
- `semantic_id`
- `chunk_type``section_text` / `table` / `figure`
- `section_path`
- `section_title`
- `section_level`
- `page_start`
- `page_end`
- `doc_id`
- `doc_title`
- `source_ids`
其中:
- 向量化字段:`embedding_text`
- 展示字段:`text`
- 检索增强字段:其余 metadata
## 六、推荐的检索方式
不要只做最简单的 top-k 向量搜索,建议采用:
**向量召回 + metadata 重排 + 邻域扩展**
### 1. 向量召回
使用 `vector_chunks[*].embedding_text` 做 embedding并在向量数据库中检索 top 10 ~ 15 条。
查询时可以对用户问题做轻微改写,例如:
原问题:
`儿童三轮车的定义是什么?`
可改写为:
`请检索 GB 14747—2006 儿童三轮车安全要求 中关于“儿童三轮车定义”的条款、术语、表格或图示说明。`
这样更适合标准文档检索。
### 2. metadata 重排
向量召回后,根据 metadata 做轻量规则重排。
常见规则:
- `chunk_type == section_text`:对定义类、要求类问题优先级更高
- `section_path` 命中查询关键词:例如查询“定义”时,`术语和定义` 章节优先
- `chunk_type == table`:对“尺寸 / 参数 / 数值 / 对照 / 要求”类问题加权
- `chunk_type == figure`:对“图 / 结构 / 状态 / 示意”类问题加权
### 3. 邻域扩展
检索命中的是最终切片,但回答往往需要更完整上下文。
建议命中某个 `vector_chunk` 后:
1. 优先回捞同一个 `semantic_id` 下的所有 chunk
2. 如果还不够,再补充同 `section_path`、相邻页码或相邻 `chunk_index` 的内容
这样可以恢复完整条款,而不是只给模型一小段碎片。
## 七、不同问题的检索重点
### 1. 定义类问题
例如:
- `儿童三轮车的定义是什么?`
- `轮距是什么意思?`
优先检索:
- `section_text`
- `section_path` 中包含 `术语和定义` 的内容
### 2. 要求类问题
例如:
- `外露突出物有什么要求?`
- `辅助推杆有哪些安全要求?`
优先检索:
- `section_text`
- `table`
### 3. 数值 / 尺寸 / 对照类问题
例如:
- `鞍座到脚蹬距离要求是什么?`
- `哪些项目需要满足规定尺寸?`
优先检索:
- `table`
- `section_text`
### 4. 图示说明类问题
例如:
- `正常乘骑状态是什么意思?`
- `图1表示什么`
优先检索:
- `figure`
- 同章节相邻 `section_text`
## 八、推荐的最终检索流程
建议采用以下固定流程:
1.`vector_chunks.embedding_text` 做 embedding 检索
2. 取 top 10 ~ 15 条候选
3.`chunk_type + section_path` 做规则重排
4.`semantic_id` 为中心回捞完整语义块
5. 选 3 ~ 5 组上下文提供给大模型回答
## 九、给大模型的上下文组织方式
最终不要直接把原始 JSON 扔给模型,建议整理成如下格式:
```text
[命中片段 1]
章节3 术语和定义 > 3.1 儿童三轮车
页码1-2
类型section_text
内容:
......
[命中片段 2]
章节4 要求 > 4.3 外露突出物
页码5
类型section_text
内容:
......
[命中片段 3]
章节5 试验方法
页码8
类型table
内容:
......
```
这种格式更利于模型稳定回答并引用出处。
## 十、转换命令
生成三层结构:
```bash
python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
--layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
--out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json
```
自定义切片大小:
```bash
python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
--layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
--out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json \
--max-chars 500 \
--overlap-chars 80
```

View File

@@ -32,6 +32,10 @@ async def get_config():
"embedding_dim": settings.embedding_dim, "embedding_dim": settings.embedding_dim,
"embedding_base_url": settings.embedding_base_url, "embedding_base_url": settings.embedding_base_url,
"milvus_collection": settings.milvus_collection, "milvus_collection": settings.milvus_collection,
"parser_backend": settings.parser_backend,
"chunk_backend": settings.chunk_backend,
"artifact_prefix": settings.document_parse_artifact_prefix,
"parser_failure_mode": settings.parser_failure_mode,
"llm_provider": settings.llm_provider, "llm_provider": settings.llm_provider,
"llm_model": settings.llm_model, "llm_model": settings.llm_model,
"document_metadata_path": settings.document_metadata_path, "document_metadata_path": settings.document_metadata_path,

View File

@@ -5,6 +5,7 @@ from __future__ import annotations
import os import os
import tempfile import tempfile
import uuid import uuid
import json
from dataclasses import dataclass from dataclasses import dataclass
from loguru import logger from loguru import logger
@@ -16,6 +17,7 @@ from app.domain.documents import (
DocumentParser, DocumentParser,
DocumentRepository, DocumentRepository,
DocumentStatus, DocumentStatus,
ParsedDocument,
) )
from app.domain.retrieval import EmbeddingProvider, VectorIndex from app.domain.retrieval import EmbeddingProvider, VectorIndex
# Keep orchestration logic centralized so use-case flow stays easy to trace. # Keep orchestration logic centralized so use-case flow stays easy to trace.
@@ -54,6 +56,27 @@ class DocumentCommandService:
self.embedding_provider = embedding_provider self.embedding_provider = embedding_provider
self.vector_index = vector_index self.vector_index = vector_index
def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
"""Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"
artifact_payloads = {
"layouts": parsed_document.raw_layouts,
"structure_nodes": parsed_document.structure_nodes,
"semantic_blocks": parsed_document.semantic_blocks,
"vector_chunks": parsed_document.vector_chunks,
}
artifact_keys: dict[str, str] = {}
for name, payload in artifact_payloads.items():
object_name = f"{prefix}/{name}.json"
self.binary_store.save(
object_name=object_name,
data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),
content_type="application/json",
metadata={"doc_id": doc_id, "artifact_type": name},
)
artifact_keys[name] = object_name
return artifact_keys
def upload_and_process( def upload_and_process(
self, self,
*, *,
@@ -104,11 +127,21 @@ class DocumentCommandService:
doc_id=doc_id, doc_id=doc_id,
doc_name=final_doc_name, doc_name=final_doc_name,
) )
artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
self.document_repository.update_status( self.document_repository.update_status(
doc_id, doc_id,
DocumentStatus.PARSED, DocumentStatus.PARSED,
parser_name=parsed_document.parser_name, parser_name=parsed_document.parser_name,
metadata={"structure_nodes": len(parsed_document.structure_nodes)}, metadata={
"parser_backend": parsed_document.parser_name,
"parse_task_id": parsed_document.metadata.get("task_id", ""),
"layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
"structure_node_count": len(parsed_document.structure_nodes),
"semantic_block_count": len(parsed_document.semantic_blocks),
"vector_chunk_count": len(parsed_document.vector_chunks),
"artifact_keys": artifact_keys,
"processing_stage": "parsed",
},
) )
chunks = self.chunk_builder.build( chunks = self.chunk_builder.build(
@@ -124,13 +157,18 @@ class DocumentCommandService:
if inserted != len(chunks): if inserted != len(chunks):
logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks)) logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
health = self.vector_index.health()
self.document_repository.update_status( self.document_repository.update_status(
doc_id, doc_id,
DocumentStatus.INDEXED, DocumentStatus.INDEXED,
chunk_count=len(chunks), chunk_count=len(chunks),
summary="", summary="",
summary_latency_ms=0, summary_latency_ms=0,
index_name=self.vector_index.health().get("collection_name", ""), index_name=health.get("collection_name", ""),
metadata={
"index_collection": health.get("collection_name", ""),
"processing_stage": "indexed",
},
) )
stored = self.document_repository.get(doc_id) stored = self.document_repository.get(doc_id)
return DocumentProcessResult( return DocumentProcessResult(
@@ -148,6 +186,10 @@ class DocumentCommandService:
doc_id, doc_id,
DocumentStatus.FAILED, DocumentStatus.FAILED,
error_message=str(exc), error_message=str(exc),
metadata={
"failure_reason": str(exc),
"processing_stage": "failed",
},
) )
return DocumentProcessResult( return DocumentProcessResult(
doc_id=doc_id, doc_id=doc_id,

View File

@@ -1,9 +1,9 @@
"""Configure backend settings for settings.""" """Configure backend settings for the backend application."""
from pathlib import Path from pathlib import Path
from pydantic_settings import BaseSettings, SettingsConfigDict
from pydantic import Field from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
from functools import lru_cache from functools import lru_cache
# Keep configuration setup explicit so runtime behavior is easy to reason about. # Keep configuration setup explicit so runtime behavior is easy to reason about.
@@ -33,18 +33,25 @@ class Settings(BaseSettings):
# Keep configuration setup explicit so runtime behavior is easy to reason about. # Keep configuration setup explicit so runtime behavior is easy to reason about.
milvus_host: str = Field(default="localhost", description="Milvus服务地址") milvus_host: str = Field(default="localhost", description="Milvus服务地址")
milvus_port: int = Field(default=19530, description="Milvus服务端口") milvus_port: int = Field(default=19530, description="Milvus服务端口")
milvus_collection: str = Field(default="regulations_dense_1536", description="法规向量集合名称") milvus_collection: str = Field(default="regulations_dense_1024_v1", description="法规向量集合名称")
milvus_db_name: str = Field(default="default", description="Milvus数据库名称") milvus_db_name: str = Field(default="default", description="Milvus数据库名称")
# Keep configuration setup explicit so runtime behavior is easy to reason about. # Keep configuration setup explicit so runtime behavior is easy to reason about.
embedding_model: str = Field(default="text-embedding-v3", description="嵌入模型名称") embedding_model: str = Field(default="text-embedding-v3", description="嵌入模型名称")
embedding_dim: int = Field(default=1536, description="嵌入向量维度") embedding_dim: int = Field(default=1024, description="嵌入向量维度")
embedding_api_key: str = Field(default="", description="Embedding API密钥") embedding_api_key: str = Field(default="", description="Embedding API密钥")
embedding_base_url: str = Field(default="http://6.86.80.4:30080/v1", description="Embedding API地址") embedding_base_url: str = Field(default="http://6.86.80.4:30080/v1", description="Embedding API地址")
embedding_timeout_seconds: int = Field(default=120, description="Embedding API超时时间(秒)") embedding_timeout_seconds: int = Field(default=120, description="Embedding API超时时间(秒)")
alibaba_access_key_id: str = Field(default="", description="阿里云文档解析 Access Key ID") alibaba_access_key_id: str = Field(default="", description="阿里云文档解析 Access Key ID")
alibaba_access_key_secret: str = Field(default="", description="阿里云文档解析 Access Key Secret") alibaba_access_key_secret: str = Field(default="", description="阿里云文档解析 Access Key Secret")
alibaba_endpoint: str = Field(default="docmind-api.cn-hangzhou.aliyuncs.com", description="阿里云文档解析 endpoint") alibaba_endpoint: str = Field(default="docmind-api.cn-hangzhou.aliyuncs.com", description="阿里云文档解析 endpoint")
aliyun_parse_poll_interval_seconds: int = Field(default=5, description="阿里云文档解析轮询间隔(秒)")
aliyun_parse_timeout_seconds: int = Field(default=900, description="阿里云文档解析超时时间(秒)")
aliyun_parse_layout_step_size: int = Field(default=50, description="阿里云文档解析分页步长")
aliyun_llm_enhancement: bool = Field(default=True, description="是否启用阿里云解析增强")
aliyun_enhancement_mode: str = Field(default="VLM", description="阿里云解析增强模式")
document_parse_artifact_prefix: str = Field(default="artifacts", description="解析产物对象前缀")
parser_failure_mode: str = Field(default="fail", description="解析失败策略")
# Keep configuration setup explicit so runtime behavior is easy to reason about. # Keep configuration setup explicit so runtime behavior is easy to reason about.
minio_endpoint: str = Field(default="localhost:9000", description="MinIO服务地址") minio_endpoint: str = Field(default="localhost:9000", description="MinIO服务地址")
@@ -71,8 +78,8 @@ class Settings(BaseSettings):
chunk_overlap: int = Field(default=50, description="分块重叠大小") chunk_overlap: int = Field(default=50, description="分块重叠大小")
max_file_size_mb: int = Field(default=100, description="最大文件大小(MB)") max_file_size_mb: int = Field(default=100, description="最大文件大小(MB)")
document_metadata_path: str = Field(default="backend/data/documents.json", description="文档元数据存储路径") document_metadata_path: str = Field(default="backend/data/documents.json", description="文档元数据存储路径")
parser_backend: str = Field(default="local", description="解析后端(local/aliyun)") parser_backend: str = Field(default="aliyun", description="解析后端(local/aliyun)")
chunk_backend: str = Field(default="local", description="分块后端(local/aliyun)") chunk_backend: str = Field(default="aliyun", description="分块后端(local/aliyun)")
# Keep configuration setup explicit so runtime behavior is easy to reason about. # Keep configuration setup explicit so runtime behavior is easy to reason about.
api_host: str = Field(default="0.0.0.0", description="API服务地址") api_host: str = Field(default="0.0.0.0", description="API服务地址")

View File

@@ -27,12 +27,12 @@ class Settings(BaseSettings):
# Milvus # Milvus
milvus_host: str = "localhost" milvus_host: str = "localhost"
milvus_port: int = 19530 milvus_port: int = 19530
milvus_collection: str = "regulations_dense_1536" milvus_collection: str = "regulations_dense_1024_v1"
# LLM / embedding defaults aligned with the migrated backend path. # LLM / embedding defaults aligned with the migrated backend path.
llm_model: str = "qwen-max" llm_model: str = "qwen-max"
embedding_model: str = "text-embedding-v3" embedding_model: str = "text-embedding-v3"
embedding_dim: int = 1536 embedding_dim: int = 1024
# Legacy workflow compatibility only. # Legacy workflow compatibility only.
vector_top_k: int = 10 vector_top_k: int = 10
@@ -47,7 +47,7 @@ class Settings(BaseSettings):
api_port: int = 8000 api_port: int = 8000
# Legacy aliases retained for old utility modules. # Legacy aliases retained for old utility modules.
regulations_collection: str = "regulations_dense_1536" regulations_collection: str = "regulations_dense_1024_v1"
compliance_collection: str = "compliance_cache" compliance_collection: str = "compliance_cache"
# Preserve the legacy module API while keeping env resolution centralized at the repo root. # Preserve the legacy module API while keeping env resolution centralized at the repo root.

View File

@@ -56,6 +56,7 @@ class ParsedDocument:
vector_chunks: list[dict[str, Any]] vector_chunks: list[dict[str, Any]]
parser_name: str parser_name: str
raw_text: str = "" raw_text: str = ""
raw_layouts: list[dict[str, Any]] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict) metadata: dict[str, Any] = field(default_factory=dict)

View File

@@ -10,6 +10,8 @@ from app.config.settings import settings
from app.domain.retrieval import EmbeddingProvider from app.domain.retrieval import EmbeddingProvider
# Keep adapter behavior explicit so integration details remain easy to audit. # Keep adapter behavior explicit so integration details remain easy to audit.
EMBEDDING_BATCH_SIZE = 8
class OpenAICompatibleEmbeddingProvider(EmbeddingProvider): class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
@@ -27,6 +29,18 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
self.timeout = settings.embedding_timeout_seconds self.timeout = settings.embedding_timeout_seconds
self.dimension = settings.embedding_dim self.dimension = settings.embedding_dim
def _raise_for_status(self, response: httpx.Response, *, batch_size: int) -> None:
"""Raise a detailed error so upstream gateway failures are easier to diagnose."""
try:
response.raise_for_status()
except httpx.HTTPStatusError as exc:
response_preview = response.text[:500].strip()
detail = (
f"Embedding request failed for model={self.model}, batch_size={batch_size}, "
f"status={response.status_code}, url={response.request.url}, response={response_preview}"
)
raise httpx.HTTPStatusError(detail, request=exc.request, response=exc.response) from exc
def _request(self, texts: list[str]) -> list[list[float]]: def _request(self, texts: list[str]) -> list[list[float]]:
"""Handle request for this module for the Open A I Compatible Embedding Provider instance.""" """Handle request for this module for the Open A I Compatible Embedding Provider instance."""
if not self.api_key: if not self.api_key:
@@ -40,7 +54,7 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
json={"model": self.model, "input": texts}, json={"model": self.model, "input": texts},
timeout=self.timeout, timeout=self.timeout,
) )
response.raise_for_status() self._raise_for_status(response, batch_size=len(texts))
data = response.json() data = response.json()
vectors = [item["embedding"] for item in sorted(data.get("data", []), key=lambda item: item["index"])] vectors = [item["embedding"] for item in sorted(data.get("data", []), key=lambda item: item["index"])]
if any(len(vector) != self.dimension for vector in vectors): if any(len(vector) != self.dimension for vector in vectors):
@@ -51,7 +65,12 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
"""Embed texts for the Open A I Compatible Embedding Provider instance.""" """Embed texts for the Open A I Compatible Embedding Provider instance."""
if not texts: if not texts:
return [] return []
return self._request(texts) vectors: list[list[float]] = []
# Batch requests conservatively because some gateways reject larger embedding payloads.
for start in range(0, len(texts), EMBEDDING_BATCH_SIZE):
batch = texts[start:start + EMBEDDING_BATCH_SIZE]
vectors.extend(self._request(batch))
return vectors
def embed_query(self, text: str) -> list[float]: def embed_query(self, text: str) -> list[float]:
"""Embed query for the Open A I Compatible Embedding Provider instance.""" """Embed query for the Open A I Compatible Embedding Provider instance."""

View File

@@ -0,0 +1,142 @@
"""Aliyun Docmind gateway helpers for the document ingest pipeline."""
from __future__ import annotations
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from alibabacloud_docmind_api20220711 import models as docmind_models
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_tea_util import models as util_models
from app.config.settings import settings
# Keep provider-specific behavior isolated so the rest of the backend can stay stable.
@dataclass
class AliyunParsePayload:
"""Represent the raw Aliyun parse payload returned by the gateway."""
task_id: str
layouts: list[dict[str, Any]]
poll_attempts: int
duration_ms: int
class AliyunDocmindGateway:
"""Submit, poll, and collect results from the Aliyun Docmind API."""
def __init__(self) -> None:
"""Initialize the gateway with runtime configuration."""
self.endpoint = settings.alibaba_endpoint
self.poll_interval_seconds = settings.aliyun_parse_poll_interval_seconds
self.timeout_seconds = settings.aliyun_parse_timeout_seconds
self.layout_step_size = settings.aliyun_parse_layout_step_size
self.llm_enhancement = settings.aliyun_llm_enhancement
self.enhancement_mode = settings.aliyun_enhancement_mode
def parse_document(self, *, file_path: str) -> AliyunParsePayload:
"""Parse a single document and return the collected layouts."""
client = self._create_client()
started_at = time.monotonic()
task_id = self._submit_job(client=client, file_path=file_path)
poll_attempts = self._wait_for_completion(client=client, task_id=task_id, started_at=started_at)
layouts = self._collect_all_results(client=client, task_id=task_id)
duration_ms = int((time.monotonic() - started_at) * 1000)
return AliyunParsePayload(
task_id=task_id,
layouts=layouts,
poll_attempts=poll_attempts,
duration_ms=duration_ms,
)
def _create_client(self) -> DocmindClient:
"""Create a Docmind client using explicit AccessKey settings only."""
config = open_api_models.Config()
config.endpoint = self.endpoint
if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
raise ValueError(
"Missing Aliyun parser credentials. Set ALIBABA_ACCESS_KEY_ID and "
"ALIBABA_ACCESS_KEY_SECRET in the project root .env."
)
# Keep production behavior deterministic by using only project-configured credentials.
config.access_key_id = settings.alibaba_access_key_id
config.access_key_secret = settings.alibaba_access_key_secret
return DocmindClient(config)
def _submit_job(self, *, client: DocmindClient, file_path: str) -> str:
"""Submit an asynchronous Docmind parse job."""
path = Path(file_path)
with open(file_path, "rb") as file_stream:
request = docmind_models.SubmitDocParserJobAdvanceRequest(
file_url_object=file_stream,
file_name=path.name,
file_name_extension=path.suffix.lstrip("."),
llm_enhancement=self.llm_enhancement,
enhancement_mode=self.enhancement_mode,
)
runtime = util_models.RuntimeOptions()
response = client.submit_doc_parser_job_advance(request, runtime)
task_id = response.body.data.id if response.body and response.body.data else ""
if not task_id:
raise RuntimeError("Aliyun Docmind did not return a parse task id.")
return task_id
def _query_status(self, *, client: DocmindClient, task_id: str) -> dict[str, Any] | None:
"""Query the current Docmind parse status."""
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
response = client.query_doc_parser_status(request)
return response.body.data.to_map() if response.body and response.body.data else None
def _wait_for_completion(self, *, client: DocmindClient, task_id: str, started_at: float) -> int:
"""Poll until the parse job finishes or times out."""
poll_attempts = 0
while True:
poll_attempts += 1
status_payload = self._query_status(client=client, task_id=task_id)
if not status_payload:
raise RuntimeError(f"Aliyun parse status payload is empty for task {task_id}.")
status = str(status_payload.get("Status", "")).lower()
if status == "success":
return poll_attempts
if status == "failed":
raise RuntimeError(f"Aliyun parse task failed: {status_payload}")
elapsed = time.monotonic() - started_at
if elapsed > self.timeout_seconds:
raise TimeoutError(
f"Aliyun parse task timed out after {self.timeout_seconds}s: task_id={task_id}"
)
time.sleep(self.poll_interval_seconds)
def _collect_all_results(self, *, client: DocmindClient, task_id: str) -> list[dict[str, Any]]:
"""Collect all paginated layout results from a completed parse task."""
all_layouts: list[dict[str, Any]] = []
layout_num = 0
while True:
request = docmind_models.GetDocParserResultRequest(
id=task_id,
layout_step_size=self.layout_step_size,
layout_num=layout_num,
)
response = client.get_doc_parser_result(request)
payload = response.body.data if response.body else None
if not payload:
break
layouts = payload.get("layouts", [])
if not layouts:
break
all_layouts.extend(layouts)
layout_num += len(layouts)
if len(layouts) < self.layout_step_size:
break
if not all_layouts:
raise RuntimeError(f"Aliyun parse task returned no layouts: task_id={task_id}")
return all_layouts

View File

@@ -1,19 +1,18 @@
"""Implement infrastructure support for aliyun document parser.""" """Implement infrastructure support for Aliyun document parsing."""
from __future__ import annotations from __future__ import annotations
from app.aliyun_parser.parse_pdf import ( from app.config.settings import settings
from app.domain.documents import DocumentParser, ParsedDocument
from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway
from app.infrastructure.parser.aliyun_layout_normalizer import (
MAX_CHARS, MAX_CHARS,
OVERLAP_CHARS, OVERLAP_CHARS,
build_semantic_blocks, build_semantic_blocks,
build_structure_nodes, build_structure_nodes,
build_vector_chunks, build_vector_chunks,
collect_all_results,
init_client,
submit_job,
wait_for_completion,
) )
from app.domain.documents import DocumentParser, ParsedDocument
# Keep adapter behavior explicit so integration details remain easy to audit. # Keep adapter behavior explicit so integration details remain easy to audit.
@@ -22,13 +21,14 @@ class AliyunDocumentParser(DocumentParser):
"""Provide the Aliyun Document Parser parser.""" """Provide the Aliyun Document Parser parser."""
parser_name = "aliyun_docmind" parser_name = "aliyun_docmind"
def __init__(self) -> None:
"""Initialize the parser adapter and its gateway dependency."""
self.gateway = AliyunDocmindGateway()
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument: def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
"""Handle parse for the Aliyun Document Parser instance.""" """Handle parse for the Aliyun Document Parser instance."""
client = init_client() payload = self.gateway.parse_document(file_path=file_path)
task_id = submit_job(client, file_path) layouts = payload.layouts
if not wait_for_completion(client, task_id):
raise RuntimeError("阿里云文档解析任务失败")
layouts = collect_all_results(client, task_id)
structure_nodes = build_structure_nodes(layouts) structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts) semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks( vector_chunks = build_vector_chunks(
@@ -51,5 +51,13 @@ class AliyunDocumentParser(DocumentParser):
vector_chunks=vector_chunks, vector_chunks=vector_chunks,
parser_name=self.parser_name, parser_name=self.parser_name,
raw_text=raw_text, raw_text=raw_text,
metadata={"task_id": task_id, "layout_count": len(layouts)}, raw_layouts=layouts,
metadata={
"task_id": payload.task_id,
"layout_count": len(layouts),
"poll_attempts": payload.poll_attempts,
"duration_ms": payload.duration_ms,
"parser_backend": self.parser_name,
"artifact_prefix": settings.document_parse_artifact_prefix,
},
) )

View File

@@ -0,0 +1,336 @@
"""Normalize Aliyun Docmind layouts into production document structures."""
from __future__ import annotations
import re
from typing import Any
# Keep layout normalization rules centralized so parser and demos stay aligned.
MAX_CHARS = 600
OVERLAP_CHARS = 80
TOC_TITLES = {"目次", "目录"}
TITLE_SUBTYPES = {"doc_title", "para_title"}
TEXT_SUBTYPES = {"para", "none"}
FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
def normalize_text(text: str) -> str:
"""Normalize raw text content emitted by the parser."""
text = text.replace("\r", "\n")
text = text.replace(" ", " ")
text = re.sub(r"\n+", "\n", text)
text = re.sub(r"[ \t]+", " ", text)
return text.strip()
def get_page(layout: dict[str, Any]) -> int:
"""Return the page number for a layout record."""
return layout.get("pageNum", layout.get("pageNumber", 0))
def get_text(layout: dict[str, Any]) -> str:
"""Return the most useful text content for a layout record."""
text = normalize_text(layout.get("text", ""))
if text:
return text
return normalize_text(layout.get("markdownContent", ""))
def is_title(layout: dict[str, Any]) -> bool:
"""Return whether the layout should be treated as a title."""
return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
def is_text(layout: dict[str, Any]) -> bool:
"""Return whether the layout should be treated as plain paragraph text."""
return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
def is_figure(layout: dict[str, Any]) -> bool:
"""Return whether the layout should be treated as figure-related content."""
return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
def is_table(layout: dict[str, Any]) -> bool:
"""Return whether the layout should be treated as a table."""
return layout.get("type") == "table"
def is_toc_layout(layout: dict[str, Any]) -> bool:
"""Return whether the layout appears to belong to a table of contents."""
text = get_text(layout)
if text in TOC_TITLES:
return True
if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
return True
return False
def extract_table_text(layout: dict[str, Any]) -> str:
"""Flatten nested table cells into retrievable plain text."""
rows: list[str] = []
for cell in layout.get("cells", []):
texts: list[str] = []
for cell_layout in cell.get("layouts", []):
cell_text = normalize_text(cell_layout.get("text", ""))
if cell_text:
texts.append(cell_text)
if texts:
rows.append(" ".join(texts))
return "\n".join(rows).strip()
def build_structure_nodes(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Build the title hierarchy emitted to downstream storage."""
nodes: list[dict[str, Any]] = []
for layout in layouts:
if not is_title(layout):
continue
text = get_text(layout)
if not text or text in TOC_TITLES:
continue
nodes.append(
{
"unique_id": layout.get("uniqueId"),
"page": get_page(layout),
"index": layout.get("index", 0),
"level": layout.get("level", 0),
"title": text,
"type": layout.get("type"),
"sub_type": layout.get("subType"),
}
)
return nodes
def update_section_path(
section_stack: list[dict[str, Any]],
layout: dict[str, Any],
) -> list[dict[str, Any]]:
"""Update the current heading stack with a newly observed title layout."""
level = layout.get("level", 0)
title = get_text(layout)
while section_stack and section_stack[-1]["level"] >= level:
section_stack.pop()
section_stack.append(
{
"level": level,
"title": title,
"page": get_page(layout),
"unique_id": layout.get("uniqueId"),
}
)
return section_stack
def section_path_titles(section_stack: list[dict[str, Any]]) -> list[str]:
"""Return the title-only view of the current heading stack."""
return [item["title"] for item in section_stack]
def flush_text_block(
blocks: list[dict[str, Any]],
semantic_blocks: list[dict[str, Any]],
block_id: int,
) -> int:
"""Flush buffered paragraph layouts into a single semantic block."""
if not blocks:
return block_id
texts = [item["text"] for item in blocks if item["text"]]
merged_text = "\n".join(texts).strip()
if not merged_text:
return block_id
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "section_text",
"page_start": min(item["page"] for item in blocks),
"page_end": max(item["page"] for item in blocks),
"section_path": blocks[0]["section_path"],
"section_level": blocks[0]["section_level"],
"section_title": blocks[0]["section_title"],
"source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
"text": merged_text,
}
)
return block_id + 1
def build_semantic_blocks(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Build semantic content blocks from raw Aliyun layouts."""
semantic_blocks: list[dict[str, Any]] = []
section_stack: list[dict[str, Any]] = []
pending_text_blocks: list[dict[str, Any]] = []
block_id = 1
skip_toc_page = False
for layout in layouts:
text = get_text(layout)
page = get_page(layout)
if is_toc_layout(layout):
skip_toc_page = True
continue
if skip_toc_page and page == 1:
continue
if skip_toc_page and page != 1:
skip_toc_page = False
if is_title(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
section_stack = update_section_path(section_stack, layout)
continue
section_path = section_path_titles(section_stack)
section_title = section_path[-1] if section_path else "未分类"
section_level = len(section_path)
if is_table(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
table_text = extract_table_text(layout)
if table_text:
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "table",
"page_start": page,
"page_end": page,
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
"source_ids": [layout.get("uniqueId")],
"text": table_text,
}
)
block_id += 1
continue
if is_figure(layout):
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
pending_text_blocks = []
if text:
semantic_blocks.append(
{
"semantic_id": f"semantic-{block_id}",
"block_type": "figure",
"page_start": page,
"page_end": page,
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
"source_ids": [layout.get("uniqueId")],
"text": text,
}
)
block_id += 1
continue
if is_text(layout) and text:
pending_text_blocks.append(
{
"page": page,
"text": text,
"unique_id": layout.get("uniqueId"),
"section_path": section_path,
"section_level": section_level,
"section_title": section_title,
}
)
flush_text_block(pending_text_blocks, semantic_blocks, block_id)
return semantic_blocks
def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> list[str]:
"""Split long text into overlapping windows for embedding."""
text = text.strip()
if len(text) <= max_chars:
return [text] if text else []
parts: list[str] = []
start = 0
while start < len(text):
end = min(len(text), start + max_chars)
parts.append(text[start:end].strip())
if end >= len(text):
break
start = max(0, end - overlap_chars)
return [part for part in parts if part]
def build_vector_chunks(
semantic_blocks: list[dict[str, Any]],
*,
doc_id: str,
doc_title: str,
max_chars: int,
overlap_chars: int,
) -> list[dict[str, Any]]:
"""Build retrieval chunks from semantic blocks."""
vector_chunks: list[dict[str, Any]] = []
chunk_index = 1
for block in semantic_blocks:
pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
for piece_index, piece in enumerate(pieces, start=1):
if block["section_path"]:
header = f"标准:{doc_title}\n章节:{' > '.join(block['section_path'])}\n\n"
else:
header = f"标准:{doc_title}\n\n"
# Preserve enriched embedding text so retrieval keeps section context.
vector_chunks.append(
{
"doc_id": doc_id,
"doc_title": doc_title,
"chunk_id": f"chunk-{chunk_index}",
"chunk_index": chunk_index,
"semantic_id": block["semantic_id"],
"chunk_type": block["block_type"],
"piece_index": piece_index,
"page_start": block["page_start"],
"page_end": block["page_end"],
"section_path": block["section_path"],
"section_level": block["section_level"],
"section_title": block["section_title"],
"source_ids": block["source_ids"],
"text": piece,
"embedding_text": header + piece,
}
)
chunk_index += 1
return vector_chunks
def convert_layouts(
layouts: list[dict[str, Any]],
*,
doc_id: str,
doc_title: str,
max_chars: int,
overlap_chars: int,
) -> dict[str, Any]:
"""Convert raw Aliyun layouts into the three-layer ingest payload."""
structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks(
semantic_blocks,
doc_id=doc_id,
doc_title=doc_title,
max_chars=max_chars,
overlap_chars=overlap_chars,
)
return {
"doc_id": doc_id,
"doc_title": doc_title,
"structure_nodes": structure_nodes,
"semantic_blocks": semantic_blocks,
"vector_chunks": vector_chunks,
}

View File

@@ -4,6 +4,7 @@ from __future__ import annotations
from pathlib import Path from pathlib import Path
from app.config.settings import settings
from app.domain.documents import DocumentParser, ParsedDocument from app.domain.documents import DocumentParser, ParsedDocument
from app.services.parser.docx_parser import parse_docx_to_markdown from app.services.parser.docx_parser import parse_docx_to_markdown
from app.services.parser.pdf_parser import parse_pdf_to_markdown from app.services.parser.pdf_parser import parse_pdf_to_markdown
@@ -34,5 +35,10 @@ class LocalDocumentParser(DocumentParser):
vector_chunks=[], vector_chunks=[],
parser_name=self.parser_name, parser_name=self.parser_name,
raw_text=markdown_text, raw_text=markdown_text,
metadata={"source": "local_parser", "file_suffix": suffix}, raw_layouts=[],
metadata={
"source": "local_parser",
"file_suffix": suffix,
"artifact_prefix": settings.document_parse_artifact_prefix,
},
) )

View File

@@ -21,5 +21,365 @@
"generate_summary": true, "generate_summary": true,
"structure_nodes": 0 "structure_nodes": 0
} }
},
"44121fbb": {
"doc_id": "44121fbb",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "44121fbb/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T09:53:47.996183+00:00",
"updated_at": "2026-05-18T09:53:50.825868+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"77debb4a": {
"doc_id": "77debb4a",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "77debb4a/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:05:46.104259+00:00",
"updated_at": "2026-05-18T10:05:48.704061+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"d12bdcc8": {
"doc_id": "d12bdcc8",
"doc_name": "TCT算法接口.pdf",
"file_name": "TCT算法接口.pdf",
"object_name": "d12bdcc8/TCT算法接口.pdf",
"content_type": "application/pdf",
"size_bytes": 165557,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:07:22.199824+00:00",
"updated_at": "2026-05-18T10:07:24.653751+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"3c2e8c9c": {
"doc_id": "3c2e8c9c",
"doc_name": "20260415_Continental tire mobile app solution.pdf",
"file_name": "20260415_Continental tire mobile app solution.pdf",
"object_name": "3c2e8c9c/20260415_Continental tire mobile app solution.pdf",
"content_type": "application/pdf",
"size_bytes": 2178074,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:09:58.338274+00:00",
"updated_at": "2026-05-18T10:10:01.295502+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"d22d21a0": {
"doc_id": "d22d21a0",
"doc_name": "20260415_Continental tire mobile app solution.pdf",
"file_name": "20260415_Continental tire mobile app solution.pdf",
"object_name": "d22d21a0/20260415_Continental tire mobile app solution.pdf",
"content_type": "application/pdf",
"size_bytes": 2178074,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:12:20.078027+00:00",
"updated_at": "2026-05-18T10:12:22.999843+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"35f129d3": {
"doc_id": "35f129d3",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "35f129d3/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "",
"index_name": "",
"error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"created_at": "2026-05-18T10:13:24.706512+00:00",
"updated_at": "2026-05-18T10:13:27.180509+00:00",
"metadata": {
"generate_summary": true,
"failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
"processing_stage": "failed"
}
},
"efc21515": {
"doc_id": "efc21515",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "efc21515/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
"created_at": "2026-05-18T13:47:32.076786+00:00",
"updated_at": "2026-05-18T13:47:57.998073+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-a6e84447457f43cb85f95225cfc6495b",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/efc21515/layouts.json",
"structure_nodes": "artifacts/efc21515/structure_nodes.json",
"semantic_blocks": "artifacts/efc21515/semantic_blocks.json",
"vector_chunks": "artifacts/efc21515/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400"
}
},
"0d4b08bc": {
"doc_id": "0d4b08bc",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "0d4b08bc/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
"created_at": "2026-05-18T14:03:15.134344+00:00",
"updated_at": "2026-05-18T14:03:34.843448+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-78353d85daa24147b68d8fb71895179f",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/0d4b08bc/layouts.json",
"structure_nodes": "artifacts/0d4b08bc/structure_nodes.json",
"semantic_blocks": "artifacts/0d4b08bc/semantic_blocks.json",
"vector_chunks": "artifacts/0d4b08bc/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404"
}
},
"4302f314": {
"doc_id": "4302f314",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "4302f314/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "embedding 维度不匹配,期望 1536",
"created_at": "2026-05-18T14:11:29.943973+00:00",
"updated_at": "2026-05-18T14:11:48.554500+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-23935ee455ac4b26ac4201ac4781ee52",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/4302f314/layouts.json",
"structure_nodes": "artifacts/4302f314/structure_nodes.json",
"semantic_blocks": "artifacts/4302f314/semantic_blocks.json",
"vector_chunks": "artifacts/4302f314/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "embedding 维度不匹配,期望 1536"
}
},
"765ed1ee": {
"doc_id": "765ed1ee",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "765ed1ee/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>",
"created_at": "2026-05-18T14:18:28.875138+00:00",
"updated_at": "2026-05-18T14:18:57.389110+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-f116856bc29245baa2531b245078a701",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/765ed1ee/layouts.json",
"structure_nodes": "artifacts/765ed1ee/structure_nodes.json",
"semantic_blocks": "artifacts/765ed1ee/semantic_blocks.json",
"vector_chunks": "artifacts/765ed1ee/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>"
}
},
"05cabe09": {
"doc_id": "05cabe09",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "05cabe09/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "failed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 0,
"parser_name": "aliyun_docmind",
"index_name": "",
"error_message": "embedding 维度不匹配,期望 1536",
"created_at": "2026-05-18T14:24:32.156500+00:00",
"updated_at": "2026-05-18T14:24:50.114138+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-897d858983df48e28e9819e563d46208",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/05cabe09/layouts.json",
"structure_nodes": "artifacts/05cabe09/structure_nodes.json",
"semantic_blocks": "artifacts/05cabe09/semantic_blocks.json",
"vector_chunks": "artifacts/05cabe09/vector_chunks.json"
},
"processing_stage": "failed",
"failure_reason": "embedding 维度不匹配,期望 1536"
}
},
"9acb2ba0": {
"doc_id": "9acb2ba0",
"doc_name": "大众汽车手册.pdf",
"file_name": "大众汽车手册.pdf",
"object_name": "9acb2ba0/大众汽车手册.pdf",
"content_type": "application/pdf",
"size_bytes": 766565,
"status": "indexed",
"regulation_type": "",
"version": "",
"summary": "",
"summary_latency_ms": 0,
"chunk_count": 27,
"parser_name": "aliyun_docmind",
"index_name": "regulations_dense_1024_v1",
"error_message": "",
"created_at": "2026-05-18T14:29:01.368719+00:00",
"updated_at": "2026-05-18T14:29:23.699068+00:00",
"metadata": {
"generate_summary": true,
"parser_backend": "aliyun_docmind",
"parse_task_id": "docmind-20260518-e5fd4a5419e74d569c562e389e6ae72c",
"layout_count": 87,
"structure_node_count": 20,
"semantic_block_count": 27,
"vector_chunk_count": 27,
"artifact_keys": {
"layouts": "artifacts/9acb2ba0/layouts.json",
"structure_nodes": "artifacts/9acb2ba0/structure_nodes.json",
"semantic_blocks": "artifacts/9acb2ba0/semantic_blocks.json",
"vector_chunks": "artifacts/9acb2ba0/vector_chunks.json"
},
"processing_stage": "indexed",
"index_collection": "regulations_dense_1024_v1"
}
} }
} }

View File

@@ -0,0 +1,71 @@
# 阿里云解析主链路实现说明
本文档描述当前仓库已经落地的文档 ingest 主链路实现,作为迁移设计到代码实现之间的收口说明。
## 1. 当前默认链路
- 上传入口保持为 `/api/v1/documents/upload`
- 默认 `PARSER_BACKEND=aliyun`
- 默认 `CHUNK_BACKEND=aliyun`
- 默认 Milvus collection 为 `regulations_dense_1536_v2`
- 解析产物落到 MinIO `artifacts/{doc_id}/`
完整主链路如下:
1. 原始文件上传到 MinIO
2. `AliyunDocmindGateway` 提交阿里云异步解析任务
3. 轮询任务状态直到成功或超时
4. 分页拉取 `layouts`
5. 转换为 `structure_nodes / semantic_blocks / vector_chunks`
6. 三层结构 JSON 回写 MinIO
7. 使用 `vector_chunks[*].embedding_text` 调 embedding API
8. 写入 `regulations_dense_1536_v2`
9. 文档状态更新为 `indexed`
运行时转换逻辑位于 `backend/app/infrastructure/parser/aliyun_layout_normalizer.py`
旧的 `backend/app/aliyun_parser/` 示例目录已移除,不参与生产运行时。
## 2. 解析产物持久化
每个文档会额外写入以下对象:
- `artifacts/{doc_id}/layouts.json`
- `artifacts/{doc_id}/structure_nodes.json`
- `artifacts/{doc_id}/semantic_blocks.json`
- `artifacts/{doc_id}/vector_chunks.json`
`documents.json` 仅保留对象 key、统计信息和处理阶段不保存完整大 JSON。
## 3. 失败策略
- 当前 `PARSER_FAILURE_MODE=fail`
- 阿里云解析失败不自动回退到本地 parser
- 失败时保留原始文件与已写入的 artifacts便于排障
## 4. 运行参数
关键环境变量如下:
- `ALIBABA_ACCESS_KEY_ID`
- `ALIBABA_ACCESS_KEY_SECRET`
- `ALIBABA_ENDPOINT`
- `ALIYUN_PARSE_POLL_INTERVAL_SECONDS`
- `ALIYUN_PARSE_TIMEOUT_SECONDS`
- `ALIYUN_PARSE_LAYOUT_STEP_SIZE`
- `ALIYUN_LLM_ENHANCEMENT`
- `ALIYUN_ENHANCEMENT_MODE`
- `DOCUMENT_PARSE_ARTIFACT_PREFIX`
- `PARSER_BACKEND`
- `CHUNK_BACKEND`
## 5. 运行态确认
可通过 `/api/v1/status/config` 确认以下字段:
- `parser_backend`
- `chunk_backend`
- `milvus_collection`
- `artifact_prefix`
- `parser_failure_mode`
这几个值用于确认服务是否实际运行在迁移后的默认链路上。

View File

@@ -29,7 +29,7 @@
已确认的目标需求如下: 已确认的目标需求如下:
- 文档解析统一改为阿里云文档智能能力 - 文档解析统一改为阿里云文档智能能力
- 当前阿里云接入基础来自 `backend/app/aliyun_parser/parse_pdf.py` - 当前阿里云接入基础已经迁移到 `backend/app/infrastructure/parser/aliyun_layout_normalizer.py`
- 解析结果以 `structure_nodes``semantic_blocks``vector_chunks` 三层结构为基础 - 解析结果以 `structure_nodes``semantic_blocks``vector_chunks` 三层结构为基础
- 分块以阿里云 `vector_chunks` 为准,不再走当前本地 `RegulationChunker` - 分块以阿里云 `vector_chunks` 为准,不再走当前本地 `RegulationChunker`
- embedding 改为 OpenAI 兼容 API 调用,模型使用 `text-embedding-v3` - embedding 改为 OpenAI 兼容 API 调用,模型使用 `text-embedding-v3`
@@ -80,7 +80,7 @@
受影响的解析能力范围包括: 受影响的解析能力范围包括:
- 当前本地 parser 目录 - 当前本地 parser 目录
- `backend/app/aliyun_parser` - `backend/app/infrastructure/parser`
迁移后阿里云文档智能能力将成为主解析来源,本地 PDF/DOCX/MinerU 解析链路需要重新界定保留、下线或回退策略,但具体模块组织方式不在本文件内定义。 迁移后阿里云文档智能能力将成为主解析来源,本地 PDF/DOCX/MinerU 解析链路需要重新界定保留、下线或回退策略,但具体模块组织方式不在本文件内定义。
@@ -133,7 +133,7 @@
以下风险和约束在本期已经明确,需要在后续架构和实施阶段优先处理: 以下风险和约束在本期已经明确,需要在后续架构和实施阶段优先处理:
- 旧 Milvus collection 与新 `1536` 维 schema 不兼容,需要新 collection 和重建索引 - 旧 Milvus collection 与新 `1536` 维 schema 不兼容,需要新 collection 和重建索引
- `backend/app/aliyun_parser` 现有脚本含硬编码密钥,后续必须全部移到环境变量 - 阿里云凭据必须继续只通过环境变量或凭据链注入,不能回到脚本内硬编码
- RAG 下游当前对 `clause_number` 有依赖,迁移后需要优先适配 `section_title` 和 Aliyun chunk metadata - RAG 下游当前对 `clause_number` 有依赖,迁移后需要优先适配 `section_title` 和 Aliyun chunk metadata
- 如果阿里云返回字段与当前样例不同,需要在架构阶段补充 adapter 层 - 如果阿里云返回字段与当前样例不同,需要在架构阶段补充 adapter 层

View File

@@ -1,4 +1,4 @@
"""新架构下的文档编排与 embedding 边界测试。""" """Document orchestration and embedding boundary tests for the migrated backend."""
from __future__ import annotations from __future__ import annotations
@@ -80,6 +80,7 @@ class FakeParser:
return ParsedDocument( return ParsedDocument(
doc_id=doc_id, doc_id=doc_id,
doc_name=doc_name, doc_name=doc_name,
raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
structure_nodes=[{"title": "第一章"}], structure_nodes=[{"title": "第一章"}],
semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}], semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
vector_chunks=[ vector_chunks=[
@@ -95,6 +96,7 @@ class FakeParser:
} }
], ],
parser_name="fake_parser", parser_name="fake_parser",
metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
) )
@@ -125,10 +127,10 @@ class FakeEmbeddingProvider:
def embed_texts(self, texts: list[str]) -> list[list[float]]: def embed_texts(self, texts: list[str]) -> list[list[float]]:
self.calls.append(texts) self.calls.append(texts)
return [[0.1] * 1536 for _ in texts] return [[0.1] * 1024 for _ in texts]
def embed_query(self, text: str) -> list[float]: def embed_query(self, text: str) -> list[float]:
return [0.2] * 1536 return [0.2] * 1024
class FakeVectorIndex: class FakeVectorIndex:
@@ -146,10 +148,10 @@ class FakeVectorIndex:
return [] return []
def health(self) -> dict: def health(self) -> dict:
return {"collection_name": "regulations_dense_1536"} return {"collection_name": "regulations_dense_1024_v1"}
def test_document_command_service_uses_1536_dense_embedding_and_updates_status(): def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
repository = FakeRepository() repository = FakeRepository()
binary_store = FakeBinaryStore() binary_store = FakeBinaryStore()
embedding_provider = FakeEmbeddingProvider() embedding_provider = FakeEmbeddingProvider()
@@ -183,15 +185,16 @@ def test_document_command_service_uses_1536_dense_embedding_and_updates_status()
assert stored.status == DocumentStatus.INDEXED assert stored.status == DocumentStatus.INDEXED
assert stored.chunk_count == 1 assert stored.chunk_count == 1
assert stored.parser_name == "fake_parser" assert stored.parser_name == "fake_parser"
assert stored.index_name == "regulations_dense_1536" assert stored.index_name == "regulations_dense_1024_v1"
assert stored.metadata["parse_task_id"] == "task-123"
assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")
def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
def test_bootstrap_defaults_to_local_parser_and_chunk_builder():
bootstrap.get_parser.cache_clear() bootstrap.get_parser.cache_clear()
bootstrap.get_chunk_builder.cache_clear() bootstrap.get_chunk_builder.cache_clear()
parser = bootstrap.get_parser() parser = bootstrap.get_parser()
chunk_builder = bootstrap.get_chunk_builder() chunk_builder = bootstrap.get_chunk_builder()
assert parser.__class__.__name__ == "LocalDocumentParser" assert parser.__class__.__name__ == "AliyunDocumentParser"
assert chunk_builder.__class__.__name__ == "LocalRegulationChunkBuilder" assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"

View File

@@ -64,11 +64,16 @@ def verify_migration_config() -> bool:
try: try:
assert settings.embedding_model == "text-embedding-v3" assert settings.embedding_model == "text-embedding-v3"
assert settings.embedding_dim == 1536 assert settings.embedding_dim == 1024
assert settings.milvus_collection == "regulations_dense_1536" assert settings.milvus_collection == "regulations_dense_1024_v1"
assert settings.parser_backend == "aliyun"
assert settings.chunk_backend == "aliyun"
logger.info(f"embedding_model={settings.embedding_model}") logger.info(f"embedding_model={settings.embedding_model}")
logger.info(f"embedding_base_url={settings.embedding_base_url}")
logger.info(f"embedding_dim={settings.embedding_dim}") logger.info(f"embedding_dim={settings.embedding_dim}")
logger.info(f"milvus_collection={settings.milvus_collection}") logger.info(f"milvus_collection={settings.milvus_collection}")
logger.info(f"parser_backend={settings.parser_backend}")
logger.info(f"chunk_backend={settings.chunk_backend}")
logger.success("migration config ok") logger.success("migration config ok")
return True return True
except Exception as exc: except Exception as exc: