feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions
--- a/.env
+++ b/.env
@@ -9,7 +9,7 @@ DEBUG=false
 # ===== Milvus向量数据库配置（已有）=====
 MILVUS_HOST=localhost
 MILVUS_PORT=19530
-MILVUS_COLLECTION=regulations_dense_1536
+MILVUS_COLLECTION=regulations_dense_1024_v1
 MILVUS_DB_NAME=default
 # ===== MinIO对象存储配置（已有）=====
@@ -34,7 +34,7 @@ POSTGRES_DB=compliance_db
 # ===== 嵌入模型配置 =====
 EMBEDDING_MODEL=text-embedding-v3
-EMBEDDING_DIM=1536
+EMBEDDING_DIM=1024
 EMBEDDING_API_KEY=sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8
 EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1
 EMBEDDING_TIMEOUT_SECONDS=120
@@ -59,7 +59,7 @@ LLM_TEMPERATURE=0.7
 # 获取API Key: https://dashscope.console.aliyun.com/
 QWEN_API_KEY=sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8
 QWEN_BASE_URL=http://6.86.80.4:30080/v1
-QWEN_MODEL=qwen3.5-plus
+QWEN_MODEL=qwen3.6-plus
 QWEN_VL_MODEL=qwen3-vl-plus
 # ===== DeepSeek API配置 =====
@@ -73,3 +73,15 @@ RAG_TOP_K=10
 RAG_MAX_CONTEXT_TOKENS=4000
 RAG_SUMMARY_MAX_TOKENS=1024
 RAG_SKILLS_MAX_TOKENS=2048
 # ===== 阿里云文档解析 =====
 ALIBABA_ACCESS_KEY_ID=LTAI5t9ZjvwSU9bKuMyiExrE
 ALIBABA_ACCESS_KEY_SECRET=hNvY6XocmEO6inYlrmiBwBcx5OfidL
 ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
 ALIYUN_PARSE_POLL_INTERVAL_SECONDS=5
 ALIYUN_PARSE_TIMEOUT_SECONDS=900
 ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
 ALIYUN_LLM_ENHANCEMENT=true
 ALIYUN_ENHANCEMENT_MODE=VLM
 DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
 PARSER_FAILURE_MODE=fail
--- a/.env.development
+++ b/.env.development
@@ -4,7 +4,7 @@
 # ===== Milvus向量数据库配置（已有）=====
 MILVUS_HOST=6.86.80.8
 MILVUS_PORT=19530
-MILVUS_COLLECTION=regulations_dense_1536
+MILVUS_COLLECTION=regulations_dense_1024_v1
 MILVUS_DB_NAME=default
 # ===== MinIO对象存储配置（已有）=====
@@ -26,4 +26,3 @@ POSTGRES_PORT=5432
 POSTGRES_USER=postgresql
 POSTGRES_PASSWORD=postgresql123456
 POSTGRES_DB=compliance_db
--- a/.env.example
+++ b/.env.example
@@ -9,12 +9,12 @@ DEBUG=false
 # ===== Milvus向量数据库配置 =====
 MILVUS_HOST=localhost
 MILVUS_PORT=19530
-MILVUS_COLLECTION=regulations_dense_1536
+MILVUS_COLLECTION=regulations_dense_1024_v1
 MILVUS_DB_NAME=default
 # ===== 嵌入模型配置 =====
 EMBEDDING_MODEL=text-embedding-v3
-EMBEDDING_DIM=1536
+EMBEDDING_DIM=1024
 EMBEDDING_API_KEY=your_embedding_api_key_here
 EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1
 EMBEDDING_TIMEOUT_SECONDS=120
@@ -44,11 +44,20 @@ CHUNK_SIZE=512
 CHUNK_OVERLAP=50
 MAX_FILE_SIZE_MB=100
 DOCUMENT_METADATA_PATH=backend/data/documents.json
 PARSER_BACKEND=aliyun
 CHUNK_BACKEND=aliyun
 # ===== 阿里云文档解析 =====
 ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
 ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
 ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
 ALIYUN_PARSE_POLL_INTERVAL_SECONDS=5
 ALIYUN_PARSE_TIMEOUT_SECONDS=900
 ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
 ALIYUN_LLM_ENHANCEMENT=true
 ALIYUN_ENHANCEMENT_MODE=VLM
 DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
 PARSER_FAILURE_MODE=fail
 # ===== API服务配置 =====
 API_HOST=0.0.0.0
@@ -73,7 +82,7 @@ DEEPSEEK_BASE_URL=http://6.86.80.4:30080/v1
 # Qwen系列: qwen3.5-plus, qwen3-plus, qwen-max, qwen-turbo, qwen-long
 # Qwen VL系列: qwen3-vl-plus, qwen-vl-max
 # DeepSeek系列: deepseek-v4-flash, deepseek-v3.2, deepseek-v3, deepseek-chat, deepseek-coder
-QWEN_MODEL=qwen3.5-plus
+QWEN_MODEL=qwen3.6-plus
 QWEN_VL_MODEL=qwen3-vl-plus
 DEEPSEEK_MODEL=deepseek-v4-flash
--- a/QUICK_DEPLOY.md
+++ b/QUICK_DEPLOY.md
@@ -106,6 +106,9 @@ ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
 EMBEDDING_API_KEY=your_embedding_api_key_here
 EMBEDDING_MODEL=text-embedding-v3
 EMBEDDING_DIM=1536
 PARSER_BACKEND=aliyun
 CHUNK_BACKEND=aliyun
 PARSER_FAILURE_MODE=fail
 ```
 ---
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ AIRegulation-DocAnalysis-Demo/
 ### 1. 安装依赖
 ```bash
-pip install -r backend/requirements.txt
+./dev.sh setup
 ```
 ### 2. 启动Milvus向量数据库
@@ -57,7 +57,7 @@ docker-compose logs -f milvus
 ### 3. 启动API服务
 ```bash
-PYTHONPATH=backend uvicorn app.main:app --reload --port 8000
+./dev.sh start api --foreground
 ```
 访问API文档：http://localhost:8000/docs
@@ -104,6 +104,8 @@ MILVUS_PORT=19530
 # 阿里云文档解析
 ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
 ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
 PARSER_BACKEND=aliyun
 CHUNK_BACKEND=aliyun
 # embedding 配置
 EMBEDDING_MODEL=text-embedding-v3
@@ -121,6 +123,17 @@ CHUNK_SIZE=512
 - 混合检索问答功能
 - 法规变更监控与自动更新
 ## 解析产物
 上传成功后，系统会把阿里云解析的中间结果持久化到 MinIO：
 - `artifacts/{doc_id}/layouts.json`
 - `artifacts/{doc_id}/structure_nodes.json`
 - `artifacts/{doc_id}/semantic_blocks.json`
 - `artifacts/{doc_id}/vector_chunks.json`
 当前默认 Milvus collection 为 `regulations_dense_1536_v2`。
 ## 许可证
 MIT License
--- a/backend/app/aliyun_parser/.claude/settings.local.json
+++ b/backend/app/aliyun_parser/.claude/settings.local.json
@@ -1,8 +0,0 @@
 {
  "permissions": {
    "allow": [
      "Bash(python3 *)",
      "Bash(PGPASSWORD=postgresql123456 psql *)"
    ]
  }
 }
--- a/backend/app/aliyun_parser/parse_pdf.py
+++ b/backend/app/aliyun_parser/parse_pdf.py
@@ -1,516 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """Handle Aliyun parsing support for parse pdf."""
 import argparse
 import json
 import os
 import re
 import time
 from pathlib import Path
 from typing import Dict, List
 from alibabacloud_docmind_api20220711.client import Client as DocmindClient
 from alibabacloud_tea_openapi import models as open_api_models
 from alibabacloud_docmind_api20220711 import models as docmind_models
 from alibabacloud_tea_util import models as util_models
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 ALIBABA_ACCESS_KEY_ID = os.getenv("ALIBABA_ACCESS_KEY_ID", "")
 ALIBABA_ACCESS_KEY_SECRET = os.getenv("ALIBABA_ACCESS_KEY_SECRET", "")
 ALIBABA_ENDPOINT = os.getenv("ALIBABA_ENDPOINT", "docmind-api.cn-hangzhou.aliyuncs.com")
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 MAX_CHARS = 600
 OVERLAP_CHARS = 80
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 TOC_TITLES = {"目次", "目录"}
 TITLE_SUBTYPES = {"doc_title", "para_title"}
 TEXT_SUBTYPES = {"para", "none"}
 FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
 FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 def init_client() -> DocmindClient:
    """Handle init client."""
    if not ALIBABA_ACCESS_KEY_ID or not ALIBABA_ACCESS_KEY_SECRET:
        raise ValueError("缺少阿里云文档解析凭据，请设置 ALIBABA_ACCESS_KEY_ID 和 ALIBABA_ACCESS_KEY_SECRET")
    config = open_api_models.Config(
        access_key_id=ALIBABA_ACCESS_KEY_ID,
        access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
    )
    config.endpoint = ALIBABA_ENDPOINT
    return DocmindClient(config)
 def submit_job(client: DocmindClient, file_path: str) -> str:
    """Submit job."""
    file_name = Path(file_path).name
    request = docmind_models.SubmitDocParserJobAdvanceRequest(
        file_url_object=open(file_path, "rb"),
        file_name=file_name,
        file_name_extension=Path(file_path).suffix.lstrip("."),
        llm_enhancement=True,
        enhancement_mode="VLM",
    )
    runtime = util_models.RuntimeOptions()
    response = client.submit_doc_parser_job_advance(request, runtime)
    return response.body.data.id
 def query_status(client: DocmindClient, task_id: str) -> Dict:
    """Handle query status."""
    request = docmind_models.QueryDocParserStatusRequest(id=task_id)
    response = client.query_doc_parser_status(request)
    return response.body.data.to_map() if response.body.data else None
 def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
    """Wait for for completion."""
    while True:
        status_data = query_status(client, task_id)
        if not status_data:
            return False
        status = status_data.get("Status", "").lower()
        if status == "success":
            return True
        elif status == "failed":
            print(f"任务失败: {status_data}")
            return False
        print(f"任务状态: {status}, 等待中...")
        time.sleep(poll_interval)
 def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
    """Return result."""
    request = docmind_models.GetDocParserResultRequest(
        id=task_id,
        layout_step_size=layout_step_size,
        layout_num=layout_num,
    )
    response = client.get_doc_parser_result(request)
    return response.body.data if response.body.data else None
 def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
    """Collect all results."""
    all_layouts = []
    layout_num = 0
    while True:
        result_data = get_result(client, task_id, layout_num, layout_step_size)
        if not result_data:
            break
        layouts = result_data.get("layouts", [])
        if not layouts:
            break
        all_layouts.extend(layouts)
        layout_num += len(layouts)
        if len(layouts) < layout_step_size:
            break
    return all_layouts
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 def normalize_text(text: str) -> str:
    """Normalize text."""
    text = text.replace("\r", "\n")
    text = text.replace(" ", " ")
    text = re.sub(r"\n+", "\n", text)
    text = re.sub(r"[ \t]+", " ", text)
    return text.strip()
 def get_page(layout: Dict) -> int:
    """Return page."""
    return layout.get("pageNum", layout.get("pageNumber", 0))
 def get_text(layout: Dict) -> str:
    """Return text."""
    text = normalize_text(layout.get("text", ""))
    if text:
        return text
    return normalize_text(layout.get("markdownContent", ""))
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 def is_title(layout: Dict) -> bool:
    """Return whether title."""
    return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
 def is_text(layout: Dict) -> bool:
    """Return whether text."""
    return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
 def is_figure(layout: Dict) -> bool:
    """Return whether figure."""
    return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
 def is_table(layout: Dict) -> bool:
    """Return whether table."""
    return layout.get("type") == "table"
 def is_toc_layout(layout: Dict) -> bool:
    """Return whether toc layout."""
    text = get_text(layout)
    if text in TOC_TITLES:
        return True
    if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
        return True
    return False
 def extract_table_text(layout: Dict) -> str:
    """Extract table text."""
    rows = []
    for cell in layout.get("cells", []):
        texts = []
        for cell_layout in cell.get("layouts", []):
            cell_text = normalize_text(cell_layout.get("text", ""))
            if cell_text:
                texts.append(cell_text)
        if texts:
            rows.append(" ".join(texts))
    return "\n".join(rows).strip()
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
    """Build structure nodes."""
    nodes = []
    for layout in layouts:
        if not is_title(layout):
            continue
        text = get_text(layout)
        if not text or text in TOC_TITLES:
            continue
        nodes.append(
            {
                "unique_id": layout.get("uniqueId"),
                "page": get_page(layout),
                "index": layout.get("index", 0),
                "level": layout.get("level", 0),
                "title": text,
                "type": layout.get("type"),
                "sub_type": layout.get("subType"),
            }
        )
    return nodes
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
    """Update section path."""
    level = layout.get("level", 0)
    title = get_text(layout)
    while section_stack and section_stack[-1]["level"] >= level:
        section_stack.pop()
    section_stack.append(
        {
            "level": level,
            "title": title,
            "page": get_page(layout),
            "unique_id": layout.get("uniqueId"),
        }
    )
    return section_stack
 def section_path_titles(section_stack: List[Dict]) -> List[str]:
    """Handle section path titles."""
    return [item["title"] for item in section_stack]
 def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
    """Handle flush text block."""
    if not blocks:
        return block_id
    texts = [item["text"] for item in blocks if item["text"]]
    merged_text = "\n".join(texts).strip()
    if not merged_text:
        return block_id
    semantic_blocks.append(
        {
            "semantic_id": f"semantic-{block_id}",
            "block_type": "section_text",
            "page_start": min(item["page"] for item in blocks),
            "page_end": max(item["page"] for item in blocks),
            "section_path": blocks[0]["section_path"],
            "section_level": blocks[0]["section_level"],
            "section_title": blocks[0]["section_title"],
            "source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
            "text": merged_text,
        }
    )
    return block_id + 1
 def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
    """Build semantic blocks."""
    semantic_blocks = []
    section_stack = []
    pending_text_blocks = []
    block_id = 1
    skip_toc_page = False
    for layout in layouts:
        text = get_text(layout)
        page = get_page(layout)
        if is_toc_layout(layout):
            skip_toc_page = True
            continue
        if skip_toc_page and page == 1:
            continue
        if skip_toc_page and page != 1:
            skip_toc_page = False
        if is_title(layout):
            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
            pending_text_blocks = []
            section_stack = update_section_path(section_stack, layout)
            continue
        section_path = section_path_titles(section_stack)
        section_title = section_path[-1] if section_path else "未分类"
        section_level = len(section_path)
        if is_table(layout):
            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
            pending_text_blocks = []
            table_text = extract_table_text(layout)
            if table_text:
                semantic_blocks.append(
                    {
                        "semantic_id": f"semantic-{block_id}",
                        "block_type": "table",
                        "page_start": page,
                        "page_end": page,
                        "section_path": section_path,
                        "section_level": section_level,
                        "section_title": section_title,
                        "source_ids": [layout.get("uniqueId")],
                        "text": table_text,
                    }
                )
                block_id += 1
            continue
        if is_figure(layout):
            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
            pending_text_blocks = []
            if text:
                semantic_blocks.append(
                    {
                        "semantic_id": f"semantic-{block_id}",
                        "block_type": "figure",
                        "page_start": page,
                        "page_end": page,
                        "section_path": section_path,
                        "section_level": section_level,
                        "section_title": section_title,
                        "source_ids": [layout.get("uniqueId")],
                        "text": text,
                    }
                )
                block_id += 1
            continue
        if is_text(layout) and text:
            pending_text_blocks.append(
                {
                    "page": page,
                    "text": text,
                    "unique_id": layout.get("uniqueId"),
                    "section_path": section_path,
                    "section_level": section_level,
                    "section_title": section_title,
                }
            )
    flush_text_block(pending_text_blocks, semantic_blocks, block_id)
    return semantic_blocks
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
    """Handle split text with overlap."""
    text = text.strip()
    if len(text) <= max_chars:
        return [text] if text else []
    parts = []
    start = 0
    while start < len(text):
        end = min(len(text), start + max_chars)
        parts.append(text[start:end].strip())
        if end >= len(text):
            break
        start = max(0, end - overlap_chars)
    return [part for part in parts if part]
 def build_vector_chunks(
    semantic_blocks: List[Dict],
    doc_id: str,
    doc_title: str,
    max_chars: int,
    overlap_chars: int,
 ) -> List[Dict]:
    """Build vector chunks."""
    vector_chunks = []
    chunk_index = 1
    for block in semantic_blocks:
        pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
        for piece_index, piece in enumerate(pieces, start=1):
            if block["section_path"]:
                header = f"标准：{doc_title}\n章节：{' > '.join(block['section_path'])}\n\n"
            else:
                header = f"标准：{doc_title}\n\n"
            vector_chunks.append(
                {
                    "doc_id": doc_id,
                    "doc_title": doc_title,
                    "chunk_id": f"chunk-{chunk_index}",
                    "chunk_index": chunk_index,
                    "semantic_id": block["semantic_id"],
                    "chunk_type": block["block_type"],
                    "piece_index": piece_index,
                    "page_start": block["page_start"],
                    "page_end": block["page_end"],
                    "section_path": block["section_path"],
                    "section_level": block["section_level"],
                    "section_title": block["section_title"],
                    "source_ids": block["source_ids"],
                    "text": piece,
                    "embedding_text": header + piece,
                }
            )
            chunk_index += 1
    return vector_chunks
 def parse_pdf_to_structured_chunks(
    pdf_path: str,
    *,
    doc_id: str,
    doc_title: str,
    max_chars: int = MAX_CHARS,
    overlap_chars: int = OVERLAP_CHARS,
    poll_interval: int = 5,
 ) -> Dict:
    """Parse pdf to structured chunks."""
    client = init_client()
    task_id = submit_job(client, pdf_path)
    if not wait_for_completion(client, task_id, poll_interval):
        raise RuntimeError("阿里云文档解析任务失败")
    layouts = collect_all_results(client, task_id)
    return convert_layouts(
        layouts,
        doc_id=doc_id,
        doc_title=doc_title,
        max_chars=max_chars,
        overlap_chars=overlap_chars,
    )
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 def convert_layouts(
    layouts: List[Dict],
    doc_id: str,
    doc_title: str,
    max_chars: int,
    overlap_chars: int,
 ) -> Dict:
    """Handle convert layouts."""
    structure_nodes = build_structure_nodes(layouts)
    semantic_blocks = build_semantic_blocks(layouts)
    vector_chunks = build_vector_chunks(
        semantic_blocks,
        doc_id=doc_id,
        doc_title=doc_title,
        max_chars=max_chars,
        overlap_chars=overlap_chars,
    )
    return {
        "doc_id": doc_id,
        "doc_title": doc_title,
        "structure_nodes": structure_nodes,
        "semantic_blocks": semantic_blocks,
        "vector_chunks": vector_chunks,
    }
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 def main() -> None:
    """Run the module entrypoint."""
    parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF，输出三层结构 chunks")
    parser.add_argument("pdf_path", help="PDF 文件路径")
    parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
    parser.add_argument("--layouts-out", dest="layouts_output", help="输出原始 layouts JSON")
    parser.add_argument("--doc-id", default="GB14747-2006", help="文档 ID")
    parser.add_argument("--doc-title", default="GB 14747—2006 儿童三轮车安全要求", help="文档标题")
    parser.add_argument("--max-chars", type=int, default=MAX_CHARS, help="单个检索 chunk 最大字符数")
    parser.add_argument("--overlap-chars", type=int, default=OVERLAP_CHARS, help="相邻检索 chunk 重叠字符数")
    parser.add_argument("--poll-interval", type=int, default=5, help="轮询间隔（秒）")
    args = parser.parse_args()
    pdf_path = Path(args.pdf_path).expanduser().resolve()
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    client = init_client()
    print(f"提交任务: {pdf_path}")
    task_id = submit_job(client, str(pdf_path))
    print(f"任务 ID: {task_id}")
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    print("等待任务完成...")
    if not wait_for_completion(client, task_id, args.poll_interval):
        print("任务失败，退出")
        return
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    print("获取解析结果...")
    layouts = collect_all_results(client, task_id)
    print(f"获取到 {len(layouts)} 个布局块")
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    if args.layouts_output:
        layouts_path = Path(args.layouts_output).expanduser().resolve()
        layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
        print(f"原始 layouts 已写入: {layouts_path}")
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    print("转换为三层结构...")
    data = convert_layouts(
        layouts,
        doc_id=args.doc_id,
        doc_title=args.doc_title,
        max_chars=args.max_chars,
        overlap_chars=args.overlap_chars,
    )
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    output_path = Path(args.out).expanduser().resolve()
    output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"结构层节点数: {len(data['structure_nodes'])}")
    print(f"语义层块数: {len(data['semantic_blocks'])}")
    print(f"检索层块数: {len(data['vector_chunks'])}")
    print(f"输出文件: {output_path}")
 if __name__ == "__main__":
    main()
--- a/backend/app/aliyun_parser/schema.sql
+++ b/backend/app/aliyun_parser/schema.sql
@@ -1,122 +0,0 @@
 -- 法规文档向量检索系统数据库表结构
 -- PostgreSQL
 -- ==================== 文档表 ====================
 CREATE TABLE documents (
    id SERIAL PRIMARY KEY,
    doc_id VARCHAR(128) UNIQUE NOT NULL,       -- 文档唯一标识，如 "GB14747-2006"
    title VARCHAR(512) NOT NULL,               -- 文档标题
    doc_type VARCHAR(32),                      -- 文档类型：标准/法规/规范
    standard_number VARCHAR(64),               -- 标准编号：如 "GB 14747-2006"
    publish_date DATE,                         -- 发布日期
    implement_date DATE,                       -- 实施日期
    status VARCHAR(32),                        -- 状态：现行/废止/修订
    source_url VARCHAR(512),                   -- 来源 URL
    file_path VARCHAR(512),                    -- 本地 PDF 文件路径
    file_size INT,                             -- 文件大小（字节）
    upload_time TIMESTAMP DEFAULT NOW(),       -- 上传时间
    created_at TIMESTAMP DEFAULT NOW(),
    updated_at TIMESTAMP DEFAULT NOW()
 );
 COMMENT ON TABLE documents IS '文档元数据表';
 COMMENT ON COLUMN documents.doc_id IS '文档唯一标识，用于关联 Milvus 和其他表';
 COMMENT ON COLUMN documents.standard_number IS '标准编号，如 GB 14747-2006';
 -- ==================== 章节结构表 ====================
 CREATE TABLE sections (
    id SERIAL PRIMARY KEY,
    doc_id VARCHAR(128) NOT NULL,
    unique_id VARCHAR(64) NOT NULL,            -- 阿里云返回的唯一标识
    level INT NOT NULL,                        -- 层级：1, 2, 3...
    title VARCHAR(512) NOT NULL,               -- 章节标题
    page INT,                                  -- 所在页码
    index INT,                                 -- 页内顺序
    parent_id INT,                             -- 父章节 ID（树形结构）
    created_at TIMESTAMP DEFAULT NOW(),
    CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
    CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
    CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
 );
 COMMENT ON TABLE sections IS '章节结构表，用于目录导航';
 COMMENT ON COLUMN sections.parent_id IS '父章节 ID，构建树形结构';
 COMMENT ON COLUMN sections.level IS '层级深度，1 为最顶层';
 -- ==================== 语义块表 ====================
 CREATE TABLE semantic_blocks (
    id SERIAL PRIMARY KEY,
    doc_id VARCHAR(128) NOT NULL,
    semantic_id VARCHAR(64) NOT NULL,          -- 语义块唯一标识
    block_type VARCHAR(32) NOT NULL,           -- 类型：section_text/table/figure
    page_start INT NOT NULL,                   -- 起始页码
    page_end INT NOT NULL,                     -- 结束页码
    section_id INT,                            -- 所属章节
    section_title VARCHAR(512),                -- 章节标题（冗余，方便查询）
    section_level INT,                         -- 章节层级
    source_ids JSONB,                          -- 原始 layout IDs（JSON 数组）
    text TEXT NOT NULL,                        -- 完整内容（未被切分）
    created_at TIMESTAMP DEFAULT NOW(),
    CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
    CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
    CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
 );
 COMMENT ON TABLE semantic_blocks IS '语义块表，用于邻域扩展，恢复完整内容';
 COMMENT ON COLUMN semantic_blocks.block_type IS '类型：section_text（正文）、table（表格）、figure（图示）';
 COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
 COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容，未被切分';
 -- ==================== 向量块元数据表 ====================
 CREATE TABLE vector_chunks (
    id SERIAL PRIMARY KEY,
    doc_id VARCHAR(128) NOT NULL,
    chunk_id VARCHAR(64) NOT NULL,             -- Milvus 主键
    semantic_id VARCHAR(64) NOT NULL,          -- 关联语义块
    chunk_index INT NOT NULL,                  -- 切片序号（全局）
    piece_index INT,                           -- 同语义块内的切片序号
    page_start INT,
    page_end INT,
    section_title VARCHAR(512),
    text VARCHAR(2048),                        -- 切片文本（可选，缩短版用于展示）
    source_ids JSONB,                          -- 原始 layout IDs（JSON 数组）
    created_at TIMESTAMP DEFAULT NOW(),
    CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
    CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
        REFERENCES semantic_blocks(doc_id, semantic_id),
    CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
 );
 COMMENT ON TABLE vector_chunks IS '向量块元数据表，用于快速关联查询';
 COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
 COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号，用于按序拼接';
 -- ==================== 索引 ====================
 CREATE INDEX idx_sections_doc_id ON sections(doc_id);
 CREATE INDEX idx_sections_parent_id ON sections(parent_id);
 CREATE INDEX idx_sections_level ON sections(level);
 CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
 CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
 CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
 CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
 CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
 CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
 CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
 -- ==================== 触发器：自动更新 updated_at ====================
 CREATE OR REPLACE FUNCTION update_updated_at()
 RETURNS TRIGGER AS $$
 BEGIN
    NEW.updated_at = NOW();
    RETURN NEW;
 END;
 $$ LANGUAGE plpgsql;
 CREATE TRIGGER tr_documents_updated_at
    BEFORE UPDATE ON documents
    FOR EACH ROW EXECUTE FUNCTION update_updated_at();
--- a/backend/app/aliyun_parser/upload_to_milvus.py
+++ b/backend/app/aliyun_parser/upload_to_milvus.py
@@ -1,327 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """Handle Aliyun parsing support for upload to milvus."""
 import argparse
 import json
 import time
 from pathlib import Path
 from typing import List, Dict
 import psycopg2
 from psycopg2.extras import execute_values
 from pymilvus import (
    connections,
    Collection,
    FieldSchema,
    CollectionSchema,
    DataType,
    utility,
 )
 from openai import OpenAI
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 RELAY_BASE_URL = "http://6.86.80.4:30080/v1"
 RELAY_API_KEY = "sk-5HeY7gfSIlyZMacfuXOf5cphpymsNqufEu1ou4U3avbULcyY"
 EMBEDDING_MODEL = "text-embedding-v3"  # Keep parser integration steps explicit so external workflow behavior stays traceable.
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 MILVUS_HOST = "localhost"
 MILVUS_PORT = "19530"
 COLLECTION_NAME = "regulation_chunks"
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 PG_HOST = "6.86.80.10"
 PG_PORT = 5432
 PG_USER = "postgresql"
 PG_PASSWORD = "postgresql123456"
 PG_DATABASE = "postgres"
 # ===================== Embedding =====================
 def get_openai_client(api_key: str, base_url: str) -> OpenAI:
    """Return openai client."""
    return OpenAI(api_key=api_key, base_url=base_url)
 def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10) -> List[List[float]]:
    """Return embeddings batch."""
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        print(f"Embedding batch {i // batch_size + 1}/{(len(texts) - 1) // batch_size + 1}...")
        response = client.embeddings.create(
            model=EMBEDDING_MODEL,
            input=batch,
        )
        embeddings = [item.embedding for item in response.data]
        all_embeddings.extend(embeddings)
    return all_embeddings
 # ===================== Milvus =====================
 def init_milvus(host: str, port: str):
    """Handle init milvus."""
    connections.connect("default", host=host, port=port)
    print(f"已连接 Milvus: {host}:{port}")
 def create_collection(name: str, dim: int) -> Collection:
    """Create collection."""
    if utility.has_collection(name):
        print(f"Collection '{name}' 已存在，删除重建")
        utility.drop_collection(name)
    fields = [
        FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=64, is_primary=True),
        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=128),
        FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=512),
        FieldSchema(name="chunk_index", dtype=DataType.INT64),
        FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=64),
        FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=32),
        FieldSchema(name="page_start", dtype=DataType.INT64),
        FieldSchema(name="page_end", dtype=DataType.INT64),
        FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
        FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),  # Keep parser integration steps explicit so external workflow behavior stays traceable.
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
    ]
    schema = CollectionSchema(fields, description="法规文档检索 chunks")
    collection = Collection(name, schema)
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    index_params = {
        "metric_type": "COSINE",
        "index_type": "IVF_FLAT",
        "params": {"nlist": 128},
    }
    collection.create_index("embedding", index_params)
    print(f"Collection '{name}' 创建完成，索引已建立")
    return collection
 def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[List[float]]):
    """Handle insert chunks."""
    data = [
        [c["chunk_id"] for c in chunks],
        [c["doc_id"] for c in chunks],
        [c["doc_title"] for c in chunks],
        [c["chunk_index"] for c in chunks],
        [c["semantic_id"] for c in chunks],
        [c["chunk_type"] for c in chunks],
        [c["page_start"] for c in chunks],
        [c["page_end"] for c in chunks],
        [c["section_title"] for c in chunks],
        [c["text"] for c in chunks],
        [json.dumps(c.get("source_ids", [])) for c in chunks],  # Keep parser integration steps explicit so external workflow behavior stays traceable.
        embeddings,
    ]
    collection.insert(data)
    collection.flush()
    print(f"已插入 {len(chunks)} 个 chunks")
 def load_collection(collection: Collection):
    """Load collection."""
    collection.load()
    print(f"Collection 已加载到内存")
 # ===================== PostgreSQL =====================
 def get_pg_connection(host: str, port: int, user: str, password: str, database: str):
    """Return pg connection."""
    conn = psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        database=database,
    )
    print(f"已连接 PostgreSQL: {host}:{port}/{database}")
    return conn
 def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
    """Handle insert chunks to pg."""
    cursor = conn.cursor()
    try:
        # Keep parser integration steps explicit so external workflow behavior stays traceable.
        cursor.execute("""
            INSERT INTO documents (doc_id, title, standard_number, upload_time)
            VALUES (%s, %s, %s, NOW())
            ON CONFLICT (doc_id) DO UPDATE SET title = EXCLUDED.title, updated_at = NOW()
        """, (doc_data["doc_id"], doc_data["doc_title"], doc_data.get("standard_number")))
        # Keep parser integration steps explicit so external workflow behavior stays traceable.
        semantic_blocks = doc_data.get("semantic_blocks", [])
        if semantic_blocks:
            block_rows = [
                (
                    doc_data["doc_id"],
                    block["semantic_id"],
                    block["block_type"],
                    block["page_start"],
                    block["page_end"],
                    block.get("section_title"),
                    block.get("section_level"),
                    json.dumps(block.get("source_ids", [])),
                    block["text"],
                )
                for block in semantic_blocks
            ]
            execute_values(
                cursor,
                """
                INSERT INTO semantic_blocks
                (doc_id, semantic_id, block_type, page_start, page_end, section_title, section_level, source_ids, text)
                VALUES %s
                ON CONFLICT (doc_id, semantic_id) DO UPDATE SET text = EXCLUDED.text
                """,
                block_rows,
            )
            print(f"已插入 {len(semantic_blocks)} 个语义块")
        # Keep parser integration steps explicit so external workflow behavior stays traceable.
        chunk_rows = [
            (
                doc_data["doc_id"],
                chunk["chunk_id"],
                chunk["semantic_id"],
                chunk["chunk_index"],
                chunk.get("piece_index"),
                chunk["page_start"],
                chunk["page_end"],
                chunk.get("section_title"),
                chunk["text"],
                json.dumps(chunk.get("source_ids", [])),
            )
            for chunk in chunks
        ]
        execute_values(
            cursor,
            """
            INSERT INTO vector_chunks
            (doc_id, chunk_id, semantic_id, chunk_index, piece_index, page_start, page_end, section_title, text, source_ids)
            VALUES %s
            ON CONFLICT (doc_id, chunk_id) DO UPDATE SET text = EXCLUDED.text
            """,
            chunk_rows,
        )
        print(f"已插入 {len(chunks)} 个向量块元数据")
        conn.commit()
        print("PostgreSQL 数据插入完成")
    except Exception as e:
        conn.rollback()
        raise e
    finally:
        cursor.close()
 # Keep parser integration steps explicit so external workflow behavior stays traceable.
 def load_data(file_path: Path) -> Dict:
    """Load data."""
    data = json.loads(file_path.read_text(encoding="utf-8"))
    return data
 def upload_to_milvus_and_pg(
    chunks_file: str,
    api_key: str,
    base_url: str,
    milvus_host: str,
    milvus_port: str,
    collection_name: str,
    batch_size: int,
    pg_host: str,
    pg_port: int,
    pg_user: str,
    pg_password: str,
    pg_database: str,
 ):
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    """Handle upload to milvus and pg."""
    chunks_path = Path(chunks_file).expanduser().resolve()
    if not chunks_path.exists():
        raise FileNotFoundError(f"文件不存在: {chunks_path}")
    data = load_data(chunks_path)
    chunks = data.get("vector_chunks", [])
    if not chunks:
        raise ValueError("vector_chunks 为空")
    print(f"加载 {len(chunks)} 个 chunks")
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    client = get_openai_client(api_key, base_url)
    init_milvus(milvus_host, milvus_port)
    pg_conn = get_pg_connection(pg_host, pg_port, pg_user, pg_password, pg_database)
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    texts = [c["embedding_text"] for c in chunks]
    embeddings = get_embeddings_batch(client, texts, batch_size)
    print(f"生成 {len(embeddings)} 个向量")
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    embedding_dim = len(embeddings[0])
    print(f"Embedding 维度: {embedding_dim}")
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    collection = create_collection(collection_name, embedding_dim)
    insert_chunks(collection, chunks, embeddings)
    load_collection(collection)
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    insert_chunks_to_pg(pg_conn, chunks, data)
    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    pg_conn.close()
    print("上传完成！")
 # ===================== CLI =====================
 def main():
    """Run the module entrypoint."""
    parser = argparse.ArgumentParser(description="将 vector_chunks 向量化并上传到 Milvus 和 PostgreSQL")
    parser.add_argument("chunks_file", help="vector_chunks.json 文件路径")
    parser.add_argument("--api-key", default=RELAY_API_KEY, help="中转站 API Key")
    parser.add_argument("--base-url", default=RELAY_BASE_URL, help="中转站 Base URL")
    parser.add_argument("--milvus-host", default=MILVUS_HOST, help="Milvus host")
    parser.add_argument("--milvus-port", default=MILVUS_PORT, help="Milvus port")
    parser.add_argument("--collection", default=COLLECTION_NAME, help="Milvus collection 名称")
    parser.add_argument("--batch-size", type=int, default=10, help="Embedding 批量大小（中转站限制最大10）")
    parser.add_argument("--pg-host", default=PG_HOST, help="PostgreSQL host")
    parser.add_argument("--pg-port", type=int, default=PG_PORT, help="PostgreSQL port")
    parser.add_argument("--pg-user", default=PG_USER, help="PostgreSQL user")
    parser.add_argument("--pg-password", default=PG_PASSWORD, help="PostgreSQL password")
    parser.add_argument("--pg-database", default=PG_DATABASE, help="PostgreSQL database")
    args = parser.parse_args()
    upload_to_milvus_and_pg(
        chunks_file=args.chunks_file,
        api_key=args.api_key,
        base_url=args.base_url,
        milvus_host=args.milvus_host,
        milvus_port=args.milvus_port,
        collection_name=args.collection,
        batch_size=args.batch_size,
        pg_host=args.pg_host,
        pg_port=args.pg_port,
        pg_user=args.pg_user,
        pg_password=args.pg_password,
        pg_database=args.pg_database,
    )
 if __name__ == "__main__":
    main()
--- a/backend/app/aliyun_parser/vector_chunks.json
+++ b/backend/app/aliyun_parser/vector_chunks.json
--- a/backend/app/aliyun_parser/嵌入和召回.md
+++ b/backend/app/aliyun_parser/嵌入和召回.md
@@ -1,263 +0,0 @@
 # 文档解析与向量检索说明
 ## 相关文件
 - `aliyun_doc_parser.py`：调用阿里云文档智能解析 PDF，生成原始 `layouts.json`
 - `layouts_to_vector_chunks.py`：把 `layouts.json` 转成适合向量数据库入库的三层结构
 - `layouts.json`：阿里云返回的原始布局结果
 - `vector_chunks.json`：转换后的结构化输出
 ## 一、`layouts.json` 的结构
 `layouts.json` 顶层是一个数组，每个元素代表一个布局块（layout）。常见字段如下：
 - `type`：主类型，例如 `title`、`text`、`table`、`figure`
 - `subType`：更细的语义类型，例如 `doc_title`、`para_title`、`para`、`picture`、`pic_title`、`pic_caption`
 - `text`：当前布局块的纯文本
 - `markdownContent`：带 markdown 标记的文本
 - `pageNum`：页码
 - `index`：页内顺序
 - `level`：标题层级
 - `uniqueId`：布局块唯一标识
 - `blocks`：更细粒度的文本与样式信息
 - `cells`：表格单元格，仅 `table` 类型存在
 这个结构不是简单 OCR 文本流，而是已经带有版面理解和语义分类的结构化数据。
 ## 二、推荐的三层转换结构
 ### 1. 结构层 `structure_nodes`
 结构层用于恢复文档标题树，不直接作为最终向量检索单元。
 示例：
 - `1 范围`
 - `2 规范性引用文件`
 - `3 术语和定义`
  - `3.1 儿童三轮车`
  - `3.2 轮距`
 结构层主要用于给下游 chunk 绑定 `section_path`。
 ### 2. 语义层 `semantic_blocks`
 语义层是按文档意义聚合后的内容块，主要分为三类：
 - `section_text`：同一章节下连续正文聚合而成
 - `table`：表格内容单独成块
 - `figure`：图、图名、图注等单独成块
 这一层比单 layout 更适合做语义理解，也适合后续做上下文扩展。
 ### 3. 检索层 `vector_chunks`
 检索层是最终写进向量数据库的 chunk。
 处理方式：
 - 对 `semantic_blocks` 中较短的块直接入库
 - 对较长的块按 `max_chars` 再切分
 - 相邻切片保留 `overlap_chars` 重叠
 - 每个 chunk 都带完整 metadata，便于后续过滤、重排和邻域扩展
 ## 三、当前转换脚本做了什么
 `layouts_to_vector_chunks.py` 当前已经实现：
 1. 过滤目录页噪声（如 `目次`）
 2. 根据标题层级维护章节路径
 3. 将正文聚合成 `section_text`
 4. 将表格单独转成 `table`
 5. 将图相关内容单独转成 `figure`
 6. 对长文本继续切分为最终 `vector_chunks`
 7. 为每个检索 chunk 生成 `embedding_text`
 ## 四、为什么不要直接按 layout 入库
 如果把 `layouts.json` 的每条 layout 直接做向量：
 - 颗粒度太碎
 - 标题和正文容易分离
 - 表格会丢失结构上下文
 - 图示信息无法完整表达
 - 检索命中结果噪声较大
 对于标准文档，最合适的单位通常不是“句子”，而是“条款语义块”。
 ## 五、建议的入库字段
 建议向量数据库每条记录至少保存：
 - `embedding_text`：用于生成向量
 - `text`：原始 chunk 文本
 - `chunk_id`
 - `semantic_id`
 - `chunk_type`：`section_text` / `table` / `figure`
 - `section_path`
 - `section_title`
 - `section_level`
 - `page_start`
 - `page_end`
 - `doc_id`
 - `doc_title`
 - `source_ids`
 其中：
 - 向量化字段：`embedding_text`
 - 展示字段：`text`
 - 检索增强字段：其余 metadata
 ## 六、推荐的检索方式
 不要只做最简单的 top-k 向量搜索，建议采用：
 **向量召回 + metadata 重排 + 邻域扩展**
 ### 1. 向量召回
 使用 `vector_chunks[*].embedding_text` 做 embedding，并在向量数据库中检索 top 10 ~ 15 条。
 查询时可以对用户问题做轻微改写，例如：
 原问题：
 `儿童三轮车的定义是什么？`
 可改写为：
 `请检索 GB 14747—2006 儿童三轮车安全要求 中关于“儿童三轮车定义”的条款、术语、表格或图示说明。`
 这样更适合标准文档检索。
 ### 2. metadata 重排
 向量召回后，根据 metadata 做轻量规则重排。
 常见规则：
 - `chunk_type == section_text`：对定义类、要求类问题优先级更高
 - `section_path` 命中查询关键词：例如查询“定义”时，`术语和定义` 章节优先
 - `chunk_type == table`：对“尺寸 / 参数 / 数值 / 对照 / 要求”类问题加权
 - `chunk_type == figure`：对“图 / 结构 / 状态 / 示意”类问题加权
 ### 3. 邻域扩展
 检索命中的是最终切片，但回答往往需要更完整上下文。
 建议命中某个 `vector_chunk` 后：
 1. 优先回捞同一个 `semantic_id` 下的所有 chunk
 2. 如果还不够，再补充同 `section_path`、相邻页码或相邻 `chunk_index` 的内容
 这样可以恢复完整条款，而不是只给模型一小段碎片。
 ## 七、不同问题的检索重点
 ### 1. 定义类问题
 例如：
 - `儿童三轮车的定义是什么？`
 - `轮距是什么意思？`
 优先检索：
 - `section_text`
 - `section_path` 中包含 `术语和定义` 的内容
 ### 2. 要求类问题
 例如：
 - `外露突出物有什么要求？`
 - `辅助推杆有哪些安全要求？`
 优先检索：
 - `section_text`
 - `table`
 ### 3. 数值 / 尺寸 / 对照类问题
 例如：
 - `鞍座到脚蹬距离要求是什么？`
 - `哪些项目需要满足规定尺寸？`
 优先检索：
 - `table`
 - `section_text`
 ### 4. 图示说明类问题
 例如：
 - `正常乘骑状态是什么意思？`
 - `图1表示什么？`
 优先检索：
 - `figure`
 - 同章节相邻 `section_text`
 ## 八、推荐的最终检索流程
 建议采用以下固定流程：
 1. 用 `vector_chunks.embedding_text` 做 embedding 检索
 2. 取 top 10 ~ 15 条候选
 3. 按 `chunk_type + section_path` 做规则重排
 4. 以 `semantic_id` 为中心回捞完整语义块
 5. 选 3 ~ 5 组上下文提供给大模型回答
 ## 九、给大模型的上下文组织方式
 最终不要直接把原始 JSON 扔给模型，建议整理成如下格式：
 ```text
 [命中片段 1]
 章节：3 术语和定义 > 3.1 儿童三轮车
 页码：1-2
 类型：section_text
 内容：
 ......
 [命中片段 2]
 章节：4 要求 > 4.3 外露突出物
 页码：5
 类型：section_text
 内容：
 ......
 [命中片段 3]
 章节：5 试验方法
 页码：8
 类型：table
 内容：
 ......
 ```
 这种格式更利于模型稳定回答并引用出处。
 ## 十、转换命令
 生成三层结构：
 ```bash
 python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
  --layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
  --out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json
 ```
 自定义切片大小：
 ```bash
 python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
  --layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
  --out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json \
  --max-chars 500 \
  --overlap-chars 80
 ```
--- a/backend/app/api/routes/status.py
+++ b/backend/app/api/routes/status.py
@@ -32,6 +32,10 @@ async def get_config():
        "embedding_dim": settings.embedding_dim,
        "embedding_base_url": settings.embedding_base_url,
        "milvus_collection": settings.milvus_collection,
        "parser_backend": settings.parser_backend,
        "chunk_backend": settings.chunk_backend,
        "artifact_prefix": settings.document_parse_artifact_prefix,
        "parser_failure_mode": settings.parser_failure_mode,
        "llm_provider": settings.llm_provider,
        "llm_model": settings.llm_model,
        "document_metadata_path": settings.document_metadata_path,
--- a/backend/app/application/documents/services.py
+++ b/backend/app/application/documents/services.py
@@ -5,6 +5,7 @@ from __future__ import annotations
 import os
 import tempfile
 import uuid
 import json
 from dataclasses import dataclass
 from loguru import logger
@@ -16,6 +17,7 @@ from app.domain.documents import (
    DocumentParser,
    DocumentRepository,
    DocumentStatus,
    ParsedDocument,
 )
 from app.domain.retrieval import EmbeddingProvider, VectorIndex
 # Keep orchestration logic centralized so use-case flow stays easy to trace.
@@ -54,6 +56,27 @@ class DocumentCommandService:
        self.embedding_provider = embedding_provider
        self.vector_index = vector_index
    def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
        """Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
        prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"
        artifact_payloads = {
            "layouts": parsed_document.raw_layouts,
            "structure_nodes": parsed_document.structure_nodes,
            "semantic_blocks": parsed_document.semantic_blocks,
            "vector_chunks": parsed_document.vector_chunks,
        }
        artifact_keys: dict[str, str] = {}
        for name, payload in artifact_payloads.items():
            object_name = f"{prefix}/{name}.json"
            self.binary_store.save(
                object_name=object_name,
                data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),
                content_type="application/json",
                metadata={"doc_id": doc_id, "artifact_type": name},
            )
            artifact_keys[name] = object_name
        return artifact_keys
    def upload_and_process(
        self,
        *,
@@ -104,11 +127,21 @@ class DocumentCommandService:
                doc_id=doc_id,
                doc_name=final_doc_name,
            )
            artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
            self.document_repository.update_status(
                doc_id,
                DocumentStatus.PARSED,
                parser_name=parsed_document.parser_name,
-                metadata={"structure_nodes": len(parsed_document.structure_nodes)},
+                metadata={
                    "parser_backend": parsed_document.parser_name,
                    "parse_task_id": parsed_document.metadata.get("task_id", ""),
                    "layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
                    "structure_node_count": len(parsed_document.structure_nodes),
                    "semantic_block_count": len(parsed_document.semantic_blocks),
                    "vector_chunk_count": len(parsed_document.vector_chunks),
                    "artifact_keys": artifact_keys,
                    "processing_stage": "parsed",
                },
            )
            chunks = self.chunk_builder.build(
@@ -124,13 +157,18 @@ class DocumentCommandService:
            if inserted != len(chunks):
                logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
            health = self.vector_index.health()
            self.document_repository.update_status(
                doc_id,
                DocumentStatus.INDEXED,
                chunk_count=len(chunks),
                summary="",
                summary_latency_ms=0,
-                index_name=self.vector_index.health().get("collection_name", ""),
+                index_name=health.get("collection_name", ""),
                metadata={
                    "index_collection": health.get("collection_name", ""),
                    "processing_stage": "indexed",
                },
            )
            stored = self.document_repository.get(doc_id)
            return DocumentProcessResult(
@@ -148,6 +186,10 @@ class DocumentCommandService:
                doc_id,
                DocumentStatus.FAILED,
                error_message=str(exc),
                metadata={
                    "failure_reason": str(exc),
                    "processing_stage": "failed",
                },
            )
            return DocumentProcessResult(
                doc_id=doc_id,
--- a/backend/app/config/settings.py
+++ b/backend/app/config/settings.py
@@ -1,9 +1,9 @@
-"""Configure backend settings for settings."""
+"""Configure backend settings for the backend application."""
 from pathlib import Path
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from functools import lru_cache
 # Keep configuration setup explicit so runtime behavior is easy to reason about.
@@ -33,18 +33,25 @@ class Settings(BaseSettings):
    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    milvus_host: str = Field(default="localhost", description="Milvus服务地址")
    milvus_port: int = Field(default=19530, description="Milvus服务端口")
-    milvus_collection: str = Field(default="regulations_dense_1536", description="法规向量集合名称")
+    milvus_collection: str = Field(default="regulations_dense_1024_v1", description="法规向量集合名称")
    milvus_db_name: str = Field(default="default", description="Milvus数据库名称")
    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    embedding_model: str = Field(default="text-embedding-v3", description="嵌入模型名称")
-    embedding_dim: int = Field(default=1536, description="嵌入向量维度")
+    embedding_dim: int = Field(default=1024, description="嵌入向量维度")
    embedding_api_key: str = Field(default="", description="Embedding API密钥")
    embedding_base_url: str = Field(default="http://6.86.80.4:30080/v1", description="Embedding API地址")
    embedding_timeout_seconds: int = Field(default=120, description="Embedding API超时时间(秒)")
    alibaba_access_key_id: str = Field(default="", description="阿里云文档解析 Access Key ID")
    alibaba_access_key_secret: str = Field(default="", description="阿里云文档解析 Access Key Secret")
    alibaba_endpoint: str = Field(default="docmind-api.cn-hangzhou.aliyuncs.com", description="阿里云文档解析 endpoint")
    aliyun_parse_poll_interval_seconds: int = Field(default=5, description="阿里云文档解析轮询间隔(秒)")
    aliyun_parse_timeout_seconds: int = Field(default=900, description="阿里云文档解析超时时间(秒)")
    aliyun_parse_layout_step_size: int = Field(default=50, description="阿里云文档解析分页步长")
    aliyun_llm_enhancement: bool = Field(default=True, description="是否启用阿里云解析增强")
    aliyun_enhancement_mode: str = Field(default="VLM", description="阿里云解析增强模式")
    document_parse_artifact_prefix: str = Field(default="artifacts", description="解析产物对象前缀")
    parser_failure_mode: str = Field(default="fail", description="解析失败策略")
    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    minio_endpoint: str = Field(default="localhost:9000", description="MinIO服务地址")
@@ -71,8 +78,8 @@ class Settings(BaseSettings):
    chunk_overlap: int = Field(default=50, description="分块重叠大小")
    max_file_size_mb: int = Field(default=100, description="最大文件大小(MB)")
    document_metadata_path: str = Field(default="backend/data/documents.json", description="文档元数据存储路径")
-    parser_backend: str = Field(default="local", description="解析后端(local/aliyun)")
+    parser_backend: str = Field(default="aliyun", description="解析后端(local/aliyun)")
-    chunk_backend: str = Field(default="local", description="分块后端(local/aliyun)")
+    chunk_backend: str = Field(default="aliyun", description="分块后端(local/aliyun)")
    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    api_host: str = Field(default="0.0.0.0", description="API服务地址")
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -27,12 +27,12 @@ class Settings(BaseSettings):
    # Milvus
    milvus_host: str = "localhost"
    milvus_port: int = 19530
-    milvus_collection: str = "regulations_dense_1536"
+    milvus_collection: str = "regulations_dense_1024_v1"
    # LLM / embedding defaults aligned with the migrated backend path.
    llm_model: str = "qwen-max"
    embedding_model: str = "text-embedding-v3"
-    embedding_dim: int = 1536
+    embedding_dim: int = 1024
    # Legacy workflow compatibility only.
    vector_top_k: int = 10
@@ -47,7 +47,7 @@ class Settings(BaseSettings):
    api_port: int = 8000
    # Legacy aliases retained for old utility modules.
-    regulations_collection: str = "regulations_dense_1536"
+    regulations_collection: str = "regulations_dense_1024_v1"
    compliance_collection: str = "compliance_cache"
 # Preserve the legacy module API while keeping env resolution centralized at the repo root.
--- a/backend/app/domain/documents/models.py
+++ b/backend/app/domain/documents/models.py
@@ -56,6 +56,7 @@ class ParsedDocument:
    vector_chunks: list[dict[str, Any]]
    parser_name: str
    raw_text: str = ""
    raw_layouts: list[dict[str, Any]] = field(default_factory=list)
    metadata: dict[str, Any] = field(default_factory=dict)
--- a/backend/app/infrastructure/embedding/openai_compatible_embedding_provider.py
+++ b/backend/app/infrastructure/embedding/openai_compatible_embedding_provider.py
@@ -10,6 +10,8 @@ from app.config.settings import settings
 from app.domain.retrieval import EmbeddingProvider
 # Keep adapter behavior explicit so integration details remain easy to audit.
 EMBEDDING_BATCH_SIZE = 8
 class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
@@ -27,6 +29,18 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
        self.timeout = settings.embedding_timeout_seconds
        self.dimension = settings.embedding_dim
    def _raise_for_status(self, response: httpx.Response, *, batch_size: int) -> None:
        """Raise a detailed error so upstream gateway failures are easier to diagnose."""
        try:
            response.raise_for_status()
        except httpx.HTTPStatusError as exc:
            response_preview = response.text[:500].strip()
            detail = (
                f"Embedding request failed for model={self.model}, batch_size={batch_size}, "
                f"status={response.status_code}, url={response.request.url}, response={response_preview}"
            )
            raise httpx.HTTPStatusError(detail, request=exc.request, response=exc.response) from exc
    def _request(self, texts: list[str]) -> list[list[float]]:
        """Handle request for this module for the Open A I Compatible Embedding Provider instance."""
        if not self.api_key:
@@ -40,7 +54,7 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
            json={"model": self.model, "input": texts},
            timeout=self.timeout,
        )
-        response.raise_for_status()
+        self._raise_for_status(response, batch_size=len(texts))
        data = response.json()
        vectors = [item["embedding"] for item in sorted(data.get("data", []), key=lambda item: item["index"])]
        if any(len(vector) != self.dimension for vector in vectors):
@@ -51,7 +65,12 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
        """Embed texts for the Open A I Compatible Embedding Provider instance."""
        if not texts:
            return []
-        return self._request(texts)
+        vectors: list[list[float]] = []
        # Batch requests conservatively because some gateways reject larger embedding payloads.
        for start in range(0, len(texts), EMBEDDING_BATCH_SIZE):
            batch = texts[start:start + EMBEDDING_BATCH_SIZE]
            vectors.extend(self._request(batch))
        return vectors
    def embed_query(self, text: str) -> list[float]:
        """Embed query for the Open A I Compatible Embedding Provider instance."""
--- a/backend/app/infrastructure/parser/aliyun_docmind_gateway.py
+++ b/backend/app/infrastructure/parser/aliyun_docmind_gateway.py
@@ -0,0 +1,142 @@
 """Aliyun Docmind gateway helpers for the document ingest pipeline."""
 from __future__ import annotations
 import time
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 from alibabacloud_docmind_api20220711 import models as docmind_models
 from alibabacloud_docmind_api20220711.client import Client as DocmindClient
 from alibabacloud_tea_openapi import models as open_api_models
 from alibabacloud_tea_util import models as util_models
 from app.config.settings import settings
 # Keep provider-specific behavior isolated so the rest of the backend can stay stable.
@dataclass
 class AliyunParsePayload:
    """Represent the raw Aliyun parse payload returned by the gateway."""
    task_id: str
    layouts: list[dict[str, Any]]
    poll_attempts: int
    duration_ms: int
 class AliyunDocmindGateway:
    """Submit, poll, and collect results from the Aliyun Docmind API."""
    def __init__(self) -> None:
        """Initialize the gateway with runtime configuration."""
        self.endpoint = settings.alibaba_endpoint
        self.poll_interval_seconds = settings.aliyun_parse_poll_interval_seconds
        self.timeout_seconds = settings.aliyun_parse_timeout_seconds
        self.layout_step_size = settings.aliyun_parse_layout_step_size
        self.llm_enhancement = settings.aliyun_llm_enhancement
        self.enhancement_mode = settings.aliyun_enhancement_mode
    def parse_document(self, *, file_path: str) -> AliyunParsePayload:
        """Parse a single document and return the collected layouts."""
        client = self._create_client()
        started_at = time.monotonic()
        task_id = self._submit_job(client=client, file_path=file_path)
        poll_attempts = self._wait_for_completion(client=client, task_id=task_id, started_at=started_at)
        layouts = self._collect_all_results(client=client, task_id=task_id)
        duration_ms = int((time.monotonic() - started_at) * 1000)
        return AliyunParsePayload(
            task_id=task_id,
            layouts=layouts,
            poll_attempts=poll_attempts,
            duration_ms=duration_ms,
        )
    def _create_client(self) -> DocmindClient:
        """Create a Docmind client using explicit AccessKey settings only."""
        config = open_api_models.Config()
        config.endpoint = self.endpoint
        if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
            raise ValueError(
                "Missing Aliyun parser credentials. Set ALIBABA_ACCESS_KEY_ID and "
                "ALIBABA_ACCESS_KEY_SECRET in the project root .env."
            )
        # Keep production behavior deterministic by using only project-configured credentials.
        config.access_key_id = settings.alibaba_access_key_id
        config.access_key_secret = settings.alibaba_access_key_secret
        return DocmindClient(config)
    def _submit_job(self, *, client: DocmindClient, file_path: str) -> str:
        """Submit an asynchronous Docmind parse job."""
        path = Path(file_path)
        with open(file_path, "rb") as file_stream:
            request = docmind_models.SubmitDocParserJobAdvanceRequest(
                file_url_object=file_stream,
                file_name=path.name,
                file_name_extension=path.suffix.lstrip("."),
                llm_enhancement=self.llm_enhancement,
                enhancement_mode=self.enhancement_mode,
            )
            runtime = util_models.RuntimeOptions()
            response = client.submit_doc_parser_job_advance(request, runtime)
        task_id = response.body.data.id if response.body and response.body.data else ""
        if not task_id:
            raise RuntimeError("Aliyun Docmind did not return a parse task id.")
        return task_id
    def _query_status(self, *, client: DocmindClient, task_id: str) -> dict[str, Any] | None:
        """Query the current Docmind parse status."""
        request = docmind_models.QueryDocParserStatusRequest(id=task_id)
        response = client.query_doc_parser_status(request)
        return response.body.data.to_map() if response.body and response.body.data else None
    def _wait_for_completion(self, *, client: DocmindClient, task_id: str, started_at: float) -> int:
        """Poll until the parse job finishes or times out."""
        poll_attempts = 0
        while True:
            poll_attempts += 1
            status_payload = self._query_status(client=client, task_id=task_id)
            if not status_payload:
                raise RuntimeError(f"Aliyun parse status payload is empty for task {task_id}.")
            status = str(status_payload.get("Status", "")).lower()
            if status == "success":
                return poll_attempts
            if status == "failed":
                raise RuntimeError(f"Aliyun parse task failed: {status_payload}")
            elapsed = time.monotonic() - started_at
            if elapsed > self.timeout_seconds:
                raise TimeoutError(
                    f"Aliyun parse task timed out after {self.timeout_seconds}s: task_id={task_id}"
                )
            time.sleep(self.poll_interval_seconds)
    def _collect_all_results(self, *, client: DocmindClient, task_id: str) -> list[dict[str, Any]]:
        """Collect all paginated layout results from a completed parse task."""
        all_layouts: list[dict[str, Any]] = []
        layout_num = 0
        while True:
            request = docmind_models.GetDocParserResultRequest(
                id=task_id,
                layout_step_size=self.layout_step_size,
                layout_num=layout_num,
            )
            response = client.get_doc_parser_result(request)
            payload = response.body.data if response.body else None
            if not payload:
                break
            layouts = payload.get("layouts", [])
            if not layouts:
                break
            all_layouts.extend(layouts)
            layout_num += len(layouts)
            if len(layouts) < self.layout_step_size:
                break
        if not all_layouts:
            raise RuntimeError(f"Aliyun parse task returned no layouts: task_id={task_id}")
        return all_layouts
--- a/backend/app/infrastructure/parser/aliyun_document_parser.py
+++ b/backend/app/infrastructure/parser/aliyun_document_parser.py
@@ -1,19 +1,18 @@
-"""Implement infrastructure support for aliyun document parser."""
+"""Implement infrastructure support for Aliyun document parsing."""
 from __future__ import annotations
-from app.aliyun_parser.parse_pdf import (
+from app.config.settings import settings
 from app.domain.documents import DocumentParser, ParsedDocument
 from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway
 from app.infrastructure.parser.aliyun_layout_normalizer import (
    MAX_CHARS,
    OVERLAP_CHARS,
    build_semantic_blocks,
    build_structure_nodes,
    build_vector_chunks,
    collect_all_results,
    init_client,
    submit_job,
    wait_for_completion,
 )
-from app.domain.documents import DocumentParser, ParsedDocument
+
 # Keep adapter behavior explicit so integration details remain easy to audit.
@@ -22,13 +21,14 @@ class AliyunDocumentParser(DocumentParser):
    """Provide the Aliyun Document Parser parser."""
    parser_name = "aliyun_docmind"
    def __init__(self) -> None:
        """Initialize the parser adapter and its gateway dependency."""
        self.gateway = AliyunDocmindGateway()
    def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
        """Handle parse for the Aliyun Document Parser instance."""
-        client = init_client()
+        payload = self.gateway.parse_document(file_path=file_path)
-        task_id = submit_job(client, file_path)
+        layouts = payload.layouts
        if not wait_for_completion(client, task_id):
            raise RuntimeError("阿里云文档解析任务失败")
        layouts = collect_all_results(client, task_id)
        structure_nodes = build_structure_nodes(layouts)
        semantic_blocks = build_semantic_blocks(layouts)
        vector_chunks = build_vector_chunks(
@@ -51,5 +51,13 @@ class AliyunDocumentParser(DocumentParser):
            vector_chunks=vector_chunks,
            parser_name=self.parser_name,
            raw_text=raw_text,
-            metadata={"task_id": task_id, "layout_count": len(layouts)},
+            raw_layouts=layouts,
            metadata={
                "task_id": payload.task_id,
                "layout_count": len(layouts),
                "poll_attempts": payload.poll_attempts,
                "duration_ms": payload.duration_ms,
                "parser_backend": self.parser_name,
                "artifact_prefix": settings.document_parse_artifact_prefix,
            },
        )
--- a/backend/app/infrastructure/parser/aliyun_layout_normalizer.py
+++ b/backend/app/infrastructure/parser/aliyun_layout_normalizer.py
@@ -0,0 +1,336 @@
 """Normalize Aliyun Docmind layouts into production document structures."""
 from __future__ import annotations
 import re
 from typing import Any
 # Keep layout normalization rules centralized so parser and demos stay aligned.
 MAX_CHARS = 600
 OVERLAP_CHARS = 80
 TOC_TITLES = {"目次", "目录"}
 TITLE_SUBTYPES = {"doc_title", "para_title"}
 TEXT_SUBTYPES = {"para", "none"}
 FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
 FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
 def normalize_text(text: str) -> str:
    """Normalize raw text content emitted by the parser."""
    text = text.replace("\r", "\n")
    text = text.replace(" ", " ")
    text = re.sub(r"\n+", "\n", text)
    text = re.sub(r"[ \t]+", " ", text)
    return text.strip()
 def get_page(layout: dict[str, Any]) -> int:
    """Return the page number for a layout record."""
    return layout.get("pageNum", layout.get("pageNumber", 0))
 def get_text(layout: dict[str, Any]) -> str:
    """Return the most useful text content for a layout record."""
    text = normalize_text(layout.get("text", ""))
    if text:
        return text
    return normalize_text(layout.get("markdownContent", ""))
 def is_title(layout: dict[str, Any]) -> bool:
    """Return whether the layout should be treated as a title."""
    return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
 def is_text(layout: dict[str, Any]) -> bool:
    """Return whether the layout should be treated as plain paragraph text."""
    return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
 def is_figure(layout: dict[str, Any]) -> bool:
    """Return whether the layout should be treated as figure-related content."""
    return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
 def is_table(layout: dict[str, Any]) -> bool:
    """Return whether the layout should be treated as a table."""
    return layout.get("type") == "table"
 def is_toc_layout(layout: dict[str, Any]) -> bool:
    """Return whether the layout appears to belong to a table of contents."""
    text = get_text(layout)
    if text in TOC_TITLES:
        return True
    if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
        return True
    return False
 def extract_table_text(layout: dict[str, Any]) -> str:
    """Flatten nested table cells into retrievable plain text."""
    rows: list[str] = []
    for cell in layout.get("cells", []):
        texts: list[str] = []
        for cell_layout in cell.get("layouts", []):
            cell_text = normalize_text(cell_layout.get("text", ""))
            if cell_text:
                texts.append(cell_text)
        if texts:
            rows.append(" ".join(texts))
    return "\n".join(rows).strip()
 def build_structure_nodes(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
    """Build the title hierarchy emitted to downstream storage."""
    nodes: list[dict[str, Any]] = []
    for layout in layouts:
        if not is_title(layout):
            continue
        text = get_text(layout)
        if not text or text in TOC_TITLES:
            continue
        nodes.append(
            {
                "unique_id": layout.get("uniqueId"),
                "page": get_page(layout),
                "index": layout.get("index", 0),
                "level": layout.get("level", 0),
                "title": text,
                "type": layout.get("type"),
                "sub_type": layout.get("subType"),
            }
        )
    return nodes
 def update_section_path(
    section_stack: list[dict[str, Any]],
    layout: dict[str, Any],
 ) -> list[dict[str, Any]]:
    """Update the current heading stack with a newly observed title layout."""
    level = layout.get("level", 0)
    title = get_text(layout)
    while section_stack and section_stack[-1]["level"] >= level:
        section_stack.pop()
    section_stack.append(
        {
            "level": level,
            "title": title,
            "page": get_page(layout),
            "unique_id": layout.get("uniqueId"),
        }
    )
    return section_stack
 def section_path_titles(section_stack: list[dict[str, Any]]) -> list[str]:
    """Return the title-only view of the current heading stack."""
    return [item["title"] for item in section_stack]
 def flush_text_block(
    blocks: list[dict[str, Any]],
    semantic_blocks: list[dict[str, Any]],
    block_id: int,
 ) -> int:
    """Flush buffered paragraph layouts into a single semantic block."""
    if not blocks:
        return block_id
    texts = [item["text"] for item in blocks if item["text"]]
    merged_text = "\n".join(texts).strip()
    if not merged_text:
        return block_id
    semantic_blocks.append(
        {
            "semantic_id": f"semantic-{block_id}",
            "block_type": "section_text",
            "page_start": min(item["page"] for item in blocks),
            "page_end": max(item["page"] for item in blocks),
            "section_path": blocks[0]["section_path"],
            "section_level": blocks[0]["section_level"],
            "section_title": blocks[0]["section_title"],
            "source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
            "text": merged_text,
        }
    )
    return block_id + 1
 def build_semantic_blocks(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
    """Build semantic content blocks from raw Aliyun layouts."""
    semantic_blocks: list[dict[str, Any]] = []
    section_stack: list[dict[str, Any]] = []
    pending_text_blocks: list[dict[str, Any]] = []
    block_id = 1
    skip_toc_page = False
    for layout in layouts:
        text = get_text(layout)
        page = get_page(layout)
        if is_toc_layout(layout):
            skip_toc_page = True
            continue
        if skip_toc_page and page == 1:
            continue
        if skip_toc_page and page != 1:
            skip_toc_page = False
        if is_title(layout):
            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
            pending_text_blocks = []
            section_stack = update_section_path(section_stack, layout)
            continue
        section_path = section_path_titles(section_stack)
        section_title = section_path[-1] if section_path else "未分类"
        section_level = len(section_path)
        if is_table(layout):
            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
            pending_text_blocks = []
            table_text = extract_table_text(layout)
            if table_text:
                semantic_blocks.append(
                    {
                        "semantic_id": f"semantic-{block_id}",
                        "block_type": "table",
                        "page_start": page,
                        "page_end": page,
                        "section_path": section_path,
                        "section_level": section_level,
                        "section_title": section_title,
                        "source_ids": [layout.get("uniqueId")],
                        "text": table_text,
                    }
                )
                block_id += 1
            continue
        if is_figure(layout):
            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
            pending_text_blocks = []
            if text:
                semantic_blocks.append(
                    {
                        "semantic_id": f"semantic-{block_id}",
                        "block_type": "figure",
                        "page_start": page,
                        "page_end": page,
                        "section_path": section_path,
                        "section_level": section_level,
                        "section_title": section_title,
                        "source_ids": [layout.get("uniqueId")],
                        "text": text,
                    }
                )
                block_id += 1
            continue
        if is_text(layout) and text:
            pending_text_blocks.append(
                {
                    "page": page,
                    "text": text,
                    "unique_id": layout.get("uniqueId"),
                    "section_path": section_path,
                    "section_level": section_level,
                    "section_title": section_title,
                }
            )
    flush_text_block(pending_text_blocks, semantic_blocks, block_id)
    return semantic_blocks
 def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> list[str]:
    """Split long text into overlapping windows for embedding."""
    text = text.strip()
    if len(text) <= max_chars:
        return [text] if text else []
    parts: list[str] = []
    start = 0
    while start < len(text):
        end = min(len(text), start + max_chars)
        parts.append(text[start:end].strip())
        if end >= len(text):
            break
        start = max(0, end - overlap_chars)
    return [part for part in parts if part]
 def build_vector_chunks(
    semantic_blocks: list[dict[str, Any]],
    *,
    doc_id: str,
    doc_title: str,
    max_chars: int,
    overlap_chars: int,
 ) -> list[dict[str, Any]]:
    """Build retrieval chunks from semantic blocks."""
    vector_chunks: list[dict[str, Any]] = []
    chunk_index = 1
    for block in semantic_blocks:
        pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
        for piece_index, piece in enumerate(pieces, start=1):
            if block["section_path"]:
                header = f"标准：{doc_title}\n章节：{' > '.join(block['section_path'])}\n\n"
            else:
                header = f"标准：{doc_title}\n\n"
            # Preserve enriched embedding text so retrieval keeps section context.
            vector_chunks.append(
                {
                    "doc_id": doc_id,
                    "doc_title": doc_title,
                    "chunk_id": f"chunk-{chunk_index}",
                    "chunk_index": chunk_index,
                    "semantic_id": block["semantic_id"],
                    "chunk_type": block["block_type"],
                    "piece_index": piece_index,
                    "page_start": block["page_start"],
                    "page_end": block["page_end"],
                    "section_path": block["section_path"],
                    "section_level": block["section_level"],
                    "section_title": block["section_title"],
                    "source_ids": block["source_ids"],
                    "text": piece,
                    "embedding_text": header + piece,
                }
            )
            chunk_index += 1
    return vector_chunks
 def convert_layouts(
    layouts: list[dict[str, Any]],
    *,
    doc_id: str,
    doc_title: str,
    max_chars: int,
    overlap_chars: int,
 ) -> dict[str, Any]:
    """Convert raw Aliyun layouts into the three-layer ingest payload."""
    structure_nodes = build_structure_nodes(layouts)
    semantic_blocks = build_semantic_blocks(layouts)
    vector_chunks = build_vector_chunks(
        semantic_blocks,
        doc_id=doc_id,
        doc_title=doc_title,
        max_chars=max_chars,
        overlap_chars=overlap_chars,
    )
    return {
        "doc_id": doc_id,
        "doc_title": doc_title,
        "structure_nodes": structure_nodes,
        "semantic_blocks": semantic_blocks,
        "vector_chunks": vector_chunks,
    }
--- a/backend/app/infrastructure/parser/local_document_parser.py
+++ b/backend/app/infrastructure/parser/local_document_parser.py
@@ -4,6 +4,7 @@ from __future__ import annotations
 from pathlib import Path
 from app.config.settings import settings
 from app.domain.documents import DocumentParser, ParsedDocument
 from app.services.parser.docx_parser import parse_docx_to_markdown
 from app.services.parser.pdf_parser import parse_pdf_to_markdown
@@ -34,5 +35,10 @@ class LocalDocumentParser(DocumentParser):
            vector_chunks=[],
            parser_name=self.parser_name,
            raw_text=markdown_text,
-            metadata={"source": "local_parser", "file_suffix": suffix},
+            raw_layouts=[],
            metadata={
                "source": "local_parser",
                "file_suffix": suffix,
                "artifact_prefix": settings.document_parse_artifact_prefix,
            },
        )
--- a/backend/data/documents.json
+++ b/backend/data/documents.json
@@ -21,5 +21,365 @@
      "generate_summary": true,
      "structure_nodes": 0
    }
  },
  "44121fbb": {
    "doc_id": "44121fbb",
    "doc_name": "大众汽车手册.pdf",
    "file_name": "大众汽车手册.pdf",
    "object_name": "44121fbb/大众汽车手册.pdf",
    "content_type": "application/pdf",
    "size_bytes": 766565,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "",
    "index_name": "",
    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
    "created_at": "2026-05-18T09:53:47.996183+00:00",
    "updated_at": "2026-05-18T09:53:50.825868+00:00",
    "metadata": {
      "generate_summary": true,
      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
      "processing_stage": "failed"
    }
  },
  "77debb4a": {
    "doc_id": "77debb4a",
    "doc_name": "大众汽车手册.pdf",
    "file_name": "大众汽车手册.pdf",
    "object_name": "77debb4a/大众汽车手册.pdf",
    "content_type": "application/pdf",
    "size_bytes": 766565,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "",
    "index_name": "",
    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
    "created_at": "2026-05-18T10:05:46.104259+00:00",
    "updated_at": "2026-05-18T10:05:48.704061+00:00",
    "metadata": {
      "generate_summary": true,
      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
      "processing_stage": "failed"
    }
  },
  "d12bdcc8": {
    "doc_id": "d12bdcc8",
    "doc_name": "TCT算法接口.pdf",
    "file_name": "TCT算法接口.pdf",
    "object_name": "d12bdcc8/TCT算法接口.pdf",
    "content_type": "application/pdf",
    "size_bytes": 165557,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "",
    "index_name": "",
    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
    "created_at": "2026-05-18T10:07:22.199824+00:00",
    "updated_at": "2026-05-18T10:07:24.653751+00:00",
    "metadata": {
      "generate_summary": true,
      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
      "processing_stage": "failed"
    }
  },
  "3c2e8c9c": {
    "doc_id": "3c2e8c9c",
    "doc_name": "20260415_Continental tire mobile app solution.pdf",
    "file_name": "20260415_Continental tire mobile app solution.pdf",
    "object_name": "3c2e8c9c/20260415_Continental tire mobile app solution.pdf",
    "content_type": "application/pdf",
    "size_bytes": 2178074,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "",
    "index_name": "",
    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
    "created_at": "2026-05-18T10:09:58.338274+00:00",
    "updated_at": "2026-05-18T10:10:01.295502+00:00",
    "metadata": {
      "generate_summary": true,
      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
      "processing_stage": "failed"
    }
  },
  "d22d21a0": {
    "doc_id": "d22d21a0",
    "doc_name": "20260415_Continental tire mobile app solution.pdf",
    "file_name": "20260415_Continental tire mobile app solution.pdf",
    "object_name": "d22d21a0/20260415_Continental tire mobile app solution.pdf",
    "content_type": "application/pdf",
    "size_bytes": 2178074,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "",
    "index_name": "",
    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
    "created_at": "2026-05-18T10:12:20.078027+00:00",
    "updated_at": "2026-05-18T10:12:22.999843+00:00",
    "metadata": {
      "generate_summary": true,
      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
      "processing_stage": "failed"
    }
  },
  "35f129d3": {
    "doc_id": "35f129d3",
    "doc_name": "大众汽车手册.pdf",
    "file_name": "大众汽车手册.pdf",
    "object_name": "35f129d3/大众汽车手册.pdf",
    "content_type": "application/pdf",
    "size_bytes": 766565,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "",
    "index_name": "",
    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
    "created_at": "2026-05-18T10:13:24.706512+00:00",
    "updated_at": "2026-05-18T10:13:27.180509+00:00",
    "metadata": {
      "generate_summary": true,
      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
      "processing_stage": "failed"
    }
  },
  "efc21515": {
    "doc_id": "efc21515",
    "doc_name": "大众汽车手册.pdf",
    "file_name": "大众汽车手册.pdf",
    "object_name": "efc21515/大众汽车手册.pdf",
    "content_type": "application/pdf",
    "size_bytes": 766565,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "aliyun_docmind",
    "index_name": "",
    "error_message": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
    "created_at": "2026-05-18T13:47:32.076786+00:00",
    "updated_at": "2026-05-18T13:47:57.998073+00:00",
    "metadata": {
      "generate_summary": true,
      "parser_backend": "aliyun_docmind",
      "parse_task_id": "docmind-20260518-a6e84447457f43cb85f95225cfc6495b",
      "layout_count": 87,
      "structure_node_count": 20,
      "semantic_block_count": 27,
      "vector_chunk_count": 27,
      "artifact_keys": {
        "layouts": "artifacts/efc21515/layouts.json",
        "structure_nodes": "artifacts/efc21515/structure_nodes.json",
        "semantic_blocks": "artifacts/efc21515/semantic_blocks.json",
        "vector_chunks": "artifacts/efc21515/vector_chunks.json"
      },
      "processing_stage": "failed",
      "failure_reason": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400"
    }
  },
  "0d4b08bc": {
    "doc_id": "0d4b08bc",
    "doc_name": "大众汽车手册.pdf",
    "file_name": "大众汽车手册.pdf",
    "object_name": "0d4b08bc/大众汽车手册.pdf",
    "content_type": "application/pdf",
    "size_bytes": 766565,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "aliyun_docmind",
    "index_name": "",
    "error_message": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
    "created_at": "2026-05-18T14:03:15.134344+00:00",
    "updated_at": "2026-05-18T14:03:34.843448+00:00",
    "metadata": {
      "generate_summary": true,
      "parser_backend": "aliyun_docmind",
      "parse_task_id": "docmind-20260518-78353d85daa24147b68d8fb71895179f",
      "layout_count": 87,
      "structure_node_count": 20,
      "semantic_block_count": 27,
      "vector_chunk_count": 27,
      "artifact_keys": {
        "layouts": "artifacts/0d4b08bc/layouts.json",
        "structure_nodes": "artifacts/0d4b08bc/structure_nodes.json",
        "semantic_blocks": "artifacts/0d4b08bc/semantic_blocks.json",
        "vector_chunks": "artifacts/0d4b08bc/vector_chunks.json"
      },
      "processing_stage": "failed",
      "failure_reason": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404"
    }
  },
  "4302f314": {
    "doc_id": "4302f314",
    "doc_name": "大众汽车手册.pdf",
    "file_name": "大众汽车手册.pdf",
    "object_name": "4302f314/大众汽车手册.pdf",
    "content_type": "application/pdf",
    "size_bytes": 766565,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "aliyun_docmind",
    "index_name": "",
    "error_message": "embedding 维度不匹配，期望 1536",
    "created_at": "2026-05-18T14:11:29.943973+00:00",
    "updated_at": "2026-05-18T14:11:48.554500+00:00",
    "metadata": {
      "generate_summary": true,
      "parser_backend": "aliyun_docmind",
      "parse_task_id": "docmind-20260518-23935ee455ac4b26ac4201ac4781ee52",
      "layout_count": 87,
      "structure_node_count": 20,
      "semantic_block_count": 27,
      "vector_chunk_count": 27,
      "artifact_keys": {
        "layouts": "artifacts/4302f314/layouts.json",
        "structure_nodes": "artifacts/4302f314/structure_nodes.json",
        "semantic_blocks": "artifacts/4302f314/semantic_blocks.json",
        "vector_chunks": "artifacts/4302f314/vector_chunks.json"
      },
      "processing_stage": "failed",
      "failure_reason": "embedding 维度不匹配，期望 1536"
    }
  },
  "765ed1ee": {
    "doc_id": "765ed1ee",
    "doc_name": "大众汽车手册.pdf",
    "file_name": "大众汽车手册.pdf",
    "object_name": "765ed1ee/大众汽车手册.pdf",
    "content_type": "application/pdf",
    "size_bytes": 766565,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "aliyun_docmind",
    "index_name": "",
    "error_message": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>",
    "created_at": "2026-05-18T14:18:28.875138+00:00",
    "updated_at": "2026-05-18T14:18:57.389110+00:00",
    "metadata": {
      "generate_summary": true,
      "parser_backend": "aliyun_docmind",
      "parse_task_id": "docmind-20260518-f116856bc29245baa2531b245078a701",
      "layout_count": 87,
      "structure_node_count": 20,
      "semantic_block_count": 27,
      "vector_chunk_count": 27,
      "artifact_keys": {
        "layouts": "artifacts/765ed1ee/layouts.json",
        "structure_nodes": "artifacts/765ed1ee/structure_nodes.json",
        "semantic_blocks": "artifacts/765ed1ee/semantic_blocks.json",
        "vector_chunks": "artifacts/765ed1ee/vector_chunks.json"
      },
      "processing_stage": "failed",
      "failure_reason": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>"
    }
  },
  "05cabe09": {
    "doc_id": "05cabe09",
    "doc_name": "大众汽车手册.pdf",
    "file_name": "大众汽车手册.pdf",
    "object_name": "05cabe09/大众汽车手册.pdf",
    "content_type": "application/pdf",
    "size_bytes": 766565,
    "status": "failed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 0,
    "parser_name": "aliyun_docmind",
    "index_name": "",
    "error_message": "embedding 维度不匹配，期望 1536",
    "created_at": "2026-05-18T14:24:32.156500+00:00",
    "updated_at": "2026-05-18T14:24:50.114138+00:00",
    "metadata": {
      "generate_summary": true,
      "parser_backend": "aliyun_docmind",
      "parse_task_id": "docmind-20260518-897d858983df48e28e9819e563d46208",
      "layout_count": 87,
      "structure_node_count": 20,
      "semantic_block_count": 27,
      "vector_chunk_count": 27,
      "artifact_keys": {
        "layouts": "artifacts/05cabe09/layouts.json",
        "structure_nodes": "artifacts/05cabe09/structure_nodes.json",
        "semantic_blocks": "artifacts/05cabe09/semantic_blocks.json",
        "vector_chunks": "artifacts/05cabe09/vector_chunks.json"
      },
      "processing_stage": "failed",
      "failure_reason": "embedding 维度不匹配，期望 1536"
    }
  },
  "9acb2ba0": {
    "doc_id": "9acb2ba0",
    "doc_name": "大众汽车手册.pdf",
    "file_name": "大众汽车手册.pdf",
    "object_name": "9acb2ba0/大众汽车手册.pdf",
    "content_type": "application/pdf",
    "size_bytes": 766565,
    "status": "indexed",
    "regulation_type": "",
    "version": "",
    "summary": "",
    "summary_latency_ms": 0,
    "chunk_count": 27,
    "parser_name": "aliyun_docmind",
    "index_name": "regulations_dense_1024_v1",
    "error_message": "",
    "created_at": "2026-05-18T14:29:01.368719+00:00",
    "updated_at": "2026-05-18T14:29:23.699068+00:00",
    "metadata": {
      "generate_summary": true,
      "parser_backend": "aliyun_docmind",
      "parse_task_id": "docmind-20260518-e5fd4a5419e74d569c562e389e6ae72c",
      "layout_count": 87,
      "structure_node_count": 20,
      "semantic_block_count": 27,
      "vector_chunk_count": 27,
      "artifact_keys": {
        "layouts": "artifacts/9acb2ba0/layouts.json",
        "structure_nodes": "artifacts/9acb2ba0/structure_nodes.json",
        "semantic_blocks": "artifacts/9acb2ba0/semantic_blocks.json",
        "vector_chunks": "artifacts/9acb2ba0/vector_chunks.json"
      },
      "processing_stage": "indexed",
      "index_collection": "regulations_dense_1024_v1"
    }
  }
 }
--- a/docs/architecture/aliyun-ingest-implementation.md
+++ b/docs/architecture/aliyun-ingest-implementation.md
@@ -0,0 +1,71 @@
 # 阿里云解析主链路实现说明
 本文档描述当前仓库已经落地的文档 ingest 主链路实现，作为迁移设计到代码实现之间的收口说明。
 ## 1. 当前默认链路
 - 上传入口保持为 `/api/v1/documents/upload`
 - 默认 `PARSER_BACKEND=aliyun`
 - 默认 `CHUNK_BACKEND=aliyun`
 - 默认 Milvus collection 为 `regulations_dense_1536_v2`
 - 解析产物落到 MinIO `artifacts/{doc_id}/`
 完整主链路如下：
 1. 原始文件上传到 MinIO
 2. `AliyunDocmindGateway` 提交阿里云异步解析任务
 3. 轮询任务状态直到成功或超时
 4. 分页拉取 `layouts`
 5. 转换为 `structure_nodes / semantic_blocks / vector_chunks`
 6. 三层结构 JSON 回写 MinIO
 7. 使用 `vector_chunks[*].embedding_text` 调 embedding API
 8. 写入 `regulations_dense_1536_v2`
 9. 文档状态更新为 `indexed`
 运行时转换逻辑位于 `backend/app/infrastructure/parser/aliyun_layout_normalizer.py`。
 旧的 `backend/app/aliyun_parser/` 示例目录已移除，不参与生产运行时。
 ## 2. 解析产物持久化
 每个文档会额外写入以下对象：
 - `artifacts/{doc_id}/layouts.json`
 - `artifacts/{doc_id}/structure_nodes.json`
 - `artifacts/{doc_id}/semantic_blocks.json`
 - `artifacts/{doc_id}/vector_chunks.json`
 `documents.json` 仅保留对象 key、统计信息和处理阶段，不保存完整大 JSON。
 ## 3. 失败策略
 - 当前 `PARSER_FAILURE_MODE=fail`
 - 阿里云解析失败不自动回退到本地 parser
 - 失败时保留原始文件与已写入的 artifacts，便于排障
 ## 4. 运行参数
 关键环境变量如下：
 - `ALIBABA_ACCESS_KEY_ID`
 - `ALIBABA_ACCESS_KEY_SECRET`
 - `ALIBABA_ENDPOINT`
 - `ALIYUN_PARSE_POLL_INTERVAL_SECONDS`
 - `ALIYUN_PARSE_TIMEOUT_SECONDS`
 - `ALIYUN_PARSE_LAYOUT_STEP_SIZE`
 - `ALIYUN_LLM_ENHANCEMENT`
 - `ALIYUN_ENHANCEMENT_MODE`
 - `DOCUMENT_PARSE_ARTIFACT_PREFIX`
 - `PARSER_BACKEND`
 - `CHUNK_BACKEND`
 ## 5. 运行态确认
 可通过 `/api/v1/status/config` 确认以下字段：
 - `parser_backend`
 - `chunk_backend`
 - `milvus_collection`
 - `artifact_prefix`
 - `parser_failure_mode`
 这几个值用于确认服务是否实际运行在迁移后的默认链路上。
--- a/docs/rfc/backend-api-parsing-embedding-migration-requirements.md
+++ b/docs/rfc/backend-api-parsing-embedding-migration-requirements.md
@@ -29,7 +29,7 @@
 已确认的目标需求如下：
 - 文档解析统一改为阿里云文档智能能力
- 当前阿里云接入基础来自 `backend/app/aliyun_parser/parse_pdf.py`
+- 当前阿里云接入基础已经迁移到 `backend/app/infrastructure/parser/aliyun_layout_normalizer.py`
 - 解析结果以 `structure_nodes`、`semantic_blocks`、`vector_chunks` 三层结构为基础
 - 分块以阿里云 `vector_chunks` 为准，不再走当前本地 `RegulationChunker`
 - embedding 改为 OpenAI 兼容 API 调用，模型使用 `text-embedding-v3`
@@ -80,7 +80,7 @@
 受影响的解析能力范围包括：
 - 当前本地 parser 目录
- `backend/app/aliyun_parser`
+- `backend/app/infrastructure/parser`
 迁移后阿里云文档智能能力将成为主解析来源，本地 PDF/DOCX/MinerU 解析链路需要重新界定保留、下线或回退策略，但具体模块组织方式不在本文件内定义。
@@ -133,7 +133,7 @@
 以下风险和约束在本期已经明确，需要在后续架构和实施阶段优先处理：
 - 旧 Milvus collection 与新 `1536` 维 schema 不兼容，需要新 collection 和重建索引
- `backend/app/aliyun_parser` 现有脚本含硬编码密钥，后续必须全部移到环境变量
+- 阿里云凭据必须继续只通过环境变量或凭据链注入，不能回到脚本内硬编码
 - RAG 下游当前对 `clause_number` 有依赖，迁移后需要优先适配 `section_title` 和 Aliyun chunk metadata
 - 如果阿里云返回字段与当前样例不同，需要在架构阶段补充 adapter 层
--- a/tests/test_embedding.py
+++ b/tests/test_embedding.py
@@ -1,4 +1,4 @@
-"""新架构下的文档编排与 embedding 边界测试。"""
+"""Document orchestration and embedding boundary tests for the migrated backend."""
 from __future__ import annotations
@@ -80,6 +80,7 @@ class FakeParser:
        return ParsedDocument(
            doc_id=doc_id,
            doc_name=doc_name,
            raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
            structure_nodes=[{"title": "第一章"}],
            semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
            vector_chunks=[
@@ -95,6 +96,7 @@ class FakeParser:
                }
            ],
            parser_name="fake_parser",
            metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
        )
@@ -125,10 +127,10 @@ class FakeEmbeddingProvider:
    def embed_texts(self, texts: list[str]) -> list[list[float]]:
        self.calls.append(texts)
-        return [[0.1] * 1536 for _ in texts]
+        return [[0.1] * 1024 for _ in texts]
    def embed_query(self, text: str) -> list[float]:
-        return [0.2] * 1536
+        return [0.2] * 1024
 class FakeVectorIndex:
@@ -146,10 +148,10 @@ class FakeVectorIndex:
        return []
    def health(self) -> dict:
-        return {"collection_name": "regulations_dense_1536"}
+        return {"collection_name": "regulations_dense_1024_v1"}
-def test_document_command_service_uses_1536_dense_embedding_and_updates_status():
+def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
    repository = FakeRepository()
    binary_store = FakeBinaryStore()
    embedding_provider = FakeEmbeddingProvider()
@@ -183,15 +185,16 @@ def test_document_command_service_uses_1536_dense_embedding_and_updates_status()
    assert stored.status == DocumentStatus.INDEXED
    assert stored.chunk_count == 1
    assert stored.parser_name == "fake_parser"
-    assert stored.index_name == "regulations_dense_1536"
+    assert stored.index_name == "regulations_dense_1024_v1"
    assert stored.metadata["parse_task_id"] == "task-123"
    assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")
-
+def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
 def test_bootstrap_defaults_to_local_parser_and_chunk_builder():
    bootstrap.get_parser.cache_clear()
    bootstrap.get_chunk_builder.cache_clear()
    parser = bootstrap.get_parser()
    chunk_builder = bootstrap.get_chunk_builder()
-    assert parser.__class__.__name__ == "LocalDocumentParser"
+    assert parser.__class__.__name__ == "AliyunDocumentParser"
-    assert chunk_builder.__class__.__name__ == "LocalRegulationChunkBuilder"
+    assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"
--- a/tests/verify_mvp.py
+++ b/tests/verify_mvp.py
@@ -64,11 +64,16 @@ def verify_migration_config() -> bool:
    try:
        assert settings.embedding_model == "text-embedding-v3"
-        assert settings.embedding_dim == 1536
+        assert settings.embedding_dim == 1024
-        assert settings.milvus_collection == "regulations_dense_1536"
+        assert settings.milvus_collection == "regulations_dense_1024_v1"
        assert settings.parser_backend == "aliyun"
        assert settings.chunk_backend == "aliyun"
        logger.info(f"embedding_model={settings.embedding_model}")
        logger.info(f"embedding_base_url={settings.embedding_base_url}")
        logger.info(f"embedding_dim={settings.embedding_dim}")
        logger.info(f"milvus_collection={settings.milvus_collection}")
        logger.info(f"parser_backend={settings.parser_backend}")
        logger.info(f"chunk_backend={settings.chunk_backend}")
        logger.success("migration config ok")
        return True
    except Exception as exc: