feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions
--- a/.env
+++ b/.env
@@ -9,7 +9,7 @@ DEBUG=false
 # ===== Milvus向量数据库配置（已有）=====
 MILVUS_HOST=localhost
 MILVUS_PORT=19530
-MILVUS_COLLECTION=regulations_dense_1536
+MILVUS_COLLECTION=regulations_dense_1024_v1
 MILVUS_DB_NAME=default

 # ===== MinIO对象存储配置（已有）=====
@@ -34,7 +34,7 @@ POSTGRES_DB=compliance_db

 # ===== 嵌入模型配置 =====
 EMBEDDING_MODEL=text-embedding-v3
-EMBEDDING_DIM=1536
+EMBEDDING_DIM=1024
 EMBEDDING_API_KEY=sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8
 EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1
 EMBEDDING_TIMEOUT_SECONDS=120
@@ -59,7 +59,7 @@ LLM_TEMPERATURE=0.7
 # 获取API Key: https://dashscope.console.aliyun.com/
 QWEN_API_KEY=sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8
 QWEN_BASE_URL=http://6.86.80.4:30080/v1
-QWEN_MODEL=qwen3.5-plus
+QWEN_MODEL=qwen3.6-plus
 QWEN_VL_MODEL=qwen3-vl-plus

 # ===== DeepSeek API配置 =====
@@ -73,3 +73,15 @@ RAG_TOP_K=10
 RAG_MAX_CONTEXT_TOKENS=4000
 RAG_SUMMARY_MAX_TOKENS=1024
 RAG_SKILLS_MAX_TOKENS=2048
+
+# ===== 阿里云文档解析 =====
+ALIBABA_ACCESS_KEY_ID=LTAI5t9ZjvwSU9bKuMyiExrE
+ALIBABA_ACCESS_KEY_SECRET=hNvY6XocmEO6inYlrmiBwBcx5OfidL
+ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
+ALIYUN_PARSE_POLL_INTERVAL_SECONDS=5
+ALIYUN_PARSE_TIMEOUT_SECONDS=900
+ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
+ALIYUN_LLM_ENHANCEMENT=true
+ALIYUN_ENHANCEMENT_MODE=VLM
+DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
+PARSER_FAILURE_MODE=fail
--- a/.env.development
+++ b/.env.development
@@ -4,7 +4,7 @@
 # ===== Milvus向量数据库配置（已有）=====
 MILVUS_HOST=6.86.80.8
 MILVUS_PORT=19530
-MILVUS_COLLECTION=regulations_dense_1536
+MILVUS_COLLECTION=regulations_dense_1024_v1
 MILVUS_DB_NAME=default

 # ===== MinIO对象存储配置（已有）=====
@@ -26,4 +26,3 @@ POSTGRES_PORT=5432
 POSTGRES_USER=postgresql
 POSTGRES_PASSWORD=postgresql123456
 POSTGRES_DB=compliance_db
-
--- a/.env.example
+++ b/.env.example
@@ -9,12 +9,12 @@ DEBUG=false
 # ===== Milvus向量数据库配置 =====
 MILVUS_HOST=localhost
 MILVUS_PORT=19530
-MILVUS_COLLECTION=regulations_dense_1536
+MILVUS_COLLECTION=regulations_dense_1024_v1
 MILVUS_DB_NAME=default

 # ===== 嵌入模型配置 =====
 EMBEDDING_MODEL=text-embedding-v3
-EMBEDDING_DIM=1536
+EMBEDDING_DIM=1024
 EMBEDDING_API_KEY=your_embedding_api_key_here
 EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1
 EMBEDDING_TIMEOUT_SECONDS=120
@@ -44,11 +44,20 @@ CHUNK_SIZE=512
 CHUNK_OVERLAP=50
 MAX_FILE_SIZE_MB=100
 DOCUMENT_METADATA_PATH=backend/data/documents.json
+PARSER_BACKEND=aliyun
+CHUNK_BACKEND=aliyun

 # ===== 阿里云文档解析 =====
 ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
 ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
 ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
+ALIYUN_PARSE_POLL_INTERVAL_SECONDS=5
+ALIYUN_PARSE_TIMEOUT_SECONDS=900
+ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
+ALIYUN_LLM_ENHANCEMENT=true
+ALIYUN_ENHANCEMENT_MODE=VLM
+DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
+PARSER_FAILURE_MODE=fail

 # ===== API服务配置 =====
 API_HOST=0.0.0.0
@@ -73,7 +82,7 @@ DEEPSEEK_BASE_URL=http://6.86.80.4:30080/v1
 # Qwen系列: qwen3.5-plus, qwen3-plus, qwen-max, qwen-turbo, qwen-long
 # Qwen VL系列: qwen3-vl-plus, qwen-vl-max
 # DeepSeek系列: deepseek-v4-flash, deepseek-v3.2, deepseek-v3, deepseek-chat, deepseek-coder
-QWEN_MODEL=qwen3.5-plus
+QWEN_MODEL=qwen3.6-plus
 QWEN_VL_MODEL=qwen3-vl-plus
 DEEPSEEK_MODEL=deepseek-v4-flash

--- a/QUICK_DEPLOY.md
+++ b/QUICK_DEPLOY.md
@@ -106,6 +106,9 @@ ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
 EMBEDDING_API_KEY=your_embedding_api_key_here
 EMBEDDING_MODEL=text-embedding-v3
 EMBEDDING_DIM=1536
+PARSER_BACKEND=aliyun
+CHUNK_BACKEND=aliyun
+PARSER_FAILURE_MODE=fail
 ```

 ---
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ AIRegulation-DocAnalysis-Demo/
 ### 1. 安装依赖

 ```bash
-pip install -r backend/requirements.txt
+./dev.sh setup
 ```

 ### 2. 启动Milvus向量数据库
@@ -57,7 +57,7 @@ docker-compose logs -f milvus
 ### 3. 启动API服务

 ```bash
-PYTHONPATH=backend uvicorn app.main:app --reload --port 8000
+./dev.sh start api --foreground
 ```

 访问API文档：http://localhost:8000/docs
@@ -104,6 +104,8 @@ MILVUS_PORT=19530
 # 阿里云文档解析
 ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
 ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
+PARSER_BACKEND=aliyun
+CHUNK_BACKEND=aliyun

 # embedding 配置
 EMBEDDING_MODEL=text-embedding-v3
@@ -121,6 +123,17 @@ CHUNK_SIZE=512
 - 混合检索问答功能
 - 法规变更监控与自动更新

+## 解析产物
+
+上传成功后，系统会把阿里云解析的中间结果持久化到 MinIO：
+
+- `artifacts/{doc_id}/layouts.json`
+- `artifacts/{doc_id}/structure_nodes.json`
+- `artifacts/{doc_id}/semantic_blocks.json`
+- `artifacts/{doc_id}/vector_chunks.json`
+
+当前默认 Milvus collection 为 `regulations_dense_1536_v2`。
+
 ## 许可证

 MIT License
--- a/backend/app/aliyun_parser/.claude/settings.local.json
+++ b/backend/app/aliyun_parser/.claude/settings.local.json
@@ -1,8 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Bash(python3 *)",
-      "Bash(PGPASSWORD=postgresql123456 psql *)"
-    ]
-  }
-}
--- a/backend/app/aliyun_parser/parse_pdf.py
+++ b/backend/app/aliyun_parser/parse_pdf.py
@@ -1,516 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""Handle Aliyun parsing support for parse pdf."""
-
-import argparse
-import json
-import os
-import re
-import time
-from pathlib import Path
-from typing import Dict, List
-
-from alibabacloud_docmind_api20220711.client import Client as DocmindClient
-from alibabacloud_tea_openapi import models as open_api_models
-from alibabacloud_docmind_api20220711 import models as docmind_models
-from alibabacloud_tea_util import models as util_models
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-ALIBABA_ACCESS_KEY_ID = os.getenv("ALIBABA_ACCESS_KEY_ID", "")
-ALIBABA_ACCESS_KEY_SECRET = os.getenv("ALIBABA_ACCESS_KEY_SECRET", "")
-ALIBABA_ENDPOINT = os.getenv("ALIBABA_ENDPOINT", "docmind-api.cn-hangzhou.aliyuncs.com")
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-MAX_CHARS = 600
-OVERLAP_CHARS = 80
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-TOC_TITLES = {"目次", "目录"}
-TITLE_SUBTYPES = {"doc_title", "para_title"}
-TEXT_SUBTYPES = {"para", "none"}
-FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
-FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
-
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-def init_client() -> DocmindClient:
-    """Handle init client."""
-    if not ALIBABA_ACCESS_KEY_ID or not ALIBABA_ACCESS_KEY_SECRET:
-        raise ValueError("缺少阿里云文档解析凭据，请设置 ALIBABA_ACCESS_KEY_ID 和 ALIBABA_ACCESS_KEY_SECRET")
-    config = open_api_models.Config(
-        access_key_id=ALIBABA_ACCESS_KEY_ID,
-        access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
-    )
-    config.endpoint = ALIBABA_ENDPOINT
-    return DocmindClient(config)
-
-
-def submit_job(client: DocmindClient, file_path: str) -> str:
-    """Submit job."""
-    file_name = Path(file_path).name
-    request = docmind_models.SubmitDocParserJobAdvanceRequest(
-        file_url_object=open(file_path, "rb"),
-        file_name=file_name,
-        file_name_extension=Path(file_path).suffix.lstrip("."),
-        llm_enhancement=True,
-        enhancement_mode="VLM",
-    )
-    runtime = util_models.RuntimeOptions()
-    response = client.submit_doc_parser_job_advance(request, runtime)
-    return response.body.data.id
-
-
-def query_status(client: DocmindClient, task_id: str) -> Dict:
-    """Handle query status."""
-    request = docmind_models.QueryDocParserStatusRequest(id=task_id)
-    response = client.query_doc_parser_status(request)
-    return response.body.data.to_map() if response.body.data else None
-
-
-def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
-    """Wait for for completion."""
-    while True:
-        status_data = query_status(client, task_id)
-        if not status_data:
-            return False
-        status = status_data.get("Status", "").lower()
-        if status == "success":
-            return True
-        elif status == "failed":
-            print(f"任务失败: {status_data}")
-            return False
-        print(f"任务状态: {status}, 等待中...")
-        time.sleep(poll_interval)
-
-
-def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
-    """Return result."""
-    request = docmind_models.GetDocParserResultRequest(
-        id=task_id,
-        layout_step_size=layout_step_size,
-        layout_num=layout_num,
-    )
-    response = client.get_doc_parser_result(request)
-    return response.body.data if response.body.data else None
-
-
-def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
-    """Collect all results."""
-    all_layouts = []
-    layout_num = 0
-    while True:
-        result_data = get_result(client, task_id, layout_num, layout_step_size)
-        if not result_data:
-            break
-        layouts = result_data.get("layouts", [])
-        if not layouts:
-            break
-        all_layouts.extend(layouts)
-        layout_num += len(layouts)
-        if len(layouts) < layout_step_size:
-            break
-    return all_layouts
-
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-def normalize_text(text: str) -> str:
-    """Normalize text."""
-    text = text.replace("\r", "\n")
-    text = text.replace(" ", " ")
-    text = re.sub(r"\n+", "\n", text)
-    text = re.sub(r"[ \t]+", " ", text)
-    return text.strip()
-
-
-def get_page(layout: Dict) -> int:
-    """Return page."""
-    return layout.get("pageNum", layout.get("pageNumber", 0))
-
-
-def get_text(layout: Dict) -> str:
-    """Return text."""
-    text = normalize_text(layout.get("text", ""))
-    if text:
-        return text
-    return normalize_text(layout.get("markdownContent", ""))
-
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-def is_title(layout: Dict) -> bool:
-    """Return whether title."""
-    return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
-
-
-def is_text(layout: Dict) -> bool:
-    """Return whether text."""
-    return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
-
-
-def is_figure(layout: Dict) -> bool:
-    """Return whether figure."""
-    return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
-
-
-def is_table(layout: Dict) -> bool:
-    """Return whether table."""
-    return layout.get("type") == "table"
-
-
-def is_toc_layout(layout: Dict) -> bool:
-    """Return whether toc layout."""
-    text = get_text(layout)
-    if text in TOC_TITLES:
-        return True
-    if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
-        return True
-    return False
-
-
-def extract_table_text(layout: Dict) -> str:
-    """Extract table text."""
-    rows = []
-    for cell in layout.get("cells", []):
-        texts = []
-        for cell_layout in cell.get("layouts", []):
-            cell_text = normalize_text(cell_layout.get("text", ""))
-            if cell_text:
-                texts.append(cell_text)
-        if texts:
-            rows.append(" ".join(texts))
-    return "\n".join(rows).strip()
-
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
-    """Build structure nodes."""
-    nodes = []
-    for layout in layouts:
-        if not is_title(layout):
-            continue
-        text = get_text(layout)
-        if not text or text in TOC_TITLES:
-            continue
-        nodes.append(
-            {
-                "unique_id": layout.get("uniqueId"),
-                "page": get_page(layout),
-                "index": layout.get("index", 0),
-                "level": layout.get("level", 0),
-                "title": text,
-                "type": layout.get("type"),
-                "sub_type": layout.get("subType"),
-            }
-        )
-    return nodes
-
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
-    """Update section path."""
-    level = layout.get("level", 0)
-    title = get_text(layout)
-    while section_stack and section_stack[-1]["level"] >= level:
-        section_stack.pop()
-    section_stack.append(
-        {
-            "level": level,
-            "title": title,
-            "page": get_page(layout),
-            "unique_id": layout.get("uniqueId"),
-        }
-    )
-    return section_stack
-
-
-def section_path_titles(section_stack: List[Dict]) -> List[str]:
-    """Handle section path titles."""
-    return [item["title"] for item in section_stack]
-
-
-def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
-    """Handle flush text block."""
-    if not blocks:
-        return block_id
-
-    texts = [item["text"] for item in blocks if item["text"]]
-    merged_text = "\n".join(texts).strip()
-    if not merged_text:
-        return block_id
-
-    semantic_blocks.append(
-        {
-            "semantic_id": f"semantic-{block_id}",
-            "block_type": "section_text",
-            "page_start": min(item["page"] for item in blocks),
-            "page_end": max(item["page"] for item in blocks),
-            "section_path": blocks[0]["section_path"],
-            "section_level": blocks[0]["section_level"],
-            "section_title": blocks[0]["section_title"],
-            "source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
-            "text": merged_text,
-        }
-    )
-    return block_id + 1
-
-
-def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
-    """Build semantic blocks."""
-    semantic_blocks = []
-    section_stack = []
-    pending_text_blocks = []
-    block_id = 1
-    skip_toc_page = False
-
-    for layout in layouts:
-        text = get_text(layout)
-        page = get_page(layout)
-
-        if is_toc_layout(layout):
-            skip_toc_page = True
-            continue
-        if skip_toc_page and page == 1:
-            continue
-        if skip_toc_page and page != 1:
-            skip_toc_page = False
-
-        if is_title(layout):
-            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
-            pending_text_blocks = []
-            section_stack = update_section_path(section_stack, layout)
-            continue
-
-        section_path = section_path_titles(section_stack)
-        section_title = section_path[-1] if section_path else "未分类"
-        section_level = len(section_path)
-
-        if is_table(layout):
-            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
-            pending_text_blocks = []
-            table_text = extract_table_text(layout)
-            if table_text:
-                semantic_blocks.append(
-                    {
-                        "semantic_id": f"semantic-{block_id}",
-                        "block_type": "table",
-                        "page_start": page,
-                        "page_end": page,
-                        "section_path": section_path,
-                        "section_level": section_level,
-                        "section_title": section_title,
-                        "source_ids": [layout.get("uniqueId")],
-                        "text": table_text,
-                    }
-                )
-                block_id += 1
-            continue
-
-        if is_figure(layout):
-            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
-            pending_text_blocks = []
-            if text:
-                semantic_blocks.append(
-                    {
-                        "semantic_id": f"semantic-{block_id}",
-                        "block_type": "figure",
-                        "page_start": page,
-                        "page_end": page,
-                        "section_path": section_path,
-                        "section_level": section_level,
-                        "section_title": section_title,
-                        "source_ids": [layout.get("uniqueId")],
-                        "text": text,
-                    }
-                )
-                block_id += 1
-            continue
-
-        if is_text(layout) and text:
-            pending_text_blocks.append(
-                {
-                    "page": page,
-                    "text": text,
-                    "unique_id": layout.get("uniqueId"),
-                    "section_path": section_path,
-                    "section_level": section_level,
-                    "section_title": section_title,
-                }
-            )
-
-    flush_text_block(pending_text_blocks, semantic_blocks, block_id)
-    return semantic_blocks
-
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
-    """Handle split text with overlap."""
-    text = text.strip()
-    if len(text) <= max_chars:
-        return [text] if text else []
-
-    parts = []
-    start = 0
-    while start < len(text):
-        end = min(len(text), start + max_chars)
-        parts.append(text[start:end].strip())
-        if end >= len(text):
-            break
-        start = max(0, end - overlap_chars)
-    return [part for part in parts if part]
-
-
-def build_vector_chunks(
-    semantic_blocks: List[Dict],
-    doc_id: str,
-    doc_title: str,
-    max_chars: int,
-    overlap_chars: int,
-) -> List[Dict]:
-    """Build vector chunks."""
-    vector_chunks = []
-    chunk_index = 1
-
-    for block in semantic_blocks:
-        pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
-        for piece_index, piece in enumerate(pieces, start=1):
-            if block["section_path"]:
-                header = f"标准：{doc_title}\n章节：{' > '.join(block['section_path'])}\n\n"
-            else:
-                header = f"标准：{doc_title}\n\n"
-            vector_chunks.append(
-                {
-                    "doc_id": doc_id,
-                    "doc_title": doc_title,
-                    "chunk_id": f"chunk-{chunk_index}",
-                    "chunk_index": chunk_index,
-                    "semantic_id": block["semantic_id"],
-                    "chunk_type": block["block_type"],
-                    "piece_index": piece_index,
-                    "page_start": block["page_start"],
-                    "page_end": block["page_end"],
-                    "section_path": block["section_path"],
-                    "section_level": block["section_level"],
-                    "section_title": block["section_title"],
-                    "source_ids": block["source_ids"],
-                    "text": piece,
-                    "embedding_text": header + piece,
-                }
-            )
-            chunk_index += 1
-
-    return vector_chunks
-
-
-def parse_pdf_to_structured_chunks(
-    pdf_path: str,
-    *,
-    doc_id: str,
-    doc_title: str,
-    max_chars: int = MAX_CHARS,
-    overlap_chars: int = OVERLAP_CHARS,
-    poll_interval: int = 5,
-) -> Dict:
-    """Parse pdf to structured chunks."""
-    client = init_client()
-    task_id = submit_job(client, pdf_path)
-    if not wait_for_completion(client, task_id, poll_interval):
-        raise RuntimeError("阿里云文档解析任务失败")
-    layouts = collect_all_results(client, task_id)
-    return convert_layouts(
-        layouts,
-        doc_id=doc_id,
-        doc_title=doc_title,
-        max_chars=max_chars,
-        overlap_chars=overlap_chars,
-    )
-
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-def convert_layouts(
-    layouts: List[Dict],
-    doc_id: str,
-    doc_title: str,
-    max_chars: int,
-    overlap_chars: int,
-) -> Dict:
-    """Handle convert layouts."""
-    structure_nodes = build_structure_nodes(layouts)
-    semantic_blocks = build_semantic_blocks(layouts)
-    vector_chunks = build_vector_chunks(
-        semantic_blocks,
-        doc_id=doc_id,
-        doc_title=doc_title,
-        max_chars=max_chars,
-        overlap_chars=overlap_chars,
-    )
-    return {
-        "doc_id": doc_id,
-        "doc_title": doc_title,
-        "structure_nodes": structure_nodes,
-        "semantic_blocks": semantic_blocks,
-        "vector_chunks": vector_chunks,
-    }
-
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-def main() -> None:
-    """Run the module entrypoint."""
-    parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF，输出三层结构 chunks")
-    parser.add_argument("pdf_path", help="PDF 文件路径")
-    parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
-    parser.add_argument("--layouts-out", dest="layouts_output", help="输出原始 layouts JSON")
-    parser.add_argument("--doc-id", default="GB14747-2006", help="文档 ID")
-    parser.add_argument("--doc-title", default="GB 14747—2006 儿童三轮车安全要求", help="文档标题")
-    parser.add_argument("--max-chars", type=int, default=MAX_CHARS, help="单个检索 chunk 最大字符数")
-    parser.add_argument("--overlap-chars", type=int, default=OVERLAP_CHARS, help="相邻检索 chunk 重叠字符数")
-    parser.add_argument("--poll-interval", type=int, default=5, help="轮询间隔（秒）")
-    args = parser.parse_args()
-
-    pdf_path = Path(args.pdf_path).expanduser().resolve()
-    if not pdf_path.exists():
-        raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    client = init_client()
-    print(f"提交任务: {pdf_path}")
-    task_id = submit_job(client, str(pdf_path))
-    print(f"任务 ID: {task_id}")
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    print("等待任务完成...")
-    if not wait_for_completion(client, task_id, args.poll_interval):
-        print("任务失败，退出")
-        return
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    print("获取解析结果...")
-    layouts = collect_all_results(client, task_id)
-    print(f"获取到 {len(layouts)} 个布局块")
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    if args.layouts_output:
-        layouts_path = Path(args.layouts_output).expanduser().resolve()
-        layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
-        print(f"原始 layouts 已写入: {layouts_path}")
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    print("转换为三层结构...")
-    data = convert_layouts(
-        layouts,
-        doc_id=args.doc_id,
-        doc_title=args.doc_title,
-        max_chars=args.max_chars,
-        overlap_chars=args.overlap_chars,
-    )
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    output_path = Path(args.out).expanduser().resolve()
-    output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
-
-    print(f"结构层节点数: {len(data['structure_nodes'])}")
-    print(f"语义层块数: {len(data['semantic_blocks'])}")
-    print(f"检索层块数: {len(data['vector_chunks'])}")
-    print(f"输出文件: {output_path}")
-
-
-if __name__ == "__main__":
-    main()
--- a/backend/app/aliyun_parser/schema.sql
+++ b/backend/app/aliyun_parser/schema.sql
@@ -1,122 +0,0 @@
-- 法规文档向量检索系统数据库表结构
-- PostgreSQL
-
-- ==================== 文档表 ====================
-CREATE TABLE documents (
-    id SERIAL PRIMARY KEY,
-    doc_id VARCHAR(128) UNIQUE NOT NULL,       -- 文档唯一标识，如 "GB14747-2006"
-    title VARCHAR(512) NOT NULL,               -- 文档标题
-    doc_type VARCHAR(32),                      -- 文档类型：标准/法规/规范
-    standard_number VARCHAR(64),               -- 标准编号：如 "GB 14747-2006"
-    publish_date DATE,                         -- 发布日期
-    implement_date DATE,                       -- 实施日期
-    status VARCHAR(32),                        -- 状态：现行/废止/修订
-    source_url VARCHAR(512),                   -- 来源 URL
-    file_path VARCHAR(512),                    -- 本地 PDF 文件路径
-    file_size INT,                             -- 文件大小（字节）
-    upload_time TIMESTAMP DEFAULT NOW(),       -- 上传时间
-    created_at TIMESTAMP DEFAULT NOW(),
-    updated_at TIMESTAMP DEFAULT NOW()
-);
-
-COMMENT ON TABLE documents IS '文档元数据表';
-COMMENT ON COLUMN documents.doc_id IS '文档唯一标识，用于关联 Milvus 和其他表';
-COMMENT ON COLUMN documents.standard_number IS '标准编号，如 GB 14747-2006';
-
-- ==================== 章节结构表 ====================
-CREATE TABLE sections (
-    id SERIAL PRIMARY KEY,
-    doc_id VARCHAR(128) NOT NULL,
-    unique_id VARCHAR(64) NOT NULL,            -- 阿里云返回的唯一标识
-    level INT NOT NULL,                        -- 层级：1, 2, 3...
-    title VARCHAR(512) NOT NULL,               -- 章节标题
-    page INT,                                  -- 所在页码
-    index INT,                                 -- 页内顺序
-    parent_id INT,                             -- 父章节 ID（树形结构）
-    created_at TIMESTAMP DEFAULT NOW(),
-
-    CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
-    CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
-    CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
-);
-
-COMMENT ON TABLE sections IS '章节结构表，用于目录导航';
-COMMENT ON COLUMN sections.parent_id IS '父章节 ID，构建树形结构';
-COMMENT ON COLUMN sections.level IS '层级深度，1 为最顶层';
-
-- ==================== 语义块表 ====================
-CREATE TABLE semantic_blocks (
-    id SERIAL PRIMARY KEY,
-    doc_id VARCHAR(128) NOT NULL,
-    semantic_id VARCHAR(64) NOT NULL,          -- 语义块唯一标识
-    block_type VARCHAR(32) NOT NULL,           -- 类型：section_text/table/figure
-    page_start INT NOT NULL,                   -- 起始页码
-    page_end INT NOT NULL,                     -- 结束页码
-    section_id INT,                            -- 所属章节
-    section_title VARCHAR(512),                -- 章节标题（冗余，方便查询）
-    section_level INT,                         -- 章节层级
-    source_ids JSONB,                          -- 原始 layout IDs（JSON 数组）
-    text TEXT NOT NULL,                        -- 完整内容（未被切分）
-    created_at TIMESTAMP DEFAULT NOW(),
-
-    CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
-    CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
-    CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
-);
-
-COMMENT ON TABLE semantic_blocks IS '语义块表，用于邻域扩展，恢复完整内容';
-COMMENT ON COLUMN semantic_blocks.block_type IS '类型：section_text（正文）、table（表格）、figure（图示）';
-COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
-COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容，未被切分';
-
-- ==================== 向量块元数据表 ====================
-CREATE TABLE vector_chunks (
-    id SERIAL PRIMARY KEY,
-    doc_id VARCHAR(128) NOT NULL,
-    chunk_id VARCHAR(64) NOT NULL,             -- Milvus 主键
-    semantic_id VARCHAR(64) NOT NULL,          -- 关联语义块
-    chunk_index INT NOT NULL,                  -- 切片序号（全局）
-    piece_index INT,                           -- 同语义块内的切片序号
-    page_start INT,
-    page_end INT,
-    section_title VARCHAR(512),
-    text VARCHAR(2048),                        -- 切片文本（可选，缩短版用于展示）
-    source_ids JSONB,                          -- 原始 layout IDs（JSON 数组）
-    created_at TIMESTAMP DEFAULT NOW(),
-
-    CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
-    CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
-        REFERENCES semantic_blocks(doc_id, semantic_id),
-    CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
-);
-
-COMMENT ON TABLE vector_chunks IS '向量块元数据表，用于快速关联查询';
-COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
-COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号，用于按序拼接';
-
-- ==================== 索引 ====================
-CREATE INDEX idx_sections_doc_id ON sections(doc_id);
-CREATE INDEX idx_sections_parent_id ON sections(parent_id);
-CREATE INDEX idx_sections_level ON sections(level);
-
-CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
-CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
-CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
-CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
-
-CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
-CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
-CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
-
-- ==================== 触发器：自动更新 updated_at ====================
-CREATE OR REPLACE FUNCTION update_updated_at()
-RETURNS TRIGGER AS $$
-BEGIN
-    NEW.updated_at = NOW();
-    RETURN NEW;
-END;
-$$ LANGUAGE plpgsql;
-
-CREATE TRIGGER tr_documents_updated_at
-    BEFORE UPDATE ON documents
-    FOR EACH ROW EXECUTE FUNCTION update_updated_at();
--- a/backend/app/aliyun_parser/upload_to_milvus.py
+++ b/backend/app/aliyun_parser/upload_to_milvus.py
@@ -1,327 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""Handle Aliyun parsing support for upload to milvus."""
-
-import argparse
-import json
-import time
-from pathlib import Path
-from typing import List, Dict
-
-import psycopg2
-from psycopg2.extras import execute_values
-from pymilvus import (
-    connections,
-    Collection,
-    FieldSchema,
-    CollectionSchema,
-    DataType,
-    utility,
-)
-from openai import OpenAI
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-RELAY_BASE_URL = "http://6.86.80.4:30080/v1"
-RELAY_API_KEY = "sk-5HeY7gfSIlyZMacfuXOf5cphpymsNqufEu1ou4U3avbULcyY"
-EMBEDDING_MODEL = "text-embedding-v3"  # Keep parser integration steps explicit so external workflow behavior stays traceable.
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-MILVUS_HOST = "localhost"
-MILVUS_PORT = "19530"
-COLLECTION_NAME = "regulation_chunks"
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-PG_HOST = "6.86.80.10"
-PG_PORT = 5432
-PG_USER = "postgresql"
-PG_PASSWORD = "postgresql123456"
-PG_DATABASE = "postgres"
-
-
-# ===================== Embedding =====================
-def get_openai_client(api_key: str, base_url: str) -> OpenAI:
-    """Return openai client."""
-    return OpenAI(api_key=api_key, base_url=base_url)
-
-
-def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10) -> List[List[float]]:
-    """Return embeddings batch."""
-    all_embeddings = []
-
-    for i in range(0, len(texts), batch_size):
-        batch = texts[i:i + batch_size]
-        print(f"Embedding batch {i // batch_size + 1}/{(len(texts) - 1) // batch_size + 1}...")
-
-        response = client.embeddings.create(
-            model=EMBEDDING_MODEL,
-            input=batch,
-        )
-
-        embeddings = [item.embedding for item in response.data]
-        all_embeddings.extend(embeddings)
-
-    return all_embeddings
-
-
-# ===================== Milvus =====================
-def init_milvus(host: str, port: str):
-    """Handle init milvus."""
-    connections.connect("default", host=host, port=port)
-    print(f"已连接 Milvus: {host}:{port}")
-
-
-def create_collection(name: str, dim: int) -> Collection:
-    """Create collection."""
-    if utility.has_collection(name):
-        print(f"Collection '{name}' 已存在，删除重建")
-        utility.drop_collection(name)
-
-    fields = [
-        FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=64, is_primary=True),
-        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=128),
-        FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=512),
-        FieldSchema(name="chunk_index", dtype=DataType.INT64),
-        FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=64),
-        FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=32),
-        FieldSchema(name="page_start", dtype=DataType.INT64),
-        FieldSchema(name="page_end", dtype=DataType.INT64),
-        FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
-        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
-        FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),  # Keep parser integration steps explicit so external workflow behavior stays traceable.
-        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
-    ]
-
-    schema = CollectionSchema(fields, description="法规文档检索 chunks")
-    collection = Collection(name, schema)
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    index_params = {
-        "metric_type": "COSINE",
-        "index_type": "IVF_FLAT",
-        "params": {"nlist": 128},
-    }
-    collection.create_index("embedding", index_params)
-    print(f"Collection '{name}' 创建完成，索引已建立")
-
-    return collection
-
-
-def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[List[float]]):
-    """Handle insert chunks."""
-    data = [
-        [c["chunk_id"] for c in chunks],
-        [c["doc_id"] for c in chunks],
-        [c["doc_title"] for c in chunks],
-        [c["chunk_index"] for c in chunks],
-        [c["semantic_id"] for c in chunks],
-        [c["chunk_type"] for c in chunks],
-        [c["page_start"] for c in chunks],
-        [c["page_end"] for c in chunks],
-        [c["section_title"] for c in chunks],
-        [c["text"] for c in chunks],
-        [json.dumps(c.get("source_ids", [])) for c in chunks],  # Keep parser integration steps explicit so external workflow behavior stays traceable.
-        embeddings,
-    ]
-
-    collection.insert(data)
-    collection.flush()
-    print(f"已插入 {len(chunks)} 个 chunks")
-
-
-def load_collection(collection: Collection):
-    """Load collection."""
-    collection.load()
-    print(f"Collection 已加载到内存")
-
-
-# ===================== PostgreSQL =====================
-def get_pg_connection(host: str, port: int, user: str, password: str, database: str):
-    """Return pg connection."""
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        database=database,
-    )
-    print(f"已连接 PostgreSQL: {host}:{port}/{database}")
-    return conn
-
-
-def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
-    """Handle insert chunks to pg."""
-    cursor = conn.cursor()
-
-    try:
-        # Keep parser integration steps explicit so external workflow behavior stays traceable.
-        cursor.execute("""
-            INSERT INTO documents (doc_id, title, standard_number, upload_time)
-            VALUES (%s, %s, %s, NOW())
-            ON CONFLICT (doc_id) DO UPDATE SET title = EXCLUDED.title, updated_at = NOW()
-        """, (doc_data["doc_id"], doc_data["doc_title"], doc_data.get("standard_number")))
-
-        # Keep parser integration steps explicit so external workflow behavior stays traceable.
-        semantic_blocks = doc_data.get("semantic_blocks", [])
-        if semantic_blocks:
-            block_rows = [
-                (
-                    doc_data["doc_id"],
-                    block["semantic_id"],
-                    block["block_type"],
-                    block["page_start"],
-                    block["page_end"],
-                    block.get("section_title"),
-                    block.get("section_level"),
-                    json.dumps(block.get("source_ids", [])),
-                    block["text"],
-                )
-                for block in semantic_blocks
-            ]
-            execute_values(
-                cursor,
-                """
-                INSERT INTO semantic_blocks
-                (doc_id, semantic_id, block_type, page_start, page_end, section_title, section_level, source_ids, text)
-                VALUES %s
-                ON CONFLICT (doc_id, semantic_id) DO UPDATE SET text = EXCLUDED.text
-                """,
-                block_rows,
-            )
-            print(f"已插入 {len(semantic_blocks)} 个语义块")
-
-        # Keep parser integration steps explicit so external workflow behavior stays traceable.
-        chunk_rows = [
-            (
-                doc_data["doc_id"],
-                chunk["chunk_id"],
-                chunk["semantic_id"],
-                chunk["chunk_index"],
-                chunk.get("piece_index"),
-                chunk["page_start"],
-                chunk["page_end"],
-                chunk.get("section_title"),
-                chunk["text"],
-                json.dumps(chunk.get("source_ids", [])),
-            )
-            for chunk in chunks
-        ]
-        execute_values(
-            cursor,
-            """
-            INSERT INTO vector_chunks
-            (doc_id, chunk_id, semantic_id, chunk_index, piece_index, page_start, page_end, section_title, text, source_ids)
-            VALUES %s
-            ON CONFLICT (doc_id, chunk_id) DO UPDATE SET text = EXCLUDED.text
-            """,
-            chunk_rows,
-        )
-        print(f"已插入 {len(chunks)} 个向量块元数据")
-
-        conn.commit()
-        print("PostgreSQL 数据插入完成")
-
-    except Exception as e:
-        conn.rollback()
-        raise e
-    finally:
-        cursor.close()
-
-
-# Keep parser integration steps explicit so external workflow behavior stays traceable.
-def load_data(file_path: Path) -> Dict:
-    """Load data."""
-    data = json.loads(file_path.read_text(encoding="utf-8"))
-    return data
-
-
-def upload_to_milvus_and_pg(
-    chunks_file: str,
-    api_key: str,
-    base_url: str,
-    milvus_host: str,
-    milvus_port: str,
-    collection_name: str,
-    batch_size: int,
-    pg_host: str,
-    pg_port: int,
-    pg_user: str,
-    pg_password: str,
-    pg_database: str,
-):
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    """Handle upload to milvus and pg."""
-    chunks_path = Path(chunks_file).expanduser().resolve()
-    if not chunks_path.exists():
-        raise FileNotFoundError(f"文件不存在: {chunks_path}")
-
-    data = load_data(chunks_path)
-    chunks = data.get("vector_chunks", [])
-    if not chunks:
-        raise ValueError("vector_chunks 为空")
-    print(f"加载 {len(chunks)} 个 chunks")
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    client = get_openai_client(api_key, base_url)
-    init_milvus(milvus_host, milvus_port)
-    pg_conn = get_pg_connection(pg_host, pg_port, pg_user, pg_password, pg_database)
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    texts = [c["embedding_text"] for c in chunks]
-    embeddings = get_embeddings_batch(client, texts, batch_size)
-    print(f"生成 {len(embeddings)} 个向量")
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    embedding_dim = len(embeddings[0])
-    print(f"Embedding 维度: {embedding_dim}")
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    collection = create_collection(collection_name, embedding_dim)
-    insert_chunks(collection, chunks, embeddings)
-    load_collection(collection)
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    insert_chunks_to_pg(pg_conn, chunks, data)
-
-    # Keep parser integration steps explicit so external workflow behavior stays traceable.
-    pg_conn.close()
-
-    print("上传完成！")
-
-
-# ===================== CLI =====================
-def main():
-    """Run the module entrypoint."""
-    parser = argparse.ArgumentParser(description="将 vector_chunks 向量化并上传到 Milvus 和 PostgreSQL")
-    parser.add_argument("chunks_file", help="vector_chunks.json 文件路径")
-    parser.add_argument("--api-key", default=RELAY_API_KEY, help="中转站 API Key")
-    parser.add_argument("--base-url", default=RELAY_BASE_URL, help="中转站 Base URL")
-    parser.add_argument("--milvus-host", default=MILVUS_HOST, help="Milvus host")
-    parser.add_argument("--milvus-port", default=MILVUS_PORT, help="Milvus port")
-    parser.add_argument("--collection", default=COLLECTION_NAME, help="Milvus collection 名称")
-    parser.add_argument("--batch-size", type=int, default=10, help="Embedding 批量大小（中转站限制最大10）")
-    parser.add_argument("--pg-host", default=PG_HOST, help="PostgreSQL host")
-    parser.add_argument("--pg-port", type=int, default=PG_PORT, help="PostgreSQL port")
-    parser.add_argument("--pg-user", default=PG_USER, help="PostgreSQL user")
-    parser.add_argument("--pg-password", default=PG_PASSWORD, help="PostgreSQL password")
-    parser.add_argument("--pg-database", default=PG_DATABASE, help="PostgreSQL database")
-    args = parser.parse_args()
-
-    upload_to_milvus_and_pg(
-        chunks_file=args.chunks_file,
-        api_key=args.api_key,
-        base_url=args.base_url,
-        milvus_host=args.milvus_host,
-        milvus_port=args.milvus_port,
-        collection_name=args.collection,
-        batch_size=args.batch_size,
-        pg_host=args.pg_host,
-        pg_port=args.pg_port,
-        pg_user=args.pg_user,
-        pg_password=args.pg_password,
-        pg_database=args.pg_database,
-    )
-
-
-if __name__ == "__main__":
-    main()
--- a/backend/app/aliyun_parser/vector_chunks.json
+++ b/backend/app/aliyun_parser/vector_chunks.json
--- a/backend/app/aliyun_parser/嵌入和召回.md
+++ b/backend/app/aliyun_parser/嵌入和召回.md
@@ -1,263 +0,0 @@
-# 文档解析与向量检索说明
-
-## 相关文件
-
- `aliyun_doc_parser.py`：调用阿里云文档智能解析 PDF，生成原始 `layouts.json`
- `layouts_to_vector_chunks.py`：把 `layouts.json` 转成适合向量数据库入库的三层结构
- `layouts.json`：阿里云返回的原始布局结果
- `vector_chunks.json`：转换后的结构化输出
-
-## 一、`layouts.json` 的结构
-
-`layouts.json` 顶层是一个数组，每个元素代表一个布局块（layout）。常见字段如下：
-
- `type`：主类型，例如 `title`、`text`、`table`、`figure`
- `subType`：更细的语义类型，例如 `doc_title`、`para_title`、`para`、`picture`、`pic_title`、`pic_caption`
- `text`：当前布局块的纯文本
- `markdownContent`：带 markdown 标记的文本
- `pageNum`：页码
- `index`：页内顺序
- `level`：标题层级
- `uniqueId`：布局块唯一标识
- `blocks`：更细粒度的文本与样式信息
- `cells`：表格单元格，仅 `table` 类型存在
-
-这个结构不是简单 OCR 文本流，而是已经带有版面理解和语义分类的结构化数据。
-
-## 二、推荐的三层转换结构
-
-### 1. 结构层 `structure_nodes`
-
-结构层用于恢复文档标题树，不直接作为最终向量检索单元。
-
-示例：
-
- `1 范围`
- `2 规范性引用文件`
- `3 术语和定义`
-  - `3.1 儿童三轮车`
-  - `3.2 轮距`
-
-结构层主要用于给下游 chunk 绑定 `section_path`。
-
-### 2. 语义层 `semantic_blocks`
-
-语义层是按文档意义聚合后的内容块，主要分为三类：
-
- `section_text`：同一章节下连续正文聚合而成
- `table`：表格内容单独成块
- `figure`：图、图名、图注等单独成块
-
-这一层比单 layout 更适合做语义理解，也适合后续做上下文扩展。
-
-### 3. 检索层 `vector_chunks`
-
-检索层是最终写进向量数据库的 chunk。
-
-处理方式：
-
- 对 `semantic_blocks` 中较短的块直接入库
- 对较长的块按 `max_chars` 再切分
- 相邻切片保留 `overlap_chars` 重叠
- 每个 chunk 都带完整 metadata，便于后续过滤、重排和邻域扩展
-
-## 三、当前转换脚本做了什么
-
-`layouts_to_vector_chunks.py` 当前已经实现：
-
-1. 过滤目录页噪声（如 `目次`）
-2. 根据标题层级维护章节路径
-3. 将正文聚合成 `section_text`
-4. 将表格单独转成 `table`
-5. 将图相关内容单独转成 `figure`
-6. 对长文本继续切分为最终 `vector_chunks`
-7. 为每个检索 chunk 生成 `embedding_text`
-
-## 四、为什么不要直接按 layout 入库
-
-如果把 `layouts.json` 的每条 layout 直接做向量：
-
- 颗粒度太碎
- 标题和正文容易分离
- 表格会丢失结构上下文
- 图示信息无法完整表达
- 检索命中结果噪声较大
-
-对于标准文档，最合适的单位通常不是“句子”，而是“条款语义块”。
-
-## 五、建议的入库字段
-
-建议向量数据库每条记录至少保存：
-
- `embedding_text`：用于生成向量
- `text`：原始 chunk 文本
- `chunk_id`
- `semantic_id`
- `chunk_type`：`section_text` / `table` / `figure`
- `section_path`
- `section_title`
- `section_level`
- `page_start`
- `page_end`
- `doc_id`
- `doc_title`
- `source_ids`
-
-其中：
-
- 向量化字段：`embedding_text`
- 展示字段：`text`
- 检索增强字段：其余 metadata
-
-## 六、推荐的检索方式
-
-不要只做最简单的 top-k 向量搜索，建议采用：
-
-**向量召回 + metadata 重排 + 邻域扩展**
-
-### 1. 向量召回
-
-使用 `vector_chunks[*].embedding_text` 做 embedding，并在向量数据库中检索 top 10 ~ 15 条。
-
-查询时可以对用户问题做轻微改写，例如：
-
-原问题：
-
-`儿童三轮车的定义是什么？`
-
-可改写为：
-
-`请检索 GB 14747—2006 儿童三轮车安全要求 中关于“儿童三轮车定义”的条款、术语、表格或图示说明。`
-
-这样更适合标准文档检索。
-
-### 2. metadata 重排
-
-向量召回后，根据 metadata 做轻量规则重排。
-
-常见规则：
-
- `chunk_type == section_text`：对定义类、要求类问题优先级更高
- `section_path` 命中查询关键词：例如查询“定义”时，`术语和定义` 章节优先
- `chunk_type == table`：对“尺寸 / 参数 / 数值 / 对照 / 要求”类问题加权
- `chunk_type == figure`：对“图 / 结构 / 状态 / 示意”类问题加权
-
-### 3. 邻域扩展
-
-检索命中的是最终切片，但回答往往需要更完整上下文。
-
-建议命中某个 `vector_chunk` 后：
-
-1. 优先回捞同一个 `semantic_id` 下的所有 chunk
-2. 如果还不够，再补充同 `section_path`、相邻页码或相邻 `chunk_index` 的内容
-
-这样可以恢复完整条款，而不是只给模型一小段碎片。
-
-## 七、不同问题的检索重点
-
-### 1. 定义类问题
-
-例如：
-
- `儿童三轮车的定义是什么？`
- `轮距是什么意思？`
-
-优先检索：
-
- `section_text`
- `section_path` 中包含 `术语和定义` 的内容
-
-### 2. 要求类问题
-
-例如：
-
- `外露突出物有什么要求？`
- `辅助推杆有哪些安全要求？`
-
-优先检索：
-
- `section_text`
- `table`
-
-### 3. 数值 / 尺寸 / 对照类问题
-
-例如：
-
- `鞍座到脚蹬距离要求是什么？`
- `哪些项目需要满足规定尺寸？`
-
-优先检索：
-
- `table`
- `section_text`
-
-### 4. 图示说明类问题
-
-例如：
-
- `正常乘骑状态是什么意思？`
- `图1表示什么？`
-
-优先检索：
-
- `figure`
- 同章节相邻 `section_text`
-
-## 八、推荐的最终检索流程
-
-建议采用以下固定流程：
-
-1. 用 `vector_chunks.embedding_text` 做 embedding 检索
-2. 取 top 10 ~ 15 条候选
-3. 按 `chunk_type + section_path` 做规则重排
-4. 以 `semantic_id` 为中心回捞完整语义块
-5. 选 3 ~ 5 组上下文提供给大模型回答
-
-## 九、给大模型的上下文组织方式
-
-最终不要直接把原始 JSON 扔给模型，建议整理成如下格式：
-
-```text
-[命中片段 1]
-章节：3 术语和定义 > 3.1 儿童三轮车
-页码：1-2
-类型：section_text
-内容：
-......
-
-[命中片段 2]
-章节：4 要求 > 4.3 外露突出物
-页码：5
-类型：section_text
-内容：
-......
-
-[命中片段 3]
-章节：5 试验方法
-页码：8
-类型：table
-内容：
-......
-```
-
-这种格式更利于模型稳定回答并引用出处。
-
-## 十、转换命令
-
-生成三层结构：
-
-```bash
-python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
-  --layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
-  --out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json
-```
-
-自定义切片大小：
-
-```bash
-python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
-  --layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
-  --out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json \
-  --max-chars 500 \
-  --overlap-chars 80
-```
--- a/backend/app/api/routes/status.py
+++ b/backend/app/api/routes/status.py
@@ -32,6 +32,10 @@ async def get_config():
        "embedding_dim": settings.embedding_dim,
        "embedding_base_url": settings.embedding_base_url,
        "milvus_collection": settings.milvus_collection,
+        "parser_backend": settings.parser_backend,
+        "chunk_backend": settings.chunk_backend,
+        "artifact_prefix": settings.document_parse_artifact_prefix,
+        "parser_failure_mode": settings.parser_failure_mode,
        "llm_provider": settings.llm_provider,
        "llm_model": settings.llm_model,
        "document_metadata_path": settings.document_metadata_path,
--- a/backend/app/application/documents/services.py
+++ b/backend/app/application/documents/services.py
@@ -5,6 +5,7 @@ from __future__ import annotations
 import os
 import tempfile
 import uuid
+import json
 from dataclasses import dataclass

 from loguru import logger
@@ -16,6 +17,7 @@ from app.domain.documents import (
    DocumentParser,
    DocumentRepository,
    DocumentStatus,
+    ParsedDocument,
 )
 from app.domain.retrieval import EmbeddingProvider, VectorIndex
 # Keep orchestration logic centralized so use-case flow stays easy to trace.
@@ -54,6 +56,27 @@ class DocumentCommandService:
        self.embedding_provider = embedding_provider
        self.vector_index = vector_index

+    def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
+        """Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
+        prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"
+        artifact_payloads = {
+            "layouts": parsed_document.raw_layouts,
+            "structure_nodes": parsed_document.structure_nodes,
+            "semantic_blocks": parsed_document.semantic_blocks,
+            "vector_chunks": parsed_document.vector_chunks,
+        }
+        artifact_keys: dict[str, str] = {}
+        for name, payload in artifact_payloads.items():
+            object_name = f"{prefix}/{name}.json"
+            self.binary_store.save(
+                object_name=object_name,
+                data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),
+                content_type="application/json",
+                metadata={"doc_id": doc_id, "artifact_type": name},
+            )
+            artifact_keys[name] = object_name
+        return artifact_keys
+
    def upload_and_process(
        self,
        *,
@@ -104,11 +127,21 @@ class DocumentCommandService:
                doc_id=doc_id,
                doc_name=final_doc_name,
            )
+            artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
            self.document_repository.update_status(
                doc_id,
                DocumentStatus.PARSED,
                parser_name=parsed_document.parser_name,
-                metadata={"structure_nodes": len(parsed_document.structure_nodes)},
+                metadata={
+                    "parser_backend": parsed_document.parser_name,
+                    "parse_task_id": parsed_document.metadata.get("task_id", ""),
+                    "layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
+                    "structure_node_count": len(parsed_document.structure_nodes),
+                    "semantic_block_count": len(parsed_document.semantic_blocks),
+                    "vector_chunk_count": len(parsed_document.vector_chunks),
+                    "artifact_keys": artifact_keys,
+                    "processing_stage": "parsed",
+                },
            )

            chunks = self.chunk_builder.build(
@@ -124,13 +157,18 @@ class DocumentCommandService:
            if inserted != len(chunks):
                logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))

+            health = self.vector_index.health()
            self.document_repository.update_status(
                doc_id,
                DocumentStatus.INDEXED,
                chunk_count=len(chunks),
                summary="",
                summary_latency_ms=0,
-                index_name=self.vector_index.health().get("collection_name", ""),
+                index_name=health.get("collection_name", ""),
+                metadata={
+                    "index_collection": health.get("collection_name", ""),
+                    "processing_stage": "indexed",
+                },
            )
            stored = self.document_repository.get(doc_id)
            return DocumentProcessResult(
@@ -148,6 +186,10 @@ class DocumentCommandService:
                doc_id,
                DocumentStatus.FAILED,
                error_message=str(exc),
+                metadata={
+                    "failure_reason": str(exc),
+                    "processing_stage": "failed",
+                },
            )
            return DocumentProcessResult(
                doc_id=doc_id,
--- a/backend/app/config/settings.py
+++ b/backend/app/config/settings.py
@@ -1,9 +1,9 @@
-"""Configure backend settings for settings."""
+"""Configure backend settings for the backend application."""

 from pathlib import Path

-from pydantic_settings import BaseSettings, SettingsConfigDict
 from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
 from functools import lru_cache
 # Keep configuration setup explicit so runtime behavior is easy to reason about.

@@ -33,18 +33,25 @@ class Settings(BaseSettings):
    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    milvus_host: str = Field(default="localhost", description="Milvus服务地址")
    milvus_port: int = Field(default=19530, description="Milvus服务端口")
-    milvus_collection: str = Field(default="regulations_dense_1536", description="法规向量集合名称")
+    milvus_collection: str = Field(default="regulations_dense_1024_v1", description="法规向量集合名称")
    milvus_db_name: str = Field(default="default", description="Milvus数据库名称")

    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    embedding_model: str = Field(default="text-embedding-v3", description="嵌入模型名称")
-    embedding_dim: int = Field(default=1536, description="嵌入向量维度")
+    embedding_dim: int = Field(default=1024, description="嵌入向量维度")
    embedding_api_key: str = Field(default="", description="Embedding API密钥")
    embedding_base_url: str = Field(default="http://6.86.80.4:30080/v1", description="Embedding API地址")
    embedding_timeout_seconds: int = Field(default=120, description="Embedding API超时时间(秒)")
    alibaba_access_key_id: str = Field(default="", description="阿里云文档解析 Access Key ID")
    alibaba_access_key_secret: str = Field(default="", description="阿里云文档解析 Access Key Secret")
    alibaba_endpoint: str = Field(default="docmind-api.cn-hangzhou.aliyuncs.com", description="阿里云文档解析 endpoint")
+    aliyun_parse_poll_interval_seconds: int = Field(default=5, description="阿里云文档解析轮询间隔(秒)")
+    aliyun_parse_timeout_seconds: int = Field(default=900, description="阿里云文档解析超时时间(秒)")
+    aliyun_parse_layout_step_size: int = Field(default=50, description="阿里云文档解析分页步长")
+    aliyun_llm_enhancement: bool = Field(default=True, description="是否启用阿里云解析增强")
+    aliyun_enhancement_mode: str = Field(default="VLM", description="阿里云解析增强模式")
+    document_parse_artifact_prefix: str = Field(default="artifacts", description="解析产物对象前缀")
+    parser_failure_mode: str = Field(default="fail", description="解析失败策略")

    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    minio_endpoint: str = Field(default="localhost:9000", description="MinIO服务地址")
@@ -71,8 +78,8 @@ class Settings(BaseSettings):
    chunk_overlap: int = Field(default=50, description="分块重叠大小")
    max_file_size_mb: int = Field(default=100, description="最大文件大小(MB)")
    document_metadata_path: str = Field(default="backend/data/documents.json", description="文档元数据存储路径")
-    parser_backend: str = Field(default="local", description="解析后端(local/aliyun)")
-    chunk_backend: str = Field(default="local", description="分块后端(local/aliyun)")
+    parser_backend: str = Field(default="aliyun", description="解析后端(local/aliyun)")
+    chunk_backend: str = Field(default="aliyun", description="分块后端(local/aliyun)")

    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    api_host: str = Field(default="0.0.0.0", description="API服务地址")
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -27,12 +27,12 @@ class Settings(BaseSettings):
    # Milvus
    milvus_host: str = "localhost"
    milvus_port: int = 19530
-    milvus_collection: str = "regulations_dense_1536"
+    milvus_collection: str = "regulations_dense_1024_v1"

    # LLM / embedding defaults aligned with the migrated backend path.
    llm_model: str = "qwen-max"
    embedding_model: str = "text-embedding-v3"
-    embedding_dim: int = 1536
+    embedding_dim: int = 1024

    # Legacy workflow compatibility only.
    vector_top_k: int = 10
@@ -47,7 +47,7 @@ class Settings(BaseSettings):
    api_port: int = 8000

    # Legacy aliases retained for old utility modules.
-    regulations_collection: str = "regulations_dense_1536"
+    regulations_collection: str = "regulations_dense_1024_v1"
    compliance_collection: str = "compliance_cache"

 # Preserve the legacy module API while keeping env resolution centralized at the repo root.
--- a/backend/app/domain/documents/models.py
+++ b/backend/app/domain/documents/models.py
@@ -56,6 +56,7 @@ class ParsedDocument:
    vector_chunks: list[dict[str, Any]]
    parser_name: str
    raw_text: str = ""
+    raw_layouts: list[dict[str, Any]] = field(default_factory=list)
    metadata: dict[str, Any] = field(default_factory=dict)


--- a/backend/app/infrastructure/embedding/openai_compatible_embedding_provider.py
+++ b/backend/app/infrastructure/embedding/openai_compatible_embedding_provider.py
@@ -10,6 +10,8 @@ from app.config.settings import settings
 from app.domain.retrieval import EmbeddingProvider
 # Keep adapter behavior explicit so integration details remain easy to audit.

+EMBEDDING_BATCH_SIZE = 8
+


 class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
@@ -27,6 +29,18 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
        self.timeout = settings.embedding_timeout_seconds
        self.dimension = settings.embedding_dim

+    def _raise_for_status(self, response: httpx.Response, *, batch_size: int) -> None:
+        """Raise a detailed error so upstream gateway failures are easier to diagnose."""
+        try:
+            response.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            response_preview = response.text[:500].strip()
+            detail = (
+                f"Embedding request failed for model={self.model}, batch_size={batch_size}, "
+                f"status={response.status_code}, url={response.request.url}, response={response_preview}"
+            )
+            raise httpx.HTTPStatusError(detail, request=exc.request, response=exc.response) from exc
+
    def _request(self, texts: list[str]) -> list[list[float]]:
        """Handle request for this module for the Open A I Compatible Embedding Provider instance."""
        if not self.api_key:
@@ -40,7 +54,7 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
            json={"model": self.model, "input": texts},
            timeout=self.timeout,
        )
-        response.raise_for_status()
+        self._raise_for_status(response, batch_size=len(texts))
        data = response.json()
        vectors = [item["embedding"] for item in sorted(data.get("data", []), key=lambda item: item["index"])]
        if any(len(vector) != self.dimension for vector in vectors):
@@ -51,7 +65,12 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
        """Embed texts for the Open A I Compatible Embedding Provider instance."""
        if not texts:
            return []
-        return self._request(texts)
+        vectors: list[list[float]] = []
+        # Batch requests conservatively because some gateways reject larger embedding payloads.
+        for start in range(0, len(texts), EMBEDDING_BATCH_SIZE):
+            batch = texts[start:start + EMBEDDING_BATCH_SIZE]
+            vectors.extend(self._request(batch))
+        return vectors

    def embed_query(self, text: str) -> list[float]:
        """Embed query for the Open A I Compatible Embedding Provider instance."""
--- a/backend/app/infrastructure/parser/aliyun_docmind_gateway.py
+++ b/backend/app/infrastructure/parser/aliyun_docmind_gateway.py
@@ -0,0 +1,142 @@
+"""Aliyun Docmind gateway helpers for the document ingest pipeline."""
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from alibabacloud_docmind_api20220711 import models as docmind_models
+from alibabacloud_docmind_api20220711.client import Client as DocmindClient
+from alibabacloud_tea_openapi import models as open_api_models
+from alibabacloud_tea_util import models as util_models
+
+from app.config.settings import settings
+
+# Keep provider-specific behavior isolated so the rest of the backend can stay stable.
+
+
+@dataclass
+class AliyunParsePayload:
+    """Represent the raw Aliyun parse payload returned by the gateway."""
+
+    task_id: str
+    layouts: list[dict[str, Any]]
+    poll_attempts: int
+    duration_ms: int
+
+
+class AliyunDocmindGateway:
+    """Submit, poll, and collect results from the Aliyun Docmind API."""
+
+    def __init__(self) -> None:
+        """Initialize the gateway with runtime configuration."""
+        self.endpoint = settings.alibaba_endpoint
+        self.poll_interval_seconds = settings.aliyun_parse_poll_interval_seconds
+        self.timeout_seconds = settings.aliyun_parse_timeout_seconds
+        self.layout_step_size = settings.aliyun_parse_layout_step_size
+        self.llm_enhancement = settings.aliyun_llm_enhancement
+        self.enhancement_mode = settings.aliyun_enhancement_mode
+
+    def parse_document(self, *, file_path: str) -> AliyunParsePayload:
+        """Parse a single document and return the collected layouts."""
+        client = self._create_client()
+        started_at = time.monotonic()
+        task_id = self._submit_job(client=client, file_path=file_path)
+        poll_attempts = self._wait_for_completion(client=client, task_id=task_id, started_at=started_at)
+        layouts = self._collect_all_results(client=client, task_id=task_id)
+        duration_ms = int((time.monotonic() - started_at) * 1000)
+        return AliyunParsePayload(
+            task_id=task_id,
+            layouts=layouts,
+            poll_attempts=poll_attempts,
+            duration_ms=duration_ms,
+        )
+
+    def _create_client(self) -> DocmindClient:
+        """Create a Docmind client using explicit AccessKey settings only."""
+        config = open_api_models.Config()
+        config.endpoint = self.endpoint
+
+        if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
+            raise ValueError(
+                "Missing Aliyun parser credentials. Set ALIBABA_ACCESS_KEY_ID and "
+                "ALIBABA_ACCESS_KEY_SECRET in the project root .env."
+            )
+
+        # Keep production behavior deterministic by using only project-configured credentials.
+        config.access_key_id = settings.alibaba_access_key_id
+        config.access_key_secret = settings.alibaba_access_key_secret
+        return DocmindClient(config)
+
+    def _submit_job(self, *, client: DocmindClient, file_path: str) -> str:
+        """Submit an asynchronous Docmind parse job."""
+        path = Path(file_path)
+        with open(file_path, "rb") as file_stream:
+            request = docmind_models.SubmitDocParserJobAdvanceRequest(
+                file_url_object=file_stream,
+                file_name=path.name,
+                file_name_extension=path.suffix.lstrip("."),
+                llm_enhancement=self.llm_enhancement,
+                enhancement_mode=self.enhancement_mode,
+            )
+            runtime = util_models.RuntimeOptions()
+            response = client.submit_doc_parser_job_advance(request, runtime)
+        task_id = response.body.data.id if response.body and response.body.data else ""
+        if not task_id:
+            raise RuntimeError("Aliyun Docmind did not return a parse task id.")
+        return task_id
+
+    def _query_status(self, *, client: DocmindClient, task_id: str) -> dict[str, Any] | None:
+        """Query the current Docmind parse status."""
+        request = docmind_models.QueryDocParserStatusRequest(id=task_id)
+        response = client.query_doc_parser_status(request)
+        return response.body.data.to_map() if response.body and response.body.data else None
+
+    def _wait_for_completion(self, *, client: DocmindClient, task_id: str, started_at: float) -> int:
+        """Poll until the parse job finishes or times out."""
+        poll_attempts = 0
+        while True:
+            poll_attempts += 1
+            status_payload = self._query_status(client=client, task_id=task_id)
+            if not status_payload:
+                raise RuntimeError(f"Aliyun parse status payload is empty for task {task_id}.")
+
+            status = str(status_payload.get("Status", "")).lower()
+            if status == "success":
+                return poll_attempts
+            if status == "failed":
+                raise RuntimeError(f"Aliyun parse task failed: {status_payload}")
+
+            elapsed = time.monotonic() - started_at
+            if elapsed > self.timeout_seconds:
+                raise TimeoutError(
+                    f"Aliyun parse task timed out after {self.timeout_seconds}s: task_id={task_id}"
+                )
+            time.sleep(self.poll_interval_seconds)
+
+    def _collect_all_results(self, *, client: DocmindClient, task_id: str) -> list[dict[str, Any]]:
+        """Collect all paginated layout results from a completed parse task."""
+        all_layouts: list[dict[str, Any]] = []
+        layout_num = 0
+        while True:
+            request = docmind_models.GetDocParserResultRequest(
+                id=task_id,
+                layout_step_size=self.layout_step_size,
+                layout_num=layout_num,
+            )
+            response = client.get_doc_parser_result(request)
+            payload = response.body.data if response.body else None
+            if not payload:
+                break
+            layouts = payload.get("layouts", [])
+            if not layouts:
+                break
+            all_layouts.extend(layouts)
+            layout_num += len(layouts)
+            if len(layouts) < self.layout_step_size:
+                break
+        if not all_layouts:
+            raise RuntimeError(f"Aliyun parse task returned no layouts: task_id={task_id}")
+        return all_layouts
--- a/backend/app/infrastructure/parser/aliyun_document_parser.py
+++ b/backend/app/infrastructure/parser/aliyun_document_parser.py
@@ -1,19 +1,18 @@
-"""Implement infrastructure support for aliyun document parser."""
+"""Implement infrastructure support for Aliyun document parsing."""

 from __future__ import annotations

-from app.aliyun_parser.parse_pdf import (
+from app.config.settings import settings
+from app.domain.documents import DocumentParser, ParsedDocument
+from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway
+from app.infrastructure.parser.aliyun_layout_normalizer import (
    MAX_CHARS,
    OVERLAP_CHARS,
    build_semantic_blocks,
    build_structure_nodes,
    build_vector_chunks,
-    collect_all_results,
-    init_client,
-    submit_job,
-    wait_for_completion,
 )
-from app.domain.documents import DocumentParser, ParsedDocument
+
 # Keep adapter behavior explicit so integration details remain easy to audit.


@@ -22,13 +21,14 @@ class AliyunDocumentParser(DocumentParser):
    """Provide the Aliyun Document Parser parser."""
    parser_name = "aliyun_docmind"

+    def __init__(self) -> None:
+        """Initialize the parser adapter and its gateway dependency."""
+        self.gateway = AliyunDocmindGateway()
+
    def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
        """Handle parse for the Aliyun Document Parser instance."""
-        client = init_client()
-        task_id = submit_job(client, file_path)
-        if not wait_for_completion(client, task_id):
-            raise RuntimeError("阿里云文档解析任务失败")
-        layouts = collect_all_results(client, task_id)
+        payload = self.gateway.parse_document(file_path=file_path)
+        layouts = payload.layouts
        structure_nodes = build_structure_nodes(layouts)
        semantic_blocks = build_semantic_blocks(layouts)
        vector_chunks = build_vector_chunks(
@@ -51,5 +51,13 @@ class AliyunDocumentParser(DocumentParser):
            vector_chunks=vector_chunks,
            parser_name=self.parser_name,
            raw_text=raw_text,
-            metadata={"task_id": task_id, "layout_count": len(layouts)},
+            raw_layouts=layouts,
+            metadata={
+                "task_id": payload.task_id,
+                "layout_count": len(layouts),
+                "poll_attempts": payload.poll_attempts,
+                "duration_ms": payload.duration_ms,
+                "parser_backend": self.parser_name,
+                "artifact_prefix": settings.document_parse_artifact_prefix,
+            },
        )
--- a/backend/app/infrastructure/parser/aliyun_layout_normalizer.py
+++ b/backend/app/infrastructure/parser/aliyun_layout_normalizer.py
@@ -0,0 +1,336 @@
+"""Normalize Aliyun Docmind layouts into production document structures."""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+# Keep layout normalization rules centralized so parser and demos stay aligned.
+MAX_CHARS = 600
+OVERLAP_CHARS = 80
+
+TOC_TITLES = {"目次", "目录"}
+TITLE_SUBTYPES = {"doc_title", "para_title"}
+TEXT_SUBTYPES = {"para", "none"}
+FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
+FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
+
+
+def normalize_text(text: str) -> str:
+    """Normalize raw text content emitted by the parser."""
+    text = text.replace("\r", "\n")
+    text = text.replace(" ", " ")
+    text = re.sub(r"\n+", "\n", text)
+    text = re.sub(r"[ \t]+", " ", text)
+    return text.strip()
+
+
+def get_page(layout: dict[str, Any]) -> int:
+    """Return the page number for a layout record."""
+    return layout.get("pageNum", layout.get("pageNumber", 0))
+
+
+def get_text(layout: dict[str, Any]) -> str:
+    """Return the most useful text content for a layout record."""
+    text = normalize_text(layout.get("text", ""))
+    if text:
+        return text
+    return normalize_text(layout.get("markdownContent", ""))
+
+
+def is_title(layout: dict[str, Any]) -> bool:
+    """Return whether the layout should be treated as a title."""
+    return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
+
+
+def is_text(layout: dict[str, Any]) -> bool:
+    """Return whether the layout should be treated as plain paragraph text."""
+    return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
+
+
+def is_figure(layout: dict[str, Any]) -> bool:
+    """Return whether the layout should be treated as figure-related content."""
+    return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
+
+
+def is_table(layout: dict[str, Any]) -> bool:
+    """Return whether the layout should be treated as a table."""
+    return layout.get("type") == "table"
+
+
+def is_toc_layout(layout: dict[str, Any]) -> bool:
+    """Return whether the layout appears to belong to a table of contents."""
+    text = get_text(layout)
+    if text in TOC_TITLES:
+        return True
+    if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
+        return True
+    return False
+
+
+def extract_table_text(layout: dict[str, Any]) -> str:
+    """Flatten nested table cells into retrievable plain text."""
+    rows: list[str] = []
+    for cell in layout.get("cells", []):
+        texts: list[str] = []
+        for cell_layout in cell.get("layouts", []):
+            cell_text = normalize_text(cell_layout.get("text", ""))
+            if cell_text:
+                texts.append(cell_text)
+        if texts:
+            rows.append(" ".join(texts))
+    return "\n".join(rows).strip()
+
+
+def build_structure_nodes(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Build the title hierarchy emitted to downstream storage."""
+    nodes: list[dict[str, Any]] = []
+    for layout in layouts:
+        if not is_title(layout):
+            continue
+        text = get_text(layout)
+        if not text or text in TOC_TITLES:
+            continue
+        nodes.append(
+            {
+                "unique_id": layout.get("uniqueId"),
+                "page": get_page(layout),
+                "index": layout.get("index", 0),
+                "level": layout.get("level", 0),
+                "title": text,
+                "type": layout.get("type"),
+                "sub_type": layout.get("subType"),
+            }
+        )
+    return nodes
+
+
+def update_section_path(
+    section_stack: list[dict[str, Any]],
+    layout: dict[str, Any],
+) -> list[dict[str, Any]]:
+    """Update the current heading stack with a newly observed title layout."""
+    level = layout.get("level", 0)
+    title = get_text(layout)
+    while section_stack and section_stack[-1]["level"] >= level:
+        section_stack.pop()
+    section_stack.append(
+        {
+            "level": level,
+            "title": title,
+            "page": get_page(layout),
+            "unique_id": layout.get("uniqueId"),
+        }
+    )
+    return section_stack
+
+
+def section_path_titles(section_stack: list[dict[str, Any]]) -> list[str]:
+    """Return the title-only view of the current heading stack."""
+    return [item["title"] for item in section_stack]
+
+
+def flush_text_block(
+    blocks: list[dict[str, Any]],
+    semantic_blocks: list[dict[str, Any]],
+    block_id: int,
+) -> int:
+    """Flush buffered paragraph layouts into a single semantic block."""
+    if not blocks:
+        return block_id
+
+    texts = [item["text"] for item in blocks if item["text"]]
+    merged_text = "\n".join(texts).strip()
+    if not merged_text:
+        return block_id
+
+    semantic_blocks.append(
+        {
+            "semantic_id": f"semantic-{block_id}",
+            "block_type": "section_text",
+            "page_start": min(item["page"] for item in blocks),
+            "page_end": max(item["page"] for item in blocks),
+            "section_path": blocks[0]["section_path"],
+            "section_level": blocks[0]["section_level"],
+            "section_title": blocks[0]["section_title"],
+            "source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
+            "text": merged_text,
+        }
+    )
+    return block_id + 1
+
+
+def build_semantic_blocks(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Build semantic content blocks from raw Aliyun layouts."""
+    semantic_blocks: list[dict[str, Any]] = []
+    section_stack: list[dict[str, Any]] = []
+    pending_text_blocks: list[dict[str, Any]] = []
+    block_id = 1
+    skip_toc_page = False
+
+    for layout in layouts:
+        text = get_text(layout)
+        page = get_page(layout)
+
+        if is_toc_layout(layout):
+            skip_toc_page = True
+            continue
+        if skip_toc_page and page == 1:
+            continue
+        if skip_toc_page and page != 1:
+            skip_toc_page = False
+
+        if is_title(layout):
+            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+            pending_text_blocks = []
+            section_stack = update_section_path(section_stack, layout)
+            continue
+
+        section_path = section_path_titles(section_stack)
+        section_title = section_path[-1] if section_path else "未分类"
+        section_level = len(section_path)
+
+        if is_table(layout):
+            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+            pending_text_blocks = []
+            table_text = extract_table_text(layout)
+            if table_text:
+                semantic_blocks.append(
+                    {
+                        "semantic_id": f"semantic-{block_id}",
+                        "block_type": "table",
+                        "page_start": page,
+                        "page_end": page,
+                        "section_path": section_path,
+                        "section_level": section_level,
+                        "section_title": section_title,
+                        "source_ids": [layout.get("uniqueId")],
+                        "text": table_text,
+                    }
+                )
+                block_id += 1
+            continue
+
+        if is_figure(layout):
+            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+            pending_text_blocks = []
+            if text:
+                semantic_blocks.append(
+                    {
+                        "semantic_id": f"semantic-{block_id}",
+                        "block_type": "figure",
+                        "page_start": page,
+                        "page_end": page,
+                        "section_path": section_path,
+                        "section_level": section_level,
+                        "section_title": section_title,
+                        "source_ids": [layout.get("uniqueId")],
+                        "text": text,
+                    }
+                )
+                block_id += 1
+            continue
+
+        if is_text(layout) and text:
+            pending_text_blocks.append(
+                {
+                    "page": page,
+                    "text": text,
+                    "unique_id": layout.get("uniqueId"),
+                    "section_path": section_path,
+                    "section_level": section_level,
+                    "section_title": section_title,
+                }
+            )
+
+    flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+    return semantic_blocks
+
+
+def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> list[str]:
+    """Split long text into overlapping windows for embedding."""
+    text = text.strip()
+    if len(text) <= max_chars:
+        return [text] if text else []
+
+    parts: list[str] = []
+    start = 0
+    while start < len(text):
+        end = min(len(text), start + max_chars)
+        parts.append(text[start:end].strip())
+        if end >= len(text):
+            break
+        start = max(0, end - overlap_chars)
+    return [part for part in parts if part]
+
+
+def build_vector_chunks(
+    semantic_blocks: list[dict[str, Any]],
+    *,
+    doc_id: str,
+    doc_title: str,
+    max_chars: int,
+    overlap_chars: int,
+) -> list[dict[str, Any]]:
+    """Build retrieval chunks from semantic blocks."""
+    vector_chunks: list[dict[str, Any]] = []
+    chunk_index = 1
+
+    for block in semantic_blocks:
+        pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
+        for piece_index, piece in enumerate(pieces, start=1):
+            if block["section_path"]:
+                header = f"标准：{doc_title}\n章节：{' > '.join(block['section_path'])}\n\n"
+            else:
+                header = f"标准：{doc_title}\n\n"
+
+            # Preserve enriched embedding text so retrieval keeps section context.
+            vector_chunks.append(
+                {
+                    "doc_id": doc_id,
+                    "doc_title": doc_title,
+                    "chunk_id": f"chunk-{chunk_index}",
+                    "chunk_index": chunk_index,
+                    "semantic_id": block["semantic_id"],
+                    "chunk_type": block["block_type"],
+                    "piece_index": piece_index,
+                    "page_start": block["page_start"],
+                    "page_end": block["page_end"],
+                    "section_path": block["section_path"],
+                    "section_level": block["section_level"],
+                    "section_title": block["section_title"],
+                    "source_ids": block["source_ids"],
+                    "text": piece,
+                    "embedding_text": header + piece,
+                }
+            )
+            chunk_index += 1
+
+    return vector_chunks
+
+
+def convert_layouts(
+    layouts: list[dict[str, Any]],
+    *,
+    doc_id: str,
+    doc_title: str,
+    max_chars: int,
+    overlap_chars: int,
+) -> dict[str, Any]:
+    """Convert raw Aliyun layouts into the three-layer ingest payload."""
+    structure_nodes = build_structure_nodes(layouts)
+    semantic_blocks = build_semantic_blocks(layouts)
+    vector_chunks = build_vector_chunks(
+        semantic_blocks,
+        doc_id=doc_id,
+        doc_title=doc_title,
+        max_chars=max_chars,
+        overlap_chars=overlap_chars,
+    )
+    return {
+        "doc_id": doc_id,
+        "doc_title": doc_title,
+        "structure_nodes": structure_nodes,
+        "semantic_blocks": semantic_blocks,
+        "vector_chunks": vector_chunks,
+    }
--- a/backend/app/infrastructure/parser/local_document_parser.py
+++ b/backend/app/infrastructure/parser/local_document_parser.py
@@ -4,6 +4,7 @@ from __future__ import annotations

 from pathlib import Path

+from app.config.settings import settings
 from app.domain.documents import DocumentParser, ParsedDocument
 from app.services.parser.docx_parser import parse_docx_to_markdown
 from app.services.parser.pdf_parser import parse_pdf_to_markdown
@@ -34,5 +35,10 @@ class LocalDocumentParser(DocumentParser):
            vector_chunks=[],
            parser_name=self.parser_name,
            raw_text=markdown_text,
-            metadata={"source": "local_parser", "file_suffix": suffix},
+            raw_layouts=[],
+            metadata={
+                "source": "local_parser",
+                "file_suffix": suffix,
+                "artifact_prefix": settings.document_parse_artifact_prefix,
+            },
        )
--- a/backend/data/documents.json
+++ b/backend/data/documents.json
@@ -21,5 +21,365 @@
      "generate_summary": true,
      "structure_nodes": 0
    }
+  },
+  "44121fbb": {
+    "doc_id": "44121fbb",
+    "doc_name": "大众汽车手册.pdf",
+    "file_name": "大众汽车手册.pdf",
+    "object_name": "44121fbb/大众汽车手册.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 766565,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "",
+    "index_name": "",
+    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+    "created_at": "2026-05-18T09:53:47.996183+00:00",
+    "updated_at": "2026-05-18T09:53:50.825868+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+      "processing_stage": "failed"
+    }
+  },
+  "77debb4a": {
+    "doc_id": "77debb4a",
+    "doc_name": "大众汽车手册.pdf",
+    "file_name": "大众汽车手册.pdf",
+    "object_name": "77debb4a/大众汽车手册.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 766565,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "",
+    "index_name": "",
+    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+    "created_at": "2026-05-18T10:05:46.104259+00:00",
+    "updated_at": "2026-05-18T10:05:48.704061+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+      "processing_stage": "failed"
+    }
+  },
+  "d12bdcc8": {
+    "doc_id": "d12bdcc8",
+    "doc_name": "TCT算法接口.pdf",
+    "file_name": "TCT算法接口.pdf",
+    "object_name": "d12bdcc8/TCT算法接口.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 165557,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "",
+    "index_name": "",
+    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+    "created_at": "2026-05-18T10:07:22.199824+00:00",
+    "updated_at": "2026-05-18T10:07:24.653751+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+      "processing_stage": "failed"
+    }
+  },
+  "3c2e8c9c": {
+    "doc_id": "3c2e8c9c",
+    "doc_name": "20260415_Continental tire mobile app solution.pdf",
+    "file_name": "20260415_Continental tire mobile app solution.pdf",
+    "object_name": "3c2e8c9c/20260415_Continental tire mobile app solution.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 2178074,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "",
+    "index_name": "",
+    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+    "created_at": "2026-05-18T10:09:58.338274+00:00",
+    "updated_at": "2026-05-18T10:10:01.295502+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+      "processing_stage": "failed"
+    }
+  },
+  "d22d21a0": {
+    "doc_id": "d22d21a0",
+    "doc_name": "20260415_Continental tire mobile app solution.pdf",
+    "file_name": "20260415_Continental tire mobile app solution.pdf",
+    "object_name": "d22d21a0/20260415_Continental tire mobile app solution.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 2178074,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "",
+    "index_name": "",
+    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+    "created_at": "2026-05-18T10:12:20.078027+00:00",
+    "updated_at": "2026-05-18T10:12:22.999843+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+      "processing_stage": "failed"
+    }
+  },
+  "35f129d3": {
+    "doc_id": "35f129d3",
+    "doc_name": "大众汽车手册.pdf",
+    "file_name": "大众汽车手册.pdf",
+    "object_name": "35f129d3/大众汽车手册.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 766565,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "",
+    "index_name": "",
+    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+    "created_at": "2026-05-18T10:13:24.706512+00:00",
+    "updated_at": "2026-05-18T10:13:27.180509+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
+      "processing_stage": "failed"
+    }
+  },
+  "efc21515": {
+    "doc_id": "efc21515",
+    "doc_name": "大众汽车手册.pdf",
+    "file_name": "大众汽车手册.pdf",
+    "object_name": "efc21515/大众汽车手册.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 766565,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "aliyun_docmind",
+    "index_name": "",
+    "error_message": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
+    "created_at": "2026-05-18T13:47:32.076786+00:00",
+    "updated_at": "2026-05-18T13:47:57.998073+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "parser_backend": "aliyun_docmind",
+      "parse_task_id": "docmind-20260518-a6e84447457f43cb85f95225cfc6495b",
+      "layout_count": 87,
+      "structure_node_count": 20,
+      "semantic_block_count": 27,
+      "vector_chunk_count": 27,
+      "artifact_keys": {
+        "layouts": "artifacts/efc21515/layouts.json",
+        "structure_nodes": "artifacts/efc21515/structure_nodes.json",
+        "semantic_blocks": "artifacts/efc21515/semantic_blocks.json",
+        "vector_chunks": "artifacts/efc21515/vector_chunks.json"
+      },
+      "processing_stage": "failed",
+      "failure_reason": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400"
+    }
+  },
+  "0d4b08bc": {
+    "doc_id": "0d4b08bc",
+    "doc_name": "大众汽车手册.pdf",
+    "file_name": "大众汽车手册.pdf",
+    "object_name": "0d4b08bc/大众汽车手册.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 766565,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "aliyun_docmind",
+    "index_name": "",
+    "error_message": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
+    "created_at": "2026-05-18T14:03:15.134344+00:00",
+    "updated_at": "2026-05-18T14:03:34.843448+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "parser_backend": "aliyun_docmind",
+      "parse_task_id": "docmind-20260518-78353d85daa24147b68d8fb71895179f",
+      "layout_count": 87,
+      "structure_node_count": 20,
+      "semantic_block_count": 27,
+      "vector_chunk_count": 27,
+      "artifact_keys": {
+        "layouts": "artifacts/0d4b08bc/layouts.json",
+        "structure_nodes": "artifacts/0d4b08bc/structure_nodes.json",
+        "semantic_blocks": "artifacts/0d4b08bc/semantic_blocks.json",
+        "vector_chunks": "artifacts/0d4b08bc/vector_chunks.json"
+      },
+      "processing_stage": "failed",
+      "failure_reason": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404"
+    }
+  },
+  "4302f314": {
+    "doc_id": "4302f314",
+    "doc_name": "大众汽车手册.pdf",
+    "file_name": "大众汽车手册.pdf",
+    "object_name": "4302f314/大众汽车手册.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 766565,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "aliyun_docmind",
+    "index_name": "",
+    "error_message": "embedding 维度不匹配，期望 1536",
+    "created_at": "2026-05-18T14:11:29.943973+00:00",
+    "updated_at": "2026-05-18T14:11:48.554500+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "parser_backend": "aliyun_docmind",
+      "parse_task_id": "docmind-20260518-23935ee455ac4b26ac4201ac4781ee52",
+      "layout_count": 87,
+      "structure_node_count": 20,
+      "semantic_block_count": 27,
+      "vector_chunk_count": 27,
+      "artifact_keys": {
+        "layouts": "artifacts/4302f314/layouts.json",
+        "structure_nodes": "artifacts/4302f314/structure_nodes.json",
+        "semantic_blocks": "artifacts/4302f314/semantic_blocks.json",
+        "vector_chunks": "artifacts/4302f314/vector_chunks.json"
+      },
+      "processing_stage": "failed",
+      "failure_reason": "embedding 维度不匹配，期望 1536"
+    }
+  },
+  "765ed1ee": {
+    "doc_id": "765ed1ee",
+    "doc_name": "大众汽车手册.pdf",
+    "file_name": "大众汽车手册.pdf",
+    "object_name": "765ed1ee/大众汽车手册.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 766565,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "aliyun_docmind",
+    "index_name": "",
+    "error_message": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>",
+    "created_at": "2026-05-18T14:18:28.875138+00:00",
+    "updated_at": "2026-05-18T14:18:57.389110+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "parser_backend": "aliyun_docmind",
+      "parse_task_id": "docmind-20260518-f116856bc29245baa2531b245078a701",
+      "layout_count": 87,
+      "structure_node_count": 20,
+      "semantic_block_count": 27,
+      "vector_chunk_count": 27,
+      "artifact_keys": {
+        "layouts": "artifacts/765ed1ee/layouts.json",
+        "structure_nodes": "artifacts/765ed1ee/structure_nodes.json",
+        "semantic_blocks": "artifacts/765ed1ee/semantic_blocks.json",
+        "vector_chunks": "artifacts/765ed1ee/vector_chunks.json"
+      },
+      "processing_stage": "failed",
+      "failure_reason": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>"
+    }
+  },
+  "05cabe09": {
+    "doc_id": "05cabe09",
+    "doc_name": "大众汽车手册.pdf",
+    "file_name": "大众汽车手册.pdf",
+    "object_name": "05cabe09/大众汽车手册.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 766565,
+    "status": "failed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 0,
+    "parser_name": "aliyun_docmind",
+    "index_name": "",
+    "error_message": "embedding 维度不匹配，期望 1536",
+    "created_at": "2026-05-18T14:24:32.156500+00:00",
+    "updated_at": "2026-05-18T14:24:50.114138+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "parser_backend": "aliyun_docmind",
+      "parse_task_id": "docmind-20260518-897d858983df48e28e9819e563d46208",
+      "layout_count": 87,
+      "structure_node_count": 20,
+      "semantic_block_count": 27,
+      "vector_chunk_count": 27,
+      "artifact_keys": {
+        "layouts": "artifacts/05cabe09/layouts.json",
+        "structure_nodes": "artifacts/05cabe09/structure_nodes.json",
+        "semantic_blocks": "artifacts/05cabe09/semantic_blocks.json",
+        "vector_chunks": "artifacts/05cabe09/vector_chunks.json"
+      },
+      "processing_stage": "failed",
+      "failure_reason": "embedding 维度不匹配，期望 1536"
+    }
+  },
+  "9acb2ba0": {
+    "doc_id": "9acb2ba0",
+    "doc_name": "大众汽车手册.pdf",
+    "file_name": "大众汽车手册.pdf",
+    "object_name": "9acb2ba0/大众汽车手册.pdf",
+    "content_type": "application/pdf",
+    "size_bytes": 766565,
+    "status": "indexed",
+    "regulation_type": "",
+    "version": "",
+    "summary": "",
+    "summary_latency_ms": 0,
+    "chunk_count": 27,
+    "parser_name": "aliyun_docmind",
+    "index_name": "regulations_dense_1024_v1",
+    "error_message": "",
+    "created_at": "2026-05-18T14:29:01.368719+00:00",
+    "updated_at": "2026-05-18T14:29:23.699068+00:00",
+    "metadata": {
+      "generate_summary": true,
+      "parser_backend": "aliyun_docmind",
+      "parse_task_id": "docmind-20260518-e5fd4a5419e74d569c562e389e6ae72c",
+      "layout_count": 87,
+      "structure_node_count": 20,
+      "semantic_block_count": 27,
+      "vector_chunk_count": 27,
+      "artifact_keys": {
+        "layouts": "artifacts/9acb2ba0/layouts.json",
+        "structure_nodes": "artifacts/9acb2ba0/structure_nodes.json",
+        "semantic_blocks": "artifacts/9acb2ba0/semantic_blocks.json",
+        "vector_chunks": "artifacts/9acb2ba0/vector_chunks.json"
+      },
+      "processing_stage": "indexed",
+      "index_collection": "regulations_dense_1024_v1"
+    }
  }
 }
--- a/docs/architecture/aliyun-ingest-implementation.md
+++ b/docs/architecture/aliyun-ingest-implementation.md
@@ -0,0 +1,71 @@
+# 阿里云解析主链路实现说明
+
+本文档描述当前仓库已经落地的文档 ingest 主链路实现，作为迁移设计到代码实现之间的收口说明。
+
+## 1. 当前默认链路
+
+- 上传入口保持为 `/api/v1/documents/upload`
+- 默认 `PARSER_BACKEND=aliyun`
+- 默认 `CHUNK_BACKEND=aliyun`
+- 默认 Milvus collection 为 `regulations_dense_1536_v2`
+- 解析产物落到 MinIO `artifacts/{doc_id}/`
+
+完整主链路如下：
+
+1. 原始文件上传到 MinIO
+2. `AliyunDocmindGateway` 提交阿里云异步解析任务
+3. 轮询任务状态直到成功或超时
+4. 分页拉取 `layouts`
+5. 转换为 `structure_nodes / semantic_blocks / vector_chunks`
+6. 三层结构 JSON 回写 MinIO
+7. 使用 `vector_chunks[*].embedding_text` 调 embedding API
+8. 写入 `regulations_dense_1536_v2`
+9. 文档状态更新为 `indexed`
+
+运行时转换逻辑位于 `backend/app/infrastructure/parser/aliyun_layout_normalizer.py`。
+旧的 `backend/app/aliyun_parser/` 示例目录已移除，不参与生产运行时。
+
+## 2. 解析产物持久化
+
+每个文档会额外写入以下对象：
+
+- `artifacts/{doc_id}/layouts.json`
+- `artifacts/{doc_id}/structure_nodes.json`
+- `artifacts/{doc_id}/semantic_blocks.json`
+- `artifacts/{doc_id}/vector_chunks.json`
+
+`documents.json` 仅保留对象 key、统计信息和处理阶段，不保存完整大 JSON。
+
+## 3. 失败策略
+
+- 当前 `PARSER_FAILURE_MODE=fail`
+- 阿里云解析失败不自动回退到本地 parser
+- 失败时保留原始文件与已写入的 artifacts，便于排障
+
+## 4. 运行参数
+
+关键环境变量如下：
+
+- `ALIBABA_ACCESS_KEY_ID`
+- `ALIBABA_ACCESS_KEY_SECRET`
+- `ALIBABA_ENDPOINT`
+- `ALIYUN_PARSE_POLL_INTERVAL_SECONDS`
+- `ALIYUN_PARSE_TIMEOUT_SECONDS`
+- `ALIYUN_PARSE_LAYOUT_STEP_SIZE`
+- `ALIYUN_LLM_ENHANCEMENT`
+- `ALIYUN_ENHANCEMENT_MODE`
+- `DOCUMENT_PARSE_ARTIFACT_PREFIX`
+- `PARSER_BACKEND`
+- `CHUNK_BACKEND`
+
+## 5. 运行态确认
+
+可通过 `/api/v1/status/config` 确认以下字段：
+
+- `parser_backend`
+- `chunk_backend`
+- `milvus_collection`
+- `artifact_prefix`
+- `parser_failure_mode`
+
+这几个值用于确认服务是否实际运行在迁移后的默认链路上。
--- a/docs/rfc/backend-api-parsing-embedding-migration-requirements.md
+++ b/docs/rfc/backend-api-parsing-embedding-migration-requirements.md
@@ -29,7 +29,7 @@
 已确认的目标需求如下：

 - 文档解析统一改为阿里云文档智能能力
- 当前阿里云接入基础来自 `backend/app/aliyun_parser/parse_pdf.py`
+- 当前阿里云接入基础已经迁移到 `backend/app/infrastructure/parser/aliyun_layout_normalizer.py`
 - 解析结果以 `structure_nodes`、`semantic_blocks`、`vector_chunks` 三层结构为基础
 - 分块以阿里云 `vector_chunks` 为准，不再走当前本地 `RegulationChunker`
 - embedding 改为 OpenAI 兼容 API 调用，模型使用 `text-embedding-v3`
@@ -80,7 +80,7 @@
 受影响的解析能力范围包括：

 - 当前本地 parser 目录
- `backend/app/aliyun_parser`
+- `backend/app/infrastructure/parser`

 迁移后阿里云文档智能能力将成为主解析来源，本地 PDF/DOCX/MinerU 解析链路需要重新界定保留、下线或回退策略，但具体模块组织方式不在本文件内定义。

@@ -133,7 +133,7 @@
 以下风险和约束在本期已经明确，需要在后续架构和实施阶段优先处理：

 - 旧 Milvus collection 与新 `1536` 维 schema 不兼容，需要新 collection 和重建索引
- `backend/app/aliyun_parser` 现有脚本含硬编码密钥，后续必须全部移到环境变量
+- 阿里云凭据必须继续只通过环境变量或凭据链注入，不能回到脚本内硬编码
 - RAG 下游当前对 `clause_number` 有依赖，迁移后需要优先适配 `section_title` 和 Aliyun chunk metadata
 - 如果阿里云返回字段与当前样例不同，需要在架构阶段补充 adapter 层

--- a/tests/test_embedding.py
+++ b/tests/test_embedding.py
@@ -1,4 +1,4 @@
-"""新架构下的文档编排与 embedding 边界测试。"""
+"""Document orchestration and embedding boundary tests for the migrated backend."""

 from __future__ import annotations

@@ -80,6 +80,7 @@ class FakeParser:
        return ParsedDocument(
            doc_id=doc_id,
            doc_name=doc_name,
+            raw_layouts=[{"uniqueId": "layout-1", "type": "text"}],
            structure_nodes=[{"title": "第一章"}],
            semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
            vector_chunks=[
@@ -95,6 +96,7 @@ class FakeParser:
                }
            ],
            parser_name="fake_parser",
+            metadata={"task_id": "task-123", "artifact_prefix": "artifacts", "layout_count": 1},
        )


@@ -125,10 +127,10 @@ class FakeEmbeddingProvider:

    def embed_texts(self, texts: list[str]) -> list[list[float]]:
        self.calls.append(texts)
-        return [[0.1] * 1536 for _ in texts]
+        return [[0.1] * 1024 for _ in texts]

    def embed_query(self, text: str) -> list[float]:
-        return [0.2] * 1536
+        return [0.2] * 1024


 class FakeVectorIndex:
@@ -146,10 +148,10 @@ class FakeVectorIndex:
        return []

    def health(self) -> dict:
-        return {"collection_name": "regulations_dense_1536"}
+        return {"collection_name": "regulations_dense_1024_v1"}


-def test_document_command_service_uses_1536_dense_embedding_and_updates_status():
+def test_document_command_service_uses_1024_dense_embedding_and_updates_status():
    repository = FakeRepository()
    binary_store = FakeBinaryStore()
    embedding_provider = FakeEmbeddingProvider()
@@ -183,15 +185,16 @@ def test_document_command_service_uses_1536_dense_embedding_and_updates_status()
    assert stored.status == DocumentStatus.INDEXED
    assert stored.chunk_count == 1
    assert stored.parser_name == "fake_parser"
-    assert stored.index_name == "regulations_dense_1536"
+    assert stored.index_name == "regulations_dense_1024_v1"
+    assert stored.metadata["parse_task_id"] == "task-123"
+    assert stored.metadata["artifact_keys"]["vector_chunks"].endswith("/vector_chunks.json")

-
-def test_bootstrap_defaults_to_local_parser_and_chunk_builder():
+def test_bootstrap_defaults_to_aliyun_parser_and_chunk_builder():
    bootstrap.get_parser.cache_clear()
    bootstrap.get_chunk_builder.cache_clear()

    parser = bootstrap.get_parser()
    chunk_builder = bootstrap.get_chunk_builder()

-    assert parser.__class__.__name__ == "LocalDocumentParser"
-    assert chunk_builder.__class__.__name__ == "LocalRegulationChunkBuilder"
+    assert parser.__class__.__name__ == "AliyunDocumentParser"
+    assert chunk_builder.__class__.__name__ == "AliyunVectorChunkBuilder"
--- a/tests/verify_mvp.py
+++ b/tests/verify_mvp.py
@@ -64,11 +64,16 @@ def verify_migration_config() -> bool:

    try:
        assert settings.embedding_model == "text-embedding-v3"
-        assert settings.embedding_dim == 1536
-        assert settings.milvus_collection == "regulations_dense_1536"
+        assert settings.embedding_dim == 1024
+        assert settings.milvus_collection == "regulations_dense_1024_v1"
+        assert settings.parser_backend == "aliyun"
+        assert settings.chunk_backend == "aliyun"
        logger.info(f"embedding_model={settings.embedding_model}")
+        logger.info(f"embedding_base_url={settings.embedding_base_url}")
        logger.info(f"embedding_dim={settings.embedding_dim}")
        logger.info(f"milvus_collection={settings.milvus_collection}")
+        logger.info(f"parser_backend={settings.parser_backend}")
+        logger.info(f"chunk_backend={settings.chunk_backend}")
        logger.success("migration config ok")
        return True
    except Exception as exc: