Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions
--- a/backend/aliyun_parser/.claude/settings.local.json
+++ b/backend/aliyun_parser/.claude/settings.local.json
@@ -0,0 +1,8 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(python3 *)",
+      "Bash(PGPASSWORD=postgresql123456 psql *)"
+    ]
+  }
+}
--- a/backend/aliyun_parser/parse_pdf.py
+++ b/backend/aliyun_parser/parse_pdf.py
@@ -0,0 +1,475 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+阿里云文档智能 API 解析 PDF，输出三层结构 chunks
+- structure_nodes: 目录树结构
+- semantic_blocks: 语义块（章节文本、表格、图片）
+- vector_chunks: 检索块（带 overlap 切分）
+"""
+
+import argparse
+import json
+import re
+import time
+from pathlib import Path
+from typing import Dict, List
+
+from alibabacloud_docmind_api20220711.client import Client as DocmindClient
+from alibabacloud_tea_openapi import models as open_api_models
+from alibabacloud_docmind_api20220711 import models as docmind_models
+from alibabacloud_tea_util import models as util_models
+
+# ===================== 阿里云配置 =====================
+ALIBABA_ACCESS_KEY_ID = "LTAI5t6fWvAsvZkoF9WTbtys"
+ALIBABA_ACCESS_KEY_SECRET = "WX4oaE4FLYRa5L85TMQkqRPHeTJAF0"
+ALIBABA_ENDPOINT = "docmind-api.cn-hangzhou.aliyuncs.com"
+
+# ===================== 切分参数 =====================
+MAX_CHARS = 600
+OVERLAP_CHARS = 80
+
+# ===================== 布局类型常量 =====================
+TOC_TITLES = {"目次", "目录"}
+TITLE_SUBTYPES = {"doc_title", "para_title"}
+TEXT_SUBTYPES = {"para", "none"}
+FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
+FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
+
+
+# ===================== 阿里云 API 客户端 =====================
+def init_client() -> DocmindClient:
+    config = open_api_models.Config(
+        access_key_id=ALIBABA_ACCESS_KEY_ID,
+        access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
+    )
+    config.endpoint = ALIBABA_ENDPOINT
+    return DocmindClient(config)
+
+
+def submit_job(client: DocmindClient, file_path: str) -> str:
+    """提交文档解析任务"""
+    file_name = Path(file_path).name
+    request = docmind_models.SubmitDocParserJobAdvanceRequest(
+        file_url_object=open(file_path, "rb"),
+        file_name=file_name,
+        file_name_extension=Path(file_path).suffix.lstrip("."),
+        llm_enhancement=True,
+        enhancement_mode="VLM",
+    )
+    runtime = util_models.RuntimeOptions()
+    response = client.submit_doc_parser_job_advance(request, runtime)
+    return response.body.data.id
+
+
+def query_status(client: DocmindClient, task_id: str) -> Dict:
+    """查询任务状态"""
+    request = docmind_models.QueryDocParserStatusRequest(id=task_id)
+    response = client.query_doc_parser_status(request)
+    return response.body.data.to_map() if response.body.data else None
+
+
+def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
+    """等待任务完成"""
+    while True:
+        status_data = query_status(client, task_id)
+        if not status_data:
+            return False
+        status = status_data.get("Status", "").lower()
+        if status == "success":
+            return True
+        elif status == "failed":
+            print(f"任务失败: {status_data}")
+            return False
+        print(f"任务状态: {status}, 等待中...")
+        time.sleep(poll_interval)
+
+
+def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
+    """获取解析结果"""
+    request = docmind_models.GetDocParserResultRequest(
+        id=task_id,
+        layout_step_size=layout_step_size,
+        layout_num=layout_num,
+    )
+    response = client.get_doc_parser_result(request)
+    return response.body.data if response.body.data else None
+
+
+def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
+    """收集所有解析结果"""
+    all_layouts = []
+    layout_num = 0
+    while True:
+        result_data = get_result(client, task_id, layout_num, layout_step_size)
+        if not result_data:
+            break
+        layouts = result_data.get("layouts", [])
+        if not layouts:
+            break
+        all_layouts.extend(layouts)
+        layout_num += len(layouts)
+        if len(layouts) < layout_step_size:
+            break
+    return all_layouts
+
+
+# ===================== 文本处理 =====================
+def normalize_text(text: str) -> str:
+    text = text.replace("\r", "\n")
+    text = text.replace(" ", " ")
+    text = re.sub(r"\n+", "\n", text)
+    text = re.sub(r"[ \t]+", " ", text)
+    return text.strip()
+
+
+def get_page(layout: Dict) -> int:
+    return layout.get("pageNum", layout.get("pageNumber", 0))
+
+
+def get_text(layout: Dict) -> str:
+    text = normalize_text(layout.get("text", ""))
+    if text:
+        return text
+    return normalize_text(layout.get("markdownContent", ""))
+
+
+# ===================== 布局类型判断 =====================
+def is_title(layout: Dict) -> bool:
+    return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
+
+
+def is_text(layout: Dict) -> bool:
+    return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
+
+
+def is_figure(layout: Dict) -> bool:
+    return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
+
+
+def is_table(layout: Dict) -> bool:
+    return layout.get("type") == "table"
+
+
+def is_toc_layout(layout: Dict) -> bool:
+    text = get_text(layout)
+    if text in TOC_TITLES:
+        return True
+    if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
+        return True
+    return False
+
+
+def extract_table_text(layout: Dict) -> str:
+    rows = []
+    for cell in layout.get("cells", []):
+        texts = []
+        for cell_layout in cell.get("layouts", []):
+            cell_text = normalize_text(cell_layout.get("text", ""))
+            if cell_text:
+                texts.append(cell_text)
+        if texts:
+            rows.append(" ".join(texts))
+    return "\n".join(rows).strip()
+
+
+# ===================== 结构层：目录树 =====================
+def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
+    nodes = []
+    for layout in layouts:
+        if not is_title(layout):
+            continue
+        text = get_text(layout)
+        if not text or text in TOC_TITLES:
+            continue
+        nodes.append(
+            {
+                "unique_id": layout.get("uniqueId"),
+                "page": get_page(layout),
+                "index": layout.get("index", 0),
+                "level": layout.get("level", 0),
+                "title": text,
+                "type": layout.get("type"),
+                "sub_type": layout.get("subType"),
+            }
+        )
+    return nodes
+
+
+# ===================== 语义层：章节内容 =====================
+def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
+    level = layout.get("level", 0)
+    title = get_text(layout)
+    while section_stack and section_stack[-1]["level"] >= level:
+        section_stack.pop()
+    section_stack.append(
+        {
+            "level": level,
+            "title": title,
+            "page": get_page(layout),
+            "unique_id": layout.get("uniqueId"),
+        }
+    )
+    return section_stack
+
+
+def section_path_titles(section_stack: List[Dict]) -> List[str]:
+    return [item["title"] for item in section_stack]
+
+
+def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
+    if not blocks:
+        return block_id
+
+    texts = [item["text"] for item in blocks if item["text"]]
+    merged_text = "\n".join(texts).strip()
+    if not merged_text:
+        return block_id
+
+    semantic_blocks.append(
+        {
+            "semantic_id": f"semantic-{block_id}",
+            "block_type": "section_text",
+            "page_start": min(item["page"] for item in blocks),
+            "page_end": max(item["page"] for item in blocks),
+            "section_path": blocks[0]["section_path"],
+            "section_level": blocks[0]["section_level"],
+            "section_title": blocks[0]["section_title"],
+            "source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
+            "text": merged_text,
+        }
+    )
+    return block_id + 1
+
+
+def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
+    semantic_blocks = []
+    section_stack = []
+    pending_text_blocks = []
+    block_id = 1
+    skip_toc_page = False
+
+    for layout in layouts:
+        text = get_text(layout)
+        page = get_page(layout)
+
+        if is_toc_layout(layout):
+            skip_toc_page = True
+            continue
+        if skip_toc_page and page == 1:
+            continue
+        if skip_toc_page and page != 1:
+            skip_toc_page = False
+
+        if is_title(layout):
+            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+            pending_text_blocks = []
+            section_stack = update_section_path(section_stack, layout)
+            continue
+
+        section_path = section_path_titles(section_stack)
+        section_title = section_path[-1] if section_path else "未分类"
+        section_level = len(section_path)
+
+        if is_table(layout):
+            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+            pending_text_blocks = []
+            table_text = extract_table_text(layout)
+            if table_text:
+                semantic_blocks.append(
+                    {
+                        "semantic_id": f"semantic-{block_id}",
+                        "block_type": "table",
+                        "page_start": page,
+                        "page_end": page,
+                        "section_path": section_path,
+                        "section_level": section_level,
+                        "section_title": section_title,
+                        "source_ids": [layout.get("uniqueId")],
+                        "text": table_text,
+                    }
+                )
+                block_id += 1
+            continue
+
+        if is_figure(layout):
+            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+            pending_text_blocks = []
+            if text:
+                semantic_blocks.append(
+                    {
+                        "semantic_id": f"semantic-{block_id}",
+                        "block_type": "figure",
+                        "page_start": page,
+                        "page_end": page,
+                        "section_path": section_path,
+                        "section_level": section_level,
+                        "section_title": section_title,
+                        "source_ids": [layout.get("uniqueId")],
+                        "text": text,
+                    }
+                )
+                block_id += 1
+            continue
+
+        if is_text(layout) and text:
+            pending_text_blocks.append(
+                {
+                    "page": page,
+                    "text": text,
+                    "unique_id": layout.get("uniqueId"),
+                    "section_path": section_path,
+                    "section_level": section_level,
+                    "section_title": section_title,
+                }
+            )
+
+    flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+    return semantic_blocks
+
+
+# ===================== 检索层：向量 chunks =====================
+def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
+    text = text.strip()
+    if len(text) <= max_chars:
+        return [text] if text else []
+
+    parts = []
+    start = 0
+    while start < len(text):
+        end = min(len(text), start + max_chars)
+        parts.append(text[start:end].strip())
+        if end >= len(text):
+            break
+        start = max(0, end - overlap_chars)
+    return [part for part in parts if part]
+
+
+def build_vector_chunks(
+    semantic_blocks: List[Dict],
+    doc_id: str,
+    doc_title: str,
+    max_chars: int,
+    overlap_chars: int,
+) -> List[Dict]:
+    vector_chunks = []
+    chunk_index = 1
+
+    for block in semantic_blocks:
+        pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
+        for piece_index, piece in enumerate(pieces, start=1):
+            if block["section_path"]:
+                header = f"标准：{doc_title}\n章节：{' > '.join(block['section_path'])}\n\n"
+            else:
+                header = f"标准：{doc_title}\n\n"
+            vector_chunks.append(
+                {
+                    "doc_id": doc_id,
+                    "doc_title": doc_title,
+                    "chunk_id": f"chunk-{chunk_index}",
+                    "chunk_index": chunk_index,
+                    "semantic_id": block["semantic_id"],
+                    "chunk_type": block["block_type"],
+                    "piece_index": piece_index,
+                    "page_start": block["page_start"],
+                    "page_end": block["page_end"],
+                    "section_path": block["section_path"],
+                    "section_level": block["section_level"],
+                    "section_title": block["section_title"],
+                    "source_ids": block["source_ids"],
+                    "text": piece,
+                    "embedding_text": header + piece,
+                }
+            )
+            chunk_index += 1
+
+    return vector_chunks
+
+
+# ===================== 主转换函数 =====================
+def convert_layouts(
+    layouts: List[Dict],
+    doc_id: str,
+    doc_title: str,
+    max_chars: int,
+    overlap_chars: int,
+) -> Dict:
+    structure_nodes = build_structure_nodes(layouts)
+    semantic_blocks = build_semantic_blocks(layouts)
+    vector_chunks = build_vector_chunks(
+        semantic_blocks,
+        doc_id=doc_id,
+        doc_title=doc_title,
+        max_chars=max_chars,
+        overlap_chars=overlap_chars,
+    )
+    return {
+        "doc_id": doc_id,
+        "doc_title": doc_title,
+        "structure_nodes": structure_nodes,
+        "semantic_blocks": semantic_blocks,
+        "vector_chunks": vector_chunks,
+    }
+
+
+# ===================== CLI 入口 =====================
+def main() -> None:
+    parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF，输出三层结构 chunks")
+    parser.add_argument("pdf_path", help="PDF 文件路径")
+    parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
+    parser.add_argument("--layouts-out", dest="layouts_output", help="输出原始 layouts JSON")
+    parser.add_argument("--doc-id", default="GB14747-2006", help="文档 ID")
+    parser.add_argument("--doc-title", default="GB 14747—2006 儿童三轮车安全要求", help="文档标题")
+    parser.add_argument("--max-chars", type=int, default=MAX_CHARS, help="单个检索 chunk 最大字符数")
+    parser.add_argument("--overlap-chars", type=int, default=OVERLAP_CHARS, help="相邻检索 chunk 重叠字符数")
+    parser.add_argument("--poll-interval", type=int, default=5, help="轮询间隔（秒）")
+    args = parser.parse_args()
+
+    pdf_path = Path(args.pdf_path).expanduser().resolve()
+    if not pdf_path.exists():
+        raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
+
+    # 1. 提交阿里云任务
+    client = init_client()
+    print(f"提交任务: {pdf_path}")
+    task_id = submit_job(client, str(pdf_path))
+    print(f"任务 ID: {task_id}")
+
+    # 2. 等待完成
+    print("等待任务完成...")
+    if not wait_for_completion(client, task_id, args.poll_interval):
+        print("任务失败，退出")
+        return
+
+    # 3. 获取 layouts
+    print("获取解析结果...")
+    layouts = collect_all_results(client, task_id)
+    print(f"获取到 {len(layouts)} 个布局块")
+
+    # 4. 输出原始 layouts（可选）
+    if args.layouts_output:
+        layouts_path = Path(args.layouts_output).expanduser().resolve()
+        layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
+        print(f"原始 layouts 已写入: {layouts_path}")
+
+    # 5. 转换为三层结构
+    print("转换为三层结构...")
+    data = convert_layouts(
+        layouts,
+        doc_id=args.doc_id,
+        doc_title=args.doc_title,
+        max_chars=args.max_chars,
+        overlap_chars=args.overlap_chars,
+    )
+
+    # 6. 输出结果
+    output_path = Path(args.out).expanduser().resolve()
+    output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    print(f"结构层节点数: {len(data['structure_nodes'])}")
+    print(f"语义层块数: {len(data['semantic_blocks'])}")
+    print(f"检索层块数: {len(data['vector_chunks'])}")
+    print(f"输出文件: {output_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/backend/aliyun_parser/rebuild_milvus_collection.py
+++ b/backend/aliyun_parser/rebuild_milvus_collection.py
@@ -0,0 +1,115 @@
+"""Rebuild the migrated Milvus collection from saved vector chunks."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
+
+
+DEFAULT_COLLECTION = "regulations_dense_1024_v2"
+DEFAULT_DIM = 1024
+
+
+def build_collection(name: str, dim: int) -> Collection:
+    """Create the migrated Milvus collection from scratch."""
+    if utility.has_collection(name):
+        utility.drop_collection(name)
+
+    schema = CollectionSchema(
+        fields=[
+            FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=128, is_primary=True, auto_id=False),
+            FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
+            FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=256),
+            FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=128),
+            FieldSchema(name="chunk_index", dtype=DataType.INT64),
+            FieldSchema(name="piece_index", dtype=DataType.INT64),
+            FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
+            FieldSchema(name="embedding_text", dtype=DataType.VARCHAR, max_length=65535),
+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
+            FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=128),
+            FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=64),
+            FieldSchema(name="page_start", dtype=DataType.INT64),
+            FieldSchema(name="page_end", dtype=DataType.INT64),
+            FieldSchema(name="section_level", dtype=DataType.INT64),
+            FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),
+            FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
+            FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
+            FieldSchema(name="metadata_json", dtype=DataType.VARCHAR, max_length=65535),
+            FieldSchema(name="created_at", dtype=DataType.INT64),
+        ],
+        description="Dense-only regulations index",
+        enable_dynamic_field=False,
+    )
+    collection = Collection(name=name, schema=schema)
+    collection.create_index(
+        field_name="embedding",
+        index_params={
+            "metric_type": "COSINE",
+            "index_type": "IVF_FLAT",
+            "params": {"nlist": 128},
+        },
+    )
+    return collection
+
+
+def load_chunks(payload_path: Path) -> list[dict]:
+    """Load vector chunks emitted by the Aliyun parser pipeline."""
+    payload = json.loads(payload_path.read_text(encoding="utf-8"))
+    if isinstance(payload, dict):
+        chunks = payload.get("vector_chunks", [])
+    else:
+        chunks = payload
+    if not isinstance(chunks, list):
+        raise ValueError("vector chunk payload must be a list or a dict containing vector_chunks")
+    return chunks
+
+
+def main() -> None:
+    """Rebuild the target collection from a vector chunk payload."""
+    parser = argparse.ArgumentParser(description="Rebuild the migrated Milvus collection.")
+    parser.add_argument("--host", default="127.0.0.1", help="Milvus host")
+    parser.add_argument("--port", default="19530", help="Milvus port")
+    parser.add_argument("--collection", default=DEFAULT_COLLECTION, help="Milvus collection name")
+    parser.add_argument("--dim", type=int, default=DEFAULT_DIM, help="Embedding dimension")
+    parser.add_argument("--payload", required=True, help="Path to vector_chunks.json or a compatible JSON file")
+    args = parser.parse_args()
+
+    connections.connect("default", host=args.host, port=args.port)
+    collection = build_collection(args.collection, args.dim)
+    chunks = load_chunks(Path(args.payload))
+    if not chunks:
+        print("No vector chunks found; collection was created but remains empty.")
+        return
+
+    data = [
+        [chunk["chunk_id"] for chunk in chunks],
+        [chunk["doc_id"] for chunk in chunks],
+        [chunk["doc_title"] for chunk in chunks],
+        [chunk["chunk_id"] for chunk in chunks],
+        [int(chunk.get("chunk_index", 0) or 0) for chunk in chunks],
+        [int(chunk.get("piece_index", 0) or 0) for chunk in chunks],
+        [str(chunk.get("text", ""))[:65535] for chunk in chunks],
+        [str(chunk.get("embedding_text", chunk.get("text", "")))[:65535] for chunk in chunks],
+        [chunk["embedding"] for chunk in chunks],
+        [str(chunk.get("semantic_id", "")) for chunk in chunks],
+        [str(chunk.get("chunk_type", "")) for chunk in chunks],
+        [int(chunk.get("page_start", 0) or 0) for chunk in chunks],
+        [int(chunk.get("page_end", 0) or 0) for chunk in chunks],
+        [int(chunk.get("section_level", 0) or 0) for chunk in chunks],
+        [json.dumps(chunk.get("source_ids", []), ensure_ascii=False) for chunk in chunks],
+        [json.dumps(chunk.get("section_path", []), ensure_ascii=False) for chunk in chunks],
+        [str(chunk.get("section_title", "")) for chunk in chunks],
+        [json.dumps(chunk, ensure_ascii=False) for chunk in chunks],
+        [int(chunk.get("created_at", 0) or 0) for chunk in chunks],
+    ]
+    collection.insert(data)
+    collection.flush()
+    collection.load()
+    print(f"Rebuilt collection {args.collection} with {len(chunks)} chunks.")
+
+
+if __name__ == "__main__":
+    main()
--- a/backend/aliyun_parser/schema.sql
+++ b/backend/aliyun_parser/schema.sql
@@ -0,0 +1,122 @@
+-- 法规文档向量检索系统数据库表结构
+-- PostgreSQL
+
+-- ==================== 文档表 ====================
+CREATE TABLE documents (
+    id SERIAL PRIMARY KEY,
+    doc_id VARCHAR(128) UNIQUE NOT NULL,       -- 文档唯一标识，如 "GB14747-2006"
+    title VARCHAR(512) NOT NULL,               -- 文档标题
+    doc_type VARCHAR(32),                      -- 文档类型：标准/法规/规范
+    standard_number VARCHAR(64),               -- 标准编号：如 "GB 14747-2006"
+    publish_date DATE,                         -- 发布日期
+    implement_date DATE,                       -- 实施日期
+    status VARCHAR(32),                        -- 状态：现行/废止/修订
+    source_url VARCHAR(512),                   -- 来源 URL
+    file_path VARCHAR(512),                    -- 本地 PDF 文件路径
+    file_size INT,                             -- 文件大小（字节）
+    upload_time TIMESTAMP DEFAULT NOW(),       -- 上传时间
+    created_at TIMESTAMP DEFAULT NOW(),
+    updated_at TIMESTAMP DEFAULT NOW()
+);
+
+COMMENT ON TABLE documents IS '文档元数据表';
+COMMENT ON COLUMN documents.doc_id IS '文档唯一标识，用于关联 Milvus 和其他表';
+COMMENT ON COLUMN documents.standard_number IS '标准编号，如 GB 14747-2006';
+
+-- ==================== 章节结构表 ====================
+CREATE TABLE sections (
+    id SERIAL PRIMARY KEY,
+    doc_id VARCHAR(128) NOT NULL,
+    unique_id VARCHAR(64) NOT NULL,            -- 阿里云返回的唯一标识
+    level INT NOT NULL,                        -- 层级：1, 2, 3...
+    title VARCHAR(512) NOT NULL,               -- 章节标题
+    page INT,                                  -- 所在页码
+    index INT,                                 -- 页内顺序
+    parent_id INT,                             -- 父章节 ID（树形结构）
+    created_at TIMESTAMP DEFAULT NOW(),
+
+    CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
+    CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
+    CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
+);
+
+COMMENT ON TABLE sections IS '章节结构表，用于目录导航';
+COMMENT ON COLUMN sections.parent_id IS '父章节 ID，构建树形结构';
+COMMENT ON COLUMN sections.level IS '层级深度，1 为最顶层';
+
+-- ==================== 语义块表 ====================
+CREATE TABLE semantic_blocks (
+    id SERIAL PRIMARY KEY,
+    doc_id VARCHAR(128) NOT NULL,
+    semantic_id VARCHAR(64) NOT NULL,          -- 语义块唯一标识
+    block_type VARCHAR(32) NOT NULL,           -- 类型：section_text/table/figure
+    page_start INT NOT NULL,                   -- 起始页码
+    page_end INT NOT NULL,                     -- 结束页码
+    section_id INT,                            -- 所属章节
+    section_title VARCHAR(512),                -- 章节标题（冗余，方便查询）
+    section_level INT,                         -- 章节层级
+    source_ids JSONB,                          -- 原始 layout IDs（JSON 数组）
+    text TEXT NOT NULL,                        -- 完整内容（未被切分）
+    created_at TIMESTAMP DEFAULT NOW(),
+
+    CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
+    CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
+    CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
+);
+
+COMMENT ON TABLE semantic_blocks IS '语义块表，用于邻域扩展，恢复完整内容';
+COMMENT ON COLUMN semantic_blocks.block_type IS '类型：section_text（正文）、table（表格）、figure（图示）';
+COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
+COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容，未被切分';
+
+-- ==================== 向量块元数据表 ====================
+CREATE TABLE vector_chunks (
+    id SERIAL PRIMARY KEY,
+    doc_id VARCHAR(128) NOT NULL,
+    chunk_id VARCHAR(64) NOT NULL,             -- Milvus 主键
+    semantic_id VARCHAR(64) NOT NULL,          -- 关联语义块
+    chunk_index INT NOT NULL,                  -- 切片序号（全局）
+    piece_index INT,                           -- 同语义块内的切片序号
+    page_start INT,
+    page_end INT,
+    section_title VARCHAR(512),
+    text VARCHAR(2048),                        -- 切片文本（可选，缩短版用于展示）
+    source_ids JSONB,                          -- 原始 layout IDs（JSON 数组）
+    created_at TIMESTAMP DEFAULT NOW(),
+
+    CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
+    CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
+        REFERENCES semantic_blocks(doc_id, semantic_id),
+    CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
+);
+
+COMMENT ON TABLE vector_chunks IS '向量块元数据表，用于快速关联查询';
+COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
+COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号，用于按序拼接';
+
+-- ==================== 索引 ====================
+CREATE INDEX idx_sections_doc_id ON sections(doc_id);
+CREATE INDEX idx_sections_parent_id ON sections(parent_id);
+CREATE INDEX idx_sections_level ON sections(level);
+
+CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
+CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
+CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
+CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
+
+CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
+CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
+CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
+
+-- ==================== 触发器：自动更新 updated_at ====================
+CREATE OR REPLACE FUNCTION update_updated_at()
+RETURNS TRIGGER AS $$
+BEGIN
+    NEW.updated_at = NOW();
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER tr_documents_updated_at
+    BEFORE UPDATE ON documents
+    FOR EACH ROW EXECUTE FUNCTION update_updated_at();
--- a/backend/aliyun_parser/upload_to_milvus.py
+++ b/backend/aliyun_parser/upload_to_milvus.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+将 vector_chunks.json 向量化并上传到 Milvus 和 PostgreSQL
+使用中转站的 OpenAI 兼容 API
+"""
+
+import argparse
+import json
+import time
+from pathlib import Path
+from typing import List, Dict
+
+import psycopg2
+from psycopg2.extras import execute_values
+from pymilvus import (
+    connections,
+    Collection,
+    FieldSchema,
+    CollectionSchema,
+    DataType,
+    utility,
+)
+from openai import OpenAI
+
+# ===================== 配置 =====================
+# 中转站配置
+RELAY_BASE_URL = "http://6.86.80.4:30080/v1"
+RELAY_API_KEY = "sk-5HeY7gfSIlyZMacfuXOf5cphpymsNqufEu1ou4U3avbULcyY"
+EMBEDDING_MODEL = "text-embedding-v3"  # 中转站支持的 embedding 模型
+
+# Milvus 配置
+MILVUS_HOST = "localhost"
+MILVUS_PORT = "19530"
+COLLECTION_NAME = "regulation_chunks"
+
+# PostgreSQL 配置
+PG_HOST = "6.86.80.10"
+PG_PORT = 5432
+PG_USER = "postgresql"
+PG_PASSWORD = "postgresql123456"
+PG_DATABASE = "postgres"
+
+
+# ===================== Embedding =====================
+def get_openai_client(api_key: str, base_url: str) -> OpenAI:
+    """创建 OpenAI 客户端连接到中转站"""
+    return OpenAI(api_key=api_key, base_url=base_url)
+
+
+def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10) -> List[List[float]]:
+    """批量获取文本向量"""
+    all_embeddings = []
+
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i + batch_size]
+        print(f"Embedding batch {i // batch_size + 1}/{(len(texts) - 1) // batch_size + 1}...")
+
+        response = client.embeddings.create(
+            model=EMBEDDING_MODEL,
+            input=batch,
+        )
+
+        embeddings = [item.embedding for item in response.data]
+        all_embeddings.extend(embeddings)
+
+    return all_embeddings
+
+
+# ===================== Milvus =====================
+def init_milvus(host: str, port: str):
+    connections.connect("default", host=host, port=port)
+    print(f"已连接 Milvus: {host}:{port}")
+
+
+def create_collection(name: str, dim: int) -> Collection:
+    """创建或获取 collection"""
+    if utility.has_collection(name):
+        print(f"Collection '{name}' 已存在，删除重建")
+        utility.drop_collection(name)
+
+    fields = [
+        FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=64, is_primary=True),
+        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=128),
+        FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=512),
+        FieldSchema(name="chunk_index", dtype=DataType.INT64),
+        FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=64),
+        FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=32),
+        FieldSchema(name="page_start", dtype=DataType.INT64),
+        FieldSchema(name="page_end", dtype=DataType.INT64),
+        FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
+        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
+        FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),  # JSON 字符串
+        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
+    ]
+
+    schema = CollectionSchema(fields, description="法规文档检索 chunks")
+    collection = Collection(name, schema)
+
+    # 创建向量索引（IVF_FLAT，适合中小规模）
+    index_params = {
+        "metric_type": "COSINE",
+        "index_type": "IVF_FLAT",
+        "params": {"nlist": 128},
+    }
+    collection.create_index("embedding", index_params)
+    print(f"Collection '{name}' 创建完成，索引已建立")
+
+    return collection
+
+
+def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[List[float]]):
+    """插入 chunks 到 Milvus"""
+    data = [
+        [c["chunk_id"] for c in chunks],
+        [c["doc_id"] for c in chunks],
+        [c["doc_title"] for c in chunks],
+        [c["chunk_index"] for c in chunks],
+        [c["semantic_id"] for c in chunks],
+        [c["chunk_type"] for c in chunks],
+        [c["page_start"] for c in chunks],
+        [c["page_end"] for c in chunks],
+        [c["section_title"] for c in chunks],
+        [c["text"] for c in chunks],
+        [json.dumps(c.get("source_ids", [])) for c in chunks],  # JSON 字符串
+        embeddings,
+    ]
+
+    collection.insert(data)
+    collection.flush()
+    print(f"已插入 {len(chunks)} 个 chunks")
+
+
+def load_collection(collection: Collection):
+    """加载 collection 到内存（搜索前必须）"""
+    collection.load()
+    print(f"Collection 已加载到内存")
+
+
+# ===================== PostgreSQL =====================
+def get_pg_connection(host: str, port: int, user: str, password: str, database: str):
+    """获取 PostgreSQL 连接"""
+    conn = psycopg2.connect(
+        host=host,
+        port=port,
+        user=user,
+        password=password,
+        database=database,
+    )
+    print(f"已连接 PostgreSQL: {host}:{port}/{database}")
+    return conn
+
+
+def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
+    """插入 chunks 和相关数据到 PostgreSQL"""
+    cursor = conn.cursor()
+
+    try:
+        # 1. 插入文档
+        cursor.execute("""
+            INSERT INTO documents (doc_id, title, standard_number, upload_time)
+            VALUES (%s, %s, %s, NOW())
+            ON CONFLICT (doc_id) DO UPDATE SET title = EXCLUDED.title, updated_at = NOW()
+        """, (doc_data["doc_id"], doc_data["doc_title"], doc_data.get("standard_number")))
+
+        # 2. 插入语义块
+        semantic_blocks = doc_data.get("semantic_blocks", [])
+        if semantic_blocks:
+            block_rows = [
+                (
+                    doc_data["doc_id"],
+                    block["semantic_id"],
+                    block["block_type"],
+                    block["page_start"],
+                    block["page_end"],
+                    block.get("section_title"),
+                    block.get("section_level"),
+                    json.dumps(block.get("source_ids", [])),
+                    block["text"],
+                )
+                for block in semantic_blocks
+            ]
+            execute_values(
+                cursor,
+                """
+                INSERT INTO semantic_blocks
+                (doc_id, semantic_id, block_type, page_start, page_end, section_title, section_level, source_ids, text)
+                VALUES %s
+                ON CONFLICT (doc_id, semantic_id) DO UPDATE SET text = EXCLUDED.text
+                """,
+                block_rows,
+            )
+            print(f"已插入 {len(semantic_blocks)} 个语义块")
+
+        # 3. 插入向量块元数据
+        chunk_rows = [
+            (
+                doc_data["doc_id"],
+                chunk["chunk_id"],
+                chunk["semantic_id"],
+                chunk["chunk_index"],
+                chunk.get("piece_index"),
+                chunk["page_start"],
+                chunk["page_end"],
+                chunk.get("section_title"),
+                chunk["text"],
+                json.dumps(chunk.get("source_ids", [])),
+            )
+            for chunk in chunks
+        ]
+        execute_values(
+            cursor,
+            """
+            INSERT INTO vector_chunks
+            (doc_id, chunk_id, semantic_id, chunk_index, piece_index, page_start, page_end, section_title, text, source_ids)
+            VALUES %s
+            ON CONFLICT (doc_id, chunk_id) DO UPDATE SET text = EXCLUDED.text
+            """,
+            chunk_rows,
+        )
+        print(f"已插入 {len(chunks)} 个向量块元数据")
+
+        conn.commit()
+        print("PostgreSQL 数据插入完成")
+
+    except Exception as e:
+        conn.rollback()
+        raise e
+    finally:
+        cursor.close()
+
+
+# ===================== 主流程 =====================
+def load_data(file_path: Path) -> Dict:
+    """加载 vector_chunks.json，返回完整数据"""
+    data = json.loads(file_path.read_text(encoding="utf-8"))
+    return data
+
+
+def upload_to_milvus_and_pg(
+    chunks_file: str,
+    api_key: str,
+    base_url: str,
+    milvus_host: str,
+    milvus_port: str,
+    collection_name: str,
+    batch_size: int,
+    pg_host: str,
+    pg_port: int,
+    pg_user: str,
+    pg_password: str,
+    pg_database: str,
+):
+    # 1. 加载完整数据
+    chunks_path = Path(chunks_file).expanduser().resolve()
+    if not chunks_path.exists():
+        raise FileNotFoundError(f"文件不存在: {chunks_path}")
+
+    data = load_data(chunks_path)
+    chunks = data.get("vector_chunks", [])
+    if not chunks:
+        raise ValueError("vector_chunks 为空")
+    print(f"加载 {len(chunks)} 个 chunks")
+
+    # 2. 初始化连接
+    client = get_openai_client(api_key, base_url)
+    init_milvus(milvus_host, milvus_port)
+    pg_conn = get_pg_connection(pg_host, pg_port, pg_user, pg_password, pg_database)
+
+    # 3. 获取 embeddings
+    texts = [c["embedding_text"] for c in chunks]
+    embeddings = get_embeddings_batch(client, texts, batch_size)
+    print(f"生成 {len(embeddings)} 个向量")
+
+    # 4. 获取 embedding 维度
+    embedding_dim = len(embeddings[0])
+    print(f"Embedding 维度: {embedding_dim}")
+
+    # 5. 创建 collection 并插入 Milvus
+    collection = create_collection(collection_name, embedding_dim)
+    insert_chunks(collection, chunks, embeddings)
+    load_collection(collection)
+
+    # 6. 插入 PostgreSQL
+    insert_chunks_to_pg(pg_conn, chunks, data)
+
+    # 7. 关闭连接
+    pg_conn.close()
+
+    print("上传完成！")
+
+
+# ===================== CLI =====================
+def main():
+    parser = argparse.ArgumentParser(description="将 vector_chunks 向量化并上传到 Milvus 和 PostgreSQL")
+    parser.add_argument("chunks_file", help="vector_chunks.json 文件路径")
+    parser.add_argument("--api-key", default=RELAY_API_KEY, help="中转站 API Key")
+    parser.add_argument("--base-url", default=RELAY_BASE_URL, help="中转站 Base URL")
+    parser.add_argument("--milvus-host", default=MILVUS_HOST, help="Milvus host")
+    parser.add_argument("--milvus-port", default=MILVUS_PORT, help="Milvus port")
+    parser.add_argument("--collection", default=COLLECTION_NAME, help="Milvus collection 名称")
+    parser.add_argument("--batch-size", type=int, default=10, help="Embedding 批量大小（中转站限制最大10）")
+    parser.add_argument("--pg-host", default=PG_HOST, help="PostgreSQL host")
+    parser.add_argument("--pg-port", type=int, default=PG_PORT, help="PostgreSQL port")
+    parser.add_argument("--pg-user", default=PG_USER, help="PostgreSQL user")
+    parser.add_argument("--pg-password", default=PG_PASSWORD, help="PostgreSQL password")
+    parser.add_argument("--pg-database", default=PG_DATABASE, help="PostgreSQL database")
+    args = parser.parse_args()
+
+    upload_to_milvus_and_pg(
+        chunks_file=args.chunks_file,
+        api_key=args.api_key,
+        base_url=args.base_url,
+        milvus_host=args.milvus_host,
+        milvus_port=args.milvus_port,
+        collection_name=args.collection,
+        batch_size=args.batch_size,
+        pg_host=args.pg_host,
+        pg_port=args.pg_port,
+        pg_user=args.pg_user,
+        pg_password=args.pg_password,
+        pg_database=args.pg_database,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/backend/aliyun_parser/vector_chunks.json
+++ b/backend/aliyun_parser/vector_chunks.json
--- a/backend/aliyun_parser/嵌入和召回.md
+++ b/backend/aliyun_parser/嵌入和召回.md
@@ -0,0 +1,263 @@
+# 文档解析与向量检索说明
+
+## 相关文件
+
+- `aliyun_doc_parser.py`：调用阿里云文档智能解析 PDF，生成原始 `layouts.json`
+- `layouts_to_vector_chunks.py`：把 `layouts.json` 转成适合向量数据库入库的三层结构
+- `layouts.json`：阿里云返回的原始布局结果
+- `vector_chunks.json`：转换后的结构化输出
+
+## 一、`layouts.json` 的结构
+
+`layouts.json` 顶层是一个数组，每个元素代表一个布局块（layout）。常见字段如下：
+
+- `type`：主类型，例如 `title`、`text`、`table`、`figure`
+- `subType`：更细的语义类型，例如 `doc_title`、`para_title`、`para`、`picture`、`pic_title`、`pic_caption`
+- `text`：当前布局块的纯文本
+- `markdownContent`：带 markdown 标记的文本
+- `pageNum`：页码
+- `index`：页内顺序
+- `level`：标题层级
+- `uniqueId`：布局块唯一标识
+- `blocks`：更细粒度的文本与样式信息
+- `cells`：表格单元格，仅 `table` 类型存在
+
+这个结构不是简单 OCR 文本流，而是已经带有版面理解和语义分类的结构化数据。
+
+## 二、推荐的三层转换结构
+
+### 1. 结构层 `structure_nodes`
+
+结构层用于恢复文档标题树，不直接作为最终向量检索单元。
+
+示例：
+
+- `1 范围`
+- `2 规范性引用文件`
+- `3 术语和定义`
+  - `3.1 儿童三轮车`
+  - `3.2 轮距`
+
+结构层主要用于给下游 chunk 绑定 `section_path`。
+
+### 2. 语义层 `semantic_blocks`
+
+语义层是按文档意义聚合后的内容块，主要分为三类：
+
+- `section_text`：同一章节下连续正文聚合而成
+- `table`：表格内容单独成块
+- `figure`：图、图名、图注等单独成块
+
+这一层比单 layout 更适合做语义理解，也适合后续做上下文扩展。
+
+### 3. 检索层 `vector_chunks`
+
+检索层是最终写进向量数据库的 chunk。
+
+处理方式：
+
+- 对 `semantic_blocks` 中较短的块直接入库
+- 对较长的块按 `max_chars` 再切分
+- 相邻切片保留 `overlap_chars` 重叠
+- 每个 chunk 都带完整 metadata，便于后续过滤、重排和邻域扩展
+
+## 三、当前转换脚本做了什么
+
+`layouts_to_vector_chunks.py` 当前已经实现：
+
+1. 过滤目录页噪声（如 `目次`）
+2. 根据标题层级维护章节路径
+3. 将正文聚合成 `section_text`
+4. 将表格单独转成 `table`
+5. 将图相关内容单独转成 `figure`
+6. 对长文本继续切分为最终 `vector_chunks`
+7. 为每个检索 chunk 生成 `embedding_text`
+
+## 四、为什么不要直接按 layout 入库
+
+如果把 `layouts.json` 的每条 layout 直接做向量：
+
+- 颗粒度太碎
+- 标题和正文容易分离
+- 表格会丢失结构上下文
+- 图示信息无法完整表达
+- 检索命中结果噪声较大
+
+对于标准文档，最合适的单位通常不是“句子”，而是“条款语义块”。
+
+## 五、建议的入库字段
+
+建议向量数据库每条记录至少保存：
+
+- `embedding_text`：用于生成向量
+- `text`：原始 chunk 文本
+- `chunk_id`
+- `semantic_id`
+- `chunk_type`：`section_text` / `table` / `figure`
+- `section_path`
+- `section_title`
+- `section_level`
+- `page_start`
+- `page_end`
+- `doc_id`
+- `doc_title`
+- `source_ids`
+
+其中：
+
+- 向量化字段：`embedding_text`
+- 展示字段：`text`
+- 检索增强字段：其余 metadata
+
+## 六、推荐的检索方式
+
+不要只做最简单的 top-k 向量搜索，建议采用：
+
+**向量召回 + metadata 重排 + 邻域扩展**
+
+### 1. 向量召回
+
+使用 `vector_chunks[*].embedding_text` 做 embedding，并在向量数据库中检索 top 10 ~ 15 条。
+
+查询时可以对用户问题做轻微改写，例如：
+
+原问题：
+
+`儿童三轮车的定义是什么？`
+
+可改写为：
+
+`请检索 GB 14747—2006 儿童三轮车安全要求 中关于“儿童三轮车定义”的条款、术语、表格或图示说明。`
+
+这样更适合标准文档检索。
+
+### 2. metadata 重排
+
+向量召回后，根据 metadata 做轻量规则重排。
+
+常见规则：
+
+- `chunk_type == section_text`：对定义类、要求类问题优先级更高
+- `section_path` 命中查询关键词：例如查询“定义”时，`术语和定义` 章节优先
+- `chunk_type == table`：对“尺寸 / 参数 / 数值 / 对照 / 要求”类问题加权
+- `chunk_type == figure`：对“图 / 结构 / 状态 / 示意”类问题加权
+
+### 3. 邻域扩展
+
+检索命中的是最终切片，但回答往往需要更完整上下文。
+
+建议命中某个 `vector_chunk` 后：
+
+1. 优先回捞同一个 `semantic_id` 下的所有 chunk
+2. 如果还不够，再补充同 `section_path`、相邻页码或相邻 `chunk_index` 的内容
+
+这样可以恢复完整条款，而不是只给模型一小段碎片。
+
+## 七、不同问题的检索重点
+
+### 1. 定义类问题
+
+例如：
+
+- `儿童三轮车的定义是什么？`
+- `轮距是什么意思？`
+
+优先检索：
+
+- `section_text`
+- `section_path` 中包含 `术语和定义` 的内容
+
+### 2. 要求类问题
+
+例如：
+
+- `外露突出物有什么要求？`
+- `辅助推杆有哪些安全要求？`
+
+优先检索：
+
+- `section_text`
+- `table`
+
+### 3. 数值 / 尺寸 / 对照类问题
+
+例如：
+
+- `鞍座到脚蹬距离要求是什么？`
+- `哪些项目需要满足规定尺寸？`
+
+优先检索：
+
+- `table`
+- `section_text`
+
+### 4. 图示说明类问题
+
+例如：
+
+- `正常乘骑状态是什么意思？`
+- `图1表示什么？`
+
+优先检索：
+
+- `figure`
+- 同章节相邻 `section_text`
+
+## 八、推荐的最终检索流程
+
+建议采用以下固定流程：
+
+1. 用 `vector_chunks.embedding_text` 做 embedding 检索
+2. 取 top 10 ~ 15 条候选
+3. 按 `chunk_type + section_path` 做规则重排
+4. 以 `semantic_id` 为中心回捞完整语义块
+5. 选 3 ~ 5 组上下文提供给大模型回答
+
+## 九、给大模型的上下文组织方式
+
+最终不要直接把原始 JSON 扔给模型，建议整理成如下格式：
+
+```text
+[命中片段 1]
+章节：3 术语和定义 > 3.1 儿童三轮车
+页码：1-2
+类型：section_text
+内容：
+......
+
+[命中片段 2]
+章节：4 要求 > 4.3 外露突出物
+页码：5
+类型：section_text
+内容：
+......
+
+[命中片段 3]
+章节：5 试验方法
+页码：8
+类型：table
+内容：
+......
+```
+
+这种格式更利于模型稳定回答并引用出处。
+
+## 十、转换命令
+
+生成三层结构：
+
+```bash
+python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
+  --layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
+  --out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json
+```
+
+自定义切片大小：
+
+```bash
+python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
+  --layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
+  --out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json \
+  --max-chars 500 \
+  --overlap-chars 80
+```
--- a/backend/app/api/main.py
+++ b/backend/app/api/main.py
@@ -3,6 +3,7 @@
 from contextlib import asynccontextmanager

 from fastapi import FastAPI, Request
+from fastapi.encoders import jsonable_encoder
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from loguru import logger
@@ -12,6 +13,7 @@ from app.api.routes import api_router
 from app.config.logging import setup_logging
 from app.config.settings import settings
 from app.shared.bootstrap import cleanup_runtime_dependencies, preload_runtime_dependencies
+from app.shared.errors import VectorStoreSchemaError
 # Keep module behavior explicit so the backend flow stays easy to audit.


@@ -55,16 +57,33 @@ app.add_middleware(
 app.include_router(api_router, prefix="/api/v1")


+@app.exception_handler(VectorStoreSchemaError)
+async def vector_store_schema_exception_handler(request: Request, exc: VectorStoreSchemaError):
+    """Return a stable JSON response for vector store schema/runtime errors."""
+    logger.error(f"向量库 schema 异常: {exc}")
+    return JSONResponse(
+        status_code=500,
+        content=jsonable_encoder(
+            ErrorResponse(
+                error="VectorStoreSchemaError",
+                message=str(exc),
+            )
+        ),
+    )
+
+
@app.exception_handler(Exception)
 async def global_exception_handler(request: Request, exc: Exception):
    """Global exception handler."""
    logger.error(f"未处理的异常: {exc}")
    return JSONResponse(
        status_code=500,
-        content=ErrorResponse(
-            error="InternalServerError",
-            message=str(exc),
-        ).model_dump(),
+        content=jsonable_encoder(
+            ErrorResponse(
+                error="InternalServerError",
+                message=str(exc),
+            )
+        ),
    )


--- a/backend/app/api/routes/init.py
+++ b/backend/app/api/routes/init.py
@@ -7,6 +7,7 @@ from .knowledge import router as knowledge_router
 from .agent import router as agent_router
 from .status import router as status_router
 from .perception import router as perception_router
+from .rag import router as rag_router
 # Keep package boundaries explicit so backend imports stay predictable.


@@ -20,6 +21,7 @@ api_router.include_router(agent_router)
 api_router.include_router(compliance_router)
 api_router.include_router(status_router)
 api_router.include_router(perception_router)
+api_router.include_router(rag_router)

 __all__ = [
    "api_router",
@@ -29,4 +31,5 @@ __all__ = [
    "compliance_router",
    "status_router",
    "perception_router",
+    "rag_router",
 ]
--- a/backend/app/api/routes/knowledge.py
+++ b/backend/app/api/routes/knowledge.py
@@ -29,14 +29,19 @@ async def search_knowledge(request: SearchRequest):
        results=[
            SearchResultItem(
                id=index + 1,
-                content=item.content,
+                content=item.text,
                score=item.score,
                metadata={
                    "doc_id": item.doc_id,
-                    "doc_name": item.doc_name,
+                    "doc_title": item.doc_title,
                    "chunk_id": item.chunk_id,
+                    "chunk_type": item.chunk_type,
                    "section_title": item.section_title,
-                    "page_number": item.page_number,
+                    "page_start": item.page_start,
+                    "page_end": item.page_end,
+                    "section_level": item.section_level,
+                    "chunk_index": item.chunk_index,
+                    "piece_index": item.piece_index,
                    **item.metadata,
                },
            )
--- a/backend/app/api/routes/rag.py
+++ b/backend/app/api/routes/rag.py
@@ -50,8 +50,8 @@ async def rag_chat(request: RagChatRequest):
                    {
                        "id": str(s.get("chunk_id") or s.get("doc_id") or idx + 1),
                        "score": s.get("score", 0),
-                        "preview": s.get("content", "")[:200],
-                        "doc_name": s.get("doc_name", ""),
+                        "preview": s.get("text", s.get("content", ""))[:200],
+                        "doc_name": s.get("doc_title", s.get("doc_name", "")),
                        "clause": s.get("section_title", "法规片段"),
                        "doc_id": s.get("doc_id"),
                        "download_url": (
--- a/backend/app/application/documents/services.py
+++ b/backend/app/application/documents/services.py
@@ -508,7 +508,7 @@ class DocumentQueryService:
        """Return documents with real-time state from Milvus as the authoritative source.

        Algorithm:
-        1. Query Milvus for all doc metadata (doc_id, doc_name, chunk_count, …).
+        1. Query Milvus for all doc metadata (doc_id, doc_title, chunk_count, …).
        2. Load JSON/PG metadata records and index them by doc_id.
        3. Merge: Milvus-present docs get status=INDEXED and live chunk_count;
           metadata-only docs with status=INDEXED are demoted to FAILED.
@@ -536,8 +536,8 @@ class DocumentQueryService:
                doc.chunk_count = row["chunk_count"]
                doc.status = DocumentStatus.INDEXED
                # Backfill fields that may be missing from older JSON records.
-                if not doc.doc_name and row.get("doc_name"):
-                    doc.doc_name = row["doc_name"]
+                if not doc.doc_name and row.get("doc_title"):
+                    doc.doc_name = row["doc_title"]
                if not doc.regulation_type and row.get("regulation_type"):
                    doc.regulation_type = row["regulation_type"]
                if not doc.version and row.get("version"):
@@ -553,8 +553,8 @@ class DocumentQueryService:
            if doc_id not in meta_by_id:
                synthetic = Document(
                    doc_id=doc_id,
-                    doc_name=row.get("doc_name", doc_id),
-                    file_name=row.get("doc_name", doc_id),
+                    doc_name=row.get("doc_title", doc_id),
+                    file_name=row.get("doc_title", doc_id),
                    object_name="",
                    content_type="",
                    size_bytes=0,
--- a/backend/app/application/knowledge/services.py
+++ b/backend/app/application/knowledge/services.py
@@ -29,11 +29,16 @@ def _reciprocal_rank_fusion(
        RetrievedChunk(
            chunk_id=chunk_map[ck].chunk_id,
            doc_id=chunk_map[ck].doc_id,
-            doc_name=chunk_map[ck].doc_name,
-            content=chunk_map[ck].content,
+            doc_title=chunk_map[ck].doc_title,
+            text=chunk_map[ck].text,
            score=scores[ck],
+            chunk_type=chunk_map[ck].chunk_type,
            section_title=chunk_map[ck].section_title,
-            page_number=chunk_map[ck].page_number,
+            page_start=chunk_map[ck].page_start,
+            page_end=chunk_map[ck].page_end,
+            section_level=chunk_map[ck].section_level,
+            chunk_index=chunk_map[ck].chunk_index,
+            piece_index=chunk_map[ck].piece_index,
            metadata=chunk_map[ck].metadata,
        )
        for ck in sorted_keys
--- a/backend/app/application/perception/services.py
+++ b/backend/app/application/perception/services.py
@@ -71,9 +71,9 @@ class PerceptionService:
                    affected_docs.append(
                        {
                            "doc_id": chunk.doc_id,
-                            "doc_name": chunk.doc_name,
+                            "doc_title": chunk.doc_title,
                            "score": round(float(chunk.score), 4),
-                            "snippet": (chunk.content or "")[:180],
+                            "snippet": (chunk.text or "")[:180],
                            "clause": getattr(chunk, "section_title", "") or "",
                        }
                    )
@@ -84,7 +84,7 @@ class PerceptionService:

        # --- 2. Build context from retrieved chunks ---
        context_parts = [
-            f"[文档{i}: {c.doc_name}]\n{(c.content or '')[:400]}"
+            f"[文档{i}: {c.doc_title}]\n{(c.text or '')[:400]}"
            for i, c in enumerate(chunks[:5], 1)
        ]
        context = "\n\n".join(context_parts) if context_parts else "（知识库中暂无相关文档）"
--- a/backend/app/config/settings.py
+++ b/backend/app/config/settings.py
@@ -33,7 +33,7 @@ class Settings(BaseSettings):
    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    milvus_host: str = Field(default="6.86.80.8", description="Milvus服务地址")
    milvus_port: int = Field(default=19530, description="Milvus服务端口")
-    milvus_collection: str = Field(default="regulations_dense_1024_v1", description="法规向量集合名称")
+    milvus_collection: str = Field(default="regulations_dense_1024_v2", description="法规向量集合名称")
    milvus_db_name: str = Field(default="default", description="Milvus数据库名称")

    # Keep configuration setup explicit so runtime behavior is easy to reason about.
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -27,7 +27,7 @@ class Settings(BaseSettings):
    # Milvus
    milvus_host: str = "6.86.80.8"
    milvus_port: int = 19530
-    milvus_collection: str = "regulations_dense_1024_v1"
+    milvus_collection: str = "regulations_dense_1024_v2"

    # LLM / embedding defaults aligned with the migrated backend path.
    llm_model: str = "qwen-max"
@@ -47,7 +47,7 @@ class Settings(BaseSettings):
    api_port: int = 8000

    # Legacy aliases retained for old utility modules.
-    regulations_collection: str = "regulations_dense_1024_v1"
+    regulations_collection: str = "regulations_dense_1024_v2"
    compliance_collection: str = "compliance_cache"

 # Preserve the legacy module API while keeping env resolution centralized at the repo root.
--- a/backend/app/domain/conversation/models.py
+++ b/backend/app/domain/conversation/models.py
@@ -8,18 +8,91 @@ from typing import Any



-@dataclass
+@dataclass(init=False)
 class AnswerSource:
-    """Represent answer source data."""
+    """Represent answer source data with legacy aliases."""
+
    doc_id: str
-    doc_name: str
+    doc_title: str
    chunk_id: str
+    chunk_type: str
    section_title: str
-    page_number: int
+    page_start: int
+    page_end: int
+    section_level: int
+    chunk_index: int
+    piece_index: int
    score: float
-    content: str
+    text: str
    metadata: dict[str, Any] = field(default_factory=dict)

+    def __init__(
+        self,
+        *,
+        doc_id: str,
+        doc_title: str | None = None,
+        chunk_id: str,
+        chunk_type: str = "",
+        section_title: str = "",
+        page_start: int = 0,
+        page_end: int = 0,
+        section_level: int = 0,
+        chunk_index: int = 0,
+        piece_index: int = 0,
+        score: float = 0.0,
+        text: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        doc_name: str | None = None,
+        content: str | None = None,
+        page_number: int | None = None,
+        **_: Any,
+    ) -> None:
+        """Initialize the answer source while accepting legacy field names."""
+        self.doc_id = doc_id
+        self.doc_title = doc_title if doc_title is not None else (doc_name or "")
+        self.chunk_id = chunk_id
+        self.chunk_type = chunk_type
+        self.section_title = section_title
+        self.page_start = int(page_start or page_number or 0)
+        self.page_end = int(page_end or self.page_start)
+        self.section_level = int(section_level or 0)
+        self.chunk_index = int(chunk_index or 0)
+        self.piece_index = int(piece_index or 0)
+        self.score = float(score)
+        self.text = text if text is not None else (content or "")
+        self.metadata = dict(metadata or {})
+
+    @property
+    def doc_name(self) -> str:
+        """Return the legacy document name alias."""
+        return self.doc_title
+
+    @doc_name.setter
+    def doc_name(self, value: str) -> None:
+        """Update the legacy document name alias."""
+        self.doc_title = value
+
+    @property
+    def content(self) -> str:
+        """Return the legacy content alias."""
+        return self.text
+
+    @content.setter
+    def content(self, value: str) -> None:
+        """Update the legacy content alias."""
+        self.text = value
+
+    @property
+    def page_number(self) -> int:
+        """Return the legacy page number alias."""
+        return self.page_start
+
+    @page_number.setter
+    def page_number(self, value: int) -> None:
+        """Update the legacy page number alias."""
+        self.page_start = value
+        self.page_end = max(self.page_end, value)
+

@dataclass
 class ConversationMessage:
--- a/backend/app/domain/documents/models.py
+++ b/backend/app/domain/documents/models.py
@@ -60,23 +60,117 @@ class ParsedDocument:
    metadata: dict[str, Any] = field(default_factory=dict)


-@dataclass
+@dataclass(init=False)
 class Chunk:
-    """Represent the Chunk type."""
+    """Represent one retrieval chunk with backward-compatible aliases."""
+
    chunk_id: str
    doc_id: str
-    doc_name: str
-    content: str
+    doc_title: str
+    text: str
    embedding_text: str
+    chunk_type: str = ""
+    chunk_index: int = 0
+    piece_index: int = 0
+    page_start: int = 0
+    page_end: int = 0
    section_title: str = ""
    section_path: list[str] = field(default_factory=list)
-    page_number: int = 0
+    section_level: int = 0
+    source_ids: list[str] = field(default_factory=list)
    regulation_type: str = ""
    version: str = ""
    semantic_id: str = ""
-    block_type: str = ""
    metadata: dict[str, Any] = field(default_factory=dict)

+    def __init__(
+        self,
+        *,
+        chunk_id: str,
+        doc_id: str,
+        doc_title: str | None = None,
+        text: str | None = None,
+        embedding_text: str = "",
+        chunk_type: str = "",
+        chunk_index: int = 0,
+        piece_index: int = 0,
+        page_start: int = 0,
+        page_end: int = 0,
+        section_title: str = "",
+        section_path: list[str] | None = None,
+        section_level: int = 0,
+        source_ids: list[str] | None = None,
+        regulation_type: str = "",
+        version: str = "",
+        semantic_id: str = "",
+        metadata: dict[str, Any] | None = None,
+        doc_name: str | None = None,
+        content: str | None = None,
+        page_number: int | None = None,
+        block_type: str | None = None,
+        **_: Any,
+    ) -> None:
+        """Initialize the chunk while accepting legacy field names."""
+        self.chunk_id = chunk_id
+        self.doc_id = doc_id
+        self.doc_title = doc_title if doc_title is not None else (doc_name or "")
+        self.text = text if text is not None else (content or "")
+        self.embedding_text = embedding_text or self.text
+        self.chunk_type = chunk_type or (block_type or "")
+        self.chunk_index = int(chunk_index or 0)
+        self.piece_index = int(piece_index or 0)
+        self.page_start = int(page_start or page_number or 0)
+        self.page_end = int(page_end or self.page_start)
+        self.section_title = section_title
+        self.section_path = list(section_path or [])
+        self.section_level = int(section_level or 0)
+        self.source_ids = list(source_ids or [])
+        self.regulation_type = regulation_type
+        self.version = version
+        self.semantic_id = semantic_id
+        self.metadata = dict(metadata or {})
+
+    @property
+    def doc_name(self) -> str:
+        """Return the legacy document name alias."""
+        return self.doc_title
+
+    @doc_name.setter
+    def doc_name(self, value: str) -> None:
+        """Update the legacy document name alias."""
+        self.doc_title = value
+
+    @property
+    def content(self) -> str:
+        """Return the legacy content alias."""
+        return self.text
+
+    @content.setter
+    def content(self, value: str) -> None:
+        """Update the legacy content alias."""
+        self.text = value
+
+    @property
+    def page_number(self) -> int:
+        """Return the legacy page number alias."""
+        return self.page_start
+
+    @page_number.setter
+    def page_number(self, value: int) -> None:
+        """Update the legacy page number alias."""
+        self.page_start = value
+        self.page_end = max(self.page_end, value)
+
+    @property
+    def block_type(self) -> str:
+        """Return the legacy block type alias."""
+        return self.chunk_type
+
+    @block_type.setter
+    def block_type(self, value: str) -> None:
+        """Update the legacy block type alias."""
+        self.chunk_type = value
+

@dataclass
 class DocumentProcessingRun:
--- a/backend/app/domain/retrieval/models.py
+++ b/backend/app/domain/retrieval/models.py
@@ -16,14 +16,88 @@ class RetrievalQuery:
    filters: str | None = None


-@dataclass
+@dataclass(init=False)
 class RetrievedChunk:
-    """Represent the Retrieved Chunk type."""
+    """Represent the retrieved chunk payload with legacy aliases."""
+
    chunk_id: str
    doc_id: str
-    doc_name: str
-    content: str
+    doc_title: str
+    text: str
    score: float
+    chunk_type: str = ""
    section_title: str = ""
-    page_number: int = 0
+    page_start: int = 0
+    page_end: int = 0
+    section_level: int = 0
+    chunk_index: int = 0
+    piece_index: int = 0
    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def __init__(
+        self,
+        *,
+        chunk_id: str,
+        doc_id: str,
+        doc_title: str | None = None,
+        text: str | None = None,
+        score: float = 0.0,
+        chunk_type: str = "",
+        section_title: str = "",
+        page_start: int = 0,
+        page_end: int = 0,
+        section_level: int = 0,
+        chunk_index: int = 0,
+        piece_index: int = 0,
+        metadata: dict[str, Any] | None = None,
+        doc_name: str | None = None,
+        content: str | None = None,
+        page_number: int | None = None,
+        block_type: str | None = None,
+        **_: Any,
+    ) -> None:
+        """Initialize the retrieved chunk while accepting legacy field names."""
+        self.chunk_id = chunk_id
+        self.doc_id = doc_id
+        self.doc_title = doc_title if doc_title is not None else (doc_name or "")
+        self.text = text if text is not None else (content or "")
+        self.score = float(score)
+        self.chunk_type = chunk_type or (block_type or "")
+        self.section_title = section_title
+        self.page_start = int(page_start or page_number or 0)
+        self.page_end = int(page_end or self.page_start)
+        self.section_level = int(section_level or 0)
+        self.chunk_index = int(chunk_index or 0)
+        self.piece_index = int(piece_index or 0)
+        self.metadata = dict(metadata or {})
+
+    @property
+    def doc_name(self) -> str:
+        """Return the legacy document name alias."""
+        return self.doc_title
+
+    @doc_name.setter
+    def doc_name(self, value: str) -> None:
+        """Update the legacy document name alias."""
+        self.doc_title = value
+
+    @property
+    def content(self) -> str:
+        """Return the legacy content alias."""
+        return self.text
+
+    @content.setter
+    def content(self, value: str) -> None:
+        """Update the legacy content alias."""
+        self.text = value
+
+    @property
+    def page_number(self) -> int:
+        """Return the legacy page number alias."""
+        return self.page_start
+
+    @page_number.setter
+    def page_number(self, value: int) -> None:
+        """Update the legacy page number alias."""
+        self.page_start = value
+        self.page_end = max(self.page_end, value)
--- a/backend/app/infrastructure/llm/openai_compatible_answer_generator.py
+++ b/backend/app/infrastructure/llm/openai_compatible_answer_generator.py
@@ -45,10 +45,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
        context_tokens = 0
        for idx, chunk in enumerate(retrieved_chunks, start=1):
            block = (
-                f"[{idx}] 文档: {chunk.doc_name}\n"
+                f"[{idx}] 文档: {chunk.doc_title}\n"
                f"章节: {chunk.section_title or '未标注'}\n"
-                f"页码: {chunk.page_number}\n"
-                f"内容: {chunk.content}"
+                f"页码: {chunk.page_start}" + (f"-{chunk.page_end}" if chunk.page_end and chunk.page_end != chunk.page_start else "") + "\n"
+                f"内容: {chunk.text}"
            )
            block_tokens = self._estimate_tokens(block)
            if context_tokens + block_tokens > settings.rag_max_context_tokens:
@@ -73,10 +73,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
            return False
        estimated_total_tokens = sum(
            self._estimate_tokens(
-                f"[{idx}] 文档: {chunk.doc_name}\n"
+                f"[{idx}] 文档: {chunk.doc_title}\n"
                f"章节: {chunk.section_title or '未标注'}\n"
-                f"页码: {chunk.page_number}\n"
-                f"内容: {chunk.content}"
+                f"页码: {chunk.page_start}" + (f"-{chunk.page_end}" if chunk.page_end and chunk.page_end != chunk.page_start else "") + "\n"
+                f"内容: {chunk.text}"
            )
            for idx, chunk in enumerate(retrieved_chunks, start=1)
        )
@@ -87,12 +87,17 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
        return [
            AnswerSource(
                doc_id=chunk.doc_id,
-                doc_name=chunk.doc_name,
+                doc_title=chunk.doc_title,
                chunk_id=chunk.chunk_id,
+                chunk_type=chunk.chunk_type,
                section_title=chunk.section_title,
-                page_number=chunk.page_number,
+                page_start=chunk.page_start,
+                page_end=chunk.page_end,
+                section_level=chunk.section_level,
+                chunk_index=chunk.chunk_index,
+                piece_index=chunk.piece_index,
                score=chunk.score,
-                content=chunk.content,
+                text=chunk.text,
                metadata=chunk.metadata,
            )
            for chunk in chunks
--- a/backend/app/infrastructure/parser/local_chunk_builder.py
+++ b/backend/app/infrastructure/parser/local_chunk_builder.py
@@ -10,6 +10,7 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
    """Adapt the existing markdown chunker to the new chunk builder port."""

    def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
+        """Initialize the local markdown chunk builder."""
        self.chunker = RegulationChunker(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
@@ -22,6 +23,7 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
        regulation_type: str,
        version: str,
    ) -> list[Chunk]:
+        """Build migrated chunk objects from the legacy markdown chunker output."""
        markdown_text = parsed_document.raw_text.strip()
        if not markdown_text:
            return []
@@ -50,16 +52,18 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
                Chunk(
                    chunk_id=item.metadata.chunk_id,
                    doc_id=parsed_document.doc_id,
-                    doc_name=parsed_document.doc_name,
-                    content=item.content,
+                    doc_title=parsed_document.doc_name,
+                    text=item.content,
                    embedding_text=item.content,
+                    chunk_type="local_markdown_chunk",
                    section_title=item.metadata.section_title or item.metadata.section_number,
                    section_path=section_path,
-                    page_number=item.metadata.page_number,
+                    page_start=item.metadata.page_number,
+                    page_end=item.metadata.page_number,
+                    section_level=len(section_path),
                    regulation_type=regulation_type,
                    version=version,
                    semantic_id=item.metadata.clause_number,
-                    block_type="local_markdown_chunk",
                    metadata=metadata,
                )
            )
--- a/backend/app/infrastructure/parser/vector_chunk_builder.py
+++ b/backend/app/infrastructure/parser/vector_chunk_builder.py
@@ -19,29 +19,35 @@ class AliyunVectorChunkBuilder(ChunkBuilder):
        """Handle build for the Aliyun Vector Chunk Builder instance."""
        chunks: list[Chunk] = []
        for index, item in enumerate(parsed_document.vector_chunks):
-            content = item.get("content") or item.get("text") or ""
-            embedding_text = item.get("embedding_text") or content
+            text = item.get("text") or ""
+            embedding_text = item.get("embedding_text") or text
            if not embedding_text.strip():
                continue
            section_path = item.get("section_path") or []
            section_title = item.get("section_title") or (section_path[-1] if section_path else "")
-            page_number = item.get("page_start") or item.get("page") or 0
            chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
-            metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
+            metadata = dict(item)
+            metadata["regulation_type"] = regulation_type
+            metadata["version"] = version
            chunks.append(
                Chunk(
                    chunk_id=str(chunk_id),
                    doc_id=parsed_document.doc_id,
-                    doc_name=parsed_document.doc_name,
-                    content=content,
+                    doc_title=str(item.get("doc_title") or parsed_document.doc_name),
+                    text=text,
                    embedding_text=embedding_text,
+                    chunk_type=str(item.get("chunk_type", item.get("block_type", ""))),
+                    chunk_index=int(item.get("chunk_index") or 0),
+                    piece_index=int(item.get("piece_index") or 0),
+                    page_start=int(item.get("page_start") or 0),
+                    page_end=int(item.get("page_end") or 0),
                    section_title=section_title,
                    section_path=section_path,
-                    page_number=int(page_number or 0),
+                    section_level=int(item.get("section_level") or len(section_path)),
+                    source_ids=[str(v) for v in item.get("source_ids", [])],
                    regulation_type=regulation_type,
                    version=version,
                    semantic_id=item.get("semantic_id", ""),
-                    block_type=item.get("block_type", ""),
                    metadata=metadata,
                )
            )
--- a/backend/app/infrastructure/vectorstore/bm25_retriever.py
+++ b/backend/app/infrastructure/vectorstore/bm25_retriever.py
@@ -56,7 +56,21 @@ class BM25Retriever:
        try:
            rows = self._vector_index.collection.query(
                expr='doc_id != ""',
-                output_fields=["id", "doc_id", "doc_name", "content", "section_title", "page_number"],
+                output_fields=[
+                    "id",
+                    "chunk_id",
+                    "doc_id",
+                    "doc_title",
+                    "text",
+                    "chunk_type",
+                    "section_title",
+                    "page_start",
+                    "page_end",
+                    "section_level",
+                    "chunk_index",
+                    "piece_index",
+                    "metadata_json",
+                ],
                limit=16384,
            )
        except Exception:
@@ -64,19 +78,33 @@ class BM25Retriever:
            return []
        return [
            RetrievedChunk(
-                chunk_id=str(row.get("id", "")),
+                chunk_id=str(row.get("chunk_id") or row.get("id", "")),
                doc_id=str(row.get("doc_id", "")),
-                doc_name=str(row.get("doc_name", "")),
-                content=str(row.get("content", "")),
+                doc_title=str(row.get("doc_title", "")),
+                text=str(row.get("text", "")),
                score=0.0,
+                chunk_type=str(row.get("chunk_type", "")),
                section_title=str(row.get("section_title", "")),
-                page_number=int(row.get("page_number") or 0),
-                metadata={},
+                page_start=int(row.get("page_start") or 0),
+                page_end=int(row.get("page_end") or 0),
+                section_level=int(row.get("section_level") or 0),
+                chunk_index=int(row.get("chunk_index") or 0),
+                piece_index=int(row.get("piece_index") or 0),
+                metadata=self._parse_metadata_json(row.get("metadata_json", "")),
            )
            for row in rows
-            if row.get("content")
+            if row.get("text")
        ]

+    def _parse_metadata_json(self, raw_metadata: str) -> dict:
+        """Parse metadata_json into a dict for BM25-side filtering."""
+        if not raw_metadata:
+            return {}
+        try:
+            return dict(__import__("json").loads(raw_metadata))
+        except Exception:
+            return {}
+
    def _ensure_built(self) -> None:
        if self._index is not None:
            return
@@ -93,7 +121,7 @@ class BM25Retriever:
            self._chunks = []
            self._index = BM25Okapi([[]])
            return
-        tokenized = [_tokenize(c.content) for c in chunks]
+        tokenized = [_tokenize(c.text) for c in chunks]
        self._chunks = chunks
        self._index = BM25Okapi(tokenized)
        logger.info("BM25Retriever: index built with %d chunks", len(chunks))
@@ -127,20 +155,26 @@ class BM25Retriever:
        for score, chunk in ranked[: top_k * 2]:
            if score <= 0:
                break
-            # Apply simple regulation_type filter if provided
-            if filters and chunk.metadata.get("regulation_type"):
-                types = [t.strip() for t in filters.split(",")]
-                if chunk.metadata.get("regulation_type") not in types:
-                    continue
+            if filters:
+                normalized_filter = filters.replace("doc_name", "doc_title").strip()
+                if normalized_filter.startswith('doc_title == "'):
+                    expected_title = normalized_filter[len('doc_title == "'):-1]
+                    if chunk.doc_title != expected_title:
+                        continue
            results.append(
                RetrievedChunk(
                    chunk_id=chunk.chunk_id,
                    doc_id=chunk.doc_id,
-                    doc_name=chunk.doc_name,
-                    content=chunk.content,
+                    doc_title=chunk.doc_title,
+                    text=chunk.text,
                    score=score,
+                    chunk_type=chunk.chunk_type,
                    section_title=chunk.section_title,
-                    page_number=chunk.page_number,
+                    page_start=chunk.page_start,
+                    page_end=chunk.page_end,
+                    section_level=chunk.section_level,
+                    chunk_index=chunk.chunk_index,
+                    piece_index=chunk.piece_index,
                    metadata=chunk.metadata,
                )
            )
--- a/backend/app/infrastructure/vectorstore/cross_encoder_reranker.py
+++ b/backend/app/infrastructure/vectorstore/cross_encoder_reranker.py
@@ -31,7 +31,7 @@ class OpenAICompatibleReranker(Reranker):
        if not chunks:
            return []

-        texts = [chunk.content for chunk in chunks]
+        texts = [chunk.text for chunk in chunks]
        start = time.time()
        try:
            scores = self._call_reranker(query, texts)
--- a/backend/app/infrastructure/vectorstore/milvus_vector_index.py
+++ b/backend/app/infrastructure/vectorstore/milvus_vector_index.py
@@ -4,57 +4,150 @@ from __future__ import annotations

 import json
 import time
+from typing import Iterable

-from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
+from loguru import logger
+from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, MilvusException, connections, utility

 from app.config.settings import settings
 from app.domain.documents import Chunk
 from app.domain.retrieval import RetrievedChunk, VectorIndex
+from app.shared.errors import VectorStoreSchemaError
 # Keep adapter behavior explicit so integration details remain easy to audit.


+_REQUIRED_SCHEMA_FIELDS = (
+    "doc_id",
+    "doc_title",
+    "chunk_id",
+    "text",
+    "embedding",
+    "section_title",
+    "metadata_json",
+)
+_SCHEMA_RECOVERY_TOKENS = (
+    "field doc_title not exist",
+    "field text not exist",
+    "field embedding not exist",
+    "collection not loaded",
+    "can't find collection",
+    "not found[collection",
+)
+
+

 class MilvusVectorIndex(VectorIndex):
    """Provide the Milvus Vector Index index implementation."""
+
    def __init__(self) -> None:
        """Initialize the Milvus Vector Index instance."""
        self.collection_name = settings.milvus_collection
        self.db_name = settings.milvus_db_name
+        self.host = settings.milvus_host
+        self.port = settings.milvus_port
+        # Use an adapter-specific alias so this index never reuses unrelated global Milvus state.
+        self.alias = f"vector-index::{self.host}:{self.port}/{self.db_name}/{self.collection_name}"
+        self._connect()
+        self.collection = self._bind_collection()
+
+    def _connect(self, *, refresh: bool = False) -> None:
+        """Establish the Milvus connection for this adapter."""
+        if refresh:
+            try:
+                connections.disconnect(self.alias)
+            except Exception:
+                # Best-effort disconnect keeps refresh idempotent when no alias is active yet.
+                pass
        connections.connect(
-            alias="default",
-            host=settings.milvus_host,
-            port=settings.milvus_port,
+            alias=self.alias,
+            host=self.host,
+            port=self.port,
            db_name=self.db_name,
        )
-        self.collection = self._ensure_collection()
+
+    def _schema_field_names(self, collection: Collection) -> list[str]:
+        """Return the field names exposed by the bound Milvus collection."""
+        return [field.name for field in collection.schema.fields]
+
+    def _raise_schema_error(self, *, message: str, actual_fields: Iterable[str]) -> None:
+        """Raise a typed schema error for the active collection."""
+        raise VectorStoreSchemaError(
+            message=message,
+            host=self.host,
+            db_name=self.db_name,
+            collection_name=self.collection_name,
+            expected_fields=list(_REQUIRED_SCHEMA_FIELDS),
+            actual_fields=list(actual_fields),
+        )
+
+    def _validate_schema(self, collection: Collection) -> None:
+        """Ensure the collection schema matches the dense-only adapter contract."""
+        actual_fields = self._schema_field_names(collection)
+        missing_fields = [field_name for field_name in _REQUIRED_SCHEMA_FIELDS if field_name not in actual_fields]
+        if missing_fields:
+            self._raise_schema_error(
+                message=f"Milvus collection schema mismatch; missing required fields: {missing_fields}",
+                actual_fields=actual_fields,
+            )
+
+    def _log_collection_binding(self, collection: Collection, *, event: str) -> None:
+        """Record the bound collection details for runtime diagnostics."""
+        try:
+            num_entities = collection.num_entities
+        except Exception:
+            num_entities = "unknown"
+        logger.info(
+            "Milvus binding {} alias={} host={} db={} collection={} fields={} num_entities={}",
+            event,
+            self.alias,
+            self.host,
+            self.db_name,
+            self.collection_name,
+            self._schema_field_names(collection),
+            num_entities,
+        )
+
+    def _bind_collection(self, *, force_refresh: bool = False) -> Collection:
+        """Bind and validate the configured Milvus collection."""
+        if force_refresh:
+            self._connect(refresh=True)
+        collection = self._ensure_collection()
+        self._validate_schema(collection)
+        self._log_collection_binding(collection, event="refreshed" if force_refresh else "initialized")
+        return collection

    def _ensure_collection(self) -> Collection:
        """Handle ensure collection for this module for the Milvus Vector Index instance."""
-        if utility.has_collection(self.collection_name):
-            collection = Collection(self.collection_name)
+        if utility.has_collection(self.collection_name, using=self.alias):
+            collection = Collection(self.collection_name, using=self.alias)
            collection.load()
            return collection
        schema = CollectionSchema(
            fields=[
                FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=128, is_primary=True, auto_id=False),
                FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
-                FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256),
-                FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
+                FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=256),
+                FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=128),
+                FieldSchema(name="chunk_index", dtype=DataType.INT64),
+                FieldSchema(name="piece_index", dtype=DataType.INT64),
+                FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
+                FieldSchema(name="embedding_text", dtype=DataType.VARCHAR, max_length=65535),
                FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=settings.embedding_dim),
-                FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
-                FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
-                FieldSchema(name="page_number", dtype=DataType.INT64),
-                FieldSchema(name="regulation_type", dtype=DataType.VARCHAR, max_length=128),
-                FieldSchema(name="version", dtype=DataType.VARCHAR, max_length=64),
                FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=128),
-                FieldSchema(name="block_type", dtype=DataType.VARCHAR, max_length=64),
+                FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=64),
+                FieldSchema(name="page_start", dtype=DataType.INT64),
+                FieldSchema(name="page_end", dtype=DataType.INT64),
+                FieldSchema(name="section_level", dtype=DataType.INT64),
+                FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),
+                FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
+                FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
                FieldSchema(name="metadata_json", dtype=DataType.VARCHAR, max_length=65535),
                FieldSchema(name="created_at", dtype=DataType.INT64),
            ],
            description="Dense-only regulations index",
            enable_dynamic_field=False,
        )
-        collection = Collection(name=self.collection_name, schema=schema)
+        collection = Collection(name=self.collection_name, schema=schema, using=self.alias)
        collection.create_index(
            field_name="embedding",
            index_params={
@@ -73,21 +166,34 @@ class MilvusVectorIndex(VectorIndex):
        data = []
        now = int(time.time())
        for chunk, vector in zip(chunks, vectors):
+            metadata = dict(chunk.metadata)
+            doc_title = str(metadata.get("doc_title", chunk.doc_title))
+            text = str(metadata.get("text", chunk.text))
+            embedding_text = str(metadata.get("embedding_text", chunk.embedding_text))
+            page_start = int(metadata.get("page_start", 0) or 0)
+            page_end = int(metadata.get("page_end", 0) or 0)
+            section_path = metadata.get("section_path", chunk.section_path)
+            source_ids = metadata.get("source_ids", [])
            data.append(
                {
                    "id": chunk.chunk_id,
                    "doc_id": chunk.doc_id,
-                    "doc_name": chunk.doc_name,
-                    "content": chunk.content[:65535],
+                    "doc_title": doc_title[:256],
+                    "chunk_id": chunk.chunk_id[:128],
+                    "chunk_index": int(metadata.get("chunk_index", chunk.chunk_index) or 0),
+                    "piece_index": int(metadata.get("piece_index", chunk.piece_index) or 0),
+                    "text": text[:65535],
+                    "embedding_text": embedding_text[:65535],
                    "embedding": vector,
-                    "section_title": chunk.section_title[:512],
-                    "section_path": json.dumps(chunk.section_path, ensure_ascii=False)[:4096],
-                    "page_number": chunk.page_number,
-                    "regulation_type": chunk.regulation_type[:128],
-                    "version": chunk.version[:64],
-                    "semantic_id": chunk.semantic_id[:128],
-                    "block_type": chunk.block_type[:64],
-                    "metadata_json": json.dumps(chunk.metadata, ensure_ascii=False)[:65535],
+                    "semantic_id": str(metadata.get("semantic_id", chunk.semantic_id))[:128],
+                    "chunk_type": str(metadata.get("chunk_type", chunk.chunk_type))[:64],
+                    "page_start": page_start,
+                    "page_end": page_end,
+                    "section_level": int(metadata.get("section_level", chunk.section_level) or 0),
+                    "source_ids": json.dumps(source_ids, ensure_ascii=False)[:4096],
+                    "section_path": json.dumps(section_path, ensure_ascii=False)[:4096],
+                    "section_title": str(metadata.get("section_title", chunk.section_title))[:512],
+                    "metadata_json": json.dumps(metadata, ensure_ascii=False)[:65535],
                    "created_at": now,
                }
            )
@@ -107,47 +213,97 @@ class MilvusVectorIndex(VectorIndex):

        filters = filters.strip()

+        # Normalize legacy field names so callers can keep older filter payloads.
+        replacements = {
+            "doc_name": "doc_title",
+            "content": "text",
+            "page_number": "page_start",
+            "block_type": "chunk_type",
+        }
+        for legacy_name, new_name in replacements.items():
+            filters = filters.replace(legacy_name, new_name)
+
        # Check if already a Milvus expression (contains operators)
        if any(op in filters for op in ["==", "!=", "in", "not in", ">", "<", ">=", "<=", "and", "or"]):
            return filters

-        # Parse simple regulation_type filter
-        # Support: "GB" or "GB,UN-ECE" or "GB, UN-ECE"
-        types = [t.strip() for t in filters.split(",") if t.strip()]
+        # Parse simple document-title filter.
+        titles = [title.strip() for title in filters.split(",") if title.strip()]

-        if not types:
+        if not titles:
            return None

-        if len(types) == 1:
-            # Single value: regulation_type == "GB"
-            return f'regulation_type == "{types[0]}"'
-        else:
-            # Multiple values: regulation_type in ["GB", "UN-ECE"]
-            quoted_types = [f'"{t}"' for t in types]
-            return f'regulation_type in [{", ".join(quoted_types)}]'
+        if len(titles) == 1:
+            return f'doc_title == "{titles[0]}"'
+
+        quoted_titles = [f'"{title}"' for title in titles]
+        return f'doc_title in [{", ".join(quoted_titles)}]'
+
+    def _should_refresh_after_exception(self, exc: Exception) -> bool:
+        """Return whether the Milvus error suggests stale connection or collection state."""
+        if not isinstance(exc, MilvusException):
+            return False
+        normalized = str(exc).lower()
+        return any(token in normalized for token in _SCHEMA_RECOVERY_TOKENS)
+
+    def _run_with_refresh(self, operation):
+        """Run a Milvus operation and retry once after a forced reconnect when appropriate."""
+        try:
+            return operation()
+        except VectorStoreSchemaError:
+            raise
+        except Exception as exc:
+            if not self._should_refresh_after_exception(exc):
+                raise
+            logger.warning(
+                "Milvus operation failed for alias={} collection={}; forcing reconnect and retry: {}",
+                self.alias,
+                self.collection_name,
+                exc,
+            )
+            self.collection = self._bind_collection(force_refresh=True)
+            try:
+                return operation()
+            except VectorStoreSchemaError:
+                raise
+            except Exception as retry_exc:
+                if isinstance(retry_exc, MilvusException):
+                    self._raise_schema_error(
+                        message=f"Milvus operation failed after refresh: {retry_exc}",
+                        actual_fields=self._schema_field_names(self.collection),
+                    )
+                raise

    def search(self, query_vector: list[float], top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
        """Handle search for the Milvus Vector Index instance."""
        milvus_expr = self._parse_filters(filters)

-        results = self.collection.search(
-            data=[query_vector],
-            anns_field="embedding",
-            param={"metric_type": "COSINE", "params": {"nprobe": settings.milvus_nprobe}},
-            limit=top_k,
-            expr=milvus_expr,
-            output_fields=[
-                "doc_id",
-                "doc_name",
-                "content",
-                "section_title",
-                "page_number",
-                "regulation_type",
-                "version",
-                "semantic_id",
-                "block_type",
-                "metadata_json",
-            ],
+        results = self._run_with_refresh(
+            lambda: self.collection.search(
+                data=[query_vector],
+                anns_field="embedding",
+                param={"metric_type": "COSINE", "params": {"nprobe": settings.milvus_nprobe}},
+                limit=top_k,
+                expr=milvus_expr,
+                output_fields=[
+                    "doc_id",
+                    "doc_title",
+                    "chunk_id",
+                    "chunk_index",
+                    "piece_index",
+                    "text",
+                    "embedding_text",
+                    "section_title",
+                    "semantic_id",
+                    "chunk_type",
+                    "page_start",
+                    "page_end",
+                    "section_level",
+                    "source_ids",
+                    "section_path",
+                    "metadata_json",
+                ],
+            )
        )
        payload: list[RetrievedChunk] = []
        for hits in results:
@@ -161,13 +317,18 @@ class MilvusVectorIndex(VectorIndex):
                        metadata = {"raw_metadata": raw_metadata}
                payload.append(
                    RetrievedChunk(
-                        chunk_id=str(hit.id),
+                        chunk_id=str(hit.entity.get("chunk_id", hit.id)),
                        doc_id=hit.entity.get("doc_id", ""),
-                        doc_name=hit.entity.get("doc_name", ""),
-                        content=hit.entity.get("content", ""),
+                        doc_title=hit.entity.get("doc_title", ""),
+                        text=hit.entity.get("text", ""),
                        score=float(hit.score),
+                        chunk_type=hit.entity.get("chunk_type", ""),
                        section_title=hit.entity.get("section_title", ""),
-                        page_number=int(hit.entity.get("page_number", 0) or 0),
+                        page_start=int(hit.entity.get("page_start", 0) or 0),
+                        page_end=int(hit.entity.get("page_end", 0) or 0),
+                        section_level=int(hit.entity.get("section_level", 0) or 0),
+                        chunk_index=int(hit.entity.get("chunk_index", 0) or 0),
+                        piece_index=int(hit.entity.get("piece_index", 0) or 0),
                        metadata=metadata,
                    )
                )
@@ -176,7 +337,9 @@ class MilvusVectorIndex(VectorIndex):
    def count_by_document(self) -> dict[str, int]:
        """Return doc_id -> chunk count from Milvus."""
        try:
-            rows = self.collection.query(expr="doc_id != \"\"", output_fields=["doc_id"])
+            rows = self._run_with_refresh(
+                lambda: self.collection.query(expr="doc_id != \"\"", output_fields=["doc_id", "doc_title"])
+            )
        except Exception:
            return {}
        counts: dict[str, int] = {}
@@ -189,9 +352,11 @@ class MilvusVectorIndex(VectorIndex):
    def list_document_metadata(self) -> list[dict]:
        """Return one metadata row per document from Milvus (single query, no embeddings)."""
        try:
-            rows = self.collection.query(
-                expr="doc_id != \"\"",
-                output_fields=["doc_id", "doc_name", "regulation_type", "version"],
+            rows = self._run_with_refresh(
+                lambda: self.collection.query(
+                    expr="doc_id != \"\"",
+                    output_fields=["doc_id", "doc_title", "metadata_json"],
+                )
            )
        except Exception:
            return []
@@ -204,15 +369,26 @@ class MilvusVectorIndex(VectorIndex):
                continue
            counts[doc_id] = counts.get(doc_id, 0) + 1
            if doc_id not in seen:
+                metadata: dict[str, object] = {}
+                raw_metadata = row.get("metadata_json", "")
+                if raw_metadata:
+                    try:
+                        metadata = json.loads(raw_metadata)
+                    except json.JSONDecodeError:
+                        metadata = {}
                seen[doc_id] = {
                    "doc_id": doc_id,
-                    "doc_name": row.get("doc_name", ""),
-                    "regulation_type": row.get("regulation_type", ""),
-                    "version": row.get("version", ""),
+                    "doc_title": row.get("doc_title", ""),
+                    "regulation_type": str(metadata.get("regulation_type", "")),
+                    "version": str(metadata.get("version", "")),
                }

        return [
-            {**meta, "chunk_count": counts[meta["doc_id"]]}
+            {
+                **meta,
+                "doc_name": meta.get("doc_title", ""),
+                "chunk_count": counts[meta["doc_id"]],
+            }
            for meta in seen.values()
        ]

--- a/backend/app/services/document_processor.py
+++ b/backend/app/services/document_processor.py
@@ -67,14 +67,14 @@ class DocumentProcessor:
        return [
            {
                "id": item.chunk_id,
-                "content": item.content,
+                "content": item.text,
                "score": item.score,
                "metadata": {
                    "doc_id": item.doc_id,
-                    "doc_name": item.doc_name,
+                    "doc_name": item.doc_title,
                    "chunk_id": item.chunk_id,
                    "section_title": item.section_title,
-                    "page_number": item.page_number,
+                    "page_number": item.page_start,
                    **item.metadata,
                },
            }
--- a/backend/app/shared/errors.py
+++ b/backend/app/shared/errors.py
@@ -0,0 +1,30 @@
+"""Define shared backend exception types."""
+
+from __future__ import annotations
+
+
+class VectorStoreSchemaError(RuntimeError):
+    """Signal that the active vector store schema does not match backend expectations."""
+
+    def __init__(
+        self,
+        *,
+        message: str,
+        host: str,
+        db_name: str,
+        collection_name: str,
+        expected_fields: list[str],
+        actual_fields: list[str],
+    ) -> None:
+        """Initialize the vector store schema error details."""
+        self.host = host
+        self.db_name = db_name
+        self.collection_name = collection_name
+        self.expected_fields = expected_fields
+        self.actual_fields = actual_fields
+        # Keep the message self-contained so runtime logs show the full mismatch context.
+        details = (
+            f"{message} | host={host} db={db_name} collection={collection_name} "
+            f"expected_fields={expected_fields} actual_fields={actual_fields}"
+        )
+        super().__init__(details)
--- a/backend/backend/data/documents.json
+++ b/backend/backend/data/documents.json
@@ -1 +0,0 @@
-{}
--- a/backend/data/document_processing.json
+++ b/backend/data/document_processing.json
@@ -0,0 +1,131 @@
+{
+  "runs": {
+    "8e722053-5009-40fe-a483-535b40ebbb16": {
+      "run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
+      "doc_id": "7cbdfe3c",
+      "trigger_type": "upload",
+      "run_status": "succeeded",
+      "parser_backend": "aliyun_docmind",
+      "chunk_backend": "aliyun",
+      "embedding_model": "text-embedding-v3",
+      "index_name": "regulations_dense_1024_v2",
+      "started_at": "2026-05-26T12:18:27.208692+00:00",
+      "stored_at": "2026-05-26T12:18:27.712855+00:00",
+      "parsed_at": "2026-05-26T12:18:42.989238+00:00",
+      "indexed_at": "2026-05-26T12:18:51.172418+00:00",
+      "finished_at": "2026-05-26T12:18:51.172418+00:00",
+      "layout_count": 48,
+      "structure_node_count": 6,
+      "semantic_block_count": 33,
+      "vector_chunk_count": 34,
+      "chunk_count": 34,
+      "failure_stage": "",
+      "error_message": "",
+      "metadata": {
+        "generate_summary": true,
+        "parse_task_id": "docmind-20260526-10b94713ccb348498b12180a5dcf32ff"
+      }
+    }
+  },
+  "status_events": {
+    "d0532baf-0d65-4130-b282-ec51f04132fd": {
+      "event_id": "d0532baf-0d65-4130-b282-ec51f04132fd",
+      "doc_id": "7cbdfe3c",
+      "run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
+      "from_status": "",
+      "to_status": "pending",
+      "stage": "document_created",
+      "message": "Document record created",
+      "metadata": {},
+      "occurred_at": "2026-05-26T12:18:27.235921+00:00"
+    },
+    "a5e32db5-25c3-4c73-a987-7311f0e72a31": {
+      "event_id": "a5e32db5-25c3-4c73-a987-7311f0e72a31",
+      "doc_id": "7cbdfe3c",
+      "run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
+      "from_status": "pending",
+      "to_status": "stored",
+      "stage": "store",
+      "message": "Source file stored",
+      "metadata": {},
+      "occurred_at": "2026-05-26T12:18:27.741462+00:00"
+    },
+    "18e04ce7-9d7a-4008-8600-e2590100bd85": {
+      "event_id": "18e04ce7-9d7a-4008-8600-e2590100bd85",
+      "doc_id": "7cbdfe3c",
+      "run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
+      "from_status": "stored",
+      "to_status": "parsed",
+      "stage": "parse",
+      "message": "Document parsed",
+      "metadata": {
+        "artifact_count": 4
+      },
+      "occurred_at": "2026-05-26T12:18:43.218026+00:00"
+    },
+    "d3b06025-5c91-4a42-9e5f-dce1c5312b96": {
+      "event_id": "d3b06025-5c91-4a42-9e5f-dce1c5312b96",
+      "doc_id": "7cbdfe3c",
+      "run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
+      "from_status": "parsed",
+      "to_status": "indexed",
+      "stage": "index",
+      "message": "Document indexed",
+      "metadata": {
+        "chunk_count": 34,
+        "index_name": "regulations_dense_1024_v2"
+      },
+      "occurred_at": "2026-05-26T12:18:51.195442+00:00"
+    }
+  },
+  "artifacts": {
+    "47fe2877-a8f5-4e1d-901b-80cd0194ba96": {
+      "artifact_id": "47fe2877-a8f5-4e1d-901b-80cd0194ba96",
+      "doc_id": "7cbdfe3c",
+      "run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
+      "artifact_type": "layouts",
+      "object_name": "artifacts/7cbdfe3c/layouts.json",
+      "content_type": "application/json",
+      "byte_size": 0,
+      "checksum": "",
+      "metadata": {},
+      "created_at": "2026-05-26T12:18:43.188467+00:00"
+    },
+    "44aa075b-86b2-48a7-9d14-a2453bd53863": {
+      "artifact_id": "44aa075b-86b2-48a7-9d14-a2453bd53863",
+      "doc_id": "7cbdfe3c",
+      "run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
+      "artifact_type": "structure_nodes",
+      "object_name": "artifacts/7cbdfe3c/structure_nodes.json",
+      "content_type": "application/json",
+      "byte_size": 0,
+      "checksum": "",
+      "metadata": {},
+      "created_at": "2026-05-26T12:18:43.188494+00:00"
+    },
+    "dedcc8fe-fa58-4de6-984d-f44332af5204": {
+      "artifact_id": "dedcc8fe-fa58-4de6-984d-f44332af5204",
+      "doc_id": "7cbdfe3c",
+      "run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
+      "artifact_type": "semantic_blocks",
+      "object_name": "artifacts/7cbdfe3c/semantic_blocks.json",
+      "content_type": "application/json",
+      "byte_size": 0,
+      "checksum": "",
+      "metadata": {},
+      "created_at": "2026-05-26T12:18:43.188511+00:00"
+    },
+    "9b0d8bda-e69e-4a4e-ae06-a308afe43109": {
+      "artifact_id": "9b0d8bda-e69e-4a4e-ae06-a308afe43109",
+      "doc_id": "7cbdfe3c",
+      "run_id": "8e722053-5009-40fe-a483-535b40ebbb16",
+      "artifact_type": "vector_chunks",
+      "object_name": "artifacts/7cbdfe3c/vector_chunks.json",
+      "content_type": "application/json",
+      "byte_size": 0,
+      "checksum": "",
+      "metadata": {},
+      "created_at": "2026-05-26T12:18:43.188526+00:00"
+    }
+  }
+}
--- a/backend/data/documents.json
+++ b/backend/data/documents.json
@@ -1,392 +1,9 @@
 {
-  "69280841": {
-    "doc_id": "69280841",
-    "doc_name": "TCT算法接口.pdf",
-    "file_name": "TCT算法接口.pdf",
-    "object_name": "69280841/TCT算法接口.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 165557,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "local_markdown_parser",
-    "index_name": "",
-    "error_message": "embedding 维度不匹配，期望 1536",
-    "created_at": "2026-05-18T07:12:16.668306+00:00",
-    "updated_at": "2026-05-18T07:12:19.417142+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "structure_nodes": 0
-    }
-  },
-  "44121fbb": {
-    "doc_id": "44121fbb",
-    "doc_name": "大众汽车手册.pdf",
-    "file_name": "大众汽车手册.pdf",
-    "object_name": "44121fbb/大众汽车手册.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 766565,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "",
-    "index_name": "",
-    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-    "created_at": "2026-05-18T09:53:47.996183+00:00",
-    "updated_at": "2026-05-18T09:53:50.825868+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5cb9d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-      "processing_stage": "failed"
-    }
-  },
-  "77debb4a": {
-    "doc_id": "77debb4a",
-    "doc_name": "大众汽车手册.pdf",
-    "file_name": "大众汽车手册.pdf",
-    "object_name": "77debb4a/大众汽车手册.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 766565,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "",
-    "index_name": "",
-    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-    "created_at": "2026-05-18T10:05:46.104259+00:00",
-    "updated_at": "2026-05-18T10:05:48.704061+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a6dd480>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-      "processing_stage": "failed"
-    }
-  },
-  "d12bdcc8": {
-    "doc_id": "d12bdcc8",
-    "doc_name": "TCT算法接口.pdf",
-    "file_name": "TCT算法接口.pdf",
-    "object_name": "d12bdcc8/TCT算法接口.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 165557,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "",
-    "index_name": "",
-    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-    "created_at": "2026-05-18T10:07:22.199824+00:00",
-    "updated_at": "2026-05-18T10:07:24.653751+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bf570>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-      "processing_stage": "failed"
-    }
-  },
-  "3c2e8c9c": {
-    "doc_id": "3c2e8c9c",
-    "doc_name": "20260415_Continental tire mobile app solution.pdf",
-    "file_name": "20260415_Continental tire mobile app solution.pdf",
-    "object_name": "3c2e8c9c/20260415_Continental tire mobile app solution.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 2178074,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "",
-    "index_name": "",
-    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-    "created_at": "2026-05-18T10:09:58.338274+00:00",
-    "updated_at": "2026-05-18T10:10:01.295502+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614a5bc8d0>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-      "processing_stage": "failed"
-    }
-  },
-  "d22d21a0": {
-    "doc_id": "d22d21a0",
-    "doc_name": "20260415_Continental tire mobile app solution.pdf",
-    "file_name": "20260415_Continental tire mobile app solution.pdf",
-    "object_name": "d22d21a0/20260415_Continental tire mobile app solution.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 2178074,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "",
-    "index_name": "",
-    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-    "created_at": "2026-05-18T10:12:20.078027+00:00",
-    "updated_at": "2026-05-18T10:12:22.999843+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b994160>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-      "processing_stage": "failed"
-    }
-  },
-  "35f129d3": {
-    "doc_id": "35f129d3",
-    "doc_name": "大众汽车手册.pdf",
-    "file_name": "大众汽车手册.pdf",
-    "object_name": "35f129d3/大众汽车手册.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 766565,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "",
-    "index_name": "",
-    "error_message": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-    "created_at": "2026-05-18T10:13:24.706512+00:00",
-    "updated_at": "2026-05-18T10:13:27.180509+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "failure_reason": "unable to load credentials from any of the providers in the chain: ['EnvironmentVariableCredentialsProvider: Environment variable accessKeyId cannot be empty', 'CLIProfileCredentialsProvider: unable to open credentials file: C:\\\\Users\\\\A200477427\\\\.aliyun/config.json', 'ProfileCredentialsProvider: failed to get credential from credentials file: $C:\\\\Users\\\\A200477427\\\\.alibabacloud/credentials.ini', \"EcsRamRoleCredentialsProvider: HTTPConnectionPool(host='100.100.100.200', port=80): Max retries exceeded with url: /latest/meta-data/ram/security-credentials/ (Caused by ConnectTimeoutError(<HTTPConnection(host='100.100.100.200', port=80) at 0x2614b995370>, 'Connection to 100.100.100.200 timed out. (connect timeout=1.0)'))\"]",
-      "processing_stage": "failed"
-    }
-  },
-  "efc21515": {
-    "doc_id": "efc21515",
-    "doc_name": "大众汽车手册.pdf",
-    "file_name": "大众汽车手册.pdf",
-    "object_name": "efc21515/大众汽车手册.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 766565,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "aliyun_docmind",
-    "index_name": "",
-    "error_message": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400",
-    "created_at": "2026-05-18T13:47:32.076786+00:00",
-    "updated_at": "2026-05-18T13:47:57.998073+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "parser_backend": "aliyun_docmind",
-      "parse_task_id": "docmind-20260518-a6e84447457f43cb85f95225cfc6495b",
-      "layout_count": 87,
-      "structure_node_count": 20,
-      "semantic_block_count": 27,
-      "vector_chunk_count": 27,
-      "artifact_keys": {
-        "layouts": "artifacts/efc21515/layouts.json",
-        "structure_nodes": "artifacts/efc21515/structure_nodes.json",
-        "semantic_blocks": "artifacts/efc21515/semantic_blocks.json",
-        "vector_chunks": "artifacts/efc21515/vector_chunks.json"
-      },
-      "processing_stage": "failed",
-      "failure_reason": "Client error '400 Bad Request' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400"
-    }
-  },
-  "0d4b08bc": {
-    "doc_id": "0d4b08bc",
-    "doc_name": "大众汽车手册.pdf",
-    "file_name": "大众汽车手册.pdf",
-    "object_name": "0d4b08bc/大众汽车手册.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 766565,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "aliyun_docmind",
-    "index_name": "",
-    "error_message": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404",
-    "created_at": "2026-05-18T14:03:15.134344+00:00",
-    "updated_at": "2026-05-18T14:03:34.843448+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "parser_backend": "aliyun_docmind",
-      "parse_task_id": "docmind-20260518-78353d85daa24147b68d8fb71895179f",
-      "layout_count": 87,
-      "structure_node_count": 20,
-      "semantic_block_count": 27,
-      "vector_chunk_count": 27,
-      "artifact_keys": {
-        "layouts": "artifacts/0d4b08bc/layouts.json",
-        "structure_nodes": "artifacts/0d4b08bc/structure_nodes.json",
-        "semantic_blocks": "artifacts/0d4b08bc/semantic_blocks.json",
-        "vector_chunks": "artifacts/0d4b08bc/vector_chunks.json"
-      },
-      "processing_stage": "failed",
-      "failure_reason": "Client error '404 Not Found' for url 'http://6.86.80.4:30080/v1/embeddings'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404"
-    }
-  },
-  "4302f314": {
-    "doc_id": "4302f314",
-    "doc_name": "大众汽车手册.pdf",
-    "file_name": "大众汽车手册.pdf",
-    "object_name": "4302f314/大众汽车手册.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 766565,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "aliyun_docmind",
-    "index_name": "",
-    "error_message": "embedding 维度不匹配，期望 1536",
-    "created_at": "2026-05-18T14:11:29.943973+00:00",
-    "updated_at": "2026-05-18T14:11:48.554500+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "parser_backend": "aliyun_docmind",
-      "parse_task_id": "docmind-20260518-23935ee455ac4b26ac4201ac4781ee52",
-      "layout_count": 87,
-      "structure_node_count": 20,
-      "semantic_block_count": 27,
-      "vector_chunk_count": 27,
-      "artifact_keys": {
-        "layouts": "artifacts/4302f314/layouts.json",
-        "structure_nodes": "artifacts/4302f314/structure_nodes.json",
-        "semantic_blocks": "artifacts/4302f314/semantic_blocks.json",
-        "vector_chunks": "artifacts/4302f314/vector_chunks.json"
-      },
-      "processing_stage": "failed",
-      "failure_reason": "embedding 维度不匹配，期望 1536"
-    }
-  },
-  "765ed1ee": {
-    "doc_id": "765ed1ee",
-    "doc_name": "大众汽车手册.pdf",
-    "file_name": "大众汽车手册.pdf",
-    "object_name": "765ed1ee/大众汽车手册.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 766565,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "aliyun_docmind",
-    "index_name": "",
-    "error_message": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>",
-    "created_at": "2026-05-18T14:18:28.875138+00:00",
-    "updated_at": "2026-05-18T14:18:57.389110+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "parser_backend": "aliyun_docmind",
-      "parse_task_id": "docmind-20260518-f116856bc29245baa2531b245078a701",
-      "layout_count": 87,
-      "structure_node_count": 20,
-      "semantic_block_count": 27,
-      "vector_chunk_count": 27,
-      "artifact_keys": {
-        "layouts": "artifacts/765ed1ee/layouts.json",
-        "structure_nodes": "artifacts/765ed1ee/structure_nodes.json",
-        "semantic_blocks": "artifacts/765ed1ee/semantic_blocks.json",
-        "vector_chunks": "artifacts/765ed1ee/vector_chunks.json"
-      },
-      "processing_stage": "failed",
-      "failure_reason": "<MilvusException: (code=1100, message=the dim (1024) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=1024])>"
-    }
-  },
-  "05cabe09": {
-    "doc_id": "05cabe09",
-    "doc_name": "大众汽车手册.pdf",
-    "file_name": "大众汽车手册.pdf",
-    "object_name": "05cabe09/大众汽车手册.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 766565,
-    "status": "failed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 0,
-    "parser_name": "aliyun_docmind",
-    "index_name": "",
-    "error_message": "embedding 维度不匹配，期望 1536",
-    "created_at": "2026-05-18T14:24:32.156500+00:00",
-    "updated_at": "2026-05-18T14:24:50.114138+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "parser_backend": "aliyun_docmind",
-      "parse_task_id": "docmind-20260518-897d858983df48e28e9819e563d46208",
-      "layout_count": 87,
-      "structure_node_count": 20,
-      "semantic_block_count": 27,
-      "vector_chunk_count": 27,
-      "artifact_keys": {
-        "layouts": "artifacts/05cabe09/layouts.json",
-        "structure_nodes": "artifacts/05cabe09/structure_nodes.json",
-        "semantic_blocks": "artifacts/05cabe09/semantic_blocks.json",
-        "vector_chunks": "artifacts/05cabe09/vector_chunks.json"
-      },
-      "processing_stage": "failed",
-      "failure_reason": "embedding 维度不匹配，期望 1536"
-    }
-  },
-  "9acb2ba0": {
-    "doc_id": "9acb2ba0",
-    "doc_name": "大众汽车手册.pdf",
-    "file_name": "大众汽车手册.pdf",
-    "object_name": "9acb2ba0/大众汽车手册.pdf",
-    "content_type": "application/pdf",
-    "size_bytes": 766565,
-    "status": "indexed",
-    "regulation_type": "",
-    "version": "",
-    "summary": "",
-    "summary_latency_ms": 0,
-    "chunk_count": 27,
-    "parser_name": "aliyun_docmind",
-    "index_name": "regulations_dense_1024_v1",
-    "error_message": "",
-    "created_at": "2026-05-18T14:29:01.368719+00:00",
-    "updated_at": "2026-05-18T14:29:23.699068+00:00",
-    "metadata": {
-      "generate_summary": true,
-      "parser_backend": "aliyun_docmind",
-      "parse_task_id": "docmind-20260518-e5fd4a5419e74d569c562e389e6ae72c",
-      "layout_count": 87,
-      "structure_node_count": 20,
-      "semantic_block_count": 27,
-      "vector_chunk_count": 27,
-      "artifact_keys": {
-        "layouts": "artifacts/9acb2ba0/layouts.json",
-        "structure_nodes": "artifacts/9acb2ba0/structure_nodes.json",
-        "semantic_blocks": "artifacts/9acb2ba0/semantic_blocks.json",
-        "vector_chunks": "artifacts/9acb2ba0/vector_chunks.json"
-      },
-      "processing_stage": "indexed",
-      "index_collection": "regulations_dense_1024_v1"
-    }
-  },
-  "52bd970f": {
-    "doc_id": "52bd970f",
+  "7cbdfe3c": {
+    "doc_id": "7cbdfe3c",
    "doc_name": "使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
    "file_name": "使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
-    "object_name": "52bd970f/使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
+    "object_name": "7cbdfe3c/使用RSA Token连接CheckPoint VPN及PIN码设置_220.181.114.93 or 10.25.134.3.docx",
    "content_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "size_bytes": 1199920,
    "status": "indexed",
@@ -396,26 +13,26 @@
    "summary_latency_ms": 0,
    "chunk_count": 34,
    "parser_name": "aliyun_docmind",
-    "index_name": "regulations_dense_1024_v1",
+    "index_name": "regulations_dense_1024_v2",
    "error_message": "",
-    "created_at": "2026-05-25T07:45:12.777459+00:00",
-    "updated_at": "2026-05-25T07:45:37.314290+00:00",
+    "created_at": "2026-05-26T12:18:27.206125+00:00",
+    "updated_at": "2026-05-26T12:18:51.171308+00:00",
    "metadata": {
      "generate_summary": true,
      "parser_backend": "aliyun_docmind",
-      "parse_task_id": "docmind-20260525-6d782dc33f2748a4a1020df765b8182d",
+      "parse_task_id": "docmind-20260526-10b94713ccb348498b12180a5dcf32ff",
      "layout_count": 48,
      "structure_node_count": 6,
      "semantic_block_count": 33,
      "vector_chunk_count": 34,
      "artifact_keys": {
-        "layouts": "artifacts/52bd970f/layouts.json",
-        "structure_nodes": "artifacts/52bd970f/structure_nodes.json",
-        "semantic_blocks": "artifacts/52bd970f/semantic_blocks.json",
-        "vector_chunks": "artifacts/52bd970f/vector_chunks.json"
+        "layouts": "artifacts/7cbdfe3c/layouts.json",
+        "structure_nodes": "artifacts/7cbdfe3c/structure_nodes.json",
+        "semantic_blocks": "artifacts/7cbdfe3c/semantic_blocks.json",
+        "vector_chunks": "artifacts/7cbdfe3c/vector_chunks.json"
      },
      "processing_stage": "indexed",
-      "index_collection": "regulations_dense_1024_v1"
+      "index_collection": "regulations_dense_1024_v2"
    }
  }
 }