feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions
--- a/backend/app/infrastructure/parser/aliyun_docmind_gateway.py
+++ b/backend/app/infrastructure/parser/aliyun_docmind_gateway.py
@@ -0,0 +1,142 @@
+"""Aliyun Docmind gateway helpers for the document ingest pipeline."""
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from alibabacloud_docmind_api20220711 import models as docmind_models
+from alibabacloud_docmind_api20220711.client import Client as DocmindClient
+from alibabacloud_tea_openapi import models as open_api_models
+from alibabacloud_tea_util import models as util_models
+
+from app.config.settings import settings
+
+# Keep provider-specific behavior isolated so the rest of the backend can stay stable.
+
+
+@dataclass
+class AliyunParsePayload:
+    """Represent the raw Aliyun parse payload returned by the gateway."""
+
+    task_id: str
+    layouts: list[dict[str, Any]]
+    poll_attempts: int
+    duration_ms: int
+
+
+class AliyunDocmindGateway:
+    """Submit, poll, and collect results from the Aliyun Docmind API."""
+
+    def __init__(self) -> None:
+        """Initialize the gateway with runtime configuration."""
+        self.endpoint = settings.alibaba_endpoint
+        self.poll_interval_seconds = settings.aliyun_parse_poll_interval_seconds
+        self.timeout_seconds = settings.aliyun_parse_timeout_seconds
+        self.layout_step_size = settings.aliyun_parse_layout_step_size
+        self.llm_enhancement = settings.aliyun_llm_enhancement
+        self.enhancement_mode = settings.aliyun_enhancement_mode
+
+    def parse_document(self, *, file_path: str) -> AliyunParsePayload:
+        """Parse a single document and return the collected layouts."""
+        client = self._create_client()
+        started_at = time.monotonic()
+        task_id = self._submit_job(client=client, file_path=file_path)
+        poll_attempts = self._wait_for_completion(client=client, task_id=task_id, started_at=started_at)
+        layouts = self._collect_all_results(client=client, task_id=task_id)
+        duration_ms = int((time.monotonic() - started_at) * 1000)
+        return AliyunParsePayload(
+            task_id=task_id,
+            layouts=layouts,
+            poll_attempts=poll_attempts,
+            duration_ms=duration_ms,
+        )
+
+    def _create_client(self) -> DocmindClient:
+        """Create a Docmind client using explicit AccessKey settings only."""
+        config = open_api_models.Config()
+        config.endpoint = self.endpoint
+
+        if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
+            raise ValueError(
+                "Missing Aliyun parser credentials. Set ALIBABA_ACCESS_KEY_ID and "
+                "ALIBABA_ACCESS_KEY_SECRET in the project root .env."
+            )
+
+        # Keep production behavior deterministic by using only project-configured credentials.
+        config.access_key_id = settings.alibaba_access_key_id
+        config.access_key_secret = settings.alibaba_access_key_secret
+        return DocmindClient(config)
+
+    def _submit_job(self, *, client: DocmindClient, file_path: str) -> str:
+        """Submit an asynchronous Docmind parse job."""
+        path = Path(file_path)
+        with open(file_path, "rb") as file_stream:
+            request = docmind_models.SubmitDocParserJobAdvanceRequest(
+                file_url_object=file_stream,
+                file_name=path.name,
+                file_name_extension=path.suffix.lstrip("."),
+                llm_enhancement=self.llm_enhancement,
+                enhancement_mode=self.enhancement_mode,
+            )
+            runtime = util_models.RuntimeOptions()
+            response = client.submit_doc_parser_job_advance(request, runtime)
+        task_id = response.body.data.id if response.body and response.body.data else ""
+        if not task_id:
+            raise RuntimeError("Aliyun Docmind did not return a parse task id.")
+        return task_id
+
+    def _query_status(self, *, client: DocmindClient, task_id: str) -> dict[str, Any] | None:
+        """Query the current Docmind parse status."""
+        request = docmind_models.QueryDocParserStatusRequest(id=task_id)
+        response = client.query_doc_parser_status(request)
+        return response.body.data.to_map() if response.body and response.body.data else None
+
+    def _wait_for_completion(self, *, client: DocmindClient, task_id: str, started_at: float) -> int:
+        """Poll until the parse job finishes or times out."""
+        poll_attempts = 0
+        while True:
+            poll_attempts += 1
+            status_payload = self._query_status(client=client, task_id=task_id)
+            if not status_payload:
+                raise RuntimeError(f"Aliyun parse status payload is empty for task {task_id}.")
+
+            status = str(status_payload.get("Status", "")).lower()
+            if status == "success":
+                return poll_attempts
+            if status == "failed":
+                raise RuntimeError(f"Aliyun parse task failed: {status_payload}")
+
+            elapsed = time.monotonic() - started_at
+            if elapsed > self.timeout_seconds:
+                raise TimeoutError(
+                    f"Aliyun parse task timed out after {self.timeout_seconds}s: task_id={task_id}"
+                )
+            time.sleep(self.poll_interval_seconds)
+
+    def _collect_all_results(self, *, client: DocmindClient, task_id: str) -> list[dict[str, Any]]:
+        """Collect all paginated layout results from a completed parse task."""
+        all_layouts: list[dict[str, Any]] = []
+        layout_num = 0
+        while True:
+            request = docmind_models.GetDocParserResultRequest(
+                id=task_id,
+                layout_step_size=self.layout_step_size,
+                layout_num=layout_num,
+            )
+            response = client.get_doc_parser_result(request)
+            payload = response.body.data if response.body else None
+            if not payload:
+                break
+            layouts = payload.get("layouts", [])
+            if not layouts:
+                break
+            all_layouts.extend(layouts)
+            layout_num += len(layouts)
+            if len(layouts) < self.layout_step_size:
+                break
+        if not all_layouts:
+            raise RuntimeError(f"Aliyun parse task returned no layouts: task_id={task_id}")
+        return all_layouts
--- a/backend/app/infrastructure/parser/aliyun_document_parser.py
+++ b/backend/app/infrastructure/parser/aliyun_document_parser.py
@@ -1,19 +1,18 @@
-"""Implement infrastructure support for aliyun document parser."""
+"""Implement infrastructure support for Aliyun document parsing."""

 from __future__ import annotations

-from app.aliyun_parser.parse_pdf import (
+from app.config.settings import settings
+from app.domain.documents import DocumentParser, ParsedDocument
+from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway
+from app.infrastructure.parser.aliyun_layout_normalizer import (
    MAX_CHARS,
    OVERLAP_CHARS,
    build_semantic_blocks,
    build_structure_nodes,
    build_vector_chunks,
-    collect_all_results,
-    init_client,
-    submit_job,
-    wait_for_completion,
 )
-from app.domain.documents import DocumentParser, ParsedDocument
+
 # Keep adapter behavior explicit so integration details remain easy to audit.


@@ -22,13 +21,14 @@ class AliyunDocumentParser(DocumentParser):
    """Provide the Aliyun Document Parser parser."""
    parser_name = "aliyun_docmind"

+    def __init__(self) -> None:
+        """Initialize the parser adapter and its gateway dependency."""
+        self.gateway = AliyunDocmindGateway()
+
    def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
        """Handle parse for the Aliyun Document Parser instance."""
-        client = init_client()
-        task_id = submit_job(client, file_path)
-        if not wait_for_completion(client, task_id):
-            raise RuntimeError("阿里云文档解析任务失败")
-        layouts = collect_all_results(client, task_id)
+        payload = self.gateway.parse_document(file_path=file_path)
+        layouts = payload.layouts
        structure_nodes = build_structure_nodes(layouts)
        semantic_blocks = build_semantic_blocks(layouts)
        vector_chunks = build_vector_chunks(
@@ -51,5 +51,13 @@ class AliyunDocumentParser(DocumentParser):
            vector_chunks=vector_chunks,
            parser_name=self.parser_name,
            raw_text=raw_text,
-            metadata={"task_id": task_id, "layout_count": len(layouts)},
+            raw_layouts=layouts,
+            metadata={
+                "task_id": payload.task_id,
+                "layout_count": len(layouts),
+                "poll_attempts": payload.poll_attempts,
+                "duration_ms": payload.duration_ms,
+                "parser_backend": self.parser_name,
+                "artifact_prefix": settings.document_parse_artifact_prefix,
+            },
        )
--- a/backend/app/infrastructure/parser/aliyun_layout_normalizer.py
+++ b/backend/app/infrastructure/parser/aliyun_layout_normalizer.py
@@ -0,0 +1,336 @@
+"""Normalize Aliyun Docmind layouts into production document structures."""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+# Keep layout normalization rules centralized so parser and demos stay aligned.
+MAX_CHARS = 600
+OVERLAP_CHARS = 80
+
+TOC_TITLES = {"目次", "目录"}
+TITLE_SUBTYPES = {"doc_title", "para_title"}
+TEXT_SUBTYPES = {"para", "none"}
+FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
+FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
+
+
+def normalize_text(text: str) -> str:
+    """Normalize raw text content emitted by the parser."""
+    text = text.replace("\r", "\n")
+    text = text.replace(" ", " ")
+    text = re.sub(r"\n+", "\n", text)
+    text = re.sub(r"[ \t]+", " ", text)
+    return text.strip()
+
+
+def get_page(layout: dict[str, Any]) -> int:
+    """Return the page number for a layout record."""
+    return layout.get("pageNum", layout.get("pageNumber", 0))
+
+
+def get_text(layout: dict[str, Any]) -> str:
+    """Return the most useful text content for a layout record."""
+    text = normalize_text(layout.get("text", ""))
+    if text:
+        return text
+    return normalize_text(layout.get("markdownContent", ""))
+
+
+def is_title(layout: dict[str, Any]) -> bool:
+    """Return whether the layout should be treated as a title."""
+    return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
+
+
+def is_text(layout: dict[str, Any]) -> bool:
+    """Return whether the layout should be treated as plain paragraph text."""
+    return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
+
+
+def is_figure(layout: dict[str, Any]) -> bool:
+    """Return whether the layout should be treated as figure-related content."""
+    return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
+
+
+def is_table(layout: dict[str, Any]) -> bool:
+    """Return whether the layout should be treated as a table."""
+    return layout.get("type") == "table"
+
+
+def is_toc_layout(layout: dict[str, Any]) -> bool:
+    """Return whether the layout appears to belong to a table of contents."""
+    text = get_text(layout)
+    if text in TOC_TITLES:
+        return True
+    if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
+        return True
+    return False
+
+
+def extract_table_text(layout: dict[str, Any]) -> str:
+    """Flatten nested table cells into retrievable plain text."""
+    rows: list[str] = []
+    for cell in layout.get("cells", []):
+        texts: list[str] = []
+        for cell_layout in cell.get("layouts", []):
+            cell_text = normalize_text(cell_layout.get("text", ""))
+            if cell_text:
+                texts.append(cell_text)
+        if texts:
+            rows.append(" ".join(texts))
+    return "\n".join(rows).strip()
+
+
+def build_structure_nodes(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Build the title hierarchy emitted to downstream storage."""
+    nodes: list[dict[str, Any]] = []
+    for layout in layouts:
+        if not is_title(layout):
+            continue
+        text = get_text(layout)
+        if not text or text in TOC_TITLES:
+            continue
+        nodes.append(
+            {
+                "unique_id": layout.get("uniqueId"),
+                "page": get_page(layout),
+                "index": layout.get("index", 0),
+                "level": layout.get("level", 0),
+                "title": text,
+                "type": layout.get("type"),
+                "sub_type": layout.get("subType"),
+            }
+        )
+    return nodes
+
+
+def update_section_path(
+    section_stack: list[dict[str, Any]],
+    layout: dict[str, Any],
+) -> list[dict[str, Any]]:
+    """Update the current heading stack with a newly observed title layout."""
+    level = layout.get("level", 0)
+    title = get_text(layout)
+    while section_stack and section_stack[-1]["level"] >= level:
+        section_stack.pop()
+    section_stack.append(
+        {
+            "level": level,
+            "title": title,
+            "page": get_page(layout),
+            "unique_id": layout.get("uniqueId"),
+        }
+    )
+    return section_stack
+
+
+def section_path_titles(section_stack: list[dict[str, Any]]) -> list[str]:
+    """Return the title-only view of the current heading stack."""
+    return [item["title"] for item in section_stack]
+
+
+def flush_text_block(
+    blocks: list[dict[str, Any]],
+    semantic_blocks: list[dict[str, Any]],
+    block_id: int,
+) -> int:
+    """Flush buffered paragraph layouts into a single semantic block."""
+    if not blocks:
+        return block_id
+
+    texts = [item["text"] for item in blocks if item["text"]]
+    merged_text = "\n".join(texts).strip()
+    if not merged_text:
+        return block_id
+
+    semantic_blocks.append(
+        {
+            "semantic_id": f"semantic-{block_id}",
+            "block_type": "section_text",
+            "page_start": min(item["page"] for item in blocks),
+            "page_end": max(item["page"] for item in blocks),
+            "section_path": blocks[0]["section_path"],
+            "section_level": blocks[0]["section_level"],
+            "section_title": blocks[0]["section_title"],
+            "source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
+            "text": merged_text,
+        }
+    )
+    return block_id + 1
+
+
+def build_semantic_blocks(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Build semantic content blocks from raw Aliyun layouts."""
+    semantic_blocks: list[dict[str, Any]] = []
+    section_stack: list[dict[str, Any]] = []
+    pending_text_blocks: list[dict[str, Any]] = []
+    block_id = 1
+    skip_toc_page = False
+
+    for layout in layouts:
+        text = get_text(layout)
+        page = get_page(layout)
+
+        if is_toc_layout(layout):
+            skip_toc_page = True
+            continue
+        if skip_toc_page and page == 1:
+            continue
+        if skip_toc_page and page != 1:
+            skip_toc_page = False
+
+        if is_title(layout):
+            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+            pending_text_blocks = []
+            section_stack = update_section_path(section_stack, layout)
+            continue
+
+        section_path = section_path_titles(section_stack)
+        section_title = section_path[-1] if section_path else "未分类"
+        section_level = len(section_path)
+
+        if is_table(layout):
+            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+            pending_text_blocks = []
+            table_text = extract_table_text(layout)
+            if table_text:
+                semantic_blocks.append(
+                    {
+                        "semantic_id": f"semantic-{block_id}",
+                        "block_type": "table",
+                        "page_start": page,
+                        "page_end": page,
+                        "section_path": section_path,
+                        "section_level": section_level,
+                        "section_title": section_title,
+                        "source_ids": [layout.get("uniqueId")],
+                        "text": table_text,
+                    }
+                )
+                block_id += 1
+            continue
+
+        if is_figure(layout):
+            block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+            pending_text_blocks = []
+            if text:
+                semantic_blocks.append(
+                    {
+                        "semantic_id": f"semantic-{block_id}",
+                        "block_type": "figure",
+                        "page_start": page,
+                        "page_end": page,
+                        "section_path": section_path,
+                        "section_level": section_level,
+                        "section_title": section_title,
+                        "source_ids": [layout.get("uniqueId")],
+                        "text": text,
+                    }
+                )
+                block_id += 1
+            continue
+
+        if is_text(layout) and text:
+            pending_text_blocks.append(
+                {
+                    "page": page,
+                    "text": text,
+                    "unique_id": layout.get("uniqueId"),
+                    "section_path": section_path,
+                    "section_level": section_level,
+                    "section_title": section_title,
+                }
+            )
+
+    flush_text_block(pending_text_blocks, semantic_blocks, block_id)
+    return semantic_blocks
+
+
+def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> list[str]:
+    """Split long text into overlapping windows for embedding."""
+    text = text.strip()
+    if len(text) <= max_chars:
+        return [text] if text else []
+
+    parts: list[str] = []
+    start = 0
+    while start < len(text):
+        end = min(len(text), start + max_chars)
+        parts.append(text[start:end].strip())
+        if end >= len(text):
+            break
+        start = max(0, end - overlap_chars)
+    return [part for part in parts if part]
+
+
+def build_vector_chunks(
+    semantic_blocks: list[dict[str, Any]],
+    *,
+    doc_id: str,
+    doc_title: str,
+    max_chars: int,
+    overlap_chars: int,
+) -> list[dict[str, Any]]:
+    """Build retrieval chunks from semantic blocks."""
+    vector_chunks: list[dict[str, Any]] = []
+    chunk_index = 1
+
+    for block in semantic_blocks:
+        pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
+        for piece_index, piece in enumerate(pieces, start=1):
+            if block["section_path"]:
+                header = f"标准：{doc_title}\n章节：{' > '.join(block['section_path'])}\n\n"
+            else:
+                header = f"标准：{doc_title}\n\n"
+
+            # Preserve enriched embedding text so retrieval keeps section context.
+            vector_chunks.append(
+                {
+                    "doc_id": doc_id,
+                    "doc_title": doc_title,
+                    "chunk_id": f"chunk-{chunk_index}",
+                    "chunk_index": chunk_index,
+                    "semantic_id": block["semantic_id"],
+                    "chunk_type": block["block_type"],
+                    "piece_index": piece_index,
+                    "page_start": block["page_start"],
+                    "page_end": block["page_end"],
+                    "section_path": block["section_path"],
+                    "section_level": block["section_level"],
+                    "section_title": block["section_title"],
+                    "source_ids": block["source_ids"],
+                    "text": piece,
+                    "embedding_text": header + piece,
+                }
+            )
+            chunk_index += 1
+
+    return vector_chunks
+
+
+def convert_layouts(
+    layouts: list[dict[str, Any]],
+    *,
+    doc_id: str,
+    doc_title: str,
+    max_chars: int,
+    overlap_chars: int,
+) -> dict[str, Any]:
+    """Convert raw Aliyun layouts into the three-layer ingest payload."""
+    structure_nodes = build_structure_nodes(layouts)
+    semantic_blocks = build_semantic_blocks(layouts)
+    vector_chunks = build_vector_chunks(
+        semantic_blocks,
+        doc_id=doc_id,
+        doc_title=doc_title,
+        max_chars=max_chars,
+        overlap_chars=overlap_chars,
+    )
+    return {
+        "doc_id": doc_id,
+        "doc_title": doc_title,
+        "structure_nodes": structure_nodes,
+        "semantic_blocks": semantic_blocks,
+        "vector_chunks": vector_chunks,
+    }
--- a/backend/app/infrastructure/parser/local_document_parser.py
+++ b/backend/app/infrastructure/parser/local_document_parser.py
@@ -4,6 +4,7 @@ from __future__ import annotations

 from pathlib import Path

+from app.config.settings import settings
 from app.domain.documents import DocumentParser, ParsedDocument
 from app.services.parser.docx_parser import parse_docx_to_markdown
 from app.services.parser.pdf_parser import parse_pdf_to_markdown
@@ -34,5 +35,10 @@ class LocalDocumentParser(DocumentParser):
            vector_chunks=[],
            parser_name=self.parser_name,
            raw_text=markdown_text,
-            metadata={"source": "local_parser", "file_suffix": suffix},
+            raw_layouts=[],
+            metadata={
+                "source": "local_parser",
+                "file_suffix": suffix,
+                "artifact_prefix": settings.document_parse_artifact_prefix,
+            },
        )