Fix SSE route dependency and align architecture docs

2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions
--- a/backend/app/aliyun_parser/parse_pdf.py
+++ b/backend/app/aliyun_parser/parse_pdf.py
@@ -1,14 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-"""
-阿里云文档智能 API 解析 PDF，输出三层结构 chunks
- structure_nodes: 目录树结构
- semantic_blocks: 语义块（章节文本、表格、图片）
- vector_chunks: 检索块（带 overlap 切分）
-"""
+"""Handle Aliyun parsing support for parse pdf."""

 import argparse
 import json
+import os
 import re
 import time
 from pathlib import Path
@@ -19,16 +15,16 @@ from alibabacloud_tea_openapi import models as open_api_models
 from alibabacloud_docmind_api20220711 import models as docmind_models
 from alibabacloud_tea_util import models as util_models

-# ===================== 阿里云配置 =====================
-ALIBABA_ACCESS_KEY_ID = "LTAI5t6fWvAsvZkoF9WTbtys"
-ALIBABA_ACCESS_KEY_SECRET = "WX4oaE4FLYRa5L85TMQkqRPHeTJAF0"
-ALIBABA_ENDPOINT = "docmind-api.cn-hangzhou.aliyuncs.com"
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
+ALIBABA_ACCESS_KEY_ID = os.getenv("ALIBABA_ACCESS_KEY_ID", "")
+ALIBABA_ACCESS_KEY_SECRET = os.getenv("ALIBABA_ACCESS_KEY_SECRET", "")
+ALIBABA_ENDPOINT = os.getenv("ALIBABA_ENDPOINT", "docmind-api.cn-hangzhou.aliyuncs.com")

-# ===================== 切分参数 =====================
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 MAX_CHARS = 600
 OVERLAP_CHARS = 80

-# ===================== 布局类型常量 =====================
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 TOC_TITLES = {"目次", "目录"}
 TITLE_SUBTYPES = {"doc_title", "para_title"}
 TEXT_SUBTYPES = {"para", "none"}
@@ -36,8 +32,11 @@ FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
 FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}


-# ===================== 阿里云 API 客户端 =====================
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 def init_client() -> DocmindClient:
+    """Handle init client."""
+    if not ALIBABA_ACCESS_KEY_ID or not ALIBABA_ACCESS_KEY_SECRET:
+        raise ValueError("缺少阿里云文档解析凭据，请设置 ALIBABA_ACCESS_KEY_ID 和 ALIBABA_ACCESS_KEY_SECRET")
    config = open_api_models.Config(
        access_key_id=ALIBABA_ACCESS_KEY_ID,
        access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
@@ -47,7 +46,7 @@ def init_client() -> DocmindClient:


 def submit_job(client: DocmindClient, file_path: str) -> str:
-    """提交文档解析任务"""
+    """Submit job."""
    file_name = Path(file_path).name
    request = docmind_models.SubmitDocParserJobAdvanceRequest(
        file_url_object=open(file_path, "rb"),
@@ -62,14 +61,14 @@ def submit_job(client: DocmindClient, file_path: str) -> str:


 def query_status(client: DocmindClient, task_id: str) -> Dict:
-    """查询任务状态"""
+    """Handle query status."""
    request = docmind_models.QueryDocParserStatusRequest(id=task_id)
    response = client.query_doc_parser_status(request)
    return response.body.data.to_map() if response.body.data else None


 def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
-    """等待任务完成"""
+    """Wait for for completion."""
    while True:
        status_data = query_status(client, task_id)
        if not status_data:
@@ -85,7 +84,7 @@ def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int


 def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
-    """获取解析结果"""
+    """Return result."""
    request = docmind_models.GetDocParserResultRequest(
        id=task_id,
        layout_step_size=layout_step_size,
@@ -96,7 +95,7 @@ def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_


 def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
-    """收集所有解析结果"""
+    """Collect all results."""
    all_layouts = []
    layout_num = 0
    while True:
@@ -113,8 +112,9 @@ def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: i
    return all_layouts


-# ===================== 文本处理 =====================
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 def normalize_text(text: str) -> str:
+    """Normalize text."""
    text = text.replace("\r", "\n")
    text = text.replace(" ", " ")
    text = re.sub(r"\n+", "\n", text)
@@ -123,34 +123,41 @@ def normalize_text(text: str) -> str:


 def get_page(layout: Dict) -> int:
+    """Return page."""
    return layout.get("pageNum", layout.get("pageNumber", 0))


 def get_text(layout: Dict) -> str:
+    """Return text."""
    text = normalize_text(layout.get("text", ""))
    if text:
        return text
    return normalize_text(layout.get("markdownContent", ""))


-# ===================== 布局类型判断 =====================
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 def is_title(layout: Dict) -> bool:
+    """Return whether title."""
    return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES


 def is_text(layout: Dict) -> bool:
+    """Return whether text."""
    return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES


 def is_figure(layout: Dict) -> bool:
+    """Return whether figure."""
    return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES


 def is_table(layout: Dict) -> bool:
+    """Return whether table."""
    return layout.get("type") == "table"


 def is_toc_layout(layout: Dict) -> bool:
+    """Return whether toc layout."""
    text = get_text(layout)
    if text in TOC_TITLES:
        return True
@@ -160,6 +167,7 @@ def is_toc_layout(layout: Dict) -> bool:


 def extract_table_text(layout: Dict) -> str:
+    """Extract table text."""
    rows = []
    for cell in layout.get("cells", []):
        texts = []
@@ -172,8 +180,9 @@ def extract_table_text(layout: Dict) -> str:
    return "\n".join(rows).strip()


-# ===================== 结构层：目录树 =====================
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
+    """Build structure nodes."""
    nodes = []
    for layout in layouts:
        if not is_title(layout):
@@ -195,8 +204,9 @@ def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
    return nodes


-# ===================== 语义层：章节内容 =====================
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
+    """Update section path."""
    level = layout.get("level", 0)
    title = get_text(layout)
    while section_stack and section_stack[-1]["level"] >= level:
@@ -213,10 +223,12 @@ def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:


 def section_path_titles(section_stack: List[Dict]) -> List[str]:
+    """Handle section path titles."""
    return [item["title"] for item in section_stack]


 def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
+    """Handle flush text block."""
    if not blocks:
        return block_id

@@ -242,6 +254,7 @@ def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id:


 def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
+    """Build semantic blocks."""
    semantic_blocks = []
    section_stack = []
    pending_text_blocks = []
@@ -327,8 +340,9 @@ def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
    return semantic_blocks


-# ===================== 检索层：向量 chunks =====================
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
+    """Handle split text with overlap."""
    text = text.strip()
    if len(text) <= max_chars:
        return [text] if text else []
@@ -351,6 +365,7 @@ def build_vector_chunks(
    max_chars: int,
    overlap_chars: int,
 ) -> List[Dict]:
+    """Build vector chunks."""
    vector_chunks = []
    chunk_index = 1

@@ -385,7 +400,31 @@ def build_vector_chunks(
    return vector_chunks


-# ===================== 主转换函数 =====================
+def parse_pdf_to_structured_chunks(
+    pdf_path: str,
+    *,
+    doc_id: str,
+    doc_title: str,
+    max_chars: int = MAX_CHARS,
+    overlap_chars: int = OVERLAP_CHARS,
+    poll_interval: int = 5,
+) -> Dict:
+    """Parse pdf to structured chunks."""
+    client = init_client()
+    task_id = submit_job(client, pdf_path)
+    if not wait_for_completion(client, task_id, poll_interval):
+        raise RuntimeError("阿里云文档解析任务失败")
+    layouts = collect_all_results(client, task_id)
+    return convert_layouts(
+        layouts,
+        doc_id=doc_id,
+        doc_title=doc_title,
+        max_chars=max_chars,
+        overlap_chars=overlap_chars,
+    )
+
+
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 def convert_layouts(
    layouts: List[Dict],
    doc_id: str,
@@ -393,6 +432,7 @@ def convert_layouts(
    max_chars: int,
    overlap_chars: int,
 ) -> Dict:
+    """Handle convert layouts."""
    structure_nodes = build_structure_nodes(layouts)
    semantic_blocks = build_semantic_blocks(layouts)
    vector_chunks = build_vector_chunks(
@@ -411,8 +451,9 @@ def convert_layouts(
    }


-# ===================== CLI 入口 =====================
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 def main() -> None:
+    """Run the module entrypoint."""
    parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF，输出三层结构 chunks")
    parser.add_argument("pdf_path", help="PDF 文件路径")
    parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
@@ -428,30 +469,30 @@ def main() -> None:
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")

-    # 1. 提交阿里云任务
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    client = init_client()
    print(f"提交任务: {pdf_path}")
    task_id = submit_job(client, str(pdf_path))
    print(f"任务 ID: {task_id}")

-    # 2. 等待完成
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    print("等待任务完成...")
    if not wait_for_completion(client, task_id, args.poll_interval):
        print("任务失败，退出")
        return

-    # 3. 获取 layouts
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    print("获取解析结果...")
    layouts = collect_all_results(client, task_id)
    print(f"获取到 {len(layouts)} 个布局块")

-    # 4. 输出原始 layouts（可选）
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    if args.layouts_output:
        layouts_path = Path(args.layouts_output).expanduser().resolve()
        layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
        print(f"原始 layouts 已写入: {layouts_path}")

-    # 5. 转换为三层结构
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    print("转换为三层结构...")
    data = convert_layouts(
        layouts,
@@ -461,7 +502,7 @@ def main() -> None:
        overlap_chars=args.overlap_chars,
    )

-    # 6. 输出结果
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    output_path = Path(args.out).expanduser().resolve()
    output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

@@ -472,4 +513,4 @@ def main() -> None:


 if __name__ == "__main__":
-    main()
+    main()