@
chore: delete old layout/common/tabs components before redesign @
This commit is contained in:
475
aliyun_parser/parse_pdf.py
Normal file
475
aliyun_parser/parse_pdf.py
Normal file
@@ -0,0 +1,475 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
阿里云文档智能 API 解析 PDF,输出三层结构 chunks
|
||||
- structure_nodes: 目录树结构
|
||||
- semantic_blocks: 语义块(章节文本、表格、图片)
|
||||
- vector_chunks: 检索块(带 overlap 切分)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
|
||||
from alibabacloud_tea_openapi import models as open_api_models
|
||||
from alibabacloud_docmind_api20220711 import models as docmind_models
|
||||
from alibabacloud_tea_util import models as util_models
|
||||
|
||||
# ===================== 阿里云配置 =====================
|
||||
ALIBABA_ACCESS_KEY_ID = "LTAI5t6fWvAsvZkoF9WTbtys"
|
||||
ALIBABA_ACCESS_KEY_SECRET = "WX4oaE4FLYRa5L85TMQkqRPHeTJAF0"
|
||||
ALIBABA_ENDPOINT = "docmind-api.cn-hangzhou.aliyuncs.com"
|
||||
|
||||
# ===================== 切分参数 =====================
|
||||
MAX_CHARS = 600
|
||||
OVERLAP_CHARS = 80
|
||||
|
||||
# ===================== 布局类型常量 =====================
|
||||
TOC_TITLES = {"目次", "目录"}
|
||||
TITLE_SUBTYPES = {"doc_title", "para_title"}
|
||||
TEXT_SUBTYPES = {"para", "none"}
|
||||
FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
|
||||
FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
|
||||
|
||||
|
||||
# ===================== 阿里云 API 客户端 =====================
|
||||
def init_client() -> DocmindClient:
|
||||
config = open_api_models.Config(
|
||||
access_key_id=ALIBABA_ACCESS_KEY_ID,
|
||||
access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
|
||||
)
|
||||
config.endpoint = ALIBABA_ENDPOINT
|
||||
return DocmindClient(config)
|
||||
|
||||
|
||||
def submit_job(client: DocmindClient, file_path: str) -> str:
|
||||
"""提交文档解析任务"""
|
||||
file_name = Path(file_path).name
|
||||
request = docmind_models.SubmitDocParserJobAdvanceRequest(
|
||||
file_url_object=open(file_path, "rb"),
|
||||
file_name=file_name,
|
||||
file_name_extension=Path(file_path).suffix.lstrip("."),
|
||||
llm_enhancement=True,
|
||||
enhancement_mode="VLM",
|
||||
)
|
||||
runtime = util_models.RuntimeOptions()
|
||||
response = client.submit_doc_parser_job_advance(request, runtime)
|
||||
return response.body.data.id
|
||||
|
||||
|
||||
def query_status(client: DocmindClient, task_id: str) -> Dict:
|
||||
"""查询任务状态"""
|
||||
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
|
||||
response = client.query_doc_parser_status(request)
|
||||
return response.body.data.to_map() if response.body.data else None
|
||||
|
||||
|
||||
def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
|
||||
"""等待任务完成"""
|
||||
while True:
|
||||
status_data = query_status(client, task_id)
|
||||
if not status_data:
|
||||
return False
|
||||
status = status_data.get("Status", "").lower()
|
||||
if status == "success":
|
||||
return True
|
||||
elif status == "failed":
|
||||
print(f"任务失败: {status_data}")
|
||||
return False
|
||||
print(f"任务状态: {status}, 等待中...")
|
||||
time.sleep(poll_interval)
|
||||
|
||||
|
||||
def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
|
||||
"""获取解析结果"""
|
||||
request = docmind_models.GetDocParserResultRequest(
|
||||
id=task_id,
|
||||
layout_step_size=layout_step_size,
|
||||
layout_num=layout_num,
|
||||
)
|
||||
response = client.get_doc_parser_result(request)
|
||||
return response.body.data if response.body.data else None
|
||||
|
||||
|
||||
def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
|
||||
"""收集所有解析结果"""
|
||||
all_layouts = []
|
||||
layout_num = 0
|
||||
while True:
|
||||
result_data = get_result(client, task_id, layout_num, layout_step_size)
|
||||
if not result_data:
|
||||
break
|
||||
layouts = result_data.get("layouts", [])
|
||||
if not layouts:
|
||||
break
|
||||
all_layouts.extend(layouts)
|
||||
layout_num += len(layouts)
|
||||
if len(layouts) < layout_step_size:
|
||||
break
|
||||
return all_layouts
|
||||
|
||||
|
||||
# ===================== 文本处理 =====================
|
||||
def normalize_text(text: str) -> str:
|
||||
text = text.replace("\r", "\n")
|
||||
text = text.replace(" ", " ")
|
||||
text = re.sub(r"\n+", "\n", text)
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_page(layout: Dict) -> int:
|
||||
return layout.get("pageNum", layout.get("pageNumber", 0))
|
||||
|
||||
|
||||
def get_text(layout: Dict) -> str:
|
||||
text = normalize_text(layout.get("text", ""))
|
||||
if text:
|
||||
return text
|
||||
return normalize_text(layout.get("markdownContent", ""))
|
||||
|
||||
|
||||
# ===================== 布局类型判断 =====================
|
||||
def is_title(layout: Dict) -> bool:
|
||||
return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
|
||||
|
||||
|
||||
def is_text(layout: Dict) -> bool:
|
||||
return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
|
||||
|
||||
|
||||
def is_figure(layout: Dict) -> bool:
|
||||
return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
|
||||
|
||||
|
||||
def is_table(layout: Dict) -> bool:
|
||||
return layout.get("type") == "table"
|
||||
|
||||
|
||||
def is_toc_layout(layout: Dict) -> bool:
|
||||
text = get_text(layout)
|
||||
if text in TOC_TITLES:
|
||||
return True
|
||||
if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def extract_table_text(layout: Dict) -> str:
|
||||
rows = []
|
||||
for cell in layout.get("cells", []):
|
||||
texts = []
|
||||
for cell_layout in cell.get("layouts", []):
|
||||
cell_text = normalize_text(cell_layout.get("text", ""))
|
||||
if cell_text:
|
||||
texts.append(cell_text)
|
||||
if texts:
|
||||
rows.append(" ".join(texts))
|
||||
return "\n".join(rows).strip()
|
||||
|
||||
|
||||
# ===================== 结构层:目录树 =====================
|
||||
def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
|
||||
nodes = []
|
||||
for layout in layouts:
|
||||
if not is_title(layout):
|
||||
continue
|
||||
text = get_text(layout)
|
||||
if not text or text in TOC_TITLES:
|
||||
continue
|
||||
nodes.append(
|
||||
{
|
||||
"unique_id": layout.get("uniqueId"),
|
||||
"page": get_page(layout),
|
||||
"index": layout.get("index", 0),
|
||||
"level": layout.get("level", 0),
|
||||
"title": text,
|
||||
"type": layout.get("type"),
|
||||
"sub_type": layout.get("subType"),
|
||||
}
|
||||
)
|
||||
return nodes
|
||||
|
||||
|
||||
# ===================== 语义层:章节内容 =====================
|
||||
def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
|
||||
level = layout.get("level", 0)
|
||||
title = get_text(layout)
|
||||
while section_stack and section_stack[-1]["level"] >= level:
|
||||
section_stack.pop()
|
||||
section_stack.append(
|
||||
{
|
||||
"level": level,
|
||||
"title": title,
|
||||
"page": get_page(layout),
|
||||
"unique_id": layout.get("uniqueId"),
|
||||
}
|
||||
)
|
||||
return section_stack
|
||||
|
||||
|
||||
def section_path_titles(section_stack: List[Dict]) -> List[str]:
|
||||
return [item["title"] for item in section_stack]
|
||||
|
||||
|
||||
def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
|
||||
if not blocks:
|
||||
return block_id
|
||||
|
||||
texts = [item["text"] for item in blocks if item["text"]]
|
||||
merged_text = "\n".join(texts).strip()
|
||||
if not merged_text:
|
||||
return block_id
|
||||
|
||||
semantic_blocks.append(
|
||||
{
|
||||
"semantic_id": f"semantic-{block_id}",
|
||||
"block_type": "section_text",
|
||||
"page_start": min(item["page"] for item in blocks),
|
||||
"page_end": max(item["page"] for item in blocks),
|
||||
"section_path": blocks[0]["section_path"],
|
||||
"section_level": blocks[0]["section_level"],
|
||||
"section_title": blocks[0]["section_title"],
|
||||
"source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
|
||||
"text": merged_text,
|
||||
}
|
||||
)
|
||||
return block_id + 1
|
||||
|
||||
|
||||
def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
|
||||
semantic_blocks = []
|
||||
section_stack = []
|
||||
pending_text_blocks = []
|
||||
block_id = 1
|
||||
skip_toc_page = False
|
||||
|
||||
for layout in layouts:
|
||||
text = get_text(layout)
|
||||
page = get_page(layout)
|
||||
|
||||
if is_toc_layout(layout):
|
||||
skip_toc_page = True
|
||||
continue
|
||||
if skip_toc_page and page == 1:
|
||||
continue
|
||||
if skip_toc_page and page != 1:
|
||||
skip_toc_page = False
|
||||
|
||||
if is_title(layout):
|
||||
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
pending_text_blocks = []
|
||||
section_stack = update_section_path(section_stack, layout)
|
||||
continue
|
||||
|
||||
section_path = section_path_titles(section_stack)
|
||||
section_title = section_path[-1] if section_path else "未分类"
|
||||
section_level = len(section_path)
|
||||
|
||||
if is_table(layout):
|
||||
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
pending_text_blocks = []
|
||||
table_text = extract_table_text(layout)
|
||||
if table_text:
|
||||
semantic_blocks.append(
|
||||
{
|
||||
"semantic_id": f"semantic-{block_id}",
|
||||
"block_type": "table",
|
||||
"page_start": page,
|
||||
"page_end": page,
|
||||
"section_path": section_path,
|
||||
"section_level": section_level,
|
||||
"section_title": section_title,
|
||||
"source_ids": [layout.get("uniqueId")],
|
||||
"text": table_text,
|
||||
}
|
||||
)
|
||||
block_id += 1
|
||||
continue
|
||||
|
||||
if is_figure(layout):
|
||||
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
pending_text_blocks = []
|
||||
if text:
|
||||
semantic_blocks.append(
|
||||
{
|
||||
"semantic_id": f"semantic-{block_id}",
|
||||
"block_type": "figure",
|
||||
"page_start": page,
|
||||
"page_end": page,
|
||||
"section_path": section_path,
|
||||
"section_level": section_level,
|
||||
"section_title": section_title,
|
||||
"source_ids": [layout.get("uniqueId")],
|
||||
"text": text,
|
||||
}
|
||||
)
|
||||
block_id += 1
|
||||
continue
|
||||
|
||||
if is_text(layout) and text:
|
||||
pending_text_blocks.append(
|
||||
{
|
||||
"page": page,
|
||||
"text": text,
|
||||
"unique_id": layout.get("uniqueId"),
|
||||
"section_path": section_path,
|
||||
"section_level": section_level,
|
||||
"section_title": section_title,
|
||||
}
|
||||
)
|
||||
|
||||
flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
return semantic_blocks
|
||||
|
||||
|
||||
# ===================== 检索层:向量 chunks =====================
|
||||
def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
|
||||
text = text.strip()
|
||||
if len(text) <= max_chars:
|
||||
return [text] if text else []
|
||||
|
||||
parts = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(len(text), start + max_chars)
|
||||
parts.append(text[start:end].strip())
|
||||
if end >= len(text):
|
||||
break
|
||||
start = max(0, end - overlap_chars)
|
||||
return [part for part in parts if part]
|
||||
|
||||
|
||||
def build_vector_chunks(
|
||||
semantic_blocks: List[Dict],
|
||||
doc_id: str,
|
||||
doc_title: str,
|
||||
max_chars: int,
|
||||
overlap_chars: int,
|
||||
) -> List[Dict]:
|
||||
vector_chunks = []
|
||||
chunk_index = 1
|
||||
|
||||
for block in semantic_blocks:
|
||||
pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
|
||||
for piece_index, piece in enumerate(pieces, start=1):
|
||||
if block["section_path"]:
|
||||
header = f"标准:{doc_title}\n章节:{' > '.join(block['section_path'])}\n\n"
|
||||
else:
|
||||
header = f"标准:{doc_title}\n\n"
|
||||
vector_chunks.append(
|
||||
{
|
||||
"doc_id": doc_id,
|
||||
"doc_title": doc_title,
|
||||
"chunk_id": f"chunk-{chunk_index}",
|
||||
"chunk_index": chunk_index,
|
||||
"semantic_id": block["semantic_id"],
|
||||
"chunk_type": block["block_type"],
|
||||
"piece_index": piece_index,
|
||||
"page_start": block["page_start"],
|
||||
"page_end": block["page_end"],
|
||||
"section_path": block["section_path"],
|
||||
"section_level": block["section_level"],
|
||||
"section_title": block["section_title"],
|
||||
"source_ids": block["source_ids"],
|
||||
"text": piece,
|
||||
"embedding_text": header + piece,
|
||||
}
|
||||
)
|
||||
chunk_index += 1
|
||||
|
||||
return vector_chunks
|
||||
|
||||
|
||||
# ===================== 主转换函数 =====================
|
||||
def convert_layouts(
|
||||
layouts: List[Dict],
|
||||
doc_id: str,
|
||||
doc_title: str,
|
||||
max_chars: int,
|
||||
overlap_chars: int,
|
||||
) -> Dict:
|
||||
structure_nodes = build_structure_nodes(layouts)
|
||||
semantic_blocks = build_semantic_blocks(layouts)
|
||||
vector_chunks = build_vector_chunks(
|
||||
semantic_blocks,
|
||||
doc_id=doc_id,
|
||||
doc_title=doc_title,
|
||||
max_chars=max_chars,
|
||||
overlap_chars=overlap_chars,
|
||||
)
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"doc_title": doc_title,
|
||||
"structure_nodes": structure_nodes,
|
||||
"semantic_blocks": semantic_blocks,
|
||||
"vector_chunks": vector_chunks,
|
||||
}
|
||||
|
||||
|
||||
# ===================== CLI 入口 =====================
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF,输出三层结构 chunks")
|
||||
parser.add_argument("pdf_path", help="PDF 文件路径")
|
||||
parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
|
||||
parser.add_argument("--layouts-out", dest="layouts_output", help="输出原始 layouts JSON")
|
||||
parser.add_argument("--doc-id", default="GB14747-2006", help="文档 ID")
|
||||
parser.add_argument("--doc-title", default="GB 14747—2006 儿童三轮车安全要求", help="文档标题")
|
||||
parser.add_argument("--max-chars", type=int, default=MAX_CHARS, help="单个检索 chunk 最大字符数")
|
||||
parser.add_argument("--overlap-chars", type=int, default=OVERLAP_CHARS, help="相邻检索 chunk 重叠字符数")
|
||||
parser.add_argument("--poll-interval", type=int, default=5, help="轮询间隔(秒)")
|
||||
args = parser.parse_args()
|
||||
|
||||
pdf_path = Path(args.pdf_path).expanduser().resolve()
|
||||
if not pdf_path.exists():
|
||||
raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
|
||||
|
||||
# 1. 提交阿里云任务
|
||||
client = init_client()
|
||||
print(f"提交任务: {pdf_path}")
|
||||
task_id = submit_job(client, str(pdf_path))
|
||||
print(f"任务 ID: {task_id}")
|
||||
|
||||
# 2. 等待完成
|
||||
print("等待任务完成...")
|
||||
if not wait_for_completion(client, task_id, args.poll_interval):
|
||||
print("任务失败,退出")
|
||||
return
|
||||
|
||||
# 3. 获取 layouts
|
||||
print("获取解析结果...")
|
||||
layouts = collect_all_results(client, task_id)
|
||||
print(f"获取到 {len(layouts)} 个布局块")
|
||||
|
||||
# 4. 输出原始 layouts(可选)
|
||||
if args.layouts_output:
|
||||
layouts_path = Path(args.layouts_output).expanduser().resolve()
|
||||
layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"原始 layouts 已写入: {layouts_path}")
|
||||
|
||||
# 5. 转换为三层结构
|
||||
print("转换为三层结构...")
|
||||
data = convert_layouts(
|
||||
layouts,
|
||||
doc_id=args.doc_id,
|
||||
doc_title=args.doc_title,
|
||||
max_chars=args.max_chars,
|
||||
overlap_chars=args.overlap_chars,
|
||||
)
|
||||
|
||||
# 6. 输出结果
|
||||
output_path = Path(args.out).expanduser().resolve()
|
||||
output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
print(f"结构层节点数: {len(data['structure_nodes'])}")
|
||||
print(f"语义层块数: {len(data['semantic_blocks'])}")
|
||||
print(f"检索层块数: {len(data['vector_chunks'])}")
|
||||
print(f"输出文件: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user