first commit
This commit is contained in:
7
rag_eval/dataset_builder/parser/__init__.py
Normal file
7
rag_eval/dataset_builder/parser/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""Parser integrations and layout normalization helpers for dataset build jobs."""
|
||||
|
||||
from .aliyun_document_parser import AliyunDocumentParser
|
||||
from .aliyun_docmind_gateway import AliyunDocmindGateway
|
||||
from .aliyun_layout_normalizer import normalize_layouts
|
||||
|
||||
__all__ = ["AliyunDocumentParser", "AliyunDocmindGateway", "normalize_layouts"]
|
||||
202
rag_eval/dataset_builder/parser/aliyun_docmind_gateway.py
Normal file
202
rag_eval/dataset_builder/parser/aliyun_docmind_gateway.py
Normal file
@@ -0,0 +1,202 @@
|
||||
"""Gateway abstraction for Alibaba Cloud document parsing workflows."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
from alibabacloud_docmind_api20220711 import models as docmind_models
|
||||
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
|
||||
from alibabacloud_tea_openapi import models as openapi_models
|
||||
from alibabacloud_tea_util import models as runtime_models
|
||||
except ImportError:
|
||||
# Keep Alibaba SDK optional so offline flows and tests can import this module.
|
||||
DocmindClient = None
|
||||
docmind_models = None
|
||||
openapi_models = None
|
||||
runtime_models = None
|
||||
|
||||
try:
|
||||
from alibabacloud_credentials.client import Client as CredentialClient
|
||||
except ImportError:
|
||||
CredentialClient = None
|
||||
|
||||
from rag_eval.settings import EvaluationSettings
|
||||
|
||||
|
||||
class AliyunDocmindGateway:
|
||||
"""Thin gateway interface around the external Alibaba document parser service."""
|
||||
|
||||
def __init__(self, settings: EvaluationSettings):
|
||||
"""Store parser-related settings needed by the gateway implementation."""
|
||||
self.settings = settings
|
||||
self._client = None
|
||||
self._models = None
|
||||
self._runtime_models = None
|
||||
|
||||
def _load_sdk(self) -> tuple[Any, Any, Any, Any]:
|
||||
"""Load Alibaba SDK modules lazily so tests and offline flows do not require them."""
|
||||
if (
|
||||
DocmindClient is None
|
||||
or openapi_models is None
|
||||
or docmind_models is None
|
||||
or runtime_models is None
|
||||
):
|
||||
raise ImportError(
|
||||
"Alibaba Cloud Docmind SDK is not installed. "
|
||||
"Install alibabacloud-docmind-api20220711, "
|
||||
"alibabacloud-tea-openapi, alibabacloud-tea-util, and "
|
||||
"alibabacloud-credentials."
|
||||
)
|
||||
return DocmindClient, openapi_models, docmind_models, runtime_models
|
||||
|
||||
def _resolve_credentials(self) -> tuple[str, str]:
|
||||
"""Resolve AccessKey credentials from settings or the Alibaba credentials client."""
|
||||
if self.settings.alibaba_access_key_id and self.settings.alibaba_access_key_secret:
|
||||
return self.settings.alibaba_access_key_id, self.settings.alibaba_access_key_secret
|
||||
|
||||
if CredentialClient is None:
|
||||
raise ImportError(
|
||||
"Alibaba Cloud credentials SDK is not installed and no explicit "
|
||||
"ALIBABA_ACCESS_KEY_ID / ALIBABA_ACCESS_KEY_SECRET were provided."
|
||||
)
|
||||
|
||||
credential_client = CredentialClient()
|
||||
credential = credential_client.get_credential()
|
||||
return credential.get_access_key_id(), credential.get_access_key_secret()
|
||||
|
||||
def _init_client(self) -> Any:
|
||||
"""Create and cache the underlying Alibaba SDK client."""
|
||||
if self._client is not None:
|
||||
return self._client
|
||||
|
||||
client_class, openapi_models, docmind_models, runtime_models = self._load_sdk()
|
||||
access_key_id, access_key_secret = self._resolve_credentials()
|
||||
endpoint = (self.settings.alibaba_endpoint or "docmind-api.cn-hangzhou.aliyuncs.com").strip()
|
||||
config = openapi_models.Config(
|
||||
access_key_id=access_key_id,
|
||||
access_key_secret=access_key_secret,
|
||||
)
|
||||
config.endpoint = endpoint
|
||||
config.region_id = "cn-hangzhou"
|
||||
config.type = "access_key"
|
||||
|
||||
self._client = client_class(config)
|
||||
self._models = docmind_models
|
||||
self._runtime_models = runtime_models
|
||||
return self._client
|
||||
|
||||
@staticmethod
|
||||
def _to_plain_dict(value: Any) -> dict[str, Any]:
|
||||
"""Convert SDK response objects into ordinary dictionaries."""
|
||||
if value is None:
|
||||
return {}
|
||||
if isinstance(value, dict):
|
||||
return value
|
||||
if hasattr(value, "to_map"):
|
||||
return value.to_map()
|
||||
if hasattr(value, "__dict__"):
|
||||
return {
|
||||
key: getattr(value, key)
|
||||
for key in vars(value)
|
||||
if not key.startswith("_")
|
||||
}
|
||||
return {}
|
||||
|
||||
@staticmethod
|
||||
def _extract_layouts(payload: Any) -> list[dict[str, Any]]:
|
||||
"""Convert layout collections from SDK payloads into plain dictionaries."""
|
||||
if payload is None:
|
||||
return []
|
||||
if isinstance(payload, dict):
|
||||
layouts = payload.get("layouts") or payload.get("Layouts") or []
|
||||
else:
|
||||
layouts = getattr(payload, "layouts", None) or getattr(payload, "Layouts", None) or []
|
||||
normalized: list[dict[str, Any]] = []
|
||||
for item in layouts:
|
||||
normalized.append(AliyunDocmindGateway._to_plain_dict(item))
|
||||
return normalized
|
||||
|
||||
def submit_parse_task(self, pdf_path: Path) -> str:
|
||||
"""Submit one PDF parse task and return the remote task identifier."""
|
||||
client = self._init_client()
|
||||
runtime = self._runtime_models.RuntimeOptions()
|
||||
file_name = pdf_path.name
|
||||
with pdf_path.open("rb") as handle:
|
||||
request = self._models.SubmitDocParserJobAdvanceRequest(
|
||||
file_url_object=handle,
|
||||
file_name=file_name,
|
||||
file_name_extension=pdf_path.suffix.lstrip(".").lower() or "pdf",
|
||||
llm_enhancement=self.settings.aliyun_llm_enhancement,
|
||||
enhancement_mode=self.settings.aliyun_enhancement_mode,
|
||||
)
|
||||
response = client.submit_doc_parser_job_advance(request, runtime)
|
||||
|
||||
payload = self._to_plain_dict(getattr(getattr(response, "body", None), "data", None))
|
||||
task_id = payload.get("id") or payload.get("Id")
|
||||
if not task_id:
|
||||
raise RuntimeError(f"Aliyun submit_doc_parser_job_advance returned no task id for {pdf_path.name}")
|
||||
return str(task_id)
|
||||
|
||||
def get_task_status(self, task_id: str) -> dict[str, Any]:
|
||||
"""Fetch the current parse task status from the remote service."""
|
||||
client = self._init_client()
|
||||
request = self._models.QueryDocParserStatusRequest(id=task_id)
|
||||
response = client.query_doc_parser_status(request)
|
||||
payload = self._to_plain_dict(getattr(getattr(response, "body", None), "data", None))
|
||||
status = payload.get("status") or payload.get("Status")
|
||||
if status is not None and "status" not in payload:
|
||||
payload["status"] = status
|
||||
return payload
|
||||
|
||||
def fetch_layouts(self, task_id: str) -> list[dict[str, Any]]:
|
||||
"""Fetch normalized layout pages for a completed parse task."""
|
||||
client = self._init_client()
|
||||
layout_num = 0
|
||||
layout_step_size = min(max(1, self.settings.aliyun_parse_layout_step_size), 3000)
|
||||
collected: list[dict[str, Any]] = []
|
||||
|
||||
while True:
|
||||
request = self._models.GetDocParserResultRequest(
|
||||
id=task_id,
|
||||
layout_step_size=layout_step_size,
|
||||
layout_num=layout_num,
|
||||
)
|
||||
response = client.get_doc_parser_result(request)
|
||||
payload = getattr(getattr(response, "body", None), "data", None)
|
||||
layouts = self._extract_layouts(payload)
|
||||
if not layouts:
|
||||
break
|
||||
collected.extend(layouts)
|
||||
layout_num += len(layouts)
|
||||
if len(layouts) < layout_step_size:
|
||||
break
|
||||
return collected
|
||||
|
||||
def parse_document(self, pdf_path: Path) -> dict[str, Any]:
|
||||
"""Run the submit/poll/fetch cycle and return a raw parse payload."""
|
||||
task_id = self.submit_parse_task(pdf_path)
|
||||
started_at = time.monotonic()
|
||||
poll_interval = max(1, self.settings.aliyun_parse_poll_interval_seconds)
|
||||
timeout_seconds = max(1, self.settings.aliyun_parse_timeout_seconds)
|
||||
|
||||
while True:
|
||||
status = self.get_task_status(task_id)
|
||||
state = str(status.get("status", "")).lower()
|
||||
if state in {"succeeded", "success", "finished"}:
|
||||
layouts = self.fetch_layouts(task_id)
|
||||
return {
|
||||
"task_id": task_id,
|
||||
"status": state,
|
||||
"doc_id": status.get("doc_id") or pdf_path.stem,
|
||||
"doc_name": status.get("doc_name") or pdf_path.name,
|
||||
"layouts": layouts,
|
||||
"metadata": status,
|
||||
}
|
||||
if state in {"failed", "error"}:
|
||||
raise RuntimeError(f"Aliyun parse task failed for {pdf_path.name}: {status}")
|
||||
if time.monotonic() - started_at > timeout_seconds:
|
||||
raise TimeoutError(f"Aliyun parse task timed out for {pdf_path.name}")
|
||||
time.sleep(poll_interval)
|
||||
38
rag_eval/dataset_builder/parser/aliyun_document_parser.py
Normal file
38
rag_eval/dataset_builder/parser/aliyun_document_parser.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""Document parser that normalizes Alibaba layout results into internal models."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from rag_eval.dataset_builder.models import ParsedDocument
|
||||
|
||||
from .aliyun_docmind_gateway import AliyunDocmindGateway
|
||||
from .aliyun_layout_normalizer import normalize_layouts
|
||||
|
||||
|
||||
class AliyunDocumentParser:
|
||||
"""Parse PDFs through the Alibaba gateway and normalize the returned layouts."""
|
||||
|
||||
def __init__(self, gateway: AliyunDocmindGateway):
|
||||
"""Store the gateway dependency used for remote parsing."""
|
||||
self.gateway = gateway
|
||||
|
||||
def parse(self, pdf_path: Path) -> ParsedDocument:
|
||||
"""Parse one PDF file into a normalized parsed-document model."""
|
||||
payload = self.gateway.parse_document(pdf_path)
|
||||
layouts = payload.get("layouts") or []
|
||||
if not layouts:
|
||||
raise ValueError(f"No layouts returned for document: {pdf_path.name}")
|
||||
|
||||
document = normalize_layouts(
|
||||
doc_id=str(payload.get("doc_id") or pdf_path.stem),
|
||||
doc_name=str(payload.get("doc_name") or pdf_path.name),
|
||||
layouts=list(layouts),
|
||||
)
|
||||
document.metadata.update(
|
||||
{
|
||||
"task_id": payload.get("task_id"),
|
||||
"provider": "aliyun_docmind",
|
||||
}
|
||||
)
|
||||
return document
|
||||
181
rag_eval/dataset_builder/parser/aliyun_layout_normalizer.py
Normal file
181
rag_eval/dataset_builder/parser/aliyun_layout_normalizer.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""Normalization helpers that convert raw layout results into source chunks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from rag_eval.dataset_builder.models import ParsedDocument, SemanticBlock, SourceChunk, StructureNode
|
||||
|
||||
|
||||
def _clean_text(value: Any) -> str:
|
||||
"""Normalize free-form layout text into a compact string."""
|
||||
if value is None:
|
||||
return ""
|
||||
return re.sub(r"\s+", " ", str(value)).strip()
|
||||
|
||||
|
||||
def _is_catalog_entry(item_type: str, text: str) -> bool:
|
||||
"""Detect table-of-contents style entries that should be skipped."""
|
||||
lowered = text.lower()
|
||||
return item_type == "toc" or "目录" in text or lowered.startswith("table of contents")
|
||||
|
||||
|
||||
def _flatten_table(item: dict[str, Any]) -> str:
|
||||
"""Convert a table layout node into a searchable plain-text representation."""
|
||||
rows = item.get("rows") or []
|
||||
flattened_rows: list[str] = []
|
||||
for row in rows:
|
||||
cells = [str(cell).strip() for cell in row if str(cell).strip()]
|
||||
if cells:
|
||||
flattened_rows.append(" | ".join(cells))
|
||||
return "\n".join(flattened_rows)
|
||||
|
||||
|
||||
def _split_text(text: str, max_chars: int = 1200, overlap: int = 150) -> list[str]:
|
||||
"""Split long text into overlapping windows so each chunk stays reviewable."""
|
||||
if len(text) <= max_chars:
|
||||
return [text]
|
||||
|
||||
windows: list[str] = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(len(text), start + max_chars)
|
||||
windows.append(text[start:end].strip())
|
||||
if end >= len(text):
|
||||
break
|
||||
start = max(end - overlap, start + 1)
|
||||
return [window for window in windows if window]
|
||||
|
||||
|
||||
def normalize_layouts(
|
||||
*,
|
||||
doc_id: str,
|
||||
doc_name: str,
|
||||
layouts: list[dict[str, Any]],
|
||||
max_chunk_chars: int = 1200,
|
||||
overlap_chars: int = 150,
|
||||
) -> ParsedDocument:
|
||||
"""Convert raw layouts into structure nodes, semantic blocks, and source chunks."""
|
||||
structure_nodes: list[StructureNode] = []
|
||||
semantic_blocks: list[SemanticBlock] = []
|
||||
source_chunks: list[SourceChunk] = []
|
||||
section_stack: list[tuple[int, str]] = []
|
||||
|
||||
current_block_text: list[str] = []
|
||||
current_block_layout_ids: list[str] = []
|
||||
current_page_start: int | None = None
|
||||
current_page_end: int | None = None
|
||||
current_section_path = ""
|
||||
current_section_title = ""
|
||||
|
||||
def flush_block() -> None:
|
||||
"""Finalize the in-progress semantic block and emit source chunks."""
|
||||
nonlocal current_block_text, current_block_layout_ids, current_page_start, current_page_end
|
||||
nonlocal current_section_path, current_section_title
|
||||
|
||||
text = _clean_text(" ".join(current_block_text))
|
||||
if not text or current_page_start is None or current_page_end is None:
|
||||
current_block_text = []
|
||||
current_block_layout_ids = []
|
||||
current_page_start = None
|
||||
current_page_end = None
|
||||
return
|
||||
|
||||
block_id = f"{doc_id}-block-{len(semantic_blocks) + 1}"
|
||||
block = SemanticBlock(
|
||||
block_id=block_id,
|
||||
doc_id=doc_id,
|
||||
doc_name=doc_name,
|
||||
text=text,
|
||||
page_start=current_page_start,
|
||||
page_end=current_page_end,
|
||||
section_path=current_section_path,
|
||||
section_title=current_section_title,
|
||||
source_layout_ids=list(current_block_layout_ids),
|
||||
)
|
||||
semantic_blocks.append(block)
|
||||
|
||||
chunk_parts = _split_text(text, max_chars=max_chunk_chars, overlap=overlap_chars)
|
||||
for index, part in enumerate(chunk_parts, start=1):
|
||||
heading_prefix = current_section_title.strip()
|
||||
chunk_text = f"{heading_prefix}\n{part}".strip() if heading_prefix and not part.startswith(heading_prefix) else part
|
||||
source_chunks.append(
|
||||
SourceChunk(
|
||||
chunk_id=f"{block_id}-chunk-{index}",
|
||||
doc_id=doc_id,
|
||||
doc_name=doc_name,
|
||||
text=chunk_text,
|
||||
page_start=current_page_start,
|
||||
page_end=current_page_end,
|
||||
section_path=current_section_path,
|
||||
section_title=current_section_title,
|
||||
source_layout_ids=list(current_block_layout_ids),
|
||||
)
|
||||
)
|
||||
|
||||
current_block_text = []
|
||||
current_block_layout_ids = []
|
||||
current_page_start = None
|
||||
current_page_end = None
|
||||
|
||||
for index, item in enumerate(layouts, start=1):
|
||||
item_type = str(item.get("type", "paragraph")).lower()
|
||||
page = int(item.get("page", 1))
|
||||
layout_id = str(item.get("layout_id") or f"layout-{index}")
|
||||
level = int(item.get("level", 1))
|
||||
|
||||
if item_type == "table":
|
||||
text = _flatten_table(item)
|
||||
else:
|
||||
text = _clean_text(item.get("text"))
|
||||
|
||||
if not text or _is_catalog_entry(item_type, text):
|
||||
continue
|
||||
|
||||
if item_type == "heading":
|
||||
flush_block()
|
||||
while section_stack and section_stack[-1][0] >= level:
|
||||
section_stack.pop()
|
||||
section_stack.append((level, text))
|
||||
section_titles = [title for _, title in section_stack]
|
||||
current_section_title = text
|
||||
current_section_path = " > ".join(section_titles)
|
||||
structure_nodes.append(
|
||||
StructureNode(
|
||||
node_id=f"{doc_id}-node-{len(structure_nodes) + 1}",
|
||||
level=level,
|
||||
title=text,
|
||||
page_start=page,
|
||||
page_end=page,
|
||||
section_path=current_section_path,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
if item_type == "caption":
|
||||
text = f"图注: {text}"
|
||||
|
||||
if current_page_start is None:
|
||||
current_page_start = page
|
||||
current_page_end = page
|
||||
current_block_text.append(text)
|
||||
current_block_layout_ids.append(layout_id)
|
||||
|
||||
flush_block()
|
||||
raw_text = "\n".join(chunk.text for chunk in source_chunks)
|
||||
metadata = {
|
||||
"layout_count": len(layouts),
|
||||
"structure_node_count": len(structure_nodes),
|
||||
"semantic_block_count": len(semantic_blocks),
|
||||
"source_chunk_count": len(source_chunks),
|
||||
}
|
||||
return ParsedDocument(
|
||||
doc_id=doc_id,
|
||||
doc_name=doc_name,
|
||||
raw_text=raw_text,
|
||||
structure_nodes=structure_nodes,
|
||||
semantic_blocks=semantic_blocks,
|
||||
source_chunks=source_chunks,
|
||||
metadata=metadata,
|
||||
)
|
||||
Reference in New Issue
Block a user