first commit

2026-06-12 14:02:15 +08:00
commit 9cbdc1d95d
69 changed files with 9486 additions and 0 deletions
--- a/rag_eval/dataset_builder/parser/init.py
+++ b/rag_eval/dataset_builder/parser/init.py
@@ -0,0 +1,7 @@
+"""Parser integrations and layout normalization helpers for dataset build jobs."""
+
+from .aliyun_document_parser import AliyunDocumentParser
+from .aliyun_docmind_gateway import AliyunDocmindGateway
+from .aliyun_layout_normalizer import normalize_layouts
+
+__all__ = ["AliyunDocumentParser", "AliyunDocmindGateway", "normalize_layouts"]
--- a/rag_eval/dataset_builder/parser/aliyun_docmind_gateway.py
+++ b/rag_eval/dataset_builder/parser/aliyun_docmind_gateway.py
@@ -0,0 +1,202 @@
+"""Gateway abstraction for Alibaba Cloud document parsing workflows."""
+
+from __future__ import annotations
+
+import time
+from pathlib import Path
+from typing import Any
+
+try:
+    from alibabacloud_docmind_api20220711 import models as docmind_models
+    from alibabacloud_docmind_api20220711.client import Client as DocmindClient
+    from alibabacloud_tea_openapi import models as openapi_models
+    from alibabacloud_tea_util import models as runtime_models
+except ImportError:
+    # Keep Alibaba SDK optional so offline flows and tests can import this module.
+    DocmindClient = None
+    docmind_models = None
+    openapi_models = None
+    runtime_models = None
+
+try:
+    from alibabacloud_credentials.client import Client as CredentialClient
+except ImportError:
+    CredentialClient = None
+
+from rag_eval.settings import EvaluationSettings
+
+
+class AliyunDocmindGateway:
+    """Thin gateway interface around the external Alibaba document parser service."""
+
+    def __init__(self, settings: EvaluationSettings):
+        """Store parser-related settings needed by the gateway implementation."""
+        self.settings = settings
+        self._client = None
+        self._models = None
+        self._runtime_models = None
+
+    def _load_sdk(self) -> tuple[Any, Any, Any, Any]:
+        """Load Alibaba SDK modules lazily so tests and offline flows do not require them."""
+        if (
+            DocmindClient is None
+            or openapi_models is None
+            or docmind_models is None
+            or runtime_models is None
+        ):
+            raise ImportError(
+                "Alibaba Cloud Docmind SDK is not installed. "
+                "Install alibabacloud-docmind-api20220711, "
+                "alibabacloud-tea-openapi, alibabacloud-tea-util, and "
+                "alibabacloud-credentials."
+            )
+        return DocmindClient, openapi_models, docmind_models, runtime_models
+
+    def _resolve_credentials(self) -> tuple[str, str]:
+        """Resolve AccessKey credentials from settings or the Alibaba credentials client."""
+        if self.settings.alibaba_access_key_id and self.settings.alibaba_access_key_secret:
+            return self.settings.alibaba_access_key_id, self.settings.alibaba_access_key_secret
+
+        if CredentialClient is None:
+            raise ImportError(
+                "Alibaba Cloud credentials SDK is not installed and no explicit "
+                "ALIBABA_ACCESS_KEY_ID / ALIBABA_ACCESS_KEY_SECRET were provided."
+            )
+
+        credential_client = CredentialClient()
+        credential = credential_client.get_credential()
+        return credential.get_access_key_id(), credential.get_access_key_secret()
+
+    def _init_client(self) -> Any:
+        """Create and cache the underlying Alibaba SDK client."""
+        if self._client is not None:
+            return self._client
+
+        client_class, openapi_models, docmind_models, runtime_models = self._load_sdk()
+        access_key_id, access_key_secret = self._resolve_credentials()
+        endpoint = (self.settings.alibaba_endpoint or "docmind-api.cn-hangzhou.aliyuncs.com").strip()
+        config = openapi_models.Config(
+            access_key_id=access_key_id,
+            access_key_secret=access_key_secret,
+        )
+        config.endpoint = endpoint
+        config.region_id = "cn-hangzhou"
+        config.type = "access_key"
+
+        self._client = client_class(config)
+        self._models = docmind_models
+        self._runtime_models = runtime_models
+        return self._client
+
+    @staticmethod
+    def _to_plain_dict(value: Any) -> dict[str, Any]:
+        """Convert SDK response objects into ordinary dictionaries."""
+        if value is None:
+            return {}
+        if isinstance(value, dict):
+            return value
+        if hasattr(value, "to_map"):
+            return value.to_map()
+        if hasattr(value, "__dict__"):
+            return {
+                key: getattr(value, key)
+                for key in vars(value)
+                if not key.startswith("_")
+            }
+        return {}
+
+    @staticmethod
+    def _extract_layouts(payload: Any) -> list[dict[str, Any]]:
+        """Convert layout collections from SDK payloads into plain dictionaries."""
+        if payload is None:
+            return []
+        if isinstance(payload, dict):
+            layouts = payload.get("layouts") or payload.get("Layouts") or []
+        else:
+            layouts = getattr(payload, "layouts", None) or getattr(payload, "Layouts", None) or []
+        normalized: list[dict[str, Any]] = []
+        for item in layouts:
+            normalized.append(AliyunDocmindGateway._to_plain_dict(item))
+        return normalized
+
+    def submit_parse_task(self, pdf_path: Path) -> str:
+        """Submit one PDF parse task and return the remote task identifier."""
+        client = self._init_client()
+        runtime = self._runtime_models.RuntimeOptions()
+        file_name = pdf_path.name
+        with pdf_path.open("rb") as handle:
+            request = self._models.SubmitDocParserJobAdvanceRequest(
+                file_url_object=handle,
+                file_name=file_name,
+                file_name_extension=pdf_path.suffix.lstrip(".").lower() or "pdf",
+                llm_enhancement=self.settings.aliyun_llm_enhancement,
+                enhancement_mode=self.settings.aliyun_enhancement_mode,
+            )
+            response = client.submit_doc_parser_job_advance(request, runtime)
+
+        payload = self._to_plain_dict(getattr(getattr(response, "body", None), "data", None))
+        task_id = payload.get("id") or payload.get("Id")
+        if not task_id:
+            raise RuntimeError(f"Aliyun submit_doc_parser_job_advance returned no task id for {pdf_path.name}")
+        return str(task_id)
+
+    def get_task_status(self, task_id: str) -> dict[str, Any]:
+        """Fetch the current parse task status from the remote service."""
+        client = self._init_client()
+        request = self._models.QueryDocParserStatusRequest(id=task_id)
+        response = client.query_doc_parser_status(request)
+        payload = self._to_plain_dict(getattr(getattr(response, "body", None), "data", None))
+        status = payload.get("status") or payload.get("Status")
+        if status is not None and "status" not in payload:
+            payload["status"] = status
+        return payload
+
+    def fetch_layouts(self, task_id: str) -> list[dict[str, Any]]:
+        """Fetch normalized layout pages for a completed parse task."""
+        client = self._init_client()
+        layout_num = 0
+        layout_step_size = min(max(1, self.settings.aliyun_parse_layout_step_size), 3000)
+        collected: list[dict[str, Any]] = []
+
+        while True:
+            request = self._models.GetDocParserResultRequest(
+                id=task_id,
+                layout_step_size=layout_step_size,
+                layout_num=layout_num,
+            )
+            response = client.get_doc_parser_result(request)
+            payload = getattr(getattr(response, "body", None), "data", None)
+            layouts = self._extract_layouts(payload)
+            if not layouts:
+                break
+            collected.extend(layouts)
+            layout_num += len(layouts)
+            if len(layouts) < layout_step_size:
+                break
+        return collected
+
+    def parse_document(self, pdf_path: Path) -> dict[str, Any]:
+        """Run the submit/poll/fetch cycle and return a raw parse payload."""
+        task_id = self.submit_parse_task(pdf_path)
+        started_at = time.monotonic()
+        poll_interval = max(1, self.settings.aliyun_parse_poll_interval_seconds)
+        timeout_seconds = max(1, self.settings.aliyun_parse_timeout_seconds)
+
+        while True:
+            status = self.get_task_status(task_id)
+            state = str(status.get("status", "")).lower()
+            if state in {"succeeded", "success", "finished"}:
+                layouts = self.fetch_layouts(task_id)
+                return {
+                    "task_id": task_id,
+                    "status": state,
+                    "doc_id": status.get("doc_id") or pdf_path.stem,
+                    "doc_name": status.get("doc_name") or pdf_path.name,
+                    "layouts": layouts,
+                    "metadata": status,
+                }
+            if state in {"failed", "error"}:
+                raise RuntimeError(f"Aliyun parse task failed for {pdf_path.name}: {status}")
+            if time.monotonic() - started_at > timeout_seconds:
+                raise TimeoutError(f"Aliyun parse task timed out for {pdf_path.name}")
+            time.sleep(poll_interval)
--- a/rag_eval/dataset_builder/parser/aliyun_document_parser.py
+++ b/rag_eval/dataset_builder/parser/aliyun_document_parser.py
@@ -0,0 +1,38 @@
+"""Document parser that normalizes Alibaba layout results into internal models."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from rag_eval.dataset_builder.models import ParsedDocument
+
+from .aliyun_docmind_gateway import AliyunDocmindGateway
+from .aliyun_layout_normalizer import normalize_layouts
+
+
+class AliyunDocumentParser:
+    """Parse PDFs through the Alibaba gateway and normalize the returned layouts."""
+
+    def __init__(self, gateway: AliyunDocmindGateway):
+        """Store the gateway dependency used for remote parsing."""
+        self.gateway = gateway
+
+    def parse(self, pdf_path: Path) -> ParsedDocument:
+        """Parse one PDF file into a normalized parsed-document model."""
+        payload = self.gateway.parse_document(pdf_path)
+        layouts = payload.get("layouts") or []
+        if not layouts:
+            raise ValueError(f"No layouts returned for document: {pdf_path.name}")
+
+        document = normalize_layouts(
+            doc_id=str(payload.get("doc_id") or pdf_path.stem),
+            doc_name=str(payload.get("doc_name") or pdf_path.name),
+            layouts=list(layouts),
+        )
+        document.metadata.update(
+            {
+                "task_id": payload.get("task_id"),
+                "provider": "aliyun_docmind",
+            }
+        )
+        return document
--- a/rag_eval/dataset_builder/parser/aliyun_layout_normalizer.py
+++ b/rag_eval/dataset_builder/parser/aliyun_layout_normalizer.py
@@ -0,0 +1,181 @@
+"""Normalization helpers that convert raw layout results into source chunks."""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+from rag_eval.dataset_builder.models import ParsedDocument, SemanticBlock, SourceChunk, StructureNode
+
+
+def _clean_text(value: Any) -> str:
+    """Normalize free-form layout text into a compact string."""
+    if value is None:
+        return ""
+    return re.sub(r"\s+", " ", str(value)).strip()
+
+
+def _is_catalog_entry(item_type: str, text: str) -> bool:
+    """Detect table-of-contents style entries that should be skipped."""
+    lowered = text.lower()
+    return item_type == "toc" or "目录" in text or lowered.startswith("table of contents")
+
+
+def _flatten_table(item: dict[str, Any]) -> str:
+    """Convert a table layout node into a searchable plain-text representation."""
+    rows = item.get("rows") or []
+    flattened_rows: list[str] = []
+    for row in rows:
+        cells = [str(cell).strip() for cell in row if str(cell).strip()]
+        if cells:
+            flattened_rows.append(" | ".join(cells))
+    return "\n".join(flattened_rows)
+
+
+def _split_text(text: str, max_chars: int = 1200, overlap: int = 150) -> list[str]:
+    """Split long text into overlapping windows so each chunk stays reviewable."""
+    if len(text) <= max_chars:
+        return [text]
+
+    windows: list[str] = []
+    start = 0
+    while start < len(text):
+        end = min(len(text), start + max_chars)
+        windows.append(text[start:end].strip())
+        if end >= len(text):
+            break
+        start = max(end - overlap, start + 1)
+    return [window for window in windows if window]
+
+
+def normalize_layouts(
+    *,
+    doc_id: str,
+    doc_name: str,
+    layouts: list[dict[str, Any]],
+    max_chunk_chars: int = 1200,
+    overlap_chars: int = 150,
+) -> ParsedDocument:
+    """Convert raw layouts into structure nodes, semantic blocks, and source chunks."""
+    structure_nodes: list[StructureNode] = []
+    semantic_blocks: list[SemanticBlock] = []
+    source_chunks: list[SourceChunk] = []
+    section_stack: list[tuple[int, str]] = []
+
+    current_block_text: list[str] = []
+    current_block_layout_ids: list[str] = []
+    current_page_start: int | None = None
+    current_page_end: int | None = None
+    current_section_path = ""
+    current_section_title = ""
+
+    def flush_block() -> None:
+        """Finalize the in-progress semantic block and emit source chunks."""
+        nonlocal current_block_text, current_block_layout_ids, current_page_start, current_page_end
+        nonlocal current_section_path, current_section_title
+
+        text = _clean_text(" ".join(current_block_text))
+        if not text or current_page_start is None or current_page_end is None:
+            current_block_text = []
+            current_block_layout_ids = []
+            current_page_start = None
+            current_page_end = None
+            return
+
+        block_id = f"{doc_id}-block-{len(semantic_blocks) + 1}"
+        block = SemanticBlock(
+            block_id=block_id,
+            doc_id=doc_id,
+            doc_name=doc_name,
+            text=text,
+            page_start=current_page_start,
+            page_end=current_page_end,
+            section_path=current_section_path,
+            section_title=current_section_title,
+            source_layout_ids=list(current_block_layout_ids),
+        )
+        semantic_blocks.append(block)
+
+        chunk_parts = _split_text(text, max_chars=max_chunk_chars, overlap=overlap_chars)
+        for index, part in enumerate(chunk_parts, start=1):
+            heading_prefix = current_section_title.strip()
+            chunk_text = f"{heading_prefix}\n{part}".strip() if heading_prefix and not part.startswith(heading_prefix) else part
+            source_chunks.append(
+                SourceChunk(
+                    chunk_id=f"{block_id}-chunk-{index}",
+                    doc_id=doc_id,
+                    doc_name=doc_name,
+                    text=chunk_text,
+                    page_start=current_page_start,
+                    page_end=current_page_end,
+                    section_path=current_section_path,
+                    section_title=current_section_title,
+                    source_layout_ids=list(current_block_layout_ids),
+                )
+            )
+
+        current_block_text = []
+        current_block_layout_ids = []
+        current_page_start = None
+        current_page_end = None
+
+    for index, item in enumerate(layouts, start=1):
+        item_type = str(item.get("type", "paragraph")).lower()
+        page = int(item.get("page", 1))
+        layout_id = str(item.get("layout_id") or f"layout-{index}")
+        level = int(item.get("level", 1))
+
+        if item_type == "table":
+            text = _flatten_table(item)
+        else:
+            text = _clean_text(item.get("text"))
+
+        if not text or _is_catalog_entry(item_type, text):
+            continue
+
+        if item_type == "heading":
+            flush_block()
+            while section_stack and section_stack[-1][0] >= level:
+                section_stack.pop()
+            section_stack.append((level, text))
+            section_titles = [title for _, title in section_stack]
+            current_section_title = text
+            current_section_path = " > ".join(section_titles)
+            structure_nodes.append(
+                StructureNode(
+                    node_id=f"{doc_id}-node-{len(structure_nodes) + 1}",
+                    level=level,
+                    title=text,
+                    page_start=page,
+                    page_end=page,
+                    section_path=current_section_path,
+                )
+            )
+            continue
+
+        if item_type == "caption":
+            text = f"图注: {text}"
+
+        if current_page_start is None:
+            current_page_start = page
+        current_page_end = page
+        current_block_text.append(text)
+        current_block_layout_ids.append(layout_id)
+
+    flush_block()
+    raw_text = "\n".join(chunk.text for chunk in source_chunks)
+    metadata = {
+        "layout_count": len(layouts),
+        "structure_node_count": len(structure_nodes),
+        "semantic_block_count": len(semantic_blocks),
+        "source_chunk_count": len(source_chunks),
+    }
+    return ParsedDocument(
+        doc_id=doc_id,
+        doc_name=doc_name,
+        raw_text=raw_text,
+        structure_nodes=structure_nodes,
+        semantic_blocks=semantic_blocks,
+        source_chunks=source_chunks,
+        metadata=metadata,
+    )