feat: Migrate document parsing to Aliyun and update embedding configurations
- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
@@ -10,6 +10,8 @@ from app.config.settings import settings
|
||||
from app.domain.retrieval import EmbeddingProvider
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
EMBEDDING_BATCH_SIZE = 8
|
||||
|
||||
|
||||
|
||||
class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
|
||||
@@ -27,6 +29,18 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
|
||||
self.timeout = settings.embedding_timeout_seconds
|
||||
self.dimension = settings.embedding_dim
|
||||
|
||||
def _raise_for_status(self, response: httpx.Response, *, batch_size: int) -> None:
|
||||
"""Raise a detailed error so upstream gateway failures are easier to diagnose."""
|
||||
try:
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPStatusError as exc:
|
||||
response_preview = response.text[:500].strip()
|
||||
detail = (
|
||||
f"Embedding request failed for model={self.model}, batch_size={batch_size}, "
|
||||
f"status={response.status_code}, url={response.request.url}, response={response_preview}"
|
||||
)
|
||||
raise httpx.HTTPStatusError(detail, request=exc.request, response=exc.response) from exc
|
||||
|
||||
def _request(self, texts: list[str]) -> list[list[float]]:
|
||||
"""Handle request for this module for the Open A I Compatible Embedding Provider instance."""
|
||||
if not self.api_key:
|
||||
@@ -40,7 +54,7 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
|
||||
json={"model": self.model, "input": texts},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
self._raise_for_status(response, batch_size=len(texts))
|
||||
data = response.json()
|
||||
vectors = [item["embedding"] for item in sorted(data.get("data", []), key=lambda item: item["index"])]
|
||||
if any(len(vector) != self.dimension for vector in vectors):
|
||||
@@ -51,7 +65,12 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
|
||||
"""Embed texts for the Open A I Compatible Embedding Provider instance."""
|
||||
if not texts:
|
||||
return []
|
||||
return self._request(texts)
|
||||
vectors: list[list[float]] = []
|
||||
# Batch requests conservatively because some gateways reject larger embedding payloads.
|
||||
for start in range(0, len(texts), EMBEDDING_BATCH_SIZE):
|
||||
batch = texts[start:start + EMBEDDING_BATCH_SIZE]
|
||||
vectors.extend(self._request(batch))
|
||||
return vectors
|
||||
|
||||
def embed_query(self, text: str) -> list[float]:
|
||||
"""Embed query for the Open A I Compatible Embedding Provider instance."""
|
||||
|
||||
142
backend/app/infrastructure/parser/aliyun_docmind_gateway.py
Normal file
142
backend/app/infrastructure/parser/aliyun_docmind_gateway.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""Aliyun Docmind gateway helpers for the document ingest pipeline."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from alibabacloud_docmind_api20220711 import models as docmind_models
|
||||
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
|
||||
from alibabacloud_tea_openapi import models as open_api_models
|
||||
from alibabacloud_tea_util import models as util_models
|
||||
|
||||
from app.config.settings import settings
|
||||
|
||||
# Keep provider-specific behavior isolated so the rest of the backend can stay stable.
|
||||
|
||||
|
||||
@dataclass
|
||||
class AliyunParsePayload:
|
||||
"""Represent the raw Aliyun parse payload returned by the gateway."""
|
||||
|
||||
task_id: str
|
||||
layouts: list[dict[str, Any]]
|
||||
poll_attempts: int
|
||||
duration_ms: int
|
||||
|
||||
|
||||
class AliyunDocmindGateway:
|
||||
"""Submit, poll, and collect results from the Aliyun Docmind API."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the gateway with runtime configuration."""
|
||||
self.endpoint = settings.alibaba_endpoint
|
||||
self.poll_interval_seconds = settings.aliyun_parse_poll_interval_seconds
|
||||
self.timeout_seconds = settings.aliyun_parse_timeout_seconds
|
||||
self.layout_step_size = settings.aliyun_parse_layout_step_size
|
||||
self.llm_enhancement = settings.aliyun_llm_enhancement
|
||||
self.enhancement_mode = settings.aliyun_enhancement_mode
|
||||
|
||||
def parse_document(self, *, file_path: str) -> AliyunParsePayload:
|
||||
"""Parse a single document and return the collected layouts."""
|
||||
client = self._create_client()
|
||||
started_at = time.monotonic()
|
||||
task_id = self._submit_job(client=client, file_path=file_path)
|
||||
poll_attempts = self._wait_for_completion(client=client, task_id=task_id, started_at=started_at)
|
||||
layouts = self._collect_all_results(client=client, task_id=task_id)
|
||||
duration_ms = int((time.monotonic() - started_at) * 1000)
|
||||
return AliyunParsePayload(
|
||||
task_id=task_id,
|
||||
layouts=layouts,
|
||||
poll_attempts=poll_attempts,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
def _create_client(self) -> DocmindClient:
|
||||
"""Create a Docmind client using explicit AccessKey settings only."""
|
||||
config = open_api_models.Config()
|
||||
config.endpoint = self.endpoint
|
||||
|
||||
if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
|
||||
raise ValueError(
|
||||
"Missing Aliyun parser credentials. Set ALIBABA_ACCESS_KEY_ID and "
|
||||
"ALIBABA_ACCESS_KEY_SECRET in the project root .env."
|
||||
)
|
||||
|
||||
# Keep production behavior deterministic by using only project-configured credentials.
|
||||
config.access_key_id = settings.alibaba_access_key_id
|
||||
config.access_key_secret = settings.alibaba_access_key_secret
|
||||
return DocmindClient(config)
|
||||
|
||||
def _submit_job(self, *, client: DocmindClient, file_path: str) -> str:
|
||||
"""Submit an asynchronous Docmind parse job."""
|
||||
path = Path(file_path)
|
||||
with open(file_path, "rb") as file_stream:
|
||||
request = docmind_models.SubmitDocParserJobAdvanceRequest(
|
||||
file_url_object=file_stream,
|
||||
file_name=path.name,
|
||||
file_name_extension=path.suffix.lstrip("."),
|
||||
llm_enhancement=self.llm_enhancement,
|
||||
enhancement_mode=self.enhancement_mode,
|
||||
)
|
||||
runtime = util_models.RuntimeOptions()
|
||||
response = client.submit_doc_parser_job_advance(request, runtime)
|
||||
task_id = response.body.data.id if response.body and response.body.data else ""
|
||||
if not task_id:
|
||||
raise RuntimeError("Aliyun Docmind did not return a parse task id.")
|
||||
return task_id
|
||||
|
||||
def _query_status(self, *, client: DocmindClient, task_id: str) -> dict[str, Any] | None:
|
||||
"""Query the current Docmind parse status."""
|
||||
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
|
||||
response = client.query_doc_parser_status(request)
|
||||
return response.body.data.to_map() if response.body and response.body.data else None
|
||||
|
||||
def _wait_for_completion(self, *, client: DocmindClient, task_id: str, started_at: float) -> int:
|
||||
"""Poll until the parse job finishes or times out."""
|
||||
poll_attempts = 0
|
||||
while True:
|
||||
poll_attempts += 1
|
||||
status_payload = self._query_status(client=client, task_id=task_id)
|
||||
if not status_payload:
|
||||
raise RuntimeError(f"Aliyun parse status payload is empty for task {task_id}.")
|
||||
|
||||
status = str(status_payload.get("Status", "")).lower()
|
||||
if status == "success":
|
||||
return poll_attempts
|
||||
if status == "failed":
|
||||
raise RuntimeError(f"Aliyun parse task failed: {status_payload}")
|
||||
|
||||
elapsed = time.monotonic() - started_at
|
||||
if elapsed > self.timeout_seconds:
|
||||
raise TimeoutError(
|
||||
f"Aliyun parse task timed out after {self.timeout_seconds}s: task_id={task_id}"
|
||||
)
|
||||
time.sleep(self.poll_interval_seconds)
|
||||
|
||||
def _collect_all_results(self, *, client: DocmindClient, task_id: str) -> list[dict[str, Any]]:
|
||||
"""Collect all paginated layout results from a completed parse task."""
|
||||
all_layouts: list[dict[str, Any]] = []
|
||||
layout_num = 0
|
||||
while True:
|
||||
request = docmind_models.GetDocParserResultRequest(
|
||||
id=task_id,
|
||||
layout_step_size=self.layout_step_size,
|
||||
layout_num=layout_num,
|
||||
)
|
||||
response = client.get_doc_parser_result(request)
|
||||
payload = response.body.data if response.body else None
|
||||
if not payload:
|
||||
break
|
||||
layouts = payload.get("layouts", [])
|
||||
if not layouts:
|
||||
break
|
||||
all_layouts.extend(layouts)
|
||||
layout_num += len(layouts)
|
||||
if len(layouts) < self.layout_step_size:
|
||||
break
|
||||
if not all_layouts:
|
||||
raise RuntimeError(f"Aliyun parse task returned no layouts: task_id={task_id}")
|
||||
return all_layouts
|
||||
@@ -1,19 +1,18 @@
|
||||
"""Implement infrastructure support for aliyun document parser."""
|
||||
"""Implement infrastructure support for Aliyun document parsing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.aliyun_parser.parse_pdf import (
|
||||
from app.config.settings import settings
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway
|
||||
from app.infrastructure.parser.aliyun_layout_normalizer import (
|
||||
MAX_CHARS,
|
||||
OVERLAP_CHARS,
|
||||
build_semantic_blocks,
|
||||
build_structure_nodes,
|
||||
build_vector_chunks,
|
||||
collect_all_results,
|
||||
init_client,
|
||||
submit_job,
|
||||
wait_for_completion,
|
||||
)
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
@@ -22,13 +21,14 @@ class AliyunDocumentParser(DocumentParser):
|
||||
"""Provide the Aliyun Document Parser parser."""
|
||||
parser_name = "aliyun_docmind"
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the parser adapter and its gateway dependency."""
|
||||
self.gateway = AliyunDocmindGateway()
|
||||
|
||||
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
||||
"""Handle parse for the Aliyun Document Parser instance."""
|
||||
client = init_client()
|
||||
task_id = submit_job(client, file_path)
|
||||
if not wait_for_completion(client, task_id):
|
||||
raise RuntimeError("阿里云文档解析任务失败")
|
||||
layouts = collect_all_results(client, task_id)
|
||||
payload = self.gateway.parse_document(file_path=file_path)
|
||||
layouts = payload.layouts
|
||||
structure_nodes = build_structure_nodes(layouts)
|
||||
semantic_blocks = build_semantic_blocks(layouts)
|
||||
vector_chunks = build_vector_chunks(
|
||||
@@ -51,5 +51,13 @@ class AliyunDocumentParser(DocumentParser):
|
||||
vector_chunks=vector_chunks,
|
||||
parser_name=self.parser_name,
|
||||
raw_text=raw_text,
|
||||
metadata={"task_id": task_id, "layout_count": len(layouts)},
|
||||
raw_layouts=layouts,
|
||||
metadata={
|
||||
"task_id": payload.task_id,
|
||||
"layout_count": len(layouts),
|
||||
"poll_attempts": payload.poll_attempts,
|
||||
"duration_ms": payload.duration_ms,
|
||||
"parser_backend": self.parser_name,
|
||||
"artifact_prefix": settings.document_parse_artifact_prefix,
|
||||
},
|
||||
)
|
||||
|
||||
336
backend/app/infrastructure/parser/aliyun_layout_normalizer.py
Normal file
336
backend/app/infrastructure/parser/aliyun_layout_normalizer.py
Normal file
@@ -0,0 +1,336 @@
|
||||
"""Normalize Aliyun Docmind layouts into production document structures."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
# Keep layout normalization rules centralized so parser and demos stay aligned.
|
||||
MAX_CHARS = 600
|
||||
OVERLAP_CHARS = 80
|
||||
|
||||
TOC_TITLES = {"目次", "目录"}
|
||||
TITLE_SUBTYPES = {"doc_title", "para_title"}
|
||||
TEXT_SUBTYPES = {"para", "none"}
|
||||
FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
|
||||
FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Normalize raw text content emitted by the parser."""
|
||||
text = text.replace("\r", "\n")
|
||||
text = text.replace(" ", " ")
|
||||
text = re.sub(r"\n+", "\n", text)
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_page(layout: dict[str, Any]) -> int:
|
||||
"""Return the page number for a layout record."""
|
||||
return layout.get("pageNum", layout.get("pageNumber", 0))
|
||||
|
||||
|
||||
def get_text(layout: dict[str, Any]) -> str:
|
||||
"""Return the most useful text content for a layout record."""
|
||||
text = normalize_text(layout.get("text", ""))
|
||||
if text:
|
||||
return text
|
||||
return normalize_text(layout.get("markdownContent", ""))
|
||||
|
||||
|
||||
def is_title(layout: dict[str, Any]) -> bool:
|
||||
"""Return whether the layout should be treated as a title."""
|
||||
return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
|
||||
|
||||
|
||||
def is_text(layout: dict[str, Any]) -> bool:
|
||||
"""Return whether the layout should be treated as plain paragraph text."""
|
||||
return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
|
||||
|
||||
|
||||
def is_figure(layout: dict[str, Any]) -> bool:
|
||||
"""Return whether the layout should be treated as figure-related content."""
|
||||
return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
|
||||
|
||||
|
||||
def is_table(layout: dict[str, Any]) -> bool:
|
||||
"""Return whether the layout should be treated as a table."""
|
||||
return layout.get("type") == "table"
|
||||
|
||||
|
||||
def is_toc_layout(layout: dict[str, Any]) -> bool:
|
||||
"""Return whether the layout appears to belong to a table of contents."""
|
||||
text = get_text(layout)
|
||||
if text in TOC_TITLES:
|
||||
return True
|
||||
if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def extract_table_text(layout: dict[str, Any]) -> str:
|
||||
"""Flatten nested table cells into retrievable plain text."""
|
||||
rows: list[str] = []
|
||||
for cell in layout.get("cells", []):
|
||||
texts: list[str] = []
|
||||
for cell_layout in cell.get("layouts", []):
|
||||
cell_text = normalize_text(cell_layout.get("text", ""))
|
||||
if cell_text:
|
||||
texts.append(cell_text)
|
||||
if texts:
|
||||
rows.append(" ".join(texts))
|
||||
return "\n".join(rows).strip()
|
||||
|
||||
|
||||
def build_structure_nodes(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Build the title hierarchy emitted to downstream storage."""
|
||||
nodes: list[dict[str, Any]] = []
|
||||
for layout in layouts:
|
||||
if not is_title(layout):
|
||||
continue
|
||||
text = get_text(layout)
|
||||
if not text or text in TOC_TITLES:
|
||||
continue
|
||||
nodes.append(
|
||||
{
|
||||
"unique_id": layout.get("uniqueId"),
|
||||
"page": get_page(layout),
|
||||
"index": layout.get("index", 0),
|
||||
"level": layout.get("level", 0),
|
||||
"title": text,
|
||||
"type": layout.get("type"),
|
||||
"sub_type": layout.get("subType"),
|
||||
}
|
||||
)
|
||||
return nodes
|
||||
|
||||
|
||||
def update_section_path(
|
||||
section_stack: list[dict[str, Any]],
|
||||
layout: dict[str, Any],
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Update the current heading stack with a newly observed title layout."""
|
||||
level = layout.get("level", 0)
|
||||
title = get_text(layout)
|
||||
while section_stack and section_stack[-1]["level"] >= level:
|
||||
section_stack.pop()
|
||||
section_stack.append(
|
||||
{
|
||||
"level": level,
|
||||
"title": title,
|
||||
"page": get_page(layout),
|
||||
"unique_id": layout.get("uniqueId"),
|
||||
}
|
||||
)
|
||||
return section_stack
|
||||
|
||||
|
||||
def section_path_titles(section_stack: list[dict[str, Any]]) -> list[str]:
|
||||
"""Return the title-only view of the current heading stack."""
|
||||
return [item["title"] for item in section_stack]
|
||||
|
||||
|
||||
def flush_text_block(
|
||||
blocks: list[dict[str, Any]],
|
||||
semantic_blocks: list[dict[str, Any]],
|
||||
block_id: int,
|
||||
) -> int:
|
||||
"""Flush buffered paragraph layouts into a single semantic block."""
|
||||
if not blocks:
|
||||
return block_id
|
||||
|
||||
texts = [item["text"] for item in blocks if item["text"]]
|
||||
merged_text = "\n".join(texts).strip()
|
||||
if not merged_text:
|
||||
return block_id
|
||||
|
||||
semantic_blocks.append(
|
||||
{
|
||||
"semantic_id": f"semantic-{block_id}",
|
||||
"block_type": "section_text",
|
||||
"page_start": min(item["page"] for item in blocks),
|
||||
"page_end": max(item["page"] for item in blocks),
|
||||
"section_path": blocks[0]["section_path"],
|
||||
"section_level": blocks[0]["section_level"],
|
||||
"section_title": blocks[0]["section_title"],
|
||||
"source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
|
||||
"text": merged_text,
|
||||
}
|
||||
)
|
||||
return block_id + 1
|
||||
|
||||
|
||||
def build_semantic_blocks(layouts: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Build semantic content blocks from raw Aliyun layouts."""
|
||||
semantic_blocks: list[dict[str, Any]] = []
|
||||
section_stack: list[dict[str, Any]] = []
|
||||
pending_text_blocks: list[dict[str, Any]] = []
|
||||
block_id = 1
|
||||
skip_toc_page = False
|
||||
|
||||
for layout in layouts:
|
||||
text = get_text(layout)
|
||||
page = get_page(layout)
|
||||
|
||||
if is_toc_layout(layout):
|
||||
skip_toc_page = True
|
||||
continue
|
||||
if skip_toc_page and page == 1:
|
||||
continue
|
||||
if skip_toc_page and page != 1:
|
||||
skip_toc_page = False
|
||||
|
||||
if is_title(layout):
|
||||
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
pending_text_blocks = []
|
||||
section_stack = update_section_path(section_stack, layout)
|
||||
continue
|
||||
|
||||
section_path = section_path_titles(section_stack)
|
||||
section_title = section_path[-1] if section_path else "未分类"
|
||||
section_level = len(section_path)
|
||||
|
||||
if is_table(layout):
|
||||
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
pending_text_blocks = []
|
||||
table_text = extract_table_text(layout)
|
||||
if table_text:
|
||||
semantic_blocks.append(
|
||||
{
|
||||
"semantic_id": f"semantic-{block_id}",
|
||||
"block_type": "table",
|
||||
"page_start": page,
|
||||
"page_end": page,
|
||||
"section_path": section_path,
|
||||
"section_level": section_level,
|
||||
"section_title": section_title,
|
||||
"source_ids": [layout.get("uniqueId")],
|
||||
"text": table_text,
|
||||
}
|
||||
)
|
||||
block_id += 1
|
||||
continue
|
||||
|
||||
if is_figure(layout):
|
||||
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
pending_text_blocks = []
|
||||
if text:
|
||||
semantic_blocks.append(
|
||||
{
|
||||
"semantic_id": f"semantic-{block_id}",
|
||||
"block_type": "figure",
|
||||
"page_start": page,
|
||||
"page_end": page,
|
||||
"section_path": section_path,
|
||||
"section_level": section_level,
|
||||
"section_title": section_title,
|
||||
"source_ids": [layout.get("uniqueId")],
|
||||
"text": text,
|
||||
}
|
||||
)
|
||||
block_id += 1
|
||||
continue
|
||||
|
||||
if is_text(layout) and text:
|
||||
pending_text_blocks.append(
|
||||
{
|
||||
"page": page,
|
||||
"text": text,
|
||||
"unique_id": layout.get("uniqueId"),
|
||||
"section_path": section_path,
|
||||
"section_level": section_level,
|
||||
"section_title": section_title,
|
||||
}
|
||||
)
|
||||
|
||||
flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||
return semantic_blocks
|
||||
|
||||
|
||||
def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> list[str]:
|
||||
"""Split long text into overlapping windows for embedding."""
|
||||
text = text.strip()
|
||||
if len(text) <= max_chars:
|
||||
return [text] if text else []
|
||||
|
||||
parts: list[str] = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(len(text), start + max_chars)
|
||||
parts.append(text[start:end].strip())
|
||||
if end >= len(text):
|
||||
break
|
||||
start = max(0, end - overlap_chars)
|
||||
return [part for part in parts if part]
|
||||
|
||||
|
||||
def build_vector_chunks(
|
||||
semantic_blocks: list[dict[str, Any]],
|
||||
*,
|
||||
doc_id: str,
|
||||
doc_title: str,
|
||||
max_chars: int,
|
||||
overlap_chars: int,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Build retrieval chunks from semantic blocks."""
|
||||
vector_chunks: list[dict[str, Any]] = []
|
||||
chunk_index = 1
|
||||
|
||||
for block in semantic_blocks:
|
||||
pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
|
||||
for piece_index, piece in enumerate(pieces, start=1):
|
||||
if block["section_path"]:
|
||||
header = f"标准:{doc_title}\n章节:{' > '.join(block['section_path'])}\n\n"
|
||||
else:
|
||||
header = f"标准:{doc_title}\n\n"
|
||||
|
||||
# Preserve enriched embedding text so retrieval keeps section context.
|
||||
vector_chunks.append(
|
||||
{
|
||||
"doc_id": doc_id,
|
||||
"doc_title": doc_title,
|
||||
"chunk_id": f"chunk-{chunk_index}",
|
||||
"chunk_index": chunk_index,
|
||||
"semantic_id": block["semantic_id"],
|
||||
"chunk_type": block["block_type"],
|
||||
"piece_index": piece_index,
|
||||
"page_start": block["page_start"],
|
||||
"page_end": block["page_end"],
|
||||
"section_path": block["section_path"],
|
||||
"section_level": block["section_level"],
|
||||
"section_title": block["section_title"],
|
||||
"source_ids": block["source_ids"],
|
||||
"text": piece,
|
||||
"embedding_text": header + piece,
|
||||
}
|
||||
)
|
||||
chunk_index += 1
|
||||
|
||||
return vector_chunks
|
||||
|
||||
|
||||
def convert_layouts(
|
||||
layouts: list[dict[str, Any]],
|
||||
*,
|
||||
doc_id: str,
|
||||
doc_title: str,
|
||||
max_chars: int,
|
||||
overlap_chars: int,
|
||||
) -> dict[str, Any]:
|
||||
"""Convert raw Aliyun layouts into the three-layer ingest payload."""
|
||||
structure_nodes = build_structure_nodes(layouts)
|
||||
semantic_blocks = build_semantic_blocks(layouts)
|
||||
vector_chunks = build_vector_chunks(
|
||||
semantic_blocks,
|
||||
doc_id=doc_id,
|
||||
doc_title=doc_title,
|
||||
max_chars=max_chars,
|
||||
overlap_chars=overlap_chars,
|
||||
)
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"doc_title": doc_title,
|
||||
"structure_nodes": structure_nodes,
|
||||
"semantic_blocks": semantic_blocks,
|
||||
"vector_chunks": vector_chunks,
|
||||
}
|
||||
@@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from app.config.settings import settings
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
from app.services.parser.docx_parser import parse_docx_to_markdown
|
||||
from app.services.parser.pdf_parser import parse_pdf_to_markdown
|
||||
@@ -34,5 +35,10 @@ class LocalDocumentParser(DocumentParser):
|
||||
vector_chunks=[],
|
||||
parser_name=self.parser_name,
|
||||
raw_text=markdown_text,
|
||||
metadata={"source": "local_parser", "file_suffix": suffix},
|
||||
raw_layouts=[],
|
||||
metadata={
|
||||
"source": "local_parser",
|
||||
"file_suffix": suffix,
|
||||
"artifact_prefix": settings.document_parse_artifact_prefix,
|
||||
},
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user