Files

45 lines
1.5 KiB
Python
Raw Permalink Normal View History

"""Local parser adapter for the migrated backend architecture."""
from __future__ import annotations
from pathlib import Path
from app.config.settings import settings
from app.domain.documents import DocumentParser, ParsedDocument
from app.services.parser.docx_parser import parse_docx_to_markdown
from app.services.parser.pdf_parser import parse_pdf_to_markdown
class LocalDocumentParser(DocumentParser):
"""Adapt the existing local PDF/DOCX parsers to the new parser port."""
parser_name = "local_markdown_parser"
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
suffix = Path(file_path).suffix.lower()
if suffix == ".pdf":
markdown_text = parse_pdf_to_markdown(file_path)
elif suffix in {".docx", ".doc"}:
markdown_text = parse_docx_to_markdown(file_path)
else:
raise ValueError(f"不支持的文件类型: {suffix}")
if not markdown_text.strip():
raise ValueError("本地解析完成但未提取到有效文本")
return ParsedDocument(
doc_id=doc_id,
doc_name=doc_name,
structure_nodes=[],
semantic_blocks=[],
vector_chunks=[],
parser_name=self.parser_name,
raw_text=markdown_text,
raw_layouts=[],
metadata={
"source": "local_parser",
"file_suffix": suffix,
"artifact_prefix": settings.document_parse_artifact_prefix,
},
)