39 lines
1.3 KiB
Python
39 lines
1.3 KiB
Python
|
|
"""Local parser adapter for the migrated backend architecture."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from app.domain.documents import DocumentParser, ParsedDocument
|
||
|
|
from app.services.parser.docx_parser import parse_docx_to_markdown
|
||
|
|
from app.services.parser.pdf_parser import parse_pdf_to_markdown
|
||
|
|
|
||
|
|
|
||
|
|
class LocalDocumentParser(DocumentParser):
|
||
|
|
"""Adapt the existing local PDF/DOCX parsers to the new parser port."""
|
||
|
|
|
||
|
|
parser_name = "local_markdown_parser"
|
||
|
|
|
||
|
|
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
||
|
|
suffix = Path(file_path).suffix.lower()
|
||
|
|
if suffix == ".pdf":
|
||
|
|
markdown_text = parse_pdf_to_markdown(file_path)
|
||
|
|
elif suffix in {".docx", ".doc"}:
|
||
|
|
markdown_text = parse_docx_to_markdown(file_path)
|
||
|
|
else:
|
||
|
|
raise ValueError(f"不支持的文件类型: {suffix}")
|
||
|
|
|
||
|
|
if not markdown_text.strip():
|
||
|
|
raise ValueError("本地解析完成但未提取到有效文本")
|
||
|
|
|
||
|
|
return ParsedDocument(
|
||
|
|
doc_id=doc_id,
|
||
|
|
doc_name=doc_name,
|
||
|
|
structure_nodes=[],
|
||
|
|
semantic_blocks=[],
|
||
|
|
vector_chunks=[],
|
||
|
|
parser_name=self.parser_name,
|
||
|
|
raw_text=markdown_text,
|
||
|
|
metadata={"source": "local_parser", "file_suffix": suffix},
|
||
|
|
)
|