"""Local parser adapter for the migrated backend architecture.""" from __future__ import annotations from pathlib import Path from app.config.settings import settings from app.domain.documents import DocumentParser, ParsedDocument from app.services.parser.docx_parser import parse_docx_to_markdown from app.services.parser.pdf_parser import parse_pdf_to_markdown class LocalDocumentParser(DocumentParser): """Adapt the existing local PDF/DOCX parsers to the new parser port.""" parser_name = "local_markdown_parser" def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument: suffix = Path(file_path).suffix.lower() if suffix == ".pdf": markdown_text = parse_pdf_to_markdown(file_path) elif suffix in {".docx", ".doc"}: markdown_text = parse_docx_to_markdown(file_path) else: raise ValueError(f"不支持的文件类型: {suffix}") if not markdown_text.strip(): raise ValueError("本地解析完成但未提取到有效文本") return ParsedDocument( doc_id=doc_id, doc_name=doc_name, structure_nodes=[], semantic_blocks=[], vector_chunks=[], parser_name=self.parser_name, raw_text=markdown_text, raw_layouts=[], metadata={ "source": "local_parser", "file_suffix": suffix, "artifact_prefix": settings.document_parse_artifact_prefix, }, )