import os import tempfile import logging from pathlib import Path from fastapi import FastAPI, UploadFile, File, HTTPException from pydantic import BaseModel logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) DEVICE = os.getenv("DEVICE", "cpu") UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "/app/uploads")) PARSED_DIR = Path(os.getenv("PARSED_DIR", "/app/parsed")) UPLOAD_DIR.mkdir(parents=True, exist_ok=True) PARSED_DIR.mkdir(parents=True, exist_ok=True) app = FastAPI(title="MinerU 文档解析服务") SUPPORTED_TYPES = { "application/pdf": "pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", "application/msword": "doc", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx", } def parse_pdf_mineru(pdf_path: str) -> str: """使用 MinerU 解析 PDF""" try: from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.pipe.UnicodeFormulaPDFPipe import UnicodeFormulaPDFPipe with tempfile.TemporaryDirectory() as tmpdir: writer = FileBasedDataWriter(tmpdir) pipe = UnicodeFormulaPDFPipe(pdf_path, writer) pipe.pipe_classify() pipe.pipe_analyze() pipe.pipe_parse() md_content = pipe.pipe_mk_uni_format(tmpdir, drop_mode="none") return md_content or "" except Exception as e: logger.warning(f"MinerU 解析失败,降级到 PyMuPDF:{e}") return parse_pdf_pymupdf(pdf_path) def parse_pdf_pymupdf(pdf_path: str) -> str: """降级:使用 PyMuPDF 提取文本""" try: import fitz # PyMuPDF doc = fitz.open(pdf_path) pages = [] for i, page in enumerate(doc): text = page.get_text() if text.strip(): pages.append(f"## 第 {i+1} 页\n\n{text}") return "\n\n".join(pages) except Exception as e: return f"[解析失败:{e}]" def parse_docx(file_path: str) -> str: """解析 Word 文档""" try: from docx import Document doc = Document(file_path) parts = [] for para in doc.paragraphs: if para.text.strip(): style = para.style.name if para.style else "" if "Heading" in style: level = style.replace("Heading ", "").strip() try: prefix = "#" * int(level) except ValueError: prefix = "##" parts.append(f"{prefix} {para.text}") else: parts.append(para.text) for table in doc.tables: rows = [] for row in table.rows: rows.append(" | ".join(cell.text.strip() for cell in row.cells)) if rows: parts.append("\n".join(rows)) return "\n\n".join(parts) except Exception as e: return f"[Word 解析失败:{e}]" class ParseResponse(BaseModel): filename: str markdown: str page_count: int parser: str @app.post("/mineru-parse", response_model=ParseResponse) async def mineru_parse(file: UploadFile = File(...)) -> ParseResponse: content = await file.read() suffix = Path(file.filename or "doc.pdf").suffix.lower() with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: tmp.write(content) tmp_path = tmp.name try: if suffix == ".pdf": markdown = parse_pdf_mineru(tmp_path) parser = "mineru" elif suffix in (".docx", ".doc"): markdown = parse_docx(tmp_path) parser = "python-docx" else: raise HTTPException(status_code=415, detail=f"不支持的文件类型:{suffix}") page_count = markdown.count("## 第") if suffix == ".pdf" else markdown.count("\n\n") return ParseResponse( filename=file.filename or "unknown", markdown=markdown, page_count=max(page_count, 1), parser=parser, ) finally: os.unlink(tmp_path) @app.post("/parse-document", response_model=ParseResponse) async def parse_document(file: UploadFile = File(...)) -> ParseResponse: return await mineru_parse(file) @app.get("/health") def health(): return {"status": "ok", "device": DEVICE}