first commit

This commit is contained in:
2026-04-23 09:58:47 +08:00
commit 448e078d99
49 changed files with 5188 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
FROM python:3.12-slim
WORKDIR /app
# 系统依赖MinerU 需要 libGL
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
libgl1-mesa-glx \
libglib2.0-0 \
libsm6 \
libxrender1 \
libxext6 \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt \
--index-url https://pypi.tuna.tsinghua.edu.cn/simple \
--trusted-host pypi.tuna.tsinghua.edu.cn
# 预下载 MinerU 模型(构建时执行,加速启动)
RUN python -c "
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
try:
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
print('MinerU 模型下载完成')
except Exception as e:
print(f'模型下载跳过(将在运行时下载): {e}')
" || true
COPY main.py .
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
CMD curl -f http://localhost:8011/health || exit 1
EXPOSE 8011
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8011", "--workers", "1"]

136
services/mcp-server/main.py Normal file
View File

@@ -0,0 +1,136 @@
import os
import tempfile
import logging
from pathlib import Path
from fastapi import FastAPI, UploadFile, File, HTTPException
from pydantic import BaseModel
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
DEVICE = os.getenv("DEVICE", "cpu")
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "/app/uploads"))
PARSED_DIR = Path(os.getenv("PARSED_DIR", "/app/parsed"))
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
PARSED_DIR.mkdir(parents=True, exist_ok=True)
app = FastAPI(title="MinerU 文档解析服务")
SUPPORTED_TYPES = {
"application/pdf": "pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
"application/msword": "doc",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
}
def parse_pdf_mineru(pdf_path: str) -> str:
"""使用 MinerU 解析 PDF"""
try:
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.pipe.UnicodeFormulaPDFPipe import UnicodeFormulaPDFPipe
with tempfile.TemporaryDirectory() as tmpdir:
writer = FileBasedDataWriter(tmpdir)
pipe = UnicodeFormulaPDFPipe(pdf_path, writer)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
md_content = pipe.pipe_mk_uni_format(tmpdir, drop_mode="none")
return md_content or ""
except Exception as e:
logger.warning(f"MinerU 解析失败,降级到 PyMuPDF{e}")
return parse_pdf_pymupdf(pdf_path)
def parse_pdf_pymupdf(pdf_path: str) -> str:
"""降级:使用 PyMuPDF 提取文本"""
try:
import fitz # PyMuPDF
doc = fitz.open(pdf_path)
pages = []
for i, page in enumerate(doc):
text = page.get_text()
if text.strip():
pages.append(f"## 第 {i+1}\n\n{text}")
return "\n\n".join(pages)
except Exception as e:
return f"[解析失败:{e}]"
def parse_docx(file_path: str) -> str:
"""解析 Word 文档"""
try:
from docx import Document
doc = Document(file_path)
parts = []
for para in doc.paragraphs:
if para.text.strip():
style = para.style.name if para.style else ""
if "Heading" in style:
level = style.replace("Heading ", "").strip()
try:
prefix = "#" * int(level)
except ValueError:
prefix = "##"
parts.append(f"{prefix} {para.text}")
else:
parts.append(para.text)
for table in doc.tables:
rows = []
for row in table.rows:
rows.append(" | ".join(cell.text.strip() for cell in row.cells))
if rows:
parts.append("\n".join(rows))
return "\n\n".join(parts)
except Exception as e:
return f"[Word 解析失败:{e}]"
class ParseResponse(BaseModel):
filename: str
markdown: str
page_count: int
parser: str
@app.post("/mineru-parse", response_model=ParseResponse)
async def mineru_parse(file: UploadFile = File(...)) -> ParseResponse:
content = await file.read()
suffix = Path(file.filename or "doc.pdf").suffix.lower()
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
if suffix == ".pdf":
markdown = parse_pdf_mineru(tmp_path)
parser = "mineru"
elif suffix in (".docx", ".doc"):
markdown = parse_docx(tmp_path)
parser = "python-docx"
else:
raise HTTPException(status_code=415, detail=f"不支持的文件类型:{suffix}")
page_count = markdown.count("## 第") if suffix == ".pdf" else markdown.count("\n\n")
return ParseResponse(
filename=file.filename or "unknown",
markdown=markdown,
page_count=max(page_count, 1),
parser=parser,
)
finally:
os.unlink(tmp_path)
@app.post("/parse-document", response_model=ParseResponse)
async def parse_document(file: UploadFile = File(...)) -> ParseResponse:
return await mineru_parse(file)
@app.get("/health")
def health():
return {"status": "ok", "device": DEVICE}

View File

@@ -0,0 +1,11 @@
fastapi>=0.115
uvicorn[standard]>=0.30
pydantic>=2.7
python-multipart>=0.0.9
httpx>=0.27
# MinerU 文档解析
mineru[pipeline]>=1.0
# Word/Excel 降级解析
python-docx>=1.1
openpyxl>=3.1
PyMuPDF>=1.24 # PDF 降级解析