291 lines
8.9 KiB
Python
291 lines
8.9 KiB
Python
|
|
"""文档上传与处理接口"""
|
|||
|
|
|
|||
|
|
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
|
|||
|
|
from fastapi.responses import FileResponse, StreamingResponse
|
|||
|
|
from typing import Optional
|
|||
|
|
import os
|
|||
|
|
import uuid
|
|||
|
|
import tempfile
|
|||
|
|
from pathlib import Path
|
|||
|
|
from loguru import logger
|
|||
|
|
from io import BytesIO
|
|||
|
|
from urllib.parse import quote
|
|||
|
|
|
|||
|
|
from ..models import DocumentUploadResponse, ErrorResponse
|
|||
|
|
from app.services.document_processor import DocumentProcessor
|
|||
|
|
from app.services.storage.minio_client import MinIOClient
|
|||
|
|
from app.config.settings import settings
|
|||
|
|
|
|||
|
|
router = APIRouter(prefix="/documents", tags=["documents"])
|
|||
|
|
|
|||
|
|
# MinIO客户端(用于文档存储)
|
|||
|
|
minio_client: Optional[MinIOClient] = None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_minio_client() -> MinIOClient:
|
|||
|
|
"""获取MinIO客户端实例"""
|
|||
|
|
global minio_client
|
|||
|
|
if minio_client is None:
|
|||
|
|
minio_client = MinIOClient()
|
|||
|
|
minio_client.connect()
|
|||
|
|
minio_client.ensure_bucket()
|
|||
|
|
return minio_client
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _build_document_records(limit: Optional[int] = None):
|
|||
|
|
"""构建文档列表记录,支持按最近更新时间倒序截断。"""
|
|||
|
|
minio = get_minio_client()
|
|||
|
|
|
|||
|
|
document_records = []
|
|||
|
|
objects = minio.client.list_objects(minio.bucket, recursive=True)
|
|||
|
|
for obj in objects:
|
|||
|
|
parts = obj.object_name.split("/", 1)
|
|||
|
|
if len(parts) != 2:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
doc_id, filename = parts
|
|||
|
|
last_modified = getattr(obj, "last_modified", None)
|
|||
|
|
document_records.append({
|
|||
|
|
"doc_id": doc_id,
|
|||
|
|
"filename": filename,
|
|||
|
|
"size": getattr(obj, "size", 0) or 0,
|
|||
|
|
"object_name": obj.object_name,
|
|||
|
|
"download_url": f"/api/v1/documents/download/{doc_id}",
|
|||
|
|
"last_modified": last_modified.isoformat() if last_modified else None,
|
|||
|
|
"_sort_key": last_modified.timestamp() if last_modified else 0,
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
document_records.sort(key=lambda item: item["_sort_key"], reverse=True)
|
|||
|
|
if limit is not None:
|
|||
|
|
document_records = document_records[:limit]
|
|||
|
|
|
|||
|
|
for item in document_records:
|
|||
|
|
item.pop("_sort_key", None)
|
|||
|
|
|
|||
|
|
return document_records
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.post("/upload", response_model=DocumentUploadResponse)
|
|||
|
|
async def upload_document(
|
|||
|
|
file: UploadFile = File(..., description="上传的文档文件"),
|
|||
|
|
doc_name: Optional[str] = Form(None, description="文档名称"),
|
|||
|
|
regulation_type: Optional[str] = Form(None, description="法规类型"),
|
|||
|
|
version: Optional[str] = Form(None, description="文档版本"),
|
|||
|
|
generate_summary: bool = Form(False, description="是否生成摘要(默认不生成,可节省约60秒)")
|
|||
|
|
):
|
|||
|
|
"""
|
|||
|
|
上传文档并处理
|
|||
|
|
|
|||
|
|
支持格式:PDF、DOCX、DOC
|
|||
|
|
处理流程:解析 → 分块 → 嵌入 → 入库(摘要可选)
|
|||
|
|
文件存储:MinIO对象存储
|
|||
|
|
|
|||
|
|
参数说明:
|
|||
|
|
- generate_summary: 是否生成LLM摘要,默认False。勾选后处理时间增加约60秒。
|
|||
|
|
"""
|
|||
|
|
# 验证文件类型
|
|||
|
|
ext = os.path.splitext(file.filename)[1].lower()
|
|||
|
|
if ext not in [".pdf", ".docx", ".doc"]:
|
|||
|
|
raise HTTPException(
|
|||
|
|
status_code=400,
|
|||
|
|
detail=f"不支持的文件类型: {ext},仅支持PDF、DOCX、DOC"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 验证文件大小
|
|||
|
|
if file.size and file.size > settings.max_file_size_mb * 1024 * 1024:
|
|||
|
|
raise HTTPException(
|
|||
|
|
status_code=400,
|
|||
|
|
detail=f"文件过大,最大支持{settings.max_file_size_mb}MB"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 生成文档ID
|
|||
|
|
doc_id = str(uuid.uuid4())[:8]
|
|||
|
|
|
|||
|
|
# 文档名称
|
|||
|
|
final_doc_name = doc_name or file.filename
|
|||
|
|
|
|||
|
|
# MinIO对象名称
|
|||
|
|
object_name = f"{doc_id}/{file.filename}"
|
|||
|
|
|
|||
|
|
logger.info(f"接收到文件上传: {final_doc_name}, 类型: {ext}, doc_id={doc_id}")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 读取文件内容
|
|||
|
|
content = await file.read()
|
|||
|
|
|
|||
|
|
# 保存临时文件用于处理
|
|||
|
|
temp_dir = tempfile.gettempdir()
|
|||
|
|
temp_path = os.path.join(temp_dir, f"{doc_id}_{file.filename}")
|
|||
|
|
|
|||
|
|
with open(temp_path, "wb") as f:
|
|||
|
|
f.write(content)
|
|||
|
|
|
|||
|
|
logger.info(f"临时文件已保存到: {temp_path}")
|
|||
|
|
|
|||
|
|
# 上传到MinIO
|
|||
|
|
minio = get_minio_client()
|
|||
|
|
upload_success = minio.upload_bytes(
|
|||
|
|
data=content,
|
|||
|
|
object_name=object_name,
|
|||
|
|
content_type=minio._get_content_type(file.filename),
|
|||
|
|
metadata={
|
|||
|
|
"doc_id": doc_id # 仅传递ASCII安全的metadata
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if upload_success:
|
|||
|
|
logger.success(f"文件已上传到MinIO: {object_name}")
|
|||
|
|
else:
|
|||
|
|
logger.warning(f"MinIO上传失败,仅使用本地临时文件")
|
|||
|
|
|
|||
|
|
# 处理文档(传入相同的doc_id,保持一致性)
|
|||
|
|
processor = DocumentProcessor(generate_summary=generate_summary)
|
|||
|
|
result = processor.process(
|
|||
|
|
file_path=temp_path,
|
|||
|
|
doc_id=doc_id, # 使用相同的doc_id
|
|||
|
|
doc_name=final_doc_name,
|
|||
|
|
regulation_type=regulation_type or "",
|
|||
|
|
version=version or ""
|
|||
|
|
)
|
|||
|
|
processor.close()
|
|||
|
|
|
|||
|
|
# 清理临时文件
|
|||
|
|
try:
|
|||
|
|
os.remove(temp_path)
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
if result.success:
|
|||
|
|
return DocumentUploadResponse(
|
|||
|
|
doc_id=result.doc_id,
|
|||
|
|
doc_name=result.doc_name,
|
|||
|
|
status="success",
|
|||
|
|
message=result.message,
|
|||
|
|
num_chunks=result.num_chunks,
|
|||
|
|
summary=result.summary,
|
|||
|
|
summary_latency_ms=result.summary_latency_ms
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
raise HTTPException(
|
|||
|
|
status_code=500,
|
|||
|
|
detail=result.message
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"文档处理失败: {e}")
|
|||
|
|
raise HTTPException(
|
|||
|
|
status_code=500,
|
|||
|
|
detail=f"文档处理失败: {str(e)}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get("/status/{doc_id}", response_model=DocumentUploadResponse)
|
|||
|
|
async def get_document_status(doc_id: str):
|
|||
|
|
"""
|
|||
|
|
查询文档处理状态
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc_id: 文档ID
|
|||
|
|
"""
|
|||
|
|
# TODO: 实现状态查询(需要数据库支持)
|
|||
|
|
return DocumentUploadResponse(
|
|||
|
|
doc_id=doc_id,
|
|||
|
|
doc_name="",
|
|||
|
|
status="unknown",
|
|||
|
|
message="状态查询功能待实现"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get("/download/{doc_id}")
|
|||
|
|
async def download_document(doc_id: str):
|
|||
|
|
"""
|
|||
|
|
下载文档(从MinIO获取)
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc_id: 文档ID
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文件下载响应
|
|||
|
|
"""
|
|||
|
|
logger.info(f"请求下载文档: doc_id={doc_id}")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
minio = get_minio_client()
|
|||
|
|
|
|||
|
|
# 查找该doc_id下的文件(MinIO对象名称格式: {doc_id}/{filename})
|
|||
|
|
objects = minio.list_objects(prefix=f"{doc_id}/")
|
|||
|
|
|
|||
|
|
if not objects:
|
|||
|
|
logger.warning(f"MinIO中未找到文档: doc_id={doc_id}")
|
|||
|
|
raise HTTPException(
|
|||
|
|
status_code=404,
|
|||
|
|
detail=f"文档不存在: doc_id={doc_id}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 获取第一个匹配的对象
|
|||
|
|
object_name = objects[0]
|
|||
|
|
logger.info(f"找到MinIO对象: {object_name}")
|
|||
|
|
|
|||
|
|
# 获取文件数据
|
|||
|
|
file_data = minio.get_object_data(object_name)
|
|||
|
|
if file_data is None:
|
|||
|
|
raise HTTPException(
|
|||
|
|
status_code=500,
|
|||
|
|
detail=f"获取文档数据失败"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 解析原始文件名
|
|||
|
|
original_name = object_name.split("/", 1)[1] if "/" in object_name else object_name
|
|||
|
|
|
|||
|
|
# 获取Content-Type
|
|||
|
|
content_type = minio._get_content_type(original_name)
|
|||
|
|
|
|||
|
|
logger.success(f"文档下载成功: {original_name}, 大小={len(file_data)}")
|
|||
|
|
|
|||
|
|
# 返回文件流(URL编码文件名以支持中文)
|
|||
|
|
encoded_name = quote(original_name)
|
|||
|
|
return StreamingResponse(
|
|||
|
|
BytesIO(file_data),
|
|||
|
|
media_type=content_type,
|
|||
|
|
headers={
|
|||
|
|
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_name}"
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except HTTPException:
|
|||
|
|
raise
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"文档下载失败: {e}")
|
|||
|
|
raise HTTPException(
|
|||
|
|
status_code=500,
|
|||
|
|
detail=f"文档下载失败: {str(e)}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get("/list")
|
|||
|
|
async def list_documents():
|
|||
|
|
"""
|
|||
|
|
列出所有已上传的文档(从MinIO获取)
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
documents = _build_document_records()
|
|||
|
|
return {"documents": documents, "total": len(documents)}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"列出文档失败: {e}")
|
|||
|
|
return {"documents": [], "total": 0, "error": str(e)}
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get("/management-list")
|
|||
|
|
async def get_document_management_list():
|
|||
|
|
"""
|
|||
|
|
文档管理清单接口:仅返回最近的10条文档。
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
documents = _build_document_records(limit=10)
|
|||
|
|
return {"documents": documents, "total": len(documents), "limit": 10}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"获取文档管理清单失败: {e}")
|
|||
|
|
return {"documents": [], "total": 0, "limit": 10, "error": str(e)}
|