Files
AIRegulation-DocAnalysis/backend/app/api/routes/documents.py

291 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""文档上传与处理接口"""
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
from fastapi.responses import FileResponse, StreamingResponse
from typing import Optional
import os
import uuid
import tempfile
from pathlib import Path
from loguru import logger
from io import BytesIO
from urllib.parse import quote
from ..models import DocumentUploadResponse, ErrorResponse
from app.services.document_processor import DocumentProcessor
from app.services.storage.minio_client import MinIOClient
from app.config.settings import settings
router = APIRouter(prefix="/documents", tags=["documents"])
# MinIO客户端用于文档存储
minio_client: Optional[MinIOClient] = None
def get_minio_client() -> MinIOClient:
"""获取MinIO客户端实例"""
global minio_client
if minio_client is None:
minio_client = MinIOClient()
minio_client.connect()
minio_client.ensure_bucket()
return minio_client
def _build_document_records(limit: Optional[int] = None):
"""构建文档列表记录,支持按最近更新时间倒序截断。"""
minio = get_minio_client()
document_records = []
objects = minio.client.list_objects(minio.bucket, recursive=True)
for obj in objects:
parts = obj.object_name.split("/", 1)
if len(parts) != 2:
continue
doc_id, filename = parts
last_modified = getattr(obj, "last_modified", None)
document_records.append({
"doc_id": doc_id,
"filename": filename,
"size": getattr(obj, "size", 0) or 0,
"object_name": obj.object_name,
"download_url": f"/api/v1/documents/download/{doc_id}",
"last_modified": last_modified.isoformat() if last_modified else None,
"_sort_key": last_modified.timestamp() if last_modified else 0,
})
document_records.sort(key=lambda item: item["_sort_key"], reverse=True)
if limit is not None:
document_records = document_records[:limit]
for item in document_records:
item.pop("_sort_key", None)
return document_records
@router.post("/upload", response_model=DocumentUploadResponse)
async def upload_document(
file: UploadFile = File(..., description="上传的文档文件"),
doc_name: Optional[str] = Form(None, description="文档名称"),
regulation_type: Optional[str] = Form(None, description="法规类型"),
version: Optional[str] = Form(None, description="文档版本"),
generate_summary: bool = Form(False, description="是否生成摘要默认不生成可节省约60秒")
):
"""
上传文档并处理
支持格式PDF、DOCX、DOC
处理流程:解析 → 分块 → 嵌入 → 入库(摘要可选)
文件存储MinIO对象存储
参数说明:
- generate_summary: 是否生成LLM摘要默认False。勾选后处理时间增加约60秒。
"""
# 验证文件类型
ext = os.path.splitext(file.filename)[1].lower()
if ext not in [".pdf", ".docx", ".doc"]:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {ext}仅支持PDF、DOCX、DOC"
)
# 验证文件大小
if file.size and file.size > settings.max_file_size_mb * 1024 * 1024:
raise HTTPException(
status_code=400,
detail=f"文件过大,最大支持{settings.max_file_size_mb}MB"
)
# 生成文档ID
doc_id = str(uuid.uuid4())[:8]
# 文档名称
final_doc_name = doc_name or file.filename
# MinIO对象名称
object_name = f"{doc_id}/{file.filename}"
logger.info(f"接收到文件上传: {final_doc_name}, 类型: {ext}, doc_id={doc_id}")
try:
# 读取文件内容
content = await file.read()
# 保存临时文件用于处理
temp_dir = tempfile.gettempdir()
temp_path = os.path.join(temp_dir, f"{doc_id}_{file.filename}")
with open(temp_path, "wb") as f:
f.write(content)
logger.info(f"临时文件已保存到: {temp_path}")
# 上传到MinIO
minio = get_minio_client()
upload_success = minio.upload_bytes(
data=content,
object_name=object_name,
content_type=minio._get_content_type(file.filename),
metadata={
"doc_id": doc_id # 仅传递ASCII安全的metadata
}
)
if upload_success:
logger.success(f"文件已上传到MinIO: {object_name}")
else:
logger.warning(f"MinIO上传失败仅使用本地临时文件")
# 处理文档传入相同的doc_id保持一致性
processor = DocumentProcessor(generate_summary=generate_summary)
result = processor.process(
file_path=temp_path,
doc_id=doc_id, # 使用相同的doc_id
doc_name=final_doc_name,
regulation_type=regulation_type or "",
version=version or ""
)
processor.close()
# 清理临时文件
try:
os.remove(temp_path)
except:
pass
if result.success:
return DocumentUploadResponse(
doc_id=result.doc_id,
doc_name=result.doc_name,
status="success",
message=result.message,
num_chunks=result.num_chunks,
summary=result.summary,
summary_latency_ms=result.summary_latency_ms
)
else:
raise HTTPException(
status_code=500,
detail=result.message
)
except Exception as e:
logger.error(f"文档处理失败: {e}")
raise HTTPException(
status_code=500,
detail=f"文档处理失败: {str(e)}"
)
@router.get("/status/{doc_id}", response_model=DocumentUploadResponse)
async def get_document_status(doc_id: str):
"""
查询文档处理状态
Args:
doc_id: 文档ID
"""
# TODO: 实现状态查询(需要数据库支持)
return DocumentUploadResponse(
doc_id=doc_id,
doc_name="",
status="unknown",
message="状态查询功能待实现"
)
@router.get("/download/{doc_id}")
async def download_document(doc_id: str):
"""
下载文档从MinIO获取
Args:
doc_id: 文档ID
Returns:
文件下载响应
"""
logger.info(f"请求下载文档: doc_id={doc_id}")
try:
minio = get_minio_client()
# 查找该doc_id下的文件MinIO对象名称格式: {doc_id}/{filename}
objects = minio.list_objects(prefix=f"{doc_id}/")
if not objects:
logger.warning(f"MinIO中未找到文档: doc_id={doc_id}")
raise HTTPException(
status_code=404,
detail=f"文档不存在: doc_id={doc_id}"
)
# 获取第一个匹配的对象
object_name = objects[0]
logger.info(f"找到MinIO对象: {object_name}")
# 获取文件数据
file_data = minio.get_object_data(object_name)
if file_data is None:
raise HTTPException(
status_code=500,
detail=f"获取文档数据失败"
)
# 解析原始文件名
original_name = object_name.split("/", 1)[1] if "/" in object_name else object_name
# 获取Content-Type
content_type = minio._get_content_type(original_name)
logger.success(f"文档下载成功: {original_name}, 大小={len(file_data)}")
# 返回文件流URL编码文件名以支持中文
encoded_name = quote(original_name)
return StreamingResponse(
BytesIO(file_data),
media_type=content_type,
headers={
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_name}"
}
)
except HTTPException:
raise
except Exception as e:
logger.error(f"文档下载失败: {e}")
raise HTTPException(
status_code=500,
detail=f"文档下载失败: {str(e)}"
)
@router.get("/list")
async def list_documents():
"""
列出所有已上传的文档从MinIO获取
"""
try:
documents = _build_document_records()
return {"documents": documents, "total": len(documents)}
except Exception as e:
logger.error(f"列出文档失败: {e}")
return {"documents": [], "total": 0, "error": str(e)}
@router.get("/management-list")
async def get_document_management_list():
"""
文档管理清单接口仅返回最近的10条文档。
"""
try:
documents = _build_document_records(limit=10)
return {"documents": documents, "total": len(documents), "limit": 10}
except Exception as e:
logger.error(f"获取文档管理清单失败: {e}")
return {"documents": [], "total": 0, "limit": 10, "error": str(e)}