291 lines
8.9 KiB
Python
291 lines
8.9 KiB
Python
"""文档上传与处理接口"""
|
||
|
||
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
|
||
from fastapi.responses import FileResponse, StreamingResponse
|
||
from typing import Optional
|
||
import os
|
||
import uuid
|
||
import tempfile
|
||
from pathlib import Path
|
||
from loguru import logger
|
||
from io import BytesIO
|
||
from urllib.parse import quote
|
||
|
||
from ..models import DocumentUploadResponse, ErrorResponse
|
||
from app.services.document_processor import DocumentProcessor
|
||
from app.services.storage.minio_client import MinIOClient
|
||
from app.config.settings import settings
|
||
|
||
router = APIRouter(prefix="/documents", tags=["documents"])
|
||
|
||
# MinIO客户端(用于文档存储)
|
||
minio_client: Optional[MinIOClient] = None
|
||
|
||
|
||
def get_minio_client() -> MinIOClient:
|
||
"""获取MinIO客户端实例"""
|
||
global minio_client
|
||
if minio_client is None:
|
||
minio_client = MinIOClient()
|
||
minio_client.connect()
|
||
minio_client.ensure_bucket()
|
||
return minio_client
|
||
|
||
|
||
def _build_document_records(limit: Optional[int] = None):
|
||
"""构建文档列表记录,支持按最近更新时间倒序截断。"""
|
||
minio = get_minio_client()
|
||
|
||
document_records = []
|
||
objects = minio.client.list_objects(minio.bucket, recursive=True)
|
||
for obj in objects:
|
||
parts = obj.object_name.split("/", 1)
|
||
if len(parts) != 2:
|
||
continue
|
||
|
||
doc_id, filename = parts
|
||
last_modified = getattr(obj, "last_modified", None)
|
||
document_records.append({
|
||
"doc_id": doc_id,
|
||
"filename": filename,
|
||
"size": getattr(obj, "size", 0) or 0,
|
||
"object_name": obj.object_name,
|
||
"download_url": f"/api/v1/documents/download/{doc_id}",
|
||
"last_modified": last_modified.isoformat() if last_modified else None,
|
||
"_sort_key": last_modified.timestamp() if last_modified else 0,
|
||
})
|
||
|
||
document_records.sort(key=lambda item: item["_sort_key"], reverse=True)
|
||
if limit is not None:
|
||
document_records = document_records[:limit]
|
||
|
||
for item in document_records:
|
||
item.pop("_sort_key", None)
|
||
|
||
return document_records
|
||
|
||
|
||
@router.post("/upload", response_model=DocumentUploadResponse)
|
||
async def upload_document(
|
||
file: UploadFile = File(..., description="上传的文档文件"),
|
||
doc_name: Optional[str] = Form(None, description="文档名称"),
|
||
regulation_type: Optional[str] = Form(None, description="法规类型"),
|
||
version: Optional[str] = Form(None, description="文档版本"),
|
||
generate_summary: bool = Form(False, description="是否生成摘要(默认不生成,可节省约60秒)")
|
||
):
|
||
"""
|
||
上传文档并处理
|
||
|
||
支持格式:PDF、DOCX、DOC
|
||
处理流程:解析 → 分块 → 嵌入 → 入库(摘要可选)
|
||
文件存储:MinIO对象存储
|
||
|
||
参数说明:
|
||
- generate_summary: 是否生成LLM摘要,默认False。勾选后处理时间增加约60秒。
|
||
"""
|
||
# 验证文件类型
|
||
ext = os.path.splitext(file.filename)[1].lower()
|
||
if ext not in [".pdf", ".docx", ".doc"]:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"不支持的文件类型: {ext},仅支持PDF、DOCX、DOC"
|
||
)
|
||
|
||
# 验证文件大小
|
||
if file.size and file.size > settings.max_file_size_mb * 1024 * 1024:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"文件过大,最大支持{settings.max_file_size_mb}MB"
|
||
)
|
||
|
||
# 生成文档ID
|
||
doc_id = str(uuid.uuid4())[:8]
|
||
|
||
# 文档名称
|
||
final_doc_name = doc_name or file.filename
|
||
|
||
# MinIO对象名称
|
||
object_name = f"{doc_id}/{file.filename}"
|
||
|
||
logger.info(f"接收到文件上传: {final_doc_name}, 类型: {ext}, doc_id={doc_id}")
|
||
|
||
try:
|
||
# 读取文件内容
|
||
content = await file.read()
|
||
|
||
# 保存临时文件用于处理
|
||
temp_dir = tempfile.gettempdir()
|
||
temp_path = os.path.join(temp_dir, f"{doc_id}_{file.filename}")
|
||
|
||
with open(temp_path, "wb") as f:
|
||
f.write(content)
|
||
|
||
logger.info(f"临时文件已保存到: {temp_path}")
|
||
|
||
# 上传到MinIO
|
||
minio = get_minio_client()
|
||
upload_success = minio.upload_bytes(
|
||
data=content,
|
||
object_name=object_name,
|
||
content_type=minio._get_content_type(file.filename),
|
||
metadata={
|
||
"doc_id": doc_id # 仅传递ASCII安全的metadata
|
||
}
|
||
)
|
||
|
||
if upload_success:
|
||
logger.success(f"文件已上传到MinIO: {object_name}")
|
||
else:
|
||
logger.warning(f"MinIO上传失败,仅使用本地临时文件")
|
||
|
||
# 处理文档(传入相同的doc_id,保持一致性)
|
||
processor = DocumentProcessor(generate_summary=generate_summary)
|
||
result = processor.process(
|
||
file_path=temp_path,
|
||
doc_id=doc_id, # 使用相同的doc_id
|
||
doc_name=final_doc_name,
|
||
regulation_type=regulation_type or "",
|
||
version=version or ""
|
||
)
|
||
processor.close()
|
||
|
||
# 清理临时文件
|
||
try:
|
||
os.remove(temp_path)
|
||
except:
|
||
pass
|
||
|
||
if result.success:
|
||
return DocumentUploadResponse(
|
||
doc_id=result.doc_id,
|
||
doc_name=result.doc_name,
|
||
status="success",
|
||
message=result.message,
|
||
num_chunks=result.num_chunks,
|
||
summary=result.summary,
|
||
summary_latency_ms=result.summary_latency_ms
|
||
)
|
||
else:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=result.message
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"文档处理失败: {e}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"文档处理失败: {str(e)}"
|
||
)
|
||
|
||
|
||
@router.get("/status/{doc_id}", response_model=DocumentUploadResponse)
|
||
async def get_document_status(doc_id: str):
|
||
"""
|
||
查询文档处理状态
|
||
|
||
Args:
|
||
doc_id: 文档ID
|
||
"""
|
||
# TODO: 实现状态查询(需要数据库支持)
|
||
return DocumentUploadResponse(
|
||
doc_id=doc_id,
|
||
doc_name="",
|
||
status="unknown",
|
||
message="状态查询功能待实现"
|
||
)
|
||
|
||
|
||
@router.get("/download/{doc_id}")
|
||
async def download_document(doc_id: str):
|
||
"""
|
||
下载文档(从MinIO获取)
|
||
|
||
Args:
|
||
doc_id: 文档ID
|
||
|
||
Returns:
|
||
文件下载响应
|
||
"""
|
||
logger.info(f"请求下载文档: doc_id={doc_id}")
|
||
|
||
try:
|
||
minio = get_minio_client()
|
||
|
||
# 查找该doc_id下的文件(MinIO对象名称格式: {doc_id}/{filename})
|
||
objects = minio.list_objects(prefix=f"{doc_id}/")
|
||
|
||
if not objects:
|
||
logger.warning(f"MinIO中未找到文档: doc_id={doc_id}")
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail=f"文档不存在: doc_id={doc_id}"
|
||
)
|
||
|
||
# 获取第一个匹配的对象
|
||
object_name = objects[0]
|
||
logger.info(f"找到MinIO对象: {object_name}")
|
||
|
||
# 获取文件数据
|
||
file_data = minio.get_object_data(object_name)
|
||
if file_data is None:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"获取文档数据失败"
|
||
)
|
||
|
||
# 解析原始文件名
|
||
original_name = object_name.split("/", 1)[1] if "/" in object_name else object_name
|
||
|
||
# 获取Content-Type
|
||
content_type = minio._get_content_type(original_name)
|
||
|
||
logger.success(f"文档下载成功: {original_name}, 大小={len(file_data)}")
|
||
|
||
# 返回文件流(URL编码文件名以支持中文)
|
||
encoded_name = quote(original_name)
|
||
return StreamingResponse(
|
||
BytesIO(file_data),
|
||
media_type=content_type,
|
||
headers={
|
||
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_name}"
|
||
}
|
||
)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"文档下载失败: {e}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"文档下载失败: {str(e)}"
|
||
)
|
||
|
||
|
||
@router.get("/list")
|
||
async def list_documents():
|
||
"""
|
||
列出所有已上传的文档(从MinIO获取)
|
||
"""
|
||
try:
|
||
documents = _build_document_records()
|
||
return {"documents": documents, "total": len(documents)}
|
||
|
||
except Exception as e:
|
||
logger.error(f"列出文档失败: {e}")
|
||
return {"documents": [], "total": 0, "error": str(e)}
|
||
|
||
|
||
@router.get("/management-list")
|
||
async def get_document_management_list():
|
||
"""
|
||
文档管理清单接口:仅返回最近的10条文档。
|
||
"""
|
||
try:
|
||
documents = _build_document_records(limit=10)
|
||
return {"documents": documents, "total": len(documents), "limit": 10}
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取文档管理清单失败: {e}")
|
||
return {"documents": [], "total": 0, "limit": 10, "error": str(e)}
|