Fix SSE route dependency and align architecture docs
This commit is contained in:
@@ -1,290 +1,140 @@
|
||||
"""文档上传与处理接口"""
|
||||
"""Define API routes for documents."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
|
||||
from fastapi.responses import FileResponse, StreamingResponse
|
||||
from typing import Optional
|
||||
import os
|
||||
import uuid
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
from io import BytesIO
|
||||
from urllib.parse import quote
|
||||
|
||||
from ..models import DocumentUploadResponse, ErrorResponse
|
||||
from app.services.document_processor import DocumentProcessor
|
||||
from app.services.storage.minio_client import MinIOClient
|
||||
from app.config.settings import settings
|
||||
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
||||
from fastapi.responses import StreamingResponse
|
||||
from loguru import logger
|
||||
|
||||
from app.api.models import DocumentUploadResponse
|
||||
from app.application.documents import DocumentProcessResult
|
||||
from app.shared.bootstrap import get_document_command_service, get_document_query_service
|
||||
# Keep route handlers close to their transport-layer wiring for easier auditing.
|
||||
|
||||
|
||||
router = APIRouter(prefix="/documents", tags=["documents"])
|
||||
|
||||
# MinIO客户端(用于文档存储)
|
||||
minio_client: Optional[MinIOClient] = None
|
||||
|
||||
|
||||
def get_minio_client() -> MinIOClient:
|
||||
"""获取MinIO客户端实例"""
|
||||
global minio_client
|
||||
if minio_client is None:
|
||||
minio_client = MinIOClient()
|
||||
minio_client.connect()
|
||||
minio_client.ensure_bucket()
|
||||
return minio_client
|
||||
|
||||
|
||||
def _build_document_records(limit: Optional[int] = None):
|
||||
"""构建文档列表记录,支持按最近更新时间倒序截断。"""
|
||||
minio = get_minio_client()
|
||||
|
||||
document_records = []
|
||||
objects = minio.client.list_objects(minio.bucket, recursive=True)
|
||||
for obj in objects:
|
||||
parts = obj.object_name.split("/", 1)
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
|
||||
doc_id, filename = parts
|
||||
last_modified = getattr(obj, "last_modified", None)
|
||||
document_records.append({
|
||||
"doc_id": doc_id,
|
||||
"filename": filename,
|
||||
"size": getattr(obj, "size", 0) or 0,
|
||||
"object_name": obj.object_name,
|
||||
"download_url": f"/api/v1/documents/download/{doc_id}",
|
||||
"last_modified": last_modified.isoformat() if last_modified else None,
|
||||
"_sort_key": last_modified.timestamp() if last_modified else 0,
|
||||
})
|
||||
|
||||
document_records.sort(key=lambda item: item["_sort_key"], reverse=True)
|
||||
if limit is not None:
|
||||
document_records = document_records[:limit]
|
||||
|
||||
for item in document_records:
|
||||
item.pop("_sort_key", None)
|
||||
|
||||
return document_records
|
||||
def _document_response(result: DocumentProcessResult) -> DocumentUploadResponse:
|
||||
"""Handle document response for this module."""
|
||||
return DocumentUploadResponse(
|
||||
doc_id=result.doc_id,
|
||||
doc_name=result.doc_name,
|
||||
status=result.status,
|
||||
message=result.message,
|
||||
num_chunks=result.num_chunks,
|
||||
summary=result.summary,
|
||||
summary_latency_ms=result.summary_latency_ms,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/upload", response_model=DocumentUploadResponse)
|
||||
async def upload_document(
|
||||
file: UploadFile = File(..., description="上传的文档文件"),
|
||||
doc_name: Optional[str] = Form(None, description="文档名称"),
|
||||
regulation_type: Optional[str] = Form(None, description="法规类型"),
|
||||
version: Optional[str] = Form(None, description="文档版本"),
|
||||
generate_summary: bool = Form(False, description="是否生成摘要(默认不生成,可节省约60秒)")
|
||||
doc_name: str | None = Form(None, description="文档名称"),
|
||||
regulation_type: str | None = Form(None, description="法规类型"),
|
||||
version: str | None = Form(None, description="文档版本"),
|
||||
generate_summary: bool = Form(False, description="是否生成摘要"),
|
||||
):
|
||||
"""
|
||||
上传文档并处理
|
||||
|
||||
支持格式:PDF、DOCX、DOC
|
||||
处理流程:解析 → 分块 → 嵌入 → 入库(摘要可选)
|
||||
文件存储:MinIO对象存储
|
||||
|
||||
参数说明:
|
||||
- generate_summary: 是否生成LLM摘要,默认False。勾选后处理时间增加约60秒。
|
||||
"""
|
||||
# 验证文件类型
|
||||
ext = os.path.splitext(file.filename)[1].lower()
|
||||
if ext not in [".pdf", ".docx", ".doc"]:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {ext},仅支持PDF、DOCX、DOC"
|
||||
)
|
||||
|
||||
# 验证文件大小
|
||||
if file.size and file.size > settings.max_file_size_mb * 1024 * 1024:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"文件过大,最大支持{settings.max_file_size_mb}MB"
|
||||
)
|
||||
|
||||
# 生成文档ID
|
||||
doc_id = str(uuid.uuid4())[:8]
|
||||
|
||||
# 文档名称
|
||||
final_doc_name = doc_name or file.filename
|
||||
|
||||
# MinIO对象名称
|
||||
object_name = f"{doc_id}/{file.filename}"
|
||||
|
||||
logger.info(f"接收到文件上传: {final_doc_name}, 类型: {ext}, doc_id={doc_id}")
|
||||
"""Handle upload document."""
|
||||
content = await file.read()
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名不能为空")
|
||||
if not content:
|
||||
raise HTTPException(status_code=400, detail="上传文件为空")
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
# 保存临时文件用于处理
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = os.path.join(temp_dir, f"{doc_id}_{file.filename}")
|
||||
|
||||
with open(temp_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
logger.info(f"临时文件已保存到: {temp_path}")
|
||||
|
||||
# 上传到MinIO
|
||||
minio = get_minio_client()
|
||||
upload_success = minio.upload_bytes(
|
||||
data=content,
|
||||
object_name=object_name,
|
||||
content_type=minio._get_content_type(file.filename),
|
||||
metadata={
|
||||
"doc_id": doc_id # 仅传递ASCII安全的metadata
|
||||
}
|
||||
)
|
||||
|
||||
if upload_success:
|
||||
logger.success(f"文件已上传到MinIO: {object_name}")
|
||||
else:
|
||||
logger.warning(f"MinIO上传失败,仅使用本地临时文件")
|
||||
|
||||
# 处理文档(传入相同的doc_id,保持一致性)
|
||||
processor = DocumentProcessor(generate_summary=generate_summary)
|
||||
result = processor.process(
|
||||
file_path=temp_path,
|
||||
doc_id=doc_id, # 使用相同的doc_id
|
||||
doc_name=final_doc_name,
|
||||
result = get_document_command_service().upload_and_process(
|
||||
file_name=file.filename,
|
||||
content=content,
|
||||
content_type=file.content_type or "application/octet-stream",
|
||||
doc_name=doc_name,
|
||||
regulation_type=regulation_type or "",
|
||||
version=version or ""
|
||||
)
|
||||
processor.close()
|
||||
|
||||
# 清理临时文件
|
||||
try:
|
||||
os.remove(temp_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
if result.success:
|
||||
return DocumentUploadResponse(
|
||||
doc_id=result.doc_id,
|
||||
doc_name=result.doc_name,
|
||||
status="success",
|
||||
message=result.message,
|
||||
num_chunks=result.num_chunks,
|
||||
summary=result.summary,
|
||||
summary_latency_ms=result.summary_latency_ms
|
||||
)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=result.message
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"文档处理失败: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"文档处理失败: {str(e)}"
|
||||
version=version or "",
|
||||
generate_summary=generate_summary,
|
||||
)
|
||||
if result.status == "failed":
|
||||
raise HTTPException(status_code=500, detail=result.message)
|
||||
return _document_response(result)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.exception("文档上传失败")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
|
||||
|
||||
@router.get("/status/{doc_id}", response_model=DocumentUploadResponse)
|
||||
async def get_document_status(doc_id: str):
|
||||
"""
|
||||
查询文档处理状态
|
||||
|
||||
Args:
|
||||
doc_id: 文档ID
|
||||
"""
|
||||
# TODO: 实现状态查询(需要数据库支持)
|
||||
"""Return document status."""
|
||||
document = get_document_query_service().get(doc_id)
|
||||
if not document:
|
||||
raise HTTPException(status_code=404, detail="文档不存在")
|
||||
return DocumentUploadResponse(
|
||||
doc_id=doc_id,
|
||||
doc_name="",
|
||||
status="unknown",
|
||||
message="状态查询功能待实现"
|
||||
doc_id=document.doc_id,
|
||||
doc_name=document.doc_name,
|
||||
status=document.status.value,
|
||||
message=document.error_message or "查询成功",
|
||||
num_chunks=document.chunk_count,
|
||||
summary=document.summary,
|
||||
summary_latency_ms=document.summary_latency_ms,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/download/{doc_id}")
|
||||
async def download_document(doc_id: str):
|
||||
"""
|
||||
下载文档(从MinIO获取)
|
||||
|
||||
Args:
|
||||
doc_id: 文档ID
|
||||
|
||||
Returns:
|
||||
文件下载响应
|
||||
"""
|
||||
logger.info(f"请求下载文档: doc_id={doc_id}")
|
||||
|
||||
"""Handle download document."""
|
||||
try:
|
||||
minio = get_minio_client()
|
||||
|
||||
# 查找该doc_id下的文件(MinIO对象名称格式: {doc_id}/{filename})
|
||||
objects = minio.list_objects(prefix=f"{doc_id}/")
|
||||
|
||||
if not objects:
|
||||
logger.warning(f"MinIO中未找到文档: doc_id={doc_id}")
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"文档不存在: doc_id={doc_id}"
|
||||
)
|
||||
|
||||
# 获取第一个匹配的对象
|
||||
object_name = objects[0]
|
||||
logger.info(f"找到MinIO对象: {object_name}")
|
||||
|
||||
# 获取文件数据
|
||||
file_data = minio.get_object_data(object_name)
|
||||
if file_data is None:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"获取文档数据失败"
|
||||
)
|
||||
|
||||
# 解析原始文件名
|
||||
original_name = object_name.split("/", 1)[1] if "/" in object_name else object_name
|
||||
|
||||
# 获取Content-Type
|
||||
content_type = minio._get_content_type(original_name)
|
||||
|
||||
logger.success(f"文档下载成功: {original_name}, 大小={len(file_data)}")
|
||||
|
||||
# 返回文件流(URL编码文件名以支持中文)
|
||||
encoded_name = quote(original_name)
|
||||
document, file_data = get_document_query_service().download(doc_id)
|
||||
encoded_name = quote(document.file_name)
|
||||
return StreamingResponse(
|
||||
BytesIO(file_data),
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_name}"
|
||||
}
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"文档下载失败: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"文档下载失败: {str(e)}"
|
||||
media_type=document.content_type or "application/octet-stream",
|
||||
headers={"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_name}"},
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc))
|
||||
except Exception as exc:
|
||||
logger.exception("文档下载失败")
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
|
||||
|
||||
@router.get("/list")
|
||||
async def list_documents():
|
||||
"""
|
||||
列出所有已上传的文档(从MinIO获取)
|
||||
"""
|
||||
try:
|
||||
documents = _build_document_records()
|
||||
return {"documents": documents, "total": len(documents)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"列出文档失败: {e}")
|
||||
return {"documents": [], "total": 0, "error": str(e)}
|
||||
"""List documents."""
|
||||
documents = get_document_query_service().list_documents()
|
||||
return {
|
||||
"documents": [
|
||||
{
|
||||
"doc_id": item.doc_id,
|
||||
"doc_name": item.doc_name,
|
||||
"status": item.status.value,
|
||||
"chunk_count": item.chunk_count,
|
||||
"updated_at": item.updated_at.isoformat(),
|
||||
}
|
||||
for item in documents
|
||||
],
|
||||
"total": len(documents),
|
||||
}
|
||||
|
||||
|
||||
@router.get("/management-list")
|
||||
async def get_document_management_list():
|
||||
"""
|
||||
文档管理清单接口:仅返回最近的10条文档。
|
||||
"""
|
||||
try:
|
||||
documents = _build_document_records(limit=10)
|
||||
return {"documents": documents, "total": len(documents), "limit": 10}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取文档管理清单失败: {e}")
|
||||
return {"documents": [], "total": 0, "limit": 10, "error": str(e)}
|
||||
"""Return document management list."""
|
||||
documents = get_document_query_service().list_documents(limit=10)
|
||||
return {
|
||||
"documents": [
|
||||
{
|
||||
"doc_id": item.doc_id,
|
||||
"doc_name": item.doc_name,
|
||||
"status": item.status.value,
|
||||
"chunk_count": item.chunk_count,
|
||||
"updated_at": item.updated_at.isoformat(),
|
||||
}
|
||||
for item in documents
|
||||
],
|
||||
"total": len(documents),
|
||||
"limit": 10,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user