Fix SSE route dependency and align architecture docs

2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions
--- a/backend/app/api/routes/documents.py
+++ b/backend/app/api/routes/documents.py
@@ -1,290 +1,140 @@
-"""文档上传与处理接口"""
+"""Define API routes for documents."""
+
+from __future__ import annotations

-from fastapi import APIRouter, UploadFile, File, Form, HTTPException
-from fastapi.responses import FileResponse, StreamingResponse
-from typing import Optional
-import os
-import uuid
-import tempfile
-from pathlib import Path
-from loguru import logger
 from io import BytesIO
 from urllib.parse import quote

-from ..models import DocumentUploadResponse, ErrorResponse
-from app.services.document_processor import DocumentProcessor
-from app.services.storage.minio_client import MinIOClient
-from app.config.settings import settings
+from fastapi import APIRouter, File, Form, HTTPException, UploadFile
+from fastapi.responses import StreamingResponse
+from loguru import logger
+
+from app.api.models import DocumentUploadResponse
+from app.application.documents import DocumentProcessResult
+from app.shared.bootstrap import get_document_command_service, get_document_query_service
+# Keep route handlers close to their transport-layer wiring for easier auditing.
+

 router = APIRouter(prefix="/documents", tags=["documents"])

-# MinIO客户端（用于文档存储）
-minio_client: Optional[MinIOClient] = None

-
-def get_minio_client() -> MinIOClient:
-    """获取MinIO客户端实例"""
-    global minio_client
-    if minio_client is None:
-        minio_client = MinIOClient()
-        minio_client.connect()
-        minio_client.ensure_bucket()
-    return minio_client
-
-
-def _build_document_records(limit: Optional[int] = None):
-    """构建文档列表记录，支持按最近更新时间倒序截断。"""
-    minio = get_minio_client()
-
-    document_records = []
-    objects = minio.client.list_objects(minio.bucket, recursive=True)
-    for obj in objects:
-        parts = obj.object_name.split("/", 1)
-        if len(parts) != 2:
-            continue
-
-        doc_id, filename = parts
-        last_modified = getattr(obj, "last_modified", None)
-        document_records.append({
-            "doc_id": doc_id,
-            "filename": filename,
-            "size": getattr(obj, "size", 0) or 0,
-            "object_name": obj.object_name,
-            "download_url": f"/api/v1/documents/download/{doc_id}",
-            "last_modified": last_modified.isoformat() if last_modified else None,
-            "_sort_key": last_modified.timestamp() if last_modified else 0,
-        })
-
-    document_records.sort(key=lambda item: item["_sort_key"], reverse=True)
-    if limit is not None:
-        document_records = document_records[:limit]
-
-    for item in document_records:
-        item.pop("_sort_key", None)
-
-    return document_records
+def _document_response(result: DocumentProcessResult) -> DocumentUploadResponse:
+    """Handle document response for this module."""
+    return DocumentUploadResponse(
+        doc_id=result.doc_id,
+        doc_name=result.doc_name,
+        status=result.status,
+        message=result.message,
+        num_chunks=result.num_chunks,
+        summary=result.summary,
+        summary_latency_ms=result.summary_latency_ms,
+    )


@router.post("/upload", response_model=DocumentUploadResponse)
 async def upload_document(
    file: UploadFile = File(..., description="上传的文档文件"),
-    doc_name: Optional[str] = Form(None, description="文档名称"),
-    regulation_type: Optional[str] = Form(None, description="法规类型"),
-    version: Optional[str] = Form(None, description="文档版本"),
-    generate_summary: bool = Form(False, description="是否生成摘要（默认不生成，可节省约60秒）")
+    doc_name: str | None = Form(None, description="文档名称"),
+    regulation_type: str | None = Form(None, description="法规类型"),
+    version: str | None = Form(None, description="文档版本"),
+    generate_summary: bool = Form(False, description="是否生成摘要"),
 ):
-    """
-    上传文档并处理
-
-    支持格式：PDF、DOCX、DOC
-    处理流程：解析 → 分块 → 嵌入 → 入库（摘要可选）
-    文件存储：MinIO对象存储
-
-    参数说明：
-    - generate_summary: 是否生成LLM摘要，默认False。勾选后处理时间增加约60秒。
-    """
-    # 验证文件类型
-    ext = os.path.splitext(file.filename)[1].lower()
-    if ext not in [".pdf", ".docx", ".doc"]:
-        raise HTTPException(
-            status_code=400,
-            detail=f"不支持的文件类型: {ext}，仅支持PDF、DOCX、DOC"
-        )
-
-    # 验证文件大小
-    if file.size and file.size > settings.max_file_size_mb * 1024 * 1024:
-        raise HTTPException(
-            status_code=400,
-            detail=f"文件过大，最大支持{settings.max_file_size_mb}MB"
-        )
-
-    # 生成文档ID
-    doc_id = str(uuid.uuid4())[:8]
-
-    # 文档名称
-    final_doc_name = doc_name or file.filename
-
-    # MinIO对象名称
-    object_name = f"{doc_id}/{file.filename}"
-
-    logger.info(f"接收到文件上传: {final_doc_name}, 类型: {ext}, doc_id={doc_id}")
+    """Handle upload document."""
+    content = await file.read()
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="文件名不能为空")
+    if not content:
+        raise HTTPException(status_code=400, detail="上传文件为空")

    try:
-        # 读取文件内容
-        content = await file.read()
-
-        # 保存临时文件用于处理
-        temp_dir = tempfile.gettempdir()
-        temp_path = os.path.join(temp_dir, f"{doc_id}_{file.filename}")
-
-        with open(temp_path, "wb") as f:
-            f.write(content)
-
-        logger.info(f"临时文件已保存到: {temp_path}")
-
-        # 上传到MinIO
-        minio = get_minio_client()
-        upload_success = minio.upload_bytes(
-            data=content,
-            object_name=object_name,
-            content_type=minio._get_content_type(file.filename),
-            metadata={
-                "doc_id": doc_id  # 仅传递ASCII安全的metadata
-            }
-        )
-
-        if upload_success:
-            logger.success(f"文件已上传到MinIO: {object_name}")
-        else:
-            logger.warning(f"MinIO上传失败，仅使用本地临时文件")
-
-        # 处理文档（传入相同的doc_id，保持一致性）
-        processor = DocumentProcessor(generate_summary=generate_summary)
-        result = processor.process(
-            file_path=temp_path,
-            doc_id=doc_id,  # 使用相同的doc_id
-            doc_name=final_doc_name,
+        result = get_document_command_service().upload_and_process(
+            file_name=file.filename,
+            content=content,
+            content_type=file.content_type or "application/octet-stream",
+            doc_name=doc_name,
            regulation_type=regulation_type or "",
-            version=version or ""
-        )
-        processor.close()
-
-        # 清理临时文件
-        try:
-            os.remove(temp_path)
-        except:
-            pass
-
-        if result.success:
-            return DocumentUploadResponse(
-                doc_id=result.doc_id,
-                doc_name=result.doc_name,
-                status="success",
-                message=result.message,
-                num_chunks=result.num_chunks,
-                summary=result.summary,
-                summary_latency_ms=result.summary_latency_ms
-            )
-        else:
-            raise HTTPException(
-                status_code=500,
-                detail=result.message
-            )
-
-    except Exception as e:
-        logger.error(f"文档处理失败: {e}")
-        raise HTTPException(
-            status_code=500,
-            detail=f"文档处理失败: {str(e)}"
+            version=version or "",
+            generate_summary=generate_summary,
        )
+        if result.status == "failed":
+            raise HTTPException(status_code=500, detail=result.message)
+        return _document_response(result)
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.exception("文档上传失败")
+        raise HTTPException(status_code=500, detail=str(exc))


@router.get("/status/{doc_id}", response_model=DocumentUploadResponse)
 async def get_document_status(doc_id: str):
-    """
-    查询文档处理状态
-
-    Args:
-        doc_id: 文档ID
-    """
-    # TODO: 实现状态查询（需要数据库支持）
+    """Return document status."""
+    document = get_document_query_service().get(doc_id)
+    if not document:
+        raise HTTPException(status_code=404, detail="文档不存在")
    return DocumentUploadResponse(
-        doc_id=doc_id,
-        doc_name="",
-        status="unknown",
-        message="状态查询功能待实现"
+        doc_id=document.doc_id,
+        doc_name=document.doc_name,
+        status=document.status.value,
+        message=document.error_message or "查询成功",
+        num_chunks=document.chunk_count,
+        summary=document.summary,
+        summary_latency_ms=document.summary_latency_ms,
    )


@router.get("/download/{doc_id}")
 async def download_document(doc_id: str):
-    """
-    下载文档（从MinIO获取）
-
-    Args:
-        doc_id: 文档ID
-
-    Returns:
-        文件下载响应
-    """
-    logger.info(f"请求下载文档: doc_id={doc_id}")
-
+    """Handle download document."""
    try:
-        minio = get_minio_client()
-
-        # 查找该doc_id下的文件（MinIO对象名称格式: {doc_id}/{filename}）
-        objects = minio.list_objects(prefix=f"{doc_id}/")
-
-        if not objects:
-            logger.warning(f"MinIO中未找到文档: doc_id={doc_id}")
-            raise HTTPException(
-                status_code=404,
-                detail=f"文档不存在: doc_id={doc_id}"
-            )
-
-        # 获取第一个匹配的对象
-        object_name = objects[0]
-        logger.info(f"找到MinIO对象: {object_name}")
-
-        # 获取文件数据
-        file_data = minio.get_object_data(object_name)
-        if file_data is None:
-            raise HTTPException(
-                status_code=500,
-                detail=f"获取文档数据失败"
-            )
-
-        # 解析原始文件名
-        original_name = object_name.split("/", 1)[1] if "/" in object_name else object_name
-
-        # 获取Content-Type
-        content_type = minio._get_content_type(original_name)
-
-        logger.success(f"文档下载成功: {original_name}, 大小={len(file_data)}")
-
-        # 返回文件流（URL编码文件名以支持中文）
-        encoded_name = quote(original_name)
+        document, file_data = get_document_query_service().download(doc_id)
+        encoded_name = quote(document.file_name)
        return StreamingResponse(
            BytesIO(file_data),
-            media_type=content_type,
-            headers={
-                "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_name}"
-            }
-        )
-
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"文档下载失败: {e}")
-        raise HTTPException(
-            status_code=500,
-            detail=f"文档下载失败: {str(e)}"
+            media_type=document.content_type or "application/octet-stream",
+            headers={"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_name}"},
        )
+    except FileNotFoundError as exc:
+        raise HTTPException(status_code=404, detail=str(exc))
+    except Exception as exc:
+        logger.exception("文档下载失败")
+        raise HTTPException(status_code=500, detail=str(exc))


@router.get("/list")
 async def list_documents():
-    """
-    列出所有已上传的文档（从MinIO获取）
-    """
-    try:
-        documents = _build_document_records()
-        return {"documents": documents, "total": len(documents)}
-
-    except Exception as e:
-        logger.error(f"列出文档失败: {e}")
-        return {"documents": [], "total": 0, "error": str(e)}
+    """List documents."""
+    documents = get_document_query_service().list_documents()
+    return {
+        "documents": [
+            {
+                "doc_id": item.doc_id,
+                "doc_name": item.doc_name,
+                "status": item.status.value,
+                "chunk_count": item.chunk_count,
+                "updated_at": item.updated_at.isoformat(),
+            }
+            for item in documents
+        ],
+        "total": len(documents),
+    }


@router.get("/management-list")
 async def get_document_management_list():
-    """
-    文档管理清单接口：仅返回最近的10条文档。
-    """
-    try:
-        documents = _build_document_records(limit=10)
-        return {"documents": documents, "total": len(documents), "limit": 10}
-
-    except Exception as e:
-        logger.error(f"获取文档管理清单失败: {e}")
-        return {"documents": [], "total": 0, "limit": 10, "error": str(e)}
+    """Return document management list."""
+    documents = get_document_query_service().list_documents(limit=10)
+    return {
+        "documents": [
+            {
+                "doc_id": item.doc_id,
+                "doc_name": item.doc_name,
+                "status": item.status.value,
+                "chunk_count": item.chunk_count,
+                "updated_at": item.updated_at.isoformat(),
+            }
+            for item in documents
+        ],
+        "total": len(documents),
+        "limit": 10,
+    }