AIRegulation-DocAnalysis/backend/app/api/routes/documents.py

"""文档上传与处理接口"""

from fastapi import APIRouter, UploadFile, File, Form, HTTPException
from fastapi.responses import FileResponse, StreamingResponse
from typing import Optional
import os
import uuid
import tempfile
from pathlib import Path
from loguru import logger
from io import BytesIO
from urllib.parse import quote

from ..models import DocumentUploadResponse, ErrorResponse
from app.services.document_processor import DocumentProcessor
from app.services.storage.minio_client import MinIOClient
from app.config.settings import settings

router = APIRouter(prefix="/documents", tags=["documents"])

# MinIO客户端（用于文档存储）
minio_client: Optional[MinIOClient] = None


def get_minio_client() -> MinIOClient:
    """获取MinIO客户端实例"""
    global minio_client
    if minio_client is None:
        minio_client = MinIOClient()
        minio_client.connect()
        minio_client.ensure_bucket()
    return minio_client


def _build_document_records(limit: Optional[int] = None):
    """构建文档列表记录，支持按最近更新时间倒序截断。"""
    minio = get_minio_client()

    document_records = []
    objects = minio.client.list_objects(minio.bucket, recursive=True)
    for obj in objects:
        parts = obj.object_name.split("/", 1)
        if len(parts) != 2:
            continue

        doc_id, filename = parts
        last_modified = getattr(obj, "last_modified", None)
        document_records.append({
            "doc_id": doc_id,
            "filename": filename,
            "size": getattr(obj, "size", 0) or 0,
            "object_name": obj.object_name,
            "download_url": f"/api/v1/documents/download/{doc_id}",
            "last_modified": last_modified.isoformat() if last_modified else None,
            "_sort_key": last_modified.timestamp() if last_modified else 0,
        })

    document_records.sort(key=lambda item: item["_sort_key"], reverse=True)
    if limit is not None:
        document_records = document_records[:limit]

    for item in document_records:
        item.pop("_sort_key", None)

    return document_records


@router.post("/upload", response_model=DocumentUploadResponse)
async def upload_document(
    file: UploadFile = File(..., description="上传的文档文件"),
    doc_name: Optional[str] = Form(None, description="文档名称"),
    regulation_type: Optional[str] = Form(None, description="法规类型"),
    version: Optional[str] = Form(None, description="文档版本"),
    generate_summary: bool = Form(False, description="是否生成摘要（默认不生成，可节省约60秒）")
):
    """
    上传文档并处理

    支持格式：PDF、DOCX、DOC
    处理流程：解析 → 分块 → 嵌入 → 入库（摘要可选）
    文件存储：MinIO对象存储

    参数说明：
    - generate_summary: 是否生成LLM摘要，默认False。勾选后处理时间增加约60秒。
    """
    # 验证文件类型
    ext = os.path.splitext(file.filename)[1].lower()
    if ext not in [".pdf", ".docx", ".doc"]:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的文件类型: {ext}，仅支持PDF、DOCX、DOC"
        )

    # 验证文件大小
    if file.size and file.size > settings.max_file_size_mb * 1024 * 1024:
        raise HTTPException(
            status_code=400,
            detail=f"文件过大，最大支持{settings.max_file_size_mb}MB"
        )

    # 生成文档ID
    doc_id = str(uuid.uuid4())[:8]

    # 文档名称
    final_doc_name = doc_name or file.filename

    # MinIO对象名称
    object_name = f"{doc_id}/{file.filename}"

    logger.info(f"接收到文件上传: {final_doc_name}, 类型: {ext}, doc_id={doc_id}")

    try:
        # 读取文件内容
        content = await file.read()

        # 保存临时文件用于处理
        temp_dir = tempfile.gettempdir()
        temp_path = os.path.join(temp_dir, f"{doc_id}_{file.filename}")

        with open(temp_path, "wb") as f:
            f.write(content)

        logger.info(f"临时文件已保存到: {temp_path}")

        # 上传到MinIO
        minio = get_minio_client()
        upload_success = minio.upload_bytes(
            data=content,
            object_name=object_name,
            content_type=minio._get_content_type(file.filename),
            metadata={
                "doc_id": doc_id  # 仅传递ASCII安全的metadata
            }
        )

        if upload_success:
            logger.success(f"文件已上传到MinIO: {object_name}")
        else:
            logger.warning(f"MinIO上传失败，仅使用本地临时文件")

        # 处理文档（传入相同的doc_id，保持一致性）
        processor = DocumentProcessor(generate_summary=generate_summary)
        result = processor.process(
            file_path=temp_path,
            doc_id=doc_id,  # 使用相同的doc_id
            doc_name=final_doc_name,
            regulation_type=regulation_type or "",
            version=version or ""
        )
        processor.close()

        # 清理临时文件
        try:
            os.remove(temp_path)
        except:
            pass

        if result.success:
            return DocumentUploadResponse(
                doc_id=result.doc_id,
                doc_name=result.doc_name,
                status="success",
                message=result.message,
                num_chunks=result.num_chunks,
                summary=result.summary,
                summary_latency_ms=result.summary_latency_ms
            )
        else:
            raise HTTPException(
                status_code=500,
                detail=result.message
            )

    except Exception as e:
        logger.error(f"文档处理失败: {e}")
        raise HTTPException(
            status_code=500,
            detail=f"文档处理失败: {str(e)}"
        )


@router.get("/status/{doc_id}", response_model=DocumentUploadResponse)
async def get_document_status(doc_id: str):
    """
    查询文档处理状态

    Args:
        doc_id: 文档ID
    """
    # TODO: 实现状态查询（需要数据库支持）
    return DocumentUploadResponse(
        doc_id=doc_id,
        doc_name="",
        status="unknown",
        message="状态查询功能待实现"
    )


@router.get("/download/{doc_id}")
async def download_document(doc_id: str):
    """
    下载文档（从MinIO获取）

    Args:
        doc_id: 文档ID

    Returns:
        文件下载响应
    """
    logger.info(f"请求下载文档: doc_id={doc_id}")

    try:
        minio = get_minio_client()

        # 查找该doc_id下的文件（MinIO对象名称格式: {doc_id}/{filename}）
        objects = minio.list_objects(prefix=f"{doc_id}/")

        if not objects:
            logger.warning(f"MinIO中未找到文档: doc_id={doc_id}")
            raise HTTPException(
                status_code=404,
                detail=f"文档不存在: doc_id={doc_id}"
            )

        # 获取第一个匹配的对象
        object_name = objects[0]
        logger.info(f"找到MinIO对象: {object_name}")

        # 获取文件数据
        file_data = minio.get_object_data(object_name)
        if file_data is None:
            raise HTTPException(
                status_code=500,
                detail=f"获取文档数据失败"
            )

        # 解析原始文件名
        original_name = object_name.split("/", 1)[1] if "/" in object_name else object_name

        # 获取Content-Type
        content_type = minio._get_content_type(original_name)

        logger.success(f"文档下载成功: {original_name}, 大小={len(file_data)}")

        # 返回文件流（URL编码文件名以支持中文）
        encoded_name = quote(original_name)
        return StreamingResponse(
            BytesIO(file_data),
            media_type=content_type,
            headers={
                "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_name}"
            }
        )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"文档下载失败: {e}")
        raise HTTPException(
            status_code=500,
            detail=f"文档下载失败: {str(e)}"
        )


@router.get("/list")
async def list_documents():
    """
    列出所有已上传的文档（从MinIO获取）
    """
    try:
        documents = _build_document_records()
        return {"documents": documents, "total": len(documents)}

    except Exception as e:
        logger.error(f"列出文档失败: {e}")
        return {"documents": [], "total": 0, "error": str(e)}


@router.get("/management-list")
async def get_document_management_list():
    """
    文档管理清单接口：仅返回最近的10条文档。
    """
    try:
        documents = _build_document_records(limit=10)
        return {"documents": documents, "total": len(documents), "limit": 10}

    except Exception as e:
        logger.error(f"获取文档管理清单失败: {e}")
        return {"documents": [], "total": 0, "limit": 10, "error": str(e)}