AIRegulation-DocAnalysis/backend/app/api/routes/docs.py

from fastapi import APIRouter, UploadFile, File, HTTPException
import os
import uuid
from datetime import datetime
from app.schemas.doc import (
    DocumentUploadResponse,
    DocumentListResponse,
    DocumentInfo,
    ParseResponse,
    EmbedResponse,
)
from app.services.mock_data import get_mock_documents, generate_doc_id

router = APIRouter(prefix="/docs", tags=["文档管理"])

# 临时存储文档信息（包含预设的mock文档）
documents_store: dict[str, dict] = {}

# 初始化时加载mock文档
for doc in get_mock_documents():
    documents_store[doc["id"]] = doc


@router.post("/upload", response_model=DocumentUploadResponse)
async def upload_document(file: UploadFile = File(...)):
    """上传法规文档"""
    # 检查文件格式
    allowed_ext = [".pdf", ".docx", ".doc", ".txt"]
    ext = os.path.splitext(file.filename)[1].lower()
    if ext not in allowed_ext:
        raise HTTPException(400, f"Unsupported file format: {ext}")

    # 生成文档ID
    doc_id = generate_doc_id()

    # 保存文件
    raw_dir = "/airegulation/demo-mao/backend/data/raw"
    os.makedirs(raw_dir, exist_ok=True)
    file_path = os.path.join(raw_dir, f"{doc_id}_{file.filename}")

    content = await file.read()
    with open(file_path, "wb") as f:
        f.write(content)

    # 记录文档信息
    documents_store[doc_id] = {
        "id": doc_id,
        "name": file.filename,
        "path": file_path,
        "size": len(content),
        "status": "uploaded",
        "chunks": 0,
        "created_at": datetime.now(),
    }

    return DocumentUploadResponse(
        doc_id=doc_id,
        filename=file.filename,
        size=len(content),
    )


@router.get("/list", response_model=DocumentListResponse)
async def list_documents():
    """获取已索引文档列表"""
    docs = [
        DocumentInfo(
            id=d["id"],
            name=d["name"],
            chunks=d["chunks"],
            status=d["status"],
            created_at=d.get("created_at"),
        )
        for d in documents_store.values()
    ]
    return DocumentListResponse(docs=docs)


@router.post("/parse/{doc_id}", response_model=ParseResponse)
async def parse_document(doc_id: str):
    """解析文档并分块"""
    if doc_id not in documents_store:
        raise HTTPException(404, "Document not found")

    doc = documents_store[doc_id]
    # 模拟解析逻辑
    doc["status"] = "parsed"
    # 根据文件大小计算chunks数量
    file_size = doc.get("size", 100000)
    doc["chunks"] = max(20, file_size // 8000)

    return ParseResponse(doc_id=doc_id, chunks=doc["chunks"])


@router.post("/embed/{doc_id}", response_model=EmbedResponse)
async def embed_document(doc_id: str):
    """嵌入并存入向量库"""
    if doc_id not in documents_store:
        raise HTTPException(404, "Document not found")

    doc = documents_store[doc_id]
    # 模拟嵌入逻辑
    doc["status"] = "indexed"

    return EmbedResponse(doc_id=doc_id, vectors=doc["chunks"])


@router.delete("/delete/{doc_id}")
async def delete_document(doc_id: str):
    """删除文档"""
    if doc_id not in documents_store:
        raise HTTPException(404, "Document not found")

    del documents_store[doc_id]
    return {"success": True}