from fastapi import APIRouter, UploadFile, File, HTTPException import os import uuid from datetime import datetime from app.schemas.doc import ( DocumentUploadResponse, DocumentListResponse, DocumentInfo, ParseResponse, EmbedResponse, ) from app.services.mock_data import get_mock_documents, generate_doc_id router = APIRouter(prefix="/docs", tags=["文档管理"]) # 临时存储文档信息(包含预设的mock文档) documents_store: dict[str, dict] = {} # 初始化时加载mock文档 for doc in get_mock_documents(): documents_store[doc["id"]] = doc @router.post("/upload", response_model=DocumentUploadResponse) async def upload_document(file: UploadFile = File(...)): """上传法规文档""" # 检查文件格式 allowed_ext = [".pdf", ".docx", ".doc", ".txt"] ext = os.path.splitext(file.filename)[1].lower() if ext not in allowed_ext: raise HTTPException(400, f"Unsupported file format: {ext}") # 生成文档ID doc_id = generate_doc_id() # 保存文件 raw_dir = "/airegulation/demo-mao/backend/data/raw" os.makedirs(raw_dir, exist_ok=True) file_path = os.path.join(raw_dir, f"{doc_id}_{file.filename}") content = await file.read() with open(file_path, "wb") as f: f.write(content) # 记录文档信息 documents_store[doc_id] = { "id": doc_id, "name": file.filename, "path": file_path, "size": len(content), "status": "uploaded", "chunks": 0, "created_at": datetime.now(), } return DocumentUploadResponse( doc_id=doc_id, filename=file.filename, size=len(content), ) @router.get("/list", response_model=DocumentListResponse) async def list_documents(): """获取已索引文档列表""" docs = [ DocumentInfo( id=d["id"], name=d["name"], chunks=d["chunks"], status=d["status"], created_at=d.get("created_at"), ) for d in documents_store.values() ] return DocumentListResponse(docs=docs) @router.post("/parse/{doc_id}", response_model=ParseResponse) async def parse_document(doc_id: str): """解析文档并分块""" if doc_id not in documents_store: raise HTTPException(404, "Document not found") doc = documents_store[doc_id] # 模拟解析逻辑 doc["status"] = "parsed" # 根据文件大小计算chunks数量 file_size = doc.get("size", 100000) doc["chunks"] = max(20, file_size // 8000) return ParseResponse(doc_id=doc_id, chunks=doc["chunks"]) @router.post("/embed/{doc_id}", response_model=EmbedResponse) async def embed_document(doc_id: str): """嵌入并存入向量库""" if doc_id not in documents_store: raise HTTPException(404, "Document not found") doc = documents_store[doc_id] # 模拟嵌入逻辑 doc["status"] = "indexed" return EmbedResponse(doc_id=doc_id, vectors=doc["chunks"]) @router.delete("/delete/{doc_id}") async def delete_document(doc_id: str): """删除文档""" if doc_id not in documents_store: raise HTTPException(404, "Document not found") del documents_store[doc_id] return {"success": True}