From 80dcd070f71aba3757512965bcbe05173caa8271 Mon Sep 17 00:00:00 2001 From: "Yuemin.Mao" Date: Mon, 11 May 2026 11:22:55 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env | 52 +++ .env.example | 24 + .gitignore | 10 + .python-version | 1 + Dockerfile | 18 + app/__init__.py | 3 + app/api/__init__.py | 3 + app/api/routes/__init__.py | 13 + app/api/routes/compliance.py | 96 ++++ app/api/routes/docs.py | 115 +++++ app/api/routes/rag.py | 74 +++ app/api/routes/status.py | 28 ++ app/core/__init__.py | 3 + app/core/config.py | 41 ++ app/main.py | 68 +++ app/schemas/__init__.py | 49 ++ app/schemas/compliance.py | 69 +++ app/schemas/doc.py | 44 ++ app/schemas/rag.py | 31 ++ app/services/__init__.py | 50 +++ app/services/document.py | 64 +++ app/services/embedding.py | 33 ++ app/services/llm.py | 58 +++ app/services/milvus.py | 158 +++++++ app/services/mock_data.py | 425 ++++++++++++++++++ app/utils/__init__.py | 4 + app/utils/chunking.py | 78 ++++ app/utils/logger.py | 24 + app/workflows/__init__.py | 12 + app/workflows/compliance_workflow.py | 175 ++++++++ app/workflows/rag_workflow.py | 114 +++++ .../raw/compliance_task-32e64724_test_doc.txt | 1 + data/raw/doc-3b47abd7_requirement.txt | 2 + data/raw/doc-9b01a78a_requirement.txt | 2 + main.py | 6 + pyproject.toml | 7 + requirements.txt | 34 ++ tests/__init__.py | 0 uv.lock | 8 + 39 files changed, 1997 insertions(+) create mode 100644 .env create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 .python-version create mode 100644 Dockerfile create mode 100644 app/__init__.py create mode 100644 app/api/__init__.py create mode 100644 app/api/routes/__init__.py create mode 100644 app/api/routes/compliance.py create mode 100644 app/api/routes/docs.py create mode 100644 app/api/routes/rag.py create mode 100644 app/api/routes/status.py create mode 100644 app/core/__init__.py create mode 100644 app/core/config.py create mode 100644 app/main.py create mode 100644 app/schemas/__init__.py create mode 100644 app/schemas/compliance.py create mode 100644 app/schemas/doc.py create mode 100644 app/schemas/rag.py create mode 100644 app/services/__init__.py create mode 100644 app/services/document.py create mode 100644 app/services/embedding.py create mode 100644 app/services/llm.py create mode 100644 app/services/milvus.py create mode 100644 app/services/mock_data.py create mode 100644 app/utils/__init__.py create mode 100644 app/utils/chunking.py create mode 100644 app/utils/logger.py create mode 100644 app/workflows/__init__.py create mode 100644 app/workflows/compliance_workflow.py create mode 100644 app/workflows/rag_workflow.py create mode 100644 data/raw/compliance_task-32e64724_test_doc.txt create mode 100644 data/raw/doc-3b47abd7_requirement.txt create mode 100644 data/raw/doc-9b01a78a_requirement.txt create mode 100644 main.py create mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 tests/__init__.py create mode 100644 uv.lock diff --git a/.env b/.env new file mode 100644 index 0000000..27a14c5 --- /dev/null +++ b/.env @@ -0,0 +1,52 @@ +# DashScope API +DASHSCOPE_API_KEY=your_api_key_here + +# PostgreSQL +POSTGRES_HOST=localhost +POSTGRES_PORT=5432 +POSTGRES_USER=postgresql +POSTGRES_PASSWORD=postgresql123456 +POSTGRES_DB=mydb + +# Redis +REDIS_HOST=localhost +REDIS_PORT=6379 +REDIS_PASSWORD=redis@123 + +# Milvus +MILVUS_HOST=localhost +MILVUS_PORT=19530 + +# MinIO +MINIO_ENDPOINT=localhost:9000 +MINIO_ACCESS_KEY=minioadmin +MINIO_SECRET_KEY=minioadmin + +# Neo4j +NEO4J_URI=bolt://localhost:7687 +NEO4J_USER=neo4j +NEO4J_PASSWORD=neo4j123 + +# RabbitMQ +RABBITMQ_HOST=localhost +RABBITMQ_PORT=5672 +RABBITMQ_USER=admin +RABBITMQ_PASSWORD=admin@123 + +# LLM配置 +LLM_MODEL=qwen-max +EMBEDDING_MODEL=text-embedding-v3 +EMBEDDING_DIM=1536 + +# 检索配置 +VECTOR_TOP_K=10 +BM25_TOP_K=10 +FINAL_TOP_K=5 + +# 分块配置 +CHUNK_SIZE=800 +CHUNK_OVERLAP=50 + +# 服务配置 +API_HOST=0.0.0.0 +API_PORT=8000 \ No newline at end of file diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..b3e4f6a --- /dev/null +++ b/.env.example @@ -0,0 +1,24 @@ +# DashScope API +DASHSCOPE_API_KEY=your_api_key_here + +# Milvus +MILVUS_HOST=localhost +MILVUS_PORT=19530 + +# LLM配置 +LLM_MODEL=qwen-max +EMBEDDING_MODEL=text-embedding-v3 +EMBEDDING_DIM=1536 + +# 检索配置 +VECTOR_TOP_K=10 +BM25_TOP_K=10 +FINAL_TOP_K=5 + +# 分块配置 +CHUNK_SIZE=800 +CHUNK_OVERLAP=50 + +# 服务配置 +API_HOST=0.0.0.0 +API_PORT=8000 \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..505a3b1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# Python-generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# Virtual environments +.venv diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..bd28b9c --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.9 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d41aaff --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.11-slim + +WORKDIR /app + +# 安装依赖 +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# 复制代码 +COPY app/ ./app/ +COPY data/ ./data/ + +# 环境变量 +ENV API_HOST=0.0.0.0 +ENV API_PORT=8000 + +# 启动命令 +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..f01a49c --- /dev/null +++ b/app/__init__.py @@ -0,0 +1,3 @@ +from .main import app + +__all__ = ["app"] \ No newline at end of file diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000..03eadff --- /dev/null +++ b/app/api/__init__.py @@ -0,0 +1,3 @@ +from .routes import api_router + +__all__ = ["api_router"] \ No newline at end of file diff --git a/app/api/routes/__init__.py b/app/api/routes/__init__.py new file mode 100644 index 0000000..ccd15d9 --- /dev/null +++ b/app/api/routes/__init__.py @@ -0,0 +1,13 @@ +from fastapi import APIRouter +from .docs import router as docs_router +from .rag import router as rag_router +from .compliance import router as compliance_router +from .status import router as status_router + +api_router = APIRouter() +api_router.include_router(docs_router) +api_router.include_router(rag_router) +api_router.include_router(compliance_router) +api_router.include_router(status_router) + +__all__ = ["api_router"] \ No newline at end of file diff --git a/app/api/routes/compliance.py b/app/api/routes/compliance.py new file mode 100644 index 0000000..f9fa99d --- /dev/null +++ b/app/api/routes/compliance.py @@ -0,0 +1,96 @@ +from fastapi import APIRouter, UploadFile, File, HTTPException +from sse_starlette.sse import EventSourceResponse +import uuid +import os +import json +import asyncio +from app.schemas.compliance import ( + AnalyzeResponse, + ComplianceChatRequest, +) +from app.services.mock_data import ( + generate_task_id, + get_mock_compliance_result, + get_mock_compliance_chat_response, +) + +router = APIRouter(prefix="/compliance", tags=["合规分析"]) + +# 临时存储分析任务 +tasks_store: dict[str, dict] = {} + + +@router.post("/analyze", response_model=AnalyzeResponse) +async def analyze_document(file: UploadFile = File(...)): + """上传设计方案进行分析""" + # 生成任务ID + task_id = generate_task_id() + + # 保存文件 + raw_dir = "/airegulation/demo-mao/backend/data/raw" + os.makedirs(raw_dir, exist_ok=True) + file_path = os.path.join(raw_dir, f"compliance_{task_id}_{file.filename}") + + content = await file.read() + with open(file_path, "wb") as f: + f.write(content) + + # 记录任务 + tasks_store[task_id] = { + "task_id": task_id, + "file_path": file_path, + "status": "processing", + "result": None, + } + + # 模拟异步处理完成(立即返回结果) + # 实际应用中这应该是后台任务 + tasks_store[task_id]["status"] = "completed" + tasks_store[task_id]["result"] = get_mock_compliance_result(task_id) + + return AnalyzeResponse(task_id=task_id) + + +@router.get("/result/{task_id}") +async def get_result(task_id: str): + """获取分析结果""" + if task_id not in tasks_store: + # 如果任务ID不存在,返回默认mock结果 + return get_mock_compliance_result(task_id) + + task = tasks_store[task_id] + + if task["status"] == "processing": + return {"status": "processing", "message": "分析进行中"} + + return task["result"] + + +@router.post("/chat/{segment_id}") +async def compliance_chat(segment_id: int, request: ComplianceChatRequest): + """针对段落进行合规对话""" + # 根据segment_id获取对应的intent + intent_map = { + 1: "车身结构设计", + 2: "动力系统配置", + 3: "安全配置设计", + } + intent = intent_map.get(segment_id, "车身结构设计") + + async def generate(): + # 获取预设响应 + response = get_mock_compliance_chat_response(intent, request.query) + + # 流式输出响应 + sentences = response.split("\n\n") + for sentence in sentences: + if sentence.strip(): + chunks = sentence.split("\n") + for chunk in chunks: + if chunk.strip(): + await asyncio.sleep(0.05) + yield {"event": "message", "data": json.dumps({"type": "chunk", "text": chunk + "\n"})} + + yield {"event": "message", "data": json.dumps({"type": "done"})} + + return EventSourceResponse(generate()) \ No newline at end of file diff --git a/app/api/routes/docs.py b/app/api/routes/docs.py new file mode 100644 index 0000000..4fda6a8 --- /dev/null +++ b/app/api/routes/docs.py @@ -0,0 +1,115 @@ +from fastapi import APIRouter, UploadFile, File, HTTPException +import os +import uuid +from datetime import datetime +from app.schemas.doc import ( + DocumentUploadResponse, + DocumentListResponse, + DocumentInfo, + ParseResponse, + EmbedResponse, +) +from app.services.mock_data import get_mock_documents, generate_doc_id + +router = APIRouter(prefix="/docs", tags=["文档管理"]) + +# 临时存储文档信息(包含预设的mock文档) +documents_store: dict[str, dict] = {} + +# 初始化时加载mock文档 +for doc in get_mock_documents(): + documents_store[doc["id"]] = doc + + +@router.post("/upload", response_model=DocumentUploadResponse) +async def upload_document(file: UploadFile = File(...)): + """上传法规文档""" + # 检查文件格式 + allowed_ext = [".pdf", ".docx", ".doc", ".txt"] + ext = os.path.splitext(file.filename)[1].lower() + if ext not in allowed_ext: + raise HTTPException(400, f"Unsupported file format: {ext}") + + # 生成文档ID + doc_id = generate_doc_id() + + # 保存文件 + raw_dir = "/airegulation/demo-mao/backend/data/raw" + os.makedirs(raw_dir, exist_ok=True) + file_path = os.path.join(raw_dir, f"{doc_id}_{file.filename}") + + content = await file.read() + with open(file_path, "wb") as f: + f.write(content) + + # 记录文档信息 + documents_store[doc_id] = { + "id": doc_id, + "name": file.filename, + "path": file_path, + "size": len(content), + "status": "uploaded", + "chunks": 0, + "created_at": datetime.now(), + } + + return DocumentUploadResponse( + doc_id=doc_id, + filename=file.filename, + size=len(content), + ) + + +@router.get("/list", response_model=DocumentListResponse) +async def list_documents(): + """获取已索引文档列表""" + docs = [ + DocumentInfo( + id=d["id"], + name=d["name"], + chunks=d["chunks"], + status=d["status"], + created_at=d.get("created_at"), + ) + for d in documents_store.values() + ] + return DocumentListResponse(docs=docs) + + +@router.post("/parse/{doc_id}", response_model=ParseResponse) +async def parse_document(doc_id: str): + """解析文档并分块""" + if doc_id not in documents_store: + raise HTTPException(404, "Document not found") + + doc = documents_store[doc_id] + # 模拟解析逻辑 + doc["status"] = "parsed" + # 根据文件大小计算chunks数量 + file_size = doc.get("size", 100000) + doc["chunks"] = max(20, file_size // 8000) + + return ParseResponse(doc_id=doc_id, chunks=doc["chunks"]) + + +@router.post("/embed/{doc_id}", response_model=EmbedResponse) +async def embed_document(doc_id: str): + """嵌入并存入向量库""" + if doc_id not in documents_store: + raise HTTPException(404, "Document not found") + + doc = documents_store[doc_id] + # 模拟嵌入逻辑 + doc["status"] = "indexed" + + return EmbedResponse(doc_id=doc_id, vectors=doc["chunks"]) + + +@router.delete("/delete/{doc_id}") +async def delete_document(doc_id: str): + """删除文档""" + if doc_id not in documents_store: + raise HTTPException(404, "Document not found") + + del documents_store[doc_id] + return {"success": True} \ No newline at end of file diff --git a/app/api/routes/rag.py b/app/api/routes/rag.py new file mode 100644 index 0000000..a45626a --- /dev/null +++ b/app/api/routes/rag.py @@ -0,0 +1,74 @@ +from fastapi import APIRouter +from sse_starlette.sse import EventSourceResponse +from app.schemas.rag import RagChatRequest, QuickQuestionsResponse, QuickQuestion +from app.services.mock_data import ( + get_mock_quick_questions, + get_mock_retrieval, + get_mock_rag_answer, +) +import json +import asyncio + +router = APIRouter(prefix="/rag", tags=["RAG问答"]) + + +@router.post("/chat") +async def rag_chat(request: RagChatRequest): + """SSE流式问答""" + + async def generate(): + # 发送检索开始事件 + yield {"event": "message", "data": json.dumps({"type": "retrieving"})} + + # 模拟检索延迟 + await asyncio.sleep(0.3) + + # 执行检索 + docs = get_mock_retrieval(request.query, top_k=request.top_k) + + retrieved_data = [ + { + "id": d["id"], + "score": d["score"], + "preview": d["preview"], + "doc_name": d.get("doc_name", ""), + "clause": d.get("clause", ""), + } + for d in docs + ] + yield {"event": "message", "data": json.dumps({"type": "retrieved", "docs": retrieved_data})} + + # 发送生成开始事件 + yield {"event": "message", "data": json.dumps({"type": "generating", "text": "正在生成答案..."})} + + # 模拟生成延迟 + await asyncio.sleep(0.2) + + # 获取预设答案 + answer = get_mock_rag_answer(request.query) + + # 流式输出答案(按句子分割) + sentences = answer.split("\n\n") + for sentence in sentences: + if sentence.strip(): + # 进一步分割长句子 + chunks = sentence.split("\n") + for chunk in chunks: + if chunk.strip(): + await asyncio.sleep(0.05) # 模拟生成延迟 + yield {"event": "message", "data": json.dumps({"type": "chunk", "text": chunk + "\n"})} + + # 发送完成事件 + yield {"event": "message", "data": json.dumps({"type": "done"})} + + return EventSourceResponse(generate()) + + +@router.get("/quick-questions", response_model=QuickQuestionsResponse) +async def get_quick_questions(): + """获取预设快捷问题""" + questions = [ + QuickQuestion(id=q["id"], question=q["question"], category=q["category"]) + for q in get_mock_quick_questions() + ] + return QuickQuestionsResponse(questions=questions) \ No newline at end of file diff --git a/app/api/routes/status.py b/app/api/routes/status.py new file mode 100644 index 0000000..66b12e4 --- /dev/null +++ b/app/api/routes/status.py @@ -0,0 +1,28 @@ +from fastapi import APIRouter +from app.core.config import settings +from app.services.mock_data import MOCK_SYSTEM_STATS, MOCK_SYSTEM_CONFIG + +router = APIRouter(prefix="/status", tags=["系统状态"]) + + +@router.get("/stats") +async def get_stats(): + """获取系统统计""" + # 返回预设统计数据 + return MOCK_SYSTEM_STATS + + +@router.get("/config") +async def get_config(): + """获取当前配置""" + return MOCK_SYSTEM_CONFIG + + +@router.get("/milvus/health") +async def milvus_health(): + """Milvus健康检查""" + # 模拟连接状态(假数据模式下始终返回连接成功) + return { + "connected": True, + "collections": ["vehicle_regulations"], + } \ No newline at end of file diff --git a/app/core/__init__.py b/app/core/__init__.py new file mode 100644 index 0000000..e849b30 --- /dev/null +++ b/app/core/__init__.py @@ -0,0 +1,3 @@ +from .config import settings, Settings + +__all__ = ["settings", "Settings"] \ No newline at end of file diff --git a/app/core/config.py b/app/core/config.py new file mode 100644 index 0000000..099a31d --- /dev/null +++ b/app/core/config.py @@ -0,0 +1,41 @@ +from pydantic_settings import BaseSettings +from typing import Optional + + +class Settings(BaseSettings): + # DashScope API + dashscope_api_key: str = "" + + # Milvus + milvus_host: str = "localhost" + milvus_port: int = 19530 + + # LLM配置 + llm_model: str = "qwen-max" + embedding_model: str = "text-embedding-v3" + embedding_dim: int = 1536 + + # 检索配置 + vector_top_k: int = 10 + bm25_top_k: int = 10 + final_top_k: int = 5 + + # 分块配置 + chunk_size: int = 800 + chunk_overlap: int = 50 + + # 服务配置 + api_host: str = "0.0.0.0" + api_port: int = 8000 + + # Collection名称 + regulations_collection: str = "vehicle_regulations" + compliance_collection: str = "compliance_cache" + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + case_sensitive = False + + +settings = Settings() \ No newline at end of file diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..947cee6 --- /dev/null +++ b/app/main.py @@ -0,0 +1,68 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from app.api.routes import api_router +from app.core.config import settings +from app.utils.logger import logger +from app.services import milvus_service + +# 创建应用 +app = FastAPI( + title="车辆法规智能检索系统", + description="基于RAG技术的法规检索与合规分析后端API", + version="1.0.0", +) + +# CORS配置 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# 注册路由 +app.include_router(api_router, prefix="/api") + + +@app.on_event("startup") +async def startup_event(): + """启动时初始化""" + logger.info("Starting application...") + + # 初始化Milvus集合(仅在服务可用时) + try: + if milvus_service is not None: + milvus_service.create_regulations_collection() + logger.info("Milvus collection initialized") + else: + logger.warning("Milvus service not available, using mock data") + except Exception as e: + logger.warning(f"Milvus initialization failed: {e}") + + +@app.on_event("shutdown") +async def shutdown_event(): + """关闭时清理""" + logger.info("Shutting down application...") + try: + if milvus_service is not None: + milvus_service.disconnect() + except Exception as e: + logger.warning(f"Shutdown cleanup error: {e}") + + +@app.get("/") +async def root(): + """根路径""" + return { + "message": "车辆法规智能检索系统 API", + "version": "1.0.0", + "docs": "/docs", + } + + +@app.get("/health") +async def health(): + """健康检查""" + return {"status": "healthy"} \ No newline at end of file diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py new file mode 100644 index 0000000..1ef3d7a --- /dev/null +++ b/app/schemas/__init__.py @@ -0,0 +1,49 @@ +from .doc import ( + DocumentUploadResponse, + DocumentInfo, + DocumentListResponse, + ChunkInfo, + ParseResponse, + EmbedResponse, +) +from .rag import ( + RagChatRequest, + RetrievedDoc, + SourceInfo, + QuickQuestion, + QuickQuestionsResponse, +) +from .compliance import ( + RiskLevel, + ComplianceStatus, + Regulation, + ComplianceSegment, + RiskDashboard, + PriorityAction, + ComplianceResult, + ComplianceChatRequest, + AnalyzeResponse, +) + +__all__ = [ + "DocumentUploadResponse", + "DocumentInfo", + "DocumentListResponse", + "ChunkInfo", + "ParseResponse", + "EmbedResponse", + "RagChatRequest", + "RetrievedDoc", + "SourceInfo", + "QuickQuestion", + "QuickQuestionsResponse", + "RiskLevel", + "ComplianceStatus", + "Regulation", + "ComplianceSegment", + "RiskDashboard", + "PriorityAction", + "ComplianceResult", + "ComplianceChatRequest", + "AnalyzeResponse", +] \ No newline at end of file diff --git a/app/schemas/compliance.py b/app/schemas/compliance.py new file mode 100644 index 0000000..220f5c6 --- /dev/null +++ b/app/schemas/compliance.py @@ -0,0 +1,69 @@ +from pydantic import BaseModel +from typing import Optional +from enum import Enum + + +class RiskLevel(str, Enum): + high = "high" + medium = "medium" + low = "low" + + +class ComplianceStatus(str, Enum): + pass_status = "pass" + warning = "warning" + fail = "fail" + + +class Regulation(BaseModel): + id: int + name: str + clause: Optional[str] = None + score: float + match_keyword: str + category: RiskLevel + full_content: str + + +class ComplianceSegment(BaseModel): + id: int + index: int + intent: str + start_pos: int + end_pos: int + content: str + risk_level: RiskLevel + regulations: list[Regulation] + + +class RiskDashboard(BaseModel): + score: float + high_risk_count: int + medium_risk_count: int + low_risk_count: int + need_fix_segments: int + status: ComplianceStatus + status_label: str + + +class PriorityAction(BaseModel): + regulation: str + issue: str + suggestion: str + severity: RiskLevel + + +class ComplianceResult(BaseModel): + task_id: str + dashboard: RiskDashboard + segments: list[ComplianceSegment] + priority_actions: list[PriorityAction] + + +class ComplianceChatRequest(BaseModel): + query: str + + +class AnalyzeResponse(BaseModel): + task_id: str + status: str = "processing" \ No newline at end of file diff --git a/app/schemas/doc.py b/app/schemas/doc.py new file mode 100644 index 0000000..5ff7cc1 --- /dev/null +++ b/app/schemas/doc.py @@ -0,0 +1,44 @@ +from pydantic import BaseModel +from typing import Optional +from datetime import datetime + + +class DocumentUploadResponse(BaseModel): + doc_id: str + filename: str + size: int + status: str = "uploaded" + + +class DocumentInfo(BaseModel): + id: str + name: str + chunks: int + status: str + created_at: Optional[datetime] = None + + +class DocumentListResponse(BaseModel): + docs: list[DocumentInfo] + + +class ChunkInfo(BaseModel): + chunk_id: str + doc_name: str + clause_id: Optional[str] = None + chapter: Optional[str] = None + content: str + token_count: int + chunk_index: int + + +class ParseResponse(BaseModel): + doc_id: str + chunks: int + status: str = "parsed" + + +class EmbedResponse(BaseModel): + doc_id: str + vectors: int + status: str = "embedded" \ No newline at end of file diff --git a/app/schemas/rag.py b/app/schemas/rag.py new file mode 100644 index 0000000..9a92e5c --- /dev/null +++ b/app/schemas/rag.py @@ -0,0 +1,31 @@ +from pydantic import BaseModel +from typing import Optional + + +class RagChatRequest(BaseModel): + query: str + top_k: int = 5 + + +class RetrievedDoc(BaseModel): + id: str + doc_name: str + clause_id: Optional[str] = None + score: float + content: str + preview: str + + +class SourceInfo(BaseModel): + name: str + clause: Optional[str] = None + + +class QuickQuestion(BaseModel): + id: str + question: str + category: str + + +class QuickQuestionsResponse(BaseModel): + questions: list[QuickQuestion] \ No newline at end of file diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..eb05bec --- /dev/null +++ b/app/services/__init__.py @@ -0,0 +1,50 @@ +# Import mock data service +from .mock_data import ( + get_mock_documents, + get_mock_quick_questions, + get_mock_retrieval, + get_mock_rag_answer, + get_mock_compliance_result, + get_mock_compliance_chat_response, + MOCK_SYSTEM_STATS, + MOCK_SYSTEM_CONFIG, +) + +# Try importing real services (may fail if dependencies not installed) +try: + from .llm import llm_service, LLMService + from .embedding import embedding_service, EmbeddingService + from .milvus import milvus_service, MilvusService + from .document import DocumentService, get_document_service + _real_services_available = True +except ImportError: + _real_services_available = False + llm_service = None + LLMService = None + embedding_service = None + EmbeddingService = None + milvus_service = None + MilvusService = None + DocumentService = None + get_document_service = None + +__all__ = [ + # Mock data services + "get_mock_documents", + "get_mock_quick_questions", + "get_mock_retrieval", + "get_mock_rag_answer", + "get_mock_compliance_result", + "get_mock_compliance_chat_response", + "MOCK_SYSTEM_STATS", + "MOCK_SYSTEM_CONFIG", + # Real services (may be None if not available) + "llm_service", + "LLMService", + "embedding_service", + "EmbeddingService", + "milvus_service", + "MilvusService", + "DocumentService", + "get_document_service", +] \ No newline at end of file diff --git a/app/services/document.py b/app/services/document.py new file mode 100644 index 0000000..cc812a2 --- /dev/null +++ b/app/services/document.py @@ -0,0 +1,64 @@ +import os +from typing import List, Optional +from PyPDF2 import PdfReader +from docx import Document +import pdfplumber + + +class DocumentService: + def __init__(self, raw_dir: str, parsed_dir: str): + self.raw_dir = raw_dir + self.parsed_dir = parsed_dir + + def parse_pdf(self, file_path: str) -> str: + """解析PDF文件""" + text = "" + try: + with pdfplumber.open(file_path) as pdf: + for page in pdf.pages: + page_text = page.extract_text() + if page_text: + text += page_text + "\n" + except Exception: + reader = PdfReader(file_path) + for page in reader.pages: + text += page.extract_text() + "\n" + + return text.strip() + + def parse_docx(self, file_path: str) -> str: + """解析Word文件""" + doc = Document(file_path) + text = "" + for paragraph in doc.paragraphs: + text += paragraph.text + "\n" + return text.strip() + + def parse_txt(self, file_path: str) -> str: + """解析TXT文件""" + with open(file_path, "r", encoding="utf-8") as f: + return f.read().strip() + + def parse_document(self, file_path: str) -> str: + """根据文件类型解析文档""" + ext = os.path.splitext(file_path)[1].lower() + + if ext == ".pdf": + return self.parse_pdf(file_path) + elif ext in [".docx", ".doc"]: + return self.parse_docx(file_path) + elif ext == ".txt": + return self.parse_txt(file_path) + else: + raise ValueError(f"Unsupported file format: {ext}") + + def save_parsed_text(self, doc_id: str, text: str) -> str: + """保存解析后的文本""" + parsed_path = os.path.join(self.parsed_dir, f"{doc_id}.txt") + with open(parsed_path, "w", encoding="utf-8") as f: + f.write(text) + return parsed_path + + +def get_document_service(raw_dir: str, parsed_dir: str) -> DocumentService: + return DocumentService(raw_dir, parsed_dir) \ No newline at end of file diff --git a/app/services/embedding.py b/app/services/embedding.py new file mode 100644 index 0000000..bcd1288 --- /dev/null +++ b/app/services/embedding.py @@ -0,0 +1,33 @@ +import dashscope +from dashscope import TextEmbedding +from typing import List + + +class EmbeddingService: + def __init__(self): + from app.core.config import settings + self.model = settings.embedding_model + self.dimension = settings.embedding_dim + dashscope.api_key = settings.dashscope_api_key + + def embed_texts(self, texts: List[str]) -> List[List[float]]: + """批量文本嵌入""" + response = TextEmbedding.call( + model=self.model, + input=texts, + ) + + if response.status_code == 200: + embeddings = [] + for item in response.output.embeddings: + embeddings.append(item.embedding) + return embeddings + raise Exception(f"Embedding failed: {response.code}") + + def embed_single(self, text: str) -> List[float]: + """单个文本嵌入""" + embeddings = self.embed_texts([text]) + return embeddings[0] + + +embedding_service = EmbeddingService() \ No newline at end of file diff --git a/app/services/llm.py b/app/services/llm.py new file mode 100644 index 0000000..05bcb96 --- /dev/null +++ b/app/services/llm.py @@ -0,0 +1,58 @@ +import dashscope +from dashscope import Generation +from typing import AsyncGenerator, Optional, Generator + + +class LLMService: + def __init__(self): + from app.core.config import settings + self.model = settings.llm_model + dashscope.api_key = settings.dashscope_api_key + + def generate_stream( + self, + prompt: str, + system_prompt: Optional[str] = None, + ) -> Generator[str, None, None]: + """流式生成文本""" + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + + responses = Generation.call( + model=self.model, + messages=messages, + result_format="message", + stream=True, + ) + + for response in responses: + if response.status_code == 200: + content = response.output.choices[0].message.content + if content: + yield content + + async def generate( + self, + prompt: str, + system_prompt: Optional[str] = None, + ) -> str: + """一次性生成文本""" + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + + response = Generation.call( + model=self.model, + messages=messages, + result_format="message", + ) + + if response.status_code == 200: + return response.output.choices[0].message.content + raise Exception(f"LLM generation failed: {response.code}") + + +llm_service = LLMService() \ No newline at end of file diff --git a/app/services/milvus.py b/app/services/milvus.py new file mode 100644 index 0000000..d01f6a5 --- /dev/null +++ b/app/services/milvus.py @@ -0,0 +1,158 @@ +from pymilvus import ( + connections, + Collection, + FieldSchema, + CollectionSchema, + DataType, + utility, +) +from typing import List, Optional + + +class MilvusService: + def __init__(self): + from app.core.config import settings + self.host = settings.milvus_host + self.port = settings.milvus_port + self.regulations_collection_name = settings.regulations_collection + self.compliance_collection_name = settings.compliance_collection + self._connected = False + + def connect(self): + """连接Milvus""" + if not self._connected: + connections.connect( + alias="default", + host=self.host, + port=self.port, + ) + self._connected = True + + def disconnect(self): + """断开连接""" + if self._connected: + connections.disconnect("default") + self._connected = False + + def create_regulations_collection(self): + """创建法规文档集合""" + from app.core.config import settings + self.connect() + + if utility.has_collection(self.regulations_collection_name): + return Collection(self.regulations_collection_name) + + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), + FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=settings.embedding_dim), + FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256), + FieldSchema(name="clause_id", dtype=DataType.VARCHAR, max_length=64), + FieldSchema(name="chapter", dtype=DataType.VARCHAR, max_length=128), + FieldSchema(name="source_file", dtype=DataType.VARCHAR, max_length=256), + FieldSchema(name="chunk_index", dtype=DataType.INT64), + FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535), + FieldSchema(name="token_count", dtype=DataType.INT64), + ] + + schema = CollectionSchema( + fields=fields, + description="法规文档向量集合", + ) + + collection = Collection( + name=self.regulations_collection_name, + schema=schema, + ) + + index_params = { + "metric_type": "COSINE", + "index_type": "IVF_FLAT", + "params": {"nlist": 128}, + } + collection.create_index(field_name="embedding", index_params=index_params) + + return collection + + def insert_chunks( + self, + embeddings: List[List[float]], + metadata: List[dict], + ) -> List[int]: + """插入向量数据""" + collection = Collection(self.regulations_collection_name) + collection.load() + + data = [ + embeddings, + [m.get("doc_name", "") for m in metadata], + [m.get("clause_id", "") for m in metadata], + [m.get("chapter", "") for m in metadata], + [m.get("source_file", "") for m in metadata], + [m.get("chunk_index", 0) for m in metadata], + [m.get("content", "") for m in metadata], + [m.get("token_count", 0) for m in metadata], + ] + + result = collection.insert(data) + collection.flush() + return result.primary_keys + + def search( + self, + query_embedding: List[float], + top_k: int = 10, + ) -> List[dict]: + """向量检索""" + collection = Collection(self.regulations_collection_name) + collection.load() + + search_params = {"metric_type": "COSINE", "params": {"nprobe": 16}} + + results = collection.search( + data=[query_embedding], + anns_field="embedding", + param=search_params, + limit=top_k, + output_fields=["doc_name", "clause_id", "chapter", "content", "chunk_index"], + ) + + hits = [] + for hit in results[0]: + hits.append({ + "id": hit.id, + "score": hit.score, + "doc_name": hit.entity.get("doc_name"), + "clause_id": hit.entity.get("clause_id"), + "chapter": hit.entity.get("chapter"), + "content": hit.entity.get("content"), + "chunk_index": hit.entity.get("chunk_index"), + }) + + return hits + + def get_collection_stats(self) -> dict: + """获取集合统计""" + self.connect() + + if not utility.has_collection(self.regulations_collection_name): + return {"exists": False} + + collection = Collection(self.regulations_collection_name) + collection.load() + + return { + "exists": True, + "name": self.regulations_collection_name, + "count": collection.num_entities, + } + + def health_check(self) -> bool: + """健康检查""" + try: + self.connect() + return True + except Exception: + return False + + +milvus_service = MilvusService() \ No newline at end of file diff --git a/app/services/mock_data.py b/app/services/mock_data.py new file mode 100644 index 0000000..d065d77 --- /dev/null +++ b/app/services/mock_data.py @@ -0,0 +1,425 @@ +""" +Mock数据服务 - 提供预设假数据供前后端对接测试 +""" + +from datetime import datetime +from typing import Dict, List, Any +import uuid + +# 预设法规文档列表 +MOCK_DOCUMENTS: List[Dict[str, Any]] = [ + { + "id": "doc-001", + "name": "道路交通安全法.pdf", + "chunks": 156, + "status": "indexed", + "created_at": datetime(2026, 5, 10, 10, 0, 0), + }, + { + "id": "doc-002", + "name": "机动车登记规定.docx", + "chunks": 89, + "status": "indexed", + "created_at": datetime(2026, 5, 10, 11, 0, 0), + }, + { + "id": "doc-003", + "name": "电动自行车规范.pdf", + "chunks": 42, + "status": "indexed", + "created_at": datetime(2026, 5, 10, 12, 0, 0), + }, + { + "id": "doc-004", + "name": "GB 38031-2020 电动汽车安全要求.pdf", + "chunks": 128, + "status": "indexed", + "created_at": datetime(2026, 5, 10, 13, 0, 0), + }, + { + "id": "doc-005", + "name": "C-NCAP管理规则(2021版).pdf", + "chunks": 95, + "status": "indexed", + "created_at": datetime(2026, 5, 10, 14, 0, 0), + }, +] + +# 预设快捷问题 +MOCK_QUICK_QUESTIONS: List[Dict[str, str]] = [ + {"id": "q1", "question": "电动自行车需要上牌照吗?", "category": "车辆登记"}, + {"id": "q2", "question": "新能源汽车有哪些补贴政策?", "category": "新能源"}, + {"id": "q3", "question": "车辆年检的规定是什么?", "category": "年检"}, + {"id": "q4", "question": "驾驶证过期了怎么处理?", "category": "驾驶证"}, +] + +# 预设检索结果 +MOCK_RETRIEVAL_RESULTS: List[Dict[str, Any]] = [ + { + "id": "chunk-001", + "score": 0.95, + "preview": "根据《道路交通安全法》第十八条规定,电动自行车经公安机关交通管理部门登记后,方可上道路行驶...", + "doc_name": "道路交通安全法", + "clause": "第十八条", + "content": "根据《道路交通安全法》第十八条规定,电动自行车经公安机关交通管理部门登记后,方可上道路行驶。电动自行车应当符合国家标准,最高设计车速不超过二十五公里每小时,整车质量不超过五十五千克。", + }, + { + "id": "chunk-002", + "score": 0.88, + "preview": "电动自行车需符合GB17761-2018国家标准,包括最高车速、整车质量、脚踏骑行能力等要求...", + "doc_name": "电动自行车规范", + "clause": "第4条", + "content": "电动自行车需符合GB17761-2018国家标准。主要技术要求包括:最高设计车速不超过25km/h,整车质量不超过55kg,具有脚踏骑行能力,蓄电池标称电压不超过48V,电动机额定连续输出功率不超过400W。", + }, + { + "id": "chunk-003", + "score": 0.82, + "preview": "机动车登记规定:初次申领机动车号牌、行驶证的,机动车所有人应当向住所地的车辆管理所申请注册登记...", + "doc_name": "机动车登记规定", + "clause": "第5条", + "content": "机动车登记规定:初次申领机动车号牌、行驶证的,机动车所有人应当向住所地的车辆管理所申请注册登记。申请注册登记的,应当提交机动车所有人的身份证明、购车发票等机动车来历证明、机动车整车出厂合格证明或者进口机动车进口凭证。", + }, + { + "id": "chunk-004", + "score": 0.75, + "preview": "驾驶电动自行车上道路行驶,应当佩戴安全头盔,遵守道路交通安全法律法规...", + "doc_name": "道路交通安全法", + "clause": "第76条", + "content": "驾驶电动自行车上道路行驶,应当佩戴安全头盔,遵守道路交通安全法律法规。电动自行车不得逆向行驶,不得在机动车道内行驶,最高车速不得超过规定的限速。", + }, + { + "id": "chunk-005", + "score": 0.68, + "preview": "电动汽车动力电池安全要求:电池系统发生热失控后,应在5分钟内不起火不爆炸...", + "doc_name": "GB 38031-2020", + "clause": "第7条", + "content": "电动汽车动力电池安全要求(GB 38031-2020):电池系统发生热失控后,应在5分钟内不起火不爆炸,为乘员预留逃生时间。电池包需通过针刺、过充、短路等安全测试。", + }, +] + +# 预设RAG问答答案模板(按关键词匹配) +MOCK_RAG_ANSWERS: Dict[str, Dict[str, Any]] = { + "电动自行车": { + "text": "根据《道路交通安全法》及相关规范,电动自行车上路需满足以下条件:\n\n1. 符合国家标准 GB17761-2018\n2. 经公安机关交通管理部门登记\n3. 最高设计车速不超过 25km/h\n4. 整车质量不超过 55kg\n5. 具有脚踏骑行能力\n6. 蓄电池标称电压不超过 48V\n\n行驶时还需佩戴安全头盔,不得逆向行驶或在机动车道内行驶。", + "retrieval_ids": ["chunk-001", "chunk-002", "chunk-004"], + }, + "驾驶证": { + "text": "驾驶证申请流程如下:\n\n1. 到驾校报名并参加培训\n2. 通过科目一(理论考试)\n3. 通过科目二(场地驾驶技能考试)\n4. 通过科目三(道路驾驶技能考试)\n5. 通过科目四(安全文明驾驶常识考试)\n6. 领取驾驶证\n\n初次申领需到住所地车辆管理所申请注册登记。", + "retrieval_ids": ["chunk-003"], + }, + "超速": { + "text": "超速处罚标准(根据《道路交通安全法》):\n\n- 超速10%以下:警告\n- 超速10%-20%:罚款50-200元\n- 超速20%-50%:罚款200-500元,记3-6分\n- 超速50%以上:罚款500-2000元,记12分,可吊销驾驶证\n\n机动车驾驶人违反道路交通安全法律、法规将处警告或二十元以上二百元以下罚款。", + "retrieval_ids": ["chunk-001"], + }, + "年检": { + "text": "车辆年检规定:\n\n- 小型私家车:6年内免检(每2年申领标志),6-10年每2年检验,10年以上每年检验\n- 车辆需携带行驶证、交强险保单\n- 检验项目:灯光、制动、排放等\n\n机动车所有人的住所迁出车辆管理所管辖区域的,需在登记证书上签注变更事项。", + "retrieval_ids": ["chunk-003"], + }, + "电池": { + "text": "电动汽车电池安全标准(GB 38031-2020):\n\n1. 热失控要求:电池系统发生热失控后,应在5分钟内不起火不爆炸,为乘员预留逃生时间\n2. 电池包需通过针刺、过充、短路等安全测试\n3. 充电系统应具备过充保护功能,当电池SOC达到100%时应自动停止充电\n4. 充电接口应符合GB/T 18487.1标准要求\n\n以上要求确保电动汽车的整车安全性。", + "retrieval_ids": ["chunk-005"], + }, + "碰撞": { + "text": "正面碰撞测试要求(C-NCAP管理规则):\n\n1. 正面100%重叠刚性壁障碰撞试验\n2. 碰撞速度:50km/h\n3. 试验后要求:\n - 车门应能打开\n - 燃油系统无泄漏\n - 座椅及安全带功能正常\n\n此测试用于评估车辆在正面碰撞事故中对乘员的保护能力。", + "retrieval_ids": [], + }, + "AEB": { + "text": "AEB(自动紧急制动系统)测试标准:\n\n1. 系统应在检测到前方障碍物时主动减速或停车\n2. 测试场景分为三种:\n - 目标车静止\n - 目标车移动\n - 目标车制动\n3. AEB功能是C-NCAP评分的重要加分项\n\n该系统对提升车辆主动安全性能具有重要意义。", + "retrieval_ids": [], + }, + "高速公路": { + "text": "高速公路安全距离规定:\n\n1. 车速超过100km/h时,与同车道前车保持100米以上距离\n2. 车速低于100km/h时,距离可适当缩短\n3. 执行紧急任务的警车、消防车、救护车、工程救险车不受行驶速度限制\n\n保持安全距离是预防追尾事故的关键措施。", + "retrieval_ids": [], + }, +} + +# 预设合规分析结果 +MOCK_COMPLIANCE_RESULT: Dict[str, Any] = { + "task_id": "task-001", + "dashboard": { + "score": 78, + "high_risk_count": 2, + "medium_risk_count": 1, + "low_risk_count": 0, + "need_fix_segments": 3, + "status": "warning", + "status_label": "需优化", + }, + "segments": [ + { + "id": 1, + "index": 1, + "intent": "车身结构设计", + "start_pos": 45, + "end_pos": 230, + "content": "车身采用高强度钢铝混合结构,A柱和B柱使用热成型钢板,厚度2.5mm。车顶结构设计满足GB 26112-2010抗压强度要求,正面碰撞能量吸收区域采用渐进式变形设计,确保碰撞时能量有效分散。", + "risk_level": "high", + "regulations": [ + { + "id": 1, + "name": "GB 26112-2010", + "clause": "第4.2条", + "score": 0.95, + "match_keyword": "车顶抗压强度", + "category": "high", + "full_content": "车顶结构应能承受相当于车辆整备质量1.5倍的载荷,载荷分布应均匀,试验后车顶变形量不超过规定值。", + }, + { + "id": 2, + "name": "C-NCAP管理规则", + "clause": "第3.1条", + "score": 0.88, + "match_keyword": "正面碰撞", + "category": "high", + "full_content": "正面碰撞试验速度为50km/h,碰撞后车门应能打开,燃油系统无泄漏,座椅及安全带功能正常。", + }, + { + "id": 3, + "name": "GB 11551-2014", + "clause": "第5条", + "score": 0.72, + "match_keyword": "碰撞能量吸收", + "category": "medium", + "full_content": "车辆正面碰撞时应有效保护乘员,碰撞能量应通过车身结构合理分散。", + }, + { + "id": 4, + "name": "机动车安全技术条件", + "clause": "第12条", + "score": 0.58, + "match_keyword": "A柱强度", + "category": "medium", + "full_content": "A柱应具备足够的抗变形能力,材料强度应符合相关标准要求。", + }, + ], + }, + { + "id": 2, + "index": 2, + "intent": "动力系统配置", + "start_pos": 298, + "end_pos": 425, + "content": "搭载永磁同步电机,最大功率150kW,峰值扭矩310Nm。电池组采用三元锂离子电池,容量75kWh,能量密度180Wh/kg。充电接口支持快充(30分钟充至80%)和慢充(8小时充满),符合GB/T 18487.1-2015标准。", + "risk_level": "medium", + "regulations": [ + { + "id": 5, + "name": "GB/T 18487.1-2015", + "clause": "第6条", + "score": 0.94, + "match_keyword": "充电接口标准", + "category": "high", + "full_content": "电动汽车传导充电接口应符合GB/T 18487.1标准要求,充电系统应具备过充保护功能。", + }, + { + "id": 6, + "name": "GB/T 31484-2015", + "clause": "第4条", + "score": 0.85, + "match_keyword": "电池能量密度", + "category": "high", + "full_content": "动力电池能量密度不低于120Wh/kg,电池系统需通过热失控测试。", + }, + { + "id": 7, + "name": "新能源汽车生产企业准入", + "clause": "第8条", + "score": 0.65, + "match_keyword": "电机功率", + "category": "medium", + "full_content": "驱动电机应符合相关技术标准,功率参数应在规定范围内。", + }, + { + "id": 8, + "name": "电动汽车安全要求", + "clause": "第7条", + "score": 0.45, + "match_keyword": "充电时间", + "category": "low", + "full_content": "充电系统应具备过充保护功能,当电池SOC达到100%时应自动停止充电。", + }, + ], + }, + { + "id": 3, + "index": 3, + "intent": "安全配置设计", + "start_pos": 570, + "end_pos": 725, + "content": "配备6个安全气囊(前排双气囊、侧气囊、侧气帘),采用预紧式安全带。ABS系统采用博世第9代ESP,具备碰撞预警功能(FCW)和自动紧急制动(AEB)。方向盘集成驾驶员疲劳监测摄像头。", + "risk_level": "low", + "regulations": [ + { + "id": 9, + "name": "GB 27887-2011", + "clause": "第5条", + "score": 0.92, + "match_keyword": "安全气囊", + "category": "high", + "full_content": "乘用车应配备驾驶员和乘客安全气囊,气囊系统应符合相关技术标准。", + }, + { + "id": 10, + "name": "GB/T 26991-2011", + "clause": "第3条", + "score": 0.78, + "match_keyword": "ABS系统", + "category": "medium", + "full_content": "车辆应配备防抱死制动系统,系统性能应符合相关标准要求。", + }, + { + "id": 11, + "name": "C-NCAP管理规则", + "clause": "第4.2条", + "score": 0.71, + "match_keyword": "AEB自动制动", + "category": "medium", + "full_content": "主动安全配置评分包含AEB功能,AEB系统应能有效检测障碍物并主动减速。", + }, + { + "id": 12, + "name": "机动车运行安全技术条件", + "clause": "第15条", + "score": 0.38, + "match_keyword": "疲劳监测", + "category": "low", + "full_content": "建议配备驾驶员状态监测系统,及时发现驾驶员疲劳或分心状态。", + }, + ], + }, + ], + "priority_actions": [ + { + "regulation": "GB 26112-2010 第4.2条", + "issue": "缺少车顶抗压强度测试数据", + "suggestion": "补充车顶抗压强度具体测试数据,确保满足1.5倍整备质量载荷要求", + "severity": "high", + }, + { + "regulation": "GB/T 31484-2015 第4条", + "issue": "缺少电池热失控测试报告", + "suggestion": "补充电池热失控测试报告,验证5分钟内不起火不爆炸", + "severity": "high", + }, + { + "regulation": "C-NCAP管理规则 第3.1条", + "issue": "缺少碰撞后车门开启性能数据", + "suggestion": "提供碰撞后车门开启性能测试数据", + "severity": "medium", + }, + ], +} + +# 预设合规对话响应模板 +MOCK_COMPLIANCE_CHAT_RESPONSES: Dict[str, Dict[str, str]] = { + "车身结构设计": { + "compliance": "根据当前分析,车身结构设计部分存在以下合规问题:\n\n1. GB 26112-2010要求车顶承受1.5倍整备质量载荷,目前设计声明满足要求但缺少测试数据\n2. C-NCAP正面碰撞后车门应能打开,需提供碰撞测试报告\n\n建议补充相关测试数据以提升合规评分。", + "interpretation": "GB 26112-2010 第4.2条具体要求解读:\n\n车顶抗压强度测试是车辆被动安全的重要指标。该标准要求车顶结构能够承受相当于车辆整备质量1.5倍的均匀分布载荷,试验后车顶变形量不得超过规定限值。\n\n热成型钢板(22MnB5材料)抗拉强度约1500-1650 MPa,理论上能满足要求,但需通过实际测试验证。", + "suggestion": "针对车身结构设计的修改建议:\n\n1. 补充车顶抗压强度测试报告\n2. 提供A柱材料认证证书\n3. 完善正面碰撞能量吸收设计说明\n4. 添加碰撞后车门开启性能数据\n\n这些补充材料可有效提升合规评分。", + }, + "动力系统配置": { + "compliance": "动力系统配置整体合规性良好,主要检查点:\n\n1. 电池能量密度180Wh/kg超过最低要求120Wh/kg ✓\n2. 充电接口符合GB/T 18487.1标准 ✓\n3. 快充30分钟充至80%符合行业标准 ✓\n\n需补充电池热失控测试报告。", + "interpretation": "GB/T 31484-2015对动力电池的要求解读:\n\n1. 能量密度:不低于120Wh/kg(您的设计180Wh/kg满足要求)\n2. 循环寿命:不少于1000次循环后容量保持率≥80%\n3. 安全测试:需通过针刺、过充、短路等测试\n\n建议补充循环寿命测试数据。", + "suggestion": "动力系统配置改进建议:\n\n1. 补充电池热失控测试报告\n2. 提供循环寿命测试数据\n3. 添加充电系统过充保护功能说明\n4. 完善电池管理系统(BMS)技术文档", + }, + "安全配置设计": { + "compliance": "安全配置设计合规性评估:\n\n1. 安全气囊配置满足GB 27887-2011要求 ✓\n2. ABS/ESP系统符合标准 ✓\n3. AEB功能是C-NCAP加分项 ✓\n\n驾驶员疲劳监测是建议配置,不强制要求。", + "interpretation": "C-NCAP主动安全评分规则解读:\n\nAEB(自动紧急制动)系统是C-NCAP评分的重要加分项,最高可获得额外加分。测试场景包括:\n- 目标车静止场景\n- 目标车移动场景\n- 目标车制动场景\n\n建议完善AEB系统测试数据以获取更高评分。", + "suggestion": "安全配置优化建议:\n\n1. 提供AEB系统测试数据\n2. 补充FCW预警功能测试报告\n3. 添加安全气囊展开时间数据\n4. 完善驾驶员疲劳监测系统说明(如有)", + }, +} + +# 预设系统统计数据 +MOCK_SYSTEM_STATS: Dict[str, int] = { + "docs": 5, + "chunks": 510, + "vectors": 510, + "segments": 0, +} + +# 预设系统配置 +MOCK_SYSTEM_CONFIG: Dict[str, Any] = { + "llm": { + "model": "qwen-max", + }, + "embedding": { + "model": "text-embedding-v3", + "dimension": 1536, + }, + "milvus": { + "host": "localhost", + "port": 19530, + }, + "retrieval": { + "vector_top_k": 10, + "final_top_k": 5, + }, +} + + +def get_mock_documents() -> List[Dict[str, Any]]: + """获取预设法规文档列表""" + return MOCK_DOCUMENTS + + +def get_mock_quick_questions() -> List[Dict[str, str]]: + """获取预设快捷问题""" + return MOCK_QUICK_QUESTIONS + + +def get_mock_retrieval(query: str, top_k: int = 5) -> List[Dict[str, Any]]: + """根据查询关键词返回预设检索结果""" + results = [] + for keyword, data in MOCK_RAG_ANSWERS.items(): + if keyword in query: + for retrieval_id in data.get("retrieval_ids", []): + for item in MOCK_RETRIEVAL_RESULTS: + if item["id"] == retrieval_id: + results.append({ + "id": item["id"], + "score": item["score"], + "preview": item["preview"], + "doc_name": item["doc_name"], + "clause": item["clause"], + }) + break + if not results: + results = MOCK_RETRIEVAL_RESULTS[:top_k] + return results[:top_k] + + +def get_mock_rag_answer(query: str) -> str: + """根据查询关键词返回预设答案""" + for keyword, data in MOCK_RAG_ANSWERS.items(): + if keyword in query: + return data["text"] + return "抱歉,暂未找到与您问题直接相关的法规内容。请尝试更具体的问题,或联系交通管理部门获取详细信息。\n\n您可以尝试询问:电动自行车、驾驶证、超速处罚、年检、电池安全、碰撞测试、AEB系统、高速公路规则等话题。" + + +def get_mock_compliance_result(task_id: str) -> Dict[str, Any]: + """获取预设合规分析结果""" + result = MOCK_COMPLIANCE_RESULT.copy() + result["task_id"] = task_id + return result + + +def get_mock_compliance_chat_response(intent: str, query: str) -> str: + """获取预设合规对话响应""" + responses = MOCK_COMPLIANCE_CHAT_RESPONSES.get(intent, {}) + if "合规" in query or "符合" in query: + return responses.get("compliance", "根据相关法规分析,该段落的合规性需进一步评估。") + elif "解读" in query or "什么" in query or "如何" in query: + return responses.get("interpretation", "法规要求详细解读如下...") + elif "修改" in query or "建议" in query or "完善" in query: + return responses.get("suggestion", "建议进行以下修改以提升合规性...") + return f"关于您的问题,{intent}部分涉及多条相关法规。您可以进一步询问合规性评估或修改建议。" + + +def generate_task_id() -> str: + """生成任务ID""" + return f"task-{uuid.uuid4().hex[:8]}" + + +def generate_doc_id() -> str: + """生成文档ID""" + return f"doc-{uuid.uuid4().hex[:8]}" \ No newline at end of file diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 0000000..44884e4 --- /dev/null +++ b/app/utils/__init__.py @@ -0,0 +1,4 @@ +from .chunking import TextChunker, chunker +from .logger import logger, setup_logging + +__all__ = ["TextChunker", "chunker", "logger", "setup_logging"] \ No newline at end of file diff --git a/app/utils/chunking.py b/app/utils/chunking.py new file mode 100644 index 0000000..6fe1d87 --- /dev/null +++ b/app/utils/chunking.py @@ -0,0 +1,78 @@ +import re +from typing import List +from app.core.config import settings + + +class TextChunker: + def __init__( + self, + chunk_size: int = settings.chunk_size, + chunk_overlap: int = settings.chunk_overlap, + ): + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + + def chunk_by_clause(self, text: str) -> List[dict]: + """按条款边界分块(适用于法规文档)""" + clause_pattern = r"(第[一二三四五六七八九十百]+条)" + parts = re.split(clause_pattern, text) + + chunks = [] + current_clause = None + current_text = "" + chunk_index = 0 + + for part in parts: + if re.match(clause_pattern, part): + if current_clause and current_text.strip(): + chunks.append({ + "clause_id": current_clause, + "content": current_text.strip(), + "chunk_index": chunk_index, + }) + chunk_index += 1 + current_clause = part + current_text = "" + else: + current_text += part + + if current_clause and current_text.strip(): + chunks.append({ + "clause_id": current_clause, + "content": current_text.strip(), + "chunk_index": chunk_index, + }) + + return chunks + + def chunk_by_size(self, text: str) -> List[dict]: + """按固定大小分块""" + chunks = [] + start = 0 + chunk_index = 0 + + while start < len(text): + end = start + self.chunk_size + chunk_text = text[start:end] + + if chunk_text.strip(): + chunks.append({ + "content": chunk_text.strip(), + "chunk_index": chunk_index, + "start_pos": start, + "end_pos": end, + }) + chunk_index += 1 + + start = end - self.chunk_overlap + + return chunks + + def estimate_tokens(self, text: str) -> int: + """估算token数量""" + chinese_chars = len(re.findall(r"[^\x00-\xff]", text)) + english_chars = len(text) - chinese_chars + return int(chinese_chars / 1.5 + english_chars / 4) + + +chunker = TextChunker() \ No newline at end of file diff --git a/app/utils/logger.py b/app/utils/logger.py new file mode 100644 index 0000000..366ca57 --- /dev/null +++ b/app/utils/logger.py @@ -0,0 +1,24 @@ +import logging +import sys + + +def setup_logging() -> logging.Logger: + """配置日志""" + logger = logging.getLogger("app") + logger.setLevel(logging.INFO) + + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.INFO) + + formatter = logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + handler.setFormatter(formatter) + + logger.addHandler(handler) + + return logger + + +logger = setup_logging() \ No newline at end of file diff --git a/app/workflows/__init__.py b/app/workflows/__init__.py new file mode 100644 index 0000000..107f16c --- /dev/null +++ b/app/workflows/__init__.py @@ -0,0 +1,12 @@ +from .rag_workflow import RagState, rag_workflow, run_rag_workflow, stream_rag_workflow +from .compliance_workflow import ComplianceState, compliance_workflow, run_compliance_workflow + +__all__ = [ + "RagState", + "rag_workflow", + "run_rag_workflow", + "stream_rag_workflow", + "ComplianceState", + "compliance_workflow", + "run_compliance_workflow", +] \ No newline at end of file diff --git a/app/workflows/compliance_workflow.py b/app/workflows/compliance_workflow.py new file mode 100644 index 0000000..5cafc4a --- /dev/null +++ b/app/workflows/compliance_workflow.py @@ -0,0 +1,175 @@ +from typing import TypedDict, List +from langgraph.graph import StateGraph, END + + +class ComplianceState(TypedDict): + document_path: str + raw_text: str + segments: List[dict] + matched_regulations: List[dict] + risk_dashboard: dict + priority_actions: List[dict] + + +def parse_document(state: ComplianceState) -> dict: + """解析文档""" + from app.services import get_document_service + doc_service = get_document_service( + "/airegulation/demo-mao/backend/data/raw", + "/airegulation/demo-mao/backend/data/parsed", + ) + text = doc_service.parse_document(state["document_path"]) + return {"raw_text": text} + + +def segment_document(state: ComplianceState) -> dict: + """AI语义分段""" + from app.services import llm_service + prompt = f"""请分析以下设计方案文档,按照设计意图将其分成若干语义段落。 + +文档内容: +{state['raw_text'][:3000]} + +请输出JSON格式的分段结果,每个段落包含: +- intent: 段落意图/主题 +- startPos: 在原文中的起始位置(大致) +- endPos: 在原文中的结束位置(大致) +- keywords: 关键词列表 + +输出格式: +[{{"intent": "...", "startPos": 0, "endPos": 100, "keywords": [...]}}]""" + + # 简化处理:返回基本分段 + segments = [ + { + "id": 1, + "intent": "整体设计概述", + "content": state["raw_text"][:500], + "keywords": ["设计", "方案"], + } + ] + + return {"segments": segments} + + +def match_regulations(state: ComplianceState) -> dict: + """法规匹配""" + from app.services import embedding_service, milvus_service + matched = [] + + for segment in state["segments"]: + keyword_text = " ".join(segment.get("keywords", [])) + embedding = embedding_service.embed_single(keyword_text) + + docs = milvus_service.search(embedding, top_k=5) + + segment_regs = [] + for doc in docs: + category = "high" if doc["score"] > 0.85 else ("medium" if doc["score"] > 0.6 else "low") + segment_regs.append({ + "id": doc["id"], + "name": doc["doc_name"], + "clause": doc.get("clause_id"), + "score": doc["score"], + "match_keyword": keyword_text, + "category": category, + "full_content": doc["content"], + }) + + segment["regulations"] = segment_regs + matched.append(segment) + + return {"matched_regulations": matched} + + +def calculate_risk(state: ComplianceState) -> dict: + """计算风险等级""" + segments = state["matched_regulations"] + + high_count = 0 + medium_count = 0 + low_count = 0 + need_fix = 0 + total_score = 0 + + for segment in segments: + regs = segment.get("regulations", []) + high_regs = [r for r in regs if r["category"] == "high"] + + if high_regs: + avg_score = sum(r["score"] for r in high_regs) / len(high_regs) + if avg_score < 0.9: + segment["risk_level"] = "high" + high_count += 1 + need_fix += 1 + elif avg_score < 0.92: + segment["risk_level"] = "medium" + medium_count += 1 + else: + segment["risk_level"] = "low" + low_count += 1 + else: + segment["risk_level"] = "low" + low_count += 1 + + total_score += avg_score if high_regs else 100 + + avg_score = total_score / len(segments) if segments else 100 + + status = "pass" if avg_score >= 90 else ("warning" if avg_score >= 70 else "fail") + status_label = "合规" if status == "pass" else ("需要修改" if status == "warning" else "高风险") + + dashboard = { + "score": avg_score, + "high_risk_count": high_count, + "medium_risk_count": medium_count, + "low_risk_count": low_count, + "need_fix_segments": need_fix, + "status": status, + "status_label": status_label, + } + + return {"risk_dashboard": dashboard, "segments": segments} + + +def generate_suggestions(state: ComplianceState) -> dict: + """生成优先建议""" + actions = [] + + for segment in state["segments"]: + for reg in segment.get("regulations", []): + if reg["category"] == "high" and reg["score"] < 0.9: + actions.append({ + "regulation": reg["name"], + "issue": reg["match_keyword"], + "suggestion": f"建议对照{reg['name']}第{reg.get('clause', '')}条进行修改", + "severity": "high", + }) + + return {"priority_actions": actions} + + +# 构建工作流图 +compliance_graph = StateGraph(ComplianceState) + +compliance_graph.add_node("parse", parse_document) +compliance_graph.add_node("segment", segment_document) +compliance_graph.add_node("match", match_regulations) +compliance_graph.add_node("score", calculate_risk) +compliance_graph.add_node("suggest", generate_suggestions) + +compliance_graph.set_entry_point("parse") +compliance_graph.add_edge("parse", "segment") +compliance_graph.add_edge("segment", "match") +compliance_graph.add_edge("match", "score") +compliance_graph.add_edge("score", "suggest") +compliance_graph.add_edge("suggest", END) + +compliance_workflow = compliance_graph.compile() + + +async def run_compliance_workflow(document_path: str) -> ComplianceState: + """运行合规分析工作流""" + initial_state: ComplianceState = {"document_path": document_path} + result = compliance_workflow.invoke(initial_state) + return result \ No newline at end of file diff --git a/app/workflows/rag_workflow.py b/app/workflows/rag_workflow.py new file mode 100644 index 0000000..370d2ce --- /dev/null +++ b/app/workflows/rag_workflow.py @@ -0,0 +1,114 @@ +from typing import TypedDict, List +from langgraph.graph import StateGraph, END + + +class RagState(TypedDict): + query: str + query_embedding: List[float] + retrieved_docs: List[dict] + context: str + answer: str + sources: List[dict] + + +def embed_query(state: RagState) -> dict: + """将查询转为向量""" + from app.services import embedding_service + embedding = embedding_service.embed_single(state["query"]) + return {"query_embedding": embedding} + + +def retrieve_docs(state: RagState) -> dict: + """向量检索""" + from app.services import milvus_service + from app.core.config import settings + docs = milvus_service.search( + state["query_embedding"], + top_k=settings.vector_top_k, + ) + return {"retrieved_docs": docs[:settings.final_top_k]} + + +def build_context(state: RagState) -> dict: + """构建上下文""" + context_parts = [] + sources = [] + + for doc in state["retrieved_docs"]: + context_parts.append(f"【{doc['doc_name']} - {doc.get('clause_id', '')}】\n{doc['content']}") + sources.append({ + "name": doc["doc_name"], + "clause": doc.get("clause_id"), + }) + + context = "\n\n".join(context_parts) + return {"context": context, "sources": sources} + + +def generate_answer(state: RagState) -> dict: + """生成答案""" + from app.services import llm_service + prompt = f"""请根据以下法规内容回答用户问题,并在回答中标注引用的法规条款。 + +法规内容: +{state['context']} + +用户问题:{state['query']} + +请给出准确、简洁的回答,并引用相关法规条款。""" + + answer = "" + for chunk in llm_service.generate_stream(prompt): + answer += chunk + + return {"answer": answer} + + +# 构建工作流图 +rag_graph = StateGraph(RagState) + +rag_graph.add_node("embed", embed_query) +rag_graph.add_node("retrieve", retrieve_docs) +rag_graph.add_node("build_context", build_context) +rag_graph.add_node("generate", generate_answer) + +rag_graph.set_entry_point("embed") +rag_graph.add_edge("embed", "retrieve") +rag_graph.add_edge("retrieve", "build_context") +rag_graph.add_edge("build_context", "generate") +rag_graph.add_edge("generate", END) + +rag_workflow = rag_graph.compile() + + +async def run_rag_workflow(query: str) -> RagState: + """运行RAG工作流""" + initial_state: RagState = {"query": query} + result = rag_workflow.invoke(initial_state) + return result + + +def stream_rag_workflow(query: str): + """流式运行RAG工作流""" + from app.services import llm_service + + # 先完成检索阶段 + state: RagState = {"query": query} + state.update(embed_query(state)) + state.update(retrieve_docs(state)) + state.update(build_context(state)) + + # 流式生成阶段 + prompt = f"""请根据以下法规内容回答用户问题,并在回答中标注引用的法规条款。 + +法规内容: +{state['context']} + +用户问题:{state['query']} + +请给出准确、简洁的回答,并引用相关法规条款。""" + + for chunk in llm_service.generate_stream(prompt): + yield {"type": "chunk", "text": chunk} + + yield {"type": "done", "sources": state["sources"]} \ No newline at end of file diff --git a/data/raw/compliance_task-32e64724_test_doc.txt b/data/raw/compliance_task-32e64724_test_doc.txt new file mode 100644 index 0000000..d670460 --- /dev/null +++ b/data/raw/compliance_task-32e64724_test_doc.txt @@ -0,0 +1 @@ +test content diff --git a/data/raw/doc-3b47abd7_requirement.txt b/data/raw/doc-3b47abd7_requirement.txt new file mode 100644 index 0000000..e51d350 --- /dev/null +++ b/data/raw/doc-3b47abd7_requirement.txt @@ -0,0 +1,2 @@ +apache-flink==1.13.2 +PyMySQL>=1.1.0 diff --git a/data/raw/doc-9b01a78a_requirement.txt b/data/raw/doc-9b01a78a_requirement.txt new file mode 100644 index 0000000..e51d350 --- /dev/null +++ b/data/raw/doc-9b01a78a_requirement.txt @@ -0,0 +1,2 @@ +apache-flink==1.13.2 +PyMySQL>=1.1.0 diff --git a/main.py b/main.py new file mode 100644 index 0000000..997a4ad --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +def main(): + print("Hello from backend!") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..cb109bf --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "backend" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.9" +dependencies = [] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..502ea6c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,34 @@ +# Web框架 +fastapi>=0.110.0 +uvicorn>=0.27.0 + +# LangGraph & LangChain +langgraph>=0.0.40 +langchain>=0.2.0 +langchain-community>=0.2.0 + +# DashScope +dashscope>=1.14.0 + +# Milvus +pymilvus>=2.3.0 + +# 文档解析 +pypdf2>=3.0.0 +python-docx>=1.1.0 +pdfplumber>=0.10.0 + +# Pydantic配置 +pydantic>=2.0.0 +pydantic-settings>=2.0.0 + +# 工具 +python-multipart>=0.0.9 +sse-starlette>=1.8.0 +python-dotenv>=1.0.0 +tiktoken>=0.5.0 +httpx>=0.25.0 + +# 测试 +pytest>=7.4.0 +pytest-asyncio>=0.21.0 \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..f069c7c --- /dev/null +++ b/uv.lock @@ -0,0 +1,8 @@ +version = 1 +revision = 3 +requires-python = ">=3.9" + +[[package]] +name = "backend" +version = "0.1.0" +source = { virtual = "." }