初始化

This commit is contained in:
2026-05-11 11:22:55 +08:00
parent 5f6c571434
commit 80dcd070f7
39 changed files with 1997 additions and 0 deletions

52
.env Normal file
View File

@@ -0,0 +1,52 @@
# DashScope API
DASHSCOPE_API_KEY=your_api_key_here
# PostgreSQL
POSTGRES_HOST=localhost
POSTGRES_PORT=5432
POSTGRES_USER=postgresql
POSTGRES_PASSWORD=postgresql123456
POSTGRES_DB=mydb
# Redis
REDIS_HOST=localhost
REDIS_PORT=6379
REDIS_PASSWORD=redis@123
# Milvus
MILVUS_HOST=localhost
MILVUS_PORT=19530
# MinIO
MINIO_ENDPOINT=localhost:9000
MINIO_ACCESS_KEY=minioadmin
MINIO_SECRET_KEY=minioadmin
# Neo4j
NEO4J_URI=bolt://localhost:7687
NEO4J_USER=neo4j
NEO4J_PASSWORD=neo4j123
# RabbitMQ
RABBITMQ_HOST=localhost
RABBITMQ_PORT=5672
RABBITMQ_USER=admin
RABBITMQ_PASSWORD=admin@123
# LLM配置
LLM_MODEL=qwen-max
EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536
# 检索配置
VECTOR_TOP_K=10
BM25_TOP_K=10
FINAL_TOP_K=5
# 分块配置
CHUNK_SIZE=800
CHUNK_OVERLAP=50
# 服务配置
API_HOST=0.0.0.0
API_PORT=8000

24
.env.example Normal file
View File

@@ -0,0 +1,24 @@
# DashScope API
DASHSCOPE_API_KEY=your_api_key_here
# Milvus
MILVUS_HOST=localhost
MILVUS_PORT=19530
# LLM配置
LLM_MODEL=qwen-max
EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536
# 检索配置
VECTOR_TOP_K=10
BM25_TOP_K=10
FINAL_TOP_K=5
# 分块配置
CHUNK_SIZE=800
CHUNK_OVERLAP=50
# 服务配置
API_HOST=0.0.0.0
API_PORT=8000

10
.gitignore vendored Normal file
View File

@@ -0,0 +1,10 @@
# Python-generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info
# Virtual environments
.venv

1
.python-version Normal file
View File

@@ -0,0 +1 @@
3.9

18
Dockerfile Normal file
View File

@@ -0,0 +1,18 @@
FROM python:3.11-slim
WORKDIR /app
# 安装依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 复制代码
COPY app/ ./app/
COPY data/ ./data/
# 环境变量
ENV API_HOST=0.0.0.0
ENV API_PORT=8000
# 启动命令
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

3
app/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .main import app
__all__ = ["app"]

3
app/api/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .routes import api_router
__all__ = ["api_router"]

View File

@@ -0,0 +1,13 @@
from fastapi import APIRouter
from .docs import router as docs_router
from .rag import router as rag_router
from .compliance import router as compliance_router
from .status import router as status_router
api_router = APIRouter()
api_router.include_router(docs_router)
api_router.include_router(rag_router)
api_router.include_router(compliance_router)
api_router.include_router(status_router)
__all__ = ["api_router"]

View File

@@ -0,0 +1,96 @@
from fastapi import APIRouter, UploadFile, File, HTTPException
from sse_starlette.sse import EventSourceResponse
import uuid
import os
import json
import asyncio
from app.schemas.compliance import (
AnalyzeResponse,
ComplianceChatRequest,
)
from app.services.mock_data import (
generate_task_id,
get_mock_compliance_result,
get_mock_compliance_chat_response,
)
router = APIRouter(prefix="/compliance", tags=["合规分析"])
# 临时存储分析任务
tasks_store: dict[str, dict] = {}
@router.post("/analyze", response_model=AnalyzeResponse)
async def analyze_document(file: UploadFile = File(...)):
"""上传设计方案进行分析"""
# 生成任务ID
task_id = generate_task_id()
# 保存文件
raw_dir = "/airegulation/demo-mao/backend/data/raw"
os.makedirs(raw_dir, exist_ok=True)
file_path = os.path.join(raw_dir, f"compliance_{task_id}_{file.filename}")
content = await file.read()
with open(file_path, "wb") as f:
f.write(content)
# 记录任务
tasks_store[task_id] = {
"task_id": task_id,
"file_path": file_path,
"status": "processing",
"result": None,
}
# 模拟异步处理完成(立即返回结果)
# 实际应用中这应该是后台任务
tasks_store[task_id]["status"] = "completed"
tasks_store[task_id]["result"] = get_mock_compliance_result(task_id)
return AnalyzeResponse(task_id=task_id)
@router.get("/result/{task_id}")
async def get_result(task_id: str):
"""获取分析结果"""
if task_id not in tasks_store:
# 如果任务ID不存在返回默认mock结果
return get_mock_compliance_result(task_id)
task = tasks_store[task_id]
if task["status"] == "processing":
return {"status": "processing", "message": "分析进行中"}
return task["result"]
@router.post("/chat/{segment_id}")
async def compliance_chat(segment_id: int, request: ComplianceChatRequest):
"""针对段落进行合规对话"""
# 根据segment_id获取对应的intent
intent_map = {
1: "车身结构设计",
2: "动力系统配置",
3: "安全配置设计",
}
intent = intent_map.get(segment_id, "车身结构设计")
async def generate():
# 获取预设响应
response = get_mock_compliance_chat_response(intent, request.query)
# 流式输出响应
sentences = response.split("\n\n")
for sentence in sentences:
if sentence.strip():
chunks = sentence.split("\n")
for chunk in chunks:
if chunk.strip():
await asyncio.sleep(0.05)
yield {"event": "message", "data": json.dumps({"type": "chunk", "text": chunk + "\n"})}
yield {"event": "message", "data": json.dumps({"type": "done"})}
return EventSourceResponse(generate())

115
app/api/routes/docs.py Normal file
View File

@@ -0,0 +1,115 @@
from fastapi import APIRouter, UploadFile, File, HTTPException
import os
import uuid
from datetime import datetime
from app.schemas.doc import (
DocumentUploadResponse,
DocumentListResponse,
DocumentInfo,
ParseResponse,
EmbedResponse,
)
from app.services.mock_data import get_mock_documents, generate_doc_id
router = APIRouter(prefix="/docs", tags=["文档管理"])
# 临时存储文档信息包含预设的mock文档
documents_store: dict[str, dict] = {}
# 初始化时加载mock文档
for doc in get_mock_documents():
documents_store[doc["id"]] = doc
@router.post("/upload", response_model=DocumentUploadResponse)
async def upload_document(file: UploadFile = File(...)):
"""上传法规文档"""
# 检查文件格式
allowed_ext = [".pdf", ".docx", ".doc", ".txt"]
ext = os.path.splitext(file.filename)[1].lower()
if ext not in allowed_ext:
raise HTTPException(400, f"Unsupported file format: {ext}")
# 生成文档ID
doc_id = generate_doc_id()
# 保存文件
raw_dir = "/airegulation/demo-mao/backend/data/raw"
os.makedirs(raw_dir, exist_ok=True)
file_path = os.path.join(raw_dir, f"{doc_id}_{file.filename}")
content = await file.read()
with open(file_path, "wb") as f:
f.write(content)
# 记录文档信息
documents_store[doc_id] = {
"id": doc_id,
"name": file.filename,
"path": file_path,
"size": len(content),
"status": "uploaded",
"chunks": 0,
"created_at": datetime.now(),
}
return DocumentUploadResponse(
doc_id=doc_id,
filename=file.filename,
size=len(content),
)
@router.get("/list", response_model=DocumentListResponse)
async def list_documents():
"""获取已索引文档列表"""
docs = [
DocumentInfo(
id=d["id"],
name=d["name"],
chunks=d["chunks"],
status=d["status"],
created_at=d.get("created_at"),
)
for d in documents_store.values()
]
return DocumentListResponse(docs=docs)
@router.post("/parse/{doc_id}", response_model=ParseResponse)
async def parse_document(doc_id: str):
"""解析文档并分块"""
if doc_id not in documents_store:
raise HTTPException(404, "Document not found")
doc = documents_store[doc_id]
# 模拟解析逻辑
doc["status"] = "parsed"
# 根据文件大小计算chunks数量
file_size = doc.get("size", 100000)
doc["chunks"] = max(20, file_size // 8000)
return ParseResponse(doc_id=doc_id, chunks=doc["chunks"])
@router.post("/embed/{doc_id}", response_model=EmbedResponse)
async def embed_document(doc_id: str):
"""嵌入并存入向量库"""
if doc_id not in documents_store:
raise HTTPException(404, "Document not found")
doc = documents_store[doc_id]
# 模拟嵌入逻辑
doc["status"] = "indexed"
return EmbedResponse(doc_id=doc_id, vectors=doc["chunks"])
@router.delete("/delete/{doc_id}")
async def delete_document(doc_id: str):
"""删除文档"""
if doc_id not in documents_store:
raise HTTPException(404, "Document not found")
del documents_store[doc_id]
return {"success": True}

74
app/api/routes/rag.py Normal file
View File

@@ -0,0 +1,74 @@
from fastapi import APIRouter
from sse_starlette.sse import EventSourceResponse
from app.schemas.rag import RagChatRequest, QuickQuestionsResponse, QuickQuestion
from app.services.mock_data import (
get_mock_quick_questions,
get_mock_retrieval,
get_mock_rag_answer,
)
import json
import asyncio
router = APIRouter(prefix="/rag", tags=["RAG问答"])
@router.post("/chat")
async def rag_chat(request: RagChatRequest):
"""SSE流式问答"""
async def generate():
# 发送检索开始事件
yield {"event": "message", "data": json.dumps({"type": "retrieving"})}
# 模拟检索延迟
await asyncio.sleep(0.3)
# 执行检索
docs = get_mock_retrieval(request.query, top_k=request.top_k)
retrieved_data = [
{
"id": d["id"],
"score": d["score"],
"preview": d["preview"],
"doc_name": d.get("doc_name", ""),
"clause": d.get("clause", ""),
}
for d in docs
]
yield {"event": "message", "data": json.dumps({"type": "retrieved", "docs": retrieved_data})}
# 发送生成开始事件
yield {"event": "message", "data": json.dumps({"type": "generating", "text": "正在生成答案..."})}
# 模拟生成延迟
await asyncio.sleep(0.2)
# 获取预设答案
answer = get_mock_rag_answer(request.query)
# 流式输出答案(按句子分割)
sentences = answer.split("\n\n")
for sentence in sentences:
if sentence.strip():
# 进一步分割长句子
chunks = sentence.split("\n")
for chunk in chunks:
if chunk.strip():
await asyncio.sleep(0.05) # 模拟生成延迟
yield {"event": "message", "data": json.dumps({"type": "chunk", "text": chunk + "\n"})}
# 发送完成事件
yield {"event": "message", "data": json.dumps({"type": "done"})}
return EventSourceResponse(generate())
@router.get("/quick-questions", response_model=QuickQuestionsResponse)
async def get_quick_questions():
"""获取预设快捷问题"""
questions = [
QuickQuestion(id=q["id"], question=q["question"], category=q["category"])
for q in get_mock_quick_questions()
]
return QuickQuestionsResponse(questions=questions)

28
app/api/routes/status.py Normal file
View File

@@ -0,0 +1,28 @@
from fastapi import APIRouter
from app.core.config import settings
from app.services.mock_data import MOCK_SYSTEM_STATS, MOCK_SYSTEM_CONFIG
router = APIRouter(prefix="/status", tags=["系统状态"])
@router.get("/stats")
async def get_stats():
"""获取系统统计"""
# 返回预设统计数据
return MOCK_SYSTEM_STATS
@router.get("/config")
async def get_config():
"""获取当前配置"""
return MOCK_SYSTEM_CONFIG
@router.get("/milvus/health")
async def milvus_health():
"""Milvus健康检查"""
# 模拟连接状态(假数据模式下始终返回连接成功)
return {
"connected": True,
"collections": ["vehicle_regulations"],
}

3
app/core/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .config import settings, Settings
__all__ = ["settings", "Settings"]

41
app/core/config.py Normal file
View File

@@ -0,0 +1,41 @@
from pydantic_settings import BaseSettings
from typing import Optional
class Settings(BaseSettings):
# DashScope API
dashscope_api_key: str = ""
# Milvus
milvus_host: str = "localhost"
milvus_port: int = 19530
# LLM配置
llm_model: str = "qwen-max"
embedding_model: str = "text-embedding-v3"
embedding_dim: int = 1536
# 检索配置
vector_top_k: int = 10
bm25_top_k: int = 10
final_top_k: int = 5
# 分块配置
chunk_size: int = 800
chunk_overlap: int = 50
# 服务配置
api_host: str = "0.0.0.0"
api_port: int = 8000
# Collection名称
regulations_collection: str = "vehicle_regulations"
compliance_collection: str = "compliance_cache"
class Config:
env_file = ".env"
env_file_encoding = "utf-8"
case_sensitive = False
settings = Settings()

68
app/main.py Normal file
View File

@@ -0,0 +1,68 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.api.routes import api_router
from app.core.config import settings
from app.utils.logger import logger
from app.services import milvus_service
# 创建应用
app = FastAPI(
title="车辆法规智能检索系统",
description="基于RAG技术的法规检索与合规分析后端API",
version="1.0.0",
)
# CORS配置
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# 注册路由
app.include_router(api_router, prefix="/api")
@app.on_event("startup")
async def startup_event():
"""启动时初始化"""
logger.info("Starting application...")
# 初始化Milvus集合仅在服务可用时
try:
if milvus_service is not None:
milvus_service.create_regulations_collection()
logger.info("Milvus collection initialized")
else:
logger.warning("Milvus service not available, using mock data")
except Exception as e:
logger.warning(f"Milvus initialization failed: {e}")
@app.on_event("shutdown")
async def shutdown_event():
"""关闭时清理"""
logger.info("Shutting down application...")
try:
if milvus_service is not None:
milvus_service.disconnect()
except Exception as e:
logger.warning(f"Shutdown cleanup error: {e}")
@app.get("/")
async def root():
"""根路径"""
return {
"message": "车辆法规智能检索系统 API",
"version": "1.0.0",
"docs": "/docs",
}
@app.get("/health")
async def health():
"""健康检查"""
return {"status": "healthy"}

49
app/schemas/__init__.py Normal file
View File

@@ -0,0 +1,49 @@
from .doc import (
DocumentUploadResponse,
DocumentInfo,
DocumentListResponse,
ChunkInfo,
ParseResponse,
EmbedResponse,
)
from .rag import (
RagChatRequest,
RetrievedDoc,
SourceInfo,
QuickQuestion,
QuickQuestionsResponse,
)
from .compliance import (
RiskLevel,
ComplianceStatus,
Regulation,
ComplianceSegment,
RiskDashboard,
PriorityAction,
ComplianceResult,
ComplianceChatRequest,
AnalyzeResponse,
)
__all__ = [
"DocumentUploadResponse",
"DocumentInfo",
"DocumentListResponse",
"ChunkInfo",
"ParseResponse",
"EmbedResponse",
"RagChatRequest",
"RetrievedDoc",
"SourceInfo",
"QuickQuestion",
"QuickQuestionsResponse",
"RiskLevel",
"ComplianceStatus",
"Regulation",
"ComplianceSegment",
"RiskDashboard",
"PriorityAction",
"ComplianceResult",
"ComplianceChatRequest",
"AnalyzeResponse",
]

69
app/schemas/compliance.py Normal file
View File

@@ -0,0 +1,69 @@
from pydantic import BaseModel
from typing import Optional
from enum import Enum
class RiskLevel(str, Enum):
high = "high"
medium = "medium"
low = "low"
class ComplianceStatus(str, Enum):
pass_status = "pass"
warning = "warning"
fail = "fail"
class Regulation(BaseModel):
id: int
name: str
clause: Optional[str] = None
score: float
match_keyword: str
category: RiskLevel
full_content: str
class ComplianceSegment(BaseModel):
id: int
index: int
intent: str
start_pos: int
end_pos: int
content: str
risk_level: RiskLevel
regulations: list[Regulation]
class RiskDashboard(BaseModel):
score: float
high_risk_count: int
medium_risk_count: int
low_risk_count: int
need_fix_segments: int
status: ComplianceStatus
status_label: str
class PriorityAction(BaseModel):
regulation: str
issue: str
suggestion: str
severity: RiskLevel
class ComplianceResult(BaseModel):
task_id: str
dashboard: RiskDashboard
segments: list[ComplianceSegment]
priority_actions: list[PriorityAction]
class ComplianceChatRequest(BaseModel):
query: str
class AnalyzeResponse(BaseModel):
task_id: str
status: str = "processing"

44
app/schemas/doc.py Normal file
View File

@@ -0,0 +1,44 @@
from pydantic import BaseModel
from typing import Optional
from datetime import datetime
class DocumentUploadResponse(BaseModel):
doc_id: str
filename: str
size: int
status: str = "uploaded"
class DocumentInfo(BaseModel):
id: str
name: str
chunks: int
status: str
created_at: Optional[datetime] = None
class DocumentListResponse(BaseModel):
docs: list[DocumentInfo]
class ChunkInfo(BaseModel):
chunk_id: str
doc_name: str
clause_id: Optional[str] = None
chapter: Optional[str] = None
content: str
token_count: int
chunk_index: int
class ParseResponse(BaseModel):
doc_id: str
chunks: int
status: str = "parsed"
class EmbedResponse(BaseModel):
doc_id: str
vectors: int
status: str = "embedded"

31
app/schemas/rag.py Normal file
View File

@@ -0,0 +1,31 @@
from pydantic import BaseModel
from typing import Optional
class RagChatRequest(BaseModel):
query: str
top_k: int = 5
class RetrievedDoc(BaseModel):
id: str
doc_name: str
clause_id: Optional[str] = None
score: float
content: str
preview: str
class SourceInfo(BaseModel):
name: str
clause: Optional[str] = None
class QuickQuestion(BaseModel):
id: str
question: str
category: str
class QuickQuestionsResponse(BaseModel):
questions: list[QuickQuestion]

50
app/services/__init__.py Normal file
View File

@@ -0,0 +1,50 @@
# Import mock data service
from .mock_data import (
get_mock_documents,
get_mock_quick_questions,
get_mock_retrieval,
get_mock_rag_answer,
get_mock_compliance_result,
get_mock_compliance_chat_response,
MOCK_SYSTEM_STATS,
MOCK_SYSTEM_CONFIG,
)
# Try importing real services (may fail if dependencies not installed)
try:
from .llm import llm_service, LLMService
from .embedding import embedding_service, EmbeddingService
from .milvus import milvus_service, MilvusService
from .document import DocumentService, get_document_service
_real_services_available = True
except ImportError:
_real_services_available = False
llm_service = None
LLMService = None
embedding_service = None
EmbeddingService = None
milvus_service = None
MilvusService = None
DocumentService = None
get_document_service = None
__all__ = [
# Mock data services
"get_mock_documents",
"get_mock_quick_questions",
"get_mock_retrieval",
"get_mock_rag_answer",
"get_mock_compliance_result",
"get_mock_compliance_chat_response",
"MOCK_SYSTEM_STATS",
"MOCK_SYSTEM_CONFIG",
# Real services (may be None if not available)
"llm_service",
"LLMService",
"embedding_service",
"EmbeddingService",
"milvus_service",
"MilvusService",
"DocumentService",
"get_document_service",
]

64
app/services/document.py Normal file
View File

@@ -0,0 +1,64 @@
import os
from typing import List, Optional
from PyPDF2 import PdfReader
from docx import Document
import pdfplumber
class DocumentService:
def __init__(self, raw_dir: str, parsed_dir: str):
self.raw_dir = raw_dir
self.parsed_dir = parsed_dir
def parse_pdf(self, file_path: str) -> str:
"""解析PDF文件"""
text = ""
try:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception:
reader = PdfReader(file_path)
for page in reader.pages:
text += page.extract_text() + "\n"
return text.strip()
def parse_docx(self, file_path: str) -> str:
"""解析Word文件"""
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text.strip()
def parse_txt(self, file_path: str) -> str:
"""解析TXT文件"""
with open(file_path, "r", encoding="utf-8") as f:
return f.read().strip()
def parse_document(self, file_path: str) -> str:
"""根据文件类型解析文档"""
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
return self.parse_pdf(file_path)
elif ext in [".docx", ".doc"]:
return self.parse_docx(file_path)
elif ext == ".txt":
return self.parse_txt(file_path)
else:
raise ValueError(f"Unsupported file format: {ext}")
def save_parsed_text(self, doc_id: str, text: str) -> str:
"""保存解析后的文本"""
parsed_path = os.path.join(self.parsed_dir, f"{doc_id}.txt")
with open(parsed_path, "w", encoding="utf-8") as f:
f.write(text)
return parsed_path
def get_document_service(raw_dir: str, parsed_dir: str) -> DocumentService:
return DocumentService(raw_dir, parsed_dir)

33
app/services/embedding.py Normal file
View File

@@ -0,0 +1,33 @@
import dashscope
from dashscope import TextEmbedding
from typing import List
class EmbeddingService:
def __init__(self):
from app.core.config import settings
self.model = settings.embedding_model
self.dimension = settings.embedding_dim
dashscope.api_key = settings.dashscope_api_key
def embed_texts(self, texts: List[str]) -> List[List[float]]:
"""批量文本嵌入"""
response = TextEmbedding.call(
model=self.model,
input=texts,
)
if response.status_code == 200:
embeddings = []
for item in response.output.embeddings:
embeddings.append(item.embedding)
return embeddings
raise Exception(f"Embedding failed: {response.code}")
def embed_single(self, text: str) -> List[float]:
"""单个文本嵌入"""
embeddings = self.embed_texts([text])
return embeddings[0]
embedding_service = EmbeddingService()

58
app/services/llm.py Normal file
View File

@@ -0,0 +1,58 @@
import dashscope
from dashscope import Generation
from typing import AsyncGenerator, Optional, Generator
class LLMService:
def __init__(self):
from app.core.config import settings
self.model = settings.llm_model
dashscope.api_key = settings.dashscope_api_key
def generate_stream(
self,
prompt: str,
system_prompt: Optional[str] = None,
) -> Generator[str, None, None]:
"""流式生成文本"""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
responses = Generation.call(
model=self.model,
messages=messages,
result_format="message",
stream=True,
)
for response in responses:
if response.status_code == 200:
content = response.output.choices[0].message.content
if content:
yield content
async def generate(
self,
prompt: str,
system_prompt: Optional[str] = None,
) -> str:
"""一次性生成文本"""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
response = Generation.call(
model=self.model,
messages=messages,
result_format="message",
)
if response.status_code == 200:
return response.output.choices[0].message.content
raise Exception(f"LLM generation failed: {response.code}")
llm_service = LLMService()

158
app/services/milvus.py Normal file
View File

@@ -0,0 +1,158 @@
from pymilvus import (
connections,
Collection,
FieldSchema,
CollectionSchema,
DataType,
utility,
)
from typing import List, Optional
class MilvusService:
def __init__(self):
from app.core.config import settings
self.host = settings.milvus_host
self.port = settings.milvus_port
self.regulations_collection_name = settings.regulations_collection
self.compliance_collection_name = settings.compliance_collection
self._connected = False
def connect(self):
"""连接Milvus"""
if not self._connected:
connections.connect(
alias="default",
host=self.host,
port=self.port,
)
self._connected = True
def disconnect(self):
"""断开连接"""
if self._connected:
connections.disconnect("default")
self._connected = False
def create_regulations_collection(self):
"""创建法规文档集合"""
from app.core.config import settings
self.connect()
if utility.has_collection(self.regulations_collection_name):
return Collection(self.regulations_collection_name)
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=settings.embedding_dim),
FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256),
FieldSchema(name="clause_id", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="chapter", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="source_file", dtype=DataType.VARCHAR, max_length=256),
FieldSchema(name="chunk_index", dtype=DataType.INT64),
FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="token_count", dtype=DataType.INT64),
]
schema = CollectionSchema(
fields=fields,
description="法规文档向量集合",
)
collection = Collection(
name=self.regulations_collection_name,
schema=schema,
)
index_params = {
"metric_type": "COSINE",
"index_type": "IVF_FLAT",
"params": {"nlist": 128},
}
collection.create_index(field_name="embedding", index_params=index_params)
return collection
def insert_chunks(
self,
embeddings: List[List[float]],
metadata: List[dict],
) -> List[int]:
"""插入向量数据"""
collection = Collection(self.regulations_collection_name)
collection.load()
data = [
embeddings,
[m.get("doc_name", "") for m in metadata],
[m.get("clause_id", "") for m in metadata],
[m.get("chapter", "") for m in metadata],
[m.get("source_file", "") for m in metadata],
[m.get("chunk_index", 0) for m in metadata],
[m.get("content", "") for m in metadata],
[m.get("token_count", 0) for m in metadata],
]
result = collection.insert(data)
collection.flush()
return result.primary_keys
def search(
self,
query_embedding: List[float],
top_k: int = 10,
) -> List[dict]:
"""向量检索"""
collection = Collection(self.regulations_collection_name)
collection.load()
search_params = {"metric_type": "COSINE", "params": {"nprobe": 16}}
results = collection.search(
data=[query_embedding],
anns_field="embedding",
param=search_params,
limit=top_k,
output_fields=["doc_name", "clause_id", "chapter", "content", "chunk_index"],
)
hits = []
for hit in results[0]:
hits.append({
"id": hit.id,
"score": hit.score,
"doc_name": hit.entity.get("doc_name"),
"clause_id": hit.entity.get("clause_id"),
"chapter": hit.entity.get("chapter"),
"content": hit.entity.get("content"),
"chunk_index": hit.entity.get("chunk_index"),
})
return hits
def get_collection_stats(self) -> dict:
"""获取集合统计"""
self.connect()
if not utility.has_collection(self.regulations_collection_name):
return {"exists": False}
collection = Collection(self.regulations_collection_name)
collection.load()
return {
"exists": True,
"name": self.regulations_collection_name,
"count": collection.num_entities,
}
def health_check(self) -> bool:
"""健康检查"""
try:
self.connect()
return True
except Exception:
return False
milvus_service = MilvusService()

425
app/services/mock_data.py Normal file
View File

@@ -0,0 +1,425 @@
"""
Mock数据服务 - 提供预设假数据供前后端对接测试
"""
from datetime import datetime
from typing import Dict, List, Any
import uuid
# 预设法规文档列表
MOCK_DOCUMENTS: List[Dict[str, Any]] = [
{
"id": "doc-001",
"name": "道路交通安全法.pdf",
"chunks": 156,
"status": "indexed",
"created_at": datetime(2026, 5, 10, 10, 0, 0),
},
{
"id": "doc-002",
"name": "机动车登记规定.docx",
"chunks": 89,
"status": "indexed",
"created_at": datetime(2026, 5, 10, 11, 0, 0),
},
{
"id": "doc-003",
"name": "电动自行车规范.pdf",
"chunks": 42,
"status": "indexed",
"created_at": datetime(2026, 5, 10, 12, 0, 0),
},
{
"id": "doc-004",
"name": "GB 38031-2020 电动汽车安全要求.pdf",
"chunks": 128,
"status": "indexed",
"created_at": datetime(2026, 5, 10, 13, 0, 0),
},
{
"id": "doc-005",
"name": "C-NCAP管理规则(2021版).pdf",
"chunks": 95,
"status": "indexed",
"created_at": datetime(2026, 5, 10, 14, 0, 0),
},
]
# 预设快捷问题
MOCK_QUICK_QUESTIONS: List[Dict[str, str]] = [
{"id": "q1", "question": "电动自行车需要上牌照吗?", "category": "车辆登记"},
{"id": "q2", "question": "新能源汽车有哪些补贴政策?", "category": "新能源"},
{"id": "q3", "question": "车辆年检的规定是什么?", "category": "年检"},
{"id": "q4", "question": "驾驶证过期了怎么处理?", "category": "驾驶证"},
]
# 预设检索结果
MOCK_RETRIEVAL_RESULTS: List[Dict[str, Any]] = [
{
"id": "chunk-001",
"score": 0.95,
"preview": "根据《道路交通安全法》第十八条规定,电动自行车经公安机关交通管理部门登记后,方可上道路行驶...",
"doc_name": "道路交通安全法",
"clause": "第十八条",
"content": "根据《道路交通安全法》第十八条规定,电动自行车经公安机关交通管理部门登记后,方可上道路行驶。电动自行车应当符合国家标准,最高设计车速不超过二十五公里每小时,整车质量不超过五十五千克。",
},
{
"id": "chunk-002",
"score": 0.88,
"preview": "电动自行车需符合GB17761-2018国家标准包括最高车速、整车质量、脚踏骑行能力等要求...",
"doc_name": "电动自行车规范",
"clause": "第4条",
"content": "电动自行车需符合GB17761-2018国家标准。主要技术要求包括最高设计车速不超过25km/h整车质量不超过55kg具有脚踏骑行能力蓄电池标称电压不超过48V电动机额定连续输出功率不超过400W。",
},
{
"id": "chunk-003",
"score": 0.82,
"preview": "机动车登记规定:初次申领机动车号牌、行驶证的,机动车所有人应当向住所地的车辆管理所申请注册登记...",
"doc_name": "机动车登记规定",
"clause": "第5条",
"content": "机动车登记规定:初次申领机动车号牌、行驶证的,机动车所有人应当向住所地的车辆管理所申请注册登记。申请注册登记的,应当提交机动车所有人的身份证明、购车发票等机动车来历证明、机动车整车出厂合格证明或者进口机动车进口凭证。",
},
{
"id": "chunk-004",
"score": 0.75,
"preview": "驾驶电动自行车上道路行驶,应当佩戴安全头盔,遵守道路交通安全法律法规...",
"doc_name": "道路交通安全法",
"clause": "第76条",
"content": "驾驶电动自行车上道路行驶,应当佩戴安全头盔,遵守道路交通安全法律法规。电动自行车不得逆向行驶,不得在机动车道内行驶,最高车速不得超过规定的限速。",
},
{
"id": "chunk-005",
"score": 0.68,
"preview": "电动汽车动力电池安全要求电池系统发生热失控后应在5分钟内不起火不爆炸...",
"doc_name": "GB 38031-2020",
"clause": "第7条",
"content": "电动汽车动力电池安全要求GB 38031-2020电池系统发生热失控后应在5分钟内不起火不爆炸为乘员预留逃生时间。电池包需通过针刺、过充、短路等安全测试。",
},
]
# 预设RAG问答答案模板按关键词匹配
MOCK_RAG_ANSWERS: Dict[str, Dict[str, Any]] = {
"电动自行车": {
"text": "根据《道路交通安全法》及相关规范,电动自行车上路需满足以下条件:\n\n1. 符合国家标准 GB17761-2018\n2. 经公安机关交通管理部门登记\n3. 最高设计车速不超过 25km/h\n4. 整车质量不超过 55kg\n5. 具有脚踏骑行能力\n6. 蓄电池标称电压不超过 48V\n\n行驶时还需佩戴安全头盔,不得逆向行驶或在机动车道内行驶。",
"retrieval_ids": ["chunk-001", "chunk-002", "chunk-004"],
},
"驾驶证": {
"text": "驾驶证申请流程如下:\n\n1. 到驾校报名并参加培训\n2. 通过科目一(理论考试)\n3. 通过科目二(场地驾驶技能考试)\n4. 通过科目三(道路驾驶技能考试)\n5. 通过科目四(安全文明驾驶常识考试)\n6. 领取驾驶证\n\n初次申领需到住所地车辆管理所申请注册登记。",
"retrieval_ids": ["chunk-003"],
},
"超速": {
"text": "超速处罚标准(根据《道路交通安全法》):\n\n- 超速10%以下:警告\n- 超速10%-20%罚款50-200元\n- 超速20%-50%罚款200-500元记3-6分\n- 超速50%以上罚款500-2000元记12分可吊销驾驶证\n\n机动车驾驶人违反道路交通安全法律、法规将处警告或二十元以上二百元以下罚款。",
"retrieval_ids": ["chunk-001"],
},
"年检": {
"text": "车辆年检规定:\n\n- 小型私家车6年内免检每2年申领标志6-10年每2年检验10年以上每年检验\n- 车辆需携带行驶证、交强险保单\n- 检验项目:灯光、制动、排放等\n\n机动车所有人的住所迁出车辆管理所管辖区域的,需在登记证书上签注变更事项。",
"retrieval_ids": ["chunk-003"],
},
"电池": {
"text": "电动汽车电池安全标准GB 38031-2020\n\n1. 热失控要求电池系统发生热失控后应在5分钟内不起火不爆炸为乘员预留逃生时间\n2. 电池包需通过针刺、过充、短路等安全测试\n3. 充电系统应具备过充保护功能当电池SOC达到100%时应自动停止充电\n4. 充电接口应符合GB/T 18487.1标准要求\n\n以上要求确保电动汽车的整车安全性。",
"retrieval_ids": ["chunk-005"],
},
"碰撞": {
"text": "正面碰撞测试要求C-NCAP管理规则\n\n1. 正面100%重叠刚性壁障碰撞试验\n2. 碰撞速度50km/h\n3. 试验后要求:\n - 车门应能打开\n - 燃油系统无泄漏\n - 座椅及安全带功能正常\n\n此测试用于评估车辆在正面碰撞事故中对乘员的保护能力。",
"retrieval_ids": [],
},
"AEB": {
"text": "AEB自动紧急制动系统测试标准\n\n1. 系统应在检测到前方障碍物时主动减速或停车\n2. 测试场景分为三种:\n - 目标车静止\n - 目标车移动\n - 目标车制动\n3. AEB功能是C-NCAP评分的重要加分项\n\n该系统对提升车辆主动安全性能具有重要意义。",
"retrieval_ids": [],
},
"高速公路": {
"text": "高速公路安全距离规定:\n\n1. 车速超过100km/h时与同车道前车保持100米以上距离\n2. 车速低于100km/h时距离可适当缩短\n3. 执行紧急任务的警车、消防车、救护车、工程救险车不受行驶速度限制\n\n保持安全距离是预防追尾事故的关键措施。",
"retrieval_ids": [],
},
}
# 预设合规分析结果
MOCK_COMPLIANCE_RESULT: Dict[str, Any] = {
"task_id": "task-001",
"dashboard": {
"score": 78,
"high_risk_count": 2,
"medium_risk_count": 1,
"low_risk_count": 0,
"need_fix_segments": 3,
"status": "warning",
"status_label": "需优化",
},
"segments": [
{
"id": 1,
"index": 1,
"intent": "车身结构设计",
"start_pos": 45,
"end_pos": 230,
"content": "车身采用高强度钢铝混合结构A柱和B柱使用热成型钢板厚度2.5mm。车顶结构设计满足GB 26112-2010抗压强度要求正面碰撞能量吸收区域采用渐进式变形设计确保碰撞时能量有效分散。",
"risk_level": "high",
"regulations": [
{
"id": 1,
"name": "GB 26112-2010",
"clause": "第4.2条",
"score": 0.95,
"match_keyword": "车顶抗压强度",
"category": "high",
"full_content": "车顶结构应能承受相当于车辆整备质量1.5倍的载荷,载荷分布应均匀,试验后车顶变形量不超过规定值。",
},
{
"id": 2,
"name": "C-NCAP管理规则",
"clause": "第3.1条",
"score": 0.88,
"match_keyword": "正面碰撞",
"category": "high",
"full_content": "正面碰撞试验速度为50km/h碰撞后车门应能打开燃油系统无泄漏座椅及安全带功能正常。",
},
{
"id": 3,
"name": "GB 11551-2014",
"clause": "第5条",
"score": 0.72,
"match_keyword": "碰撞能量吸收",
"category": "medium",
"full_content": "车辆正面碰撞时应有效保护乘员,碰撞能量应通过车身结构合理分散。",
},
{
"id": 4,
"name": "机动车安全技术条件",
"clause": "第12条",
"score": 0.58,
"match_keyword": "A柱强度",
"category": "medium",
"full_content": "A柱应具备足够的抗变形能力材料强度应符合相关标准要求。",
},
],
},
{
"id": 2,
"index": 2,
"intent": "动力系统配置",
"start_pos": 298,
"end_pos": 425,
"content": "搭载永磁同步电机最大功率150kW峰值扭矩310Nm。电池组采用三元锂离子电池容量75kWh能量密度180Wh/kg。充电接口支持快充30分钟充至80%和慢充8小时充满符合GB/T 18487.1-2015标准。",
"risk_level": "medium",
"regulations": [
{
"id": 5,
"name": "GB/T 18487.1-2015",
"clause": "第6条",
"score": 0.94,
"match_keyword": "充电接口标准",
"category": "high",
"full_content": "电动汽车传导充电接口应符合GB/T 18487.1标准要求,充电系统应具备过充保护功能。",
},
{
"id": 6,
"name": "GB/T 31484-2015",
"clause": "第4条",
"score": 0.85,
"match_keyword": "电池能量密度",
"category": "high",
"full_content": "动力电池能量密度不低于120Wh/kg电池系统需通过热失控测试。",
},
{
"id": 7,
"name": "新能源汽车生产企业准入",
"clause": "第8条",
"score": 0.65,
"match_keyword": "电机功率",
"category": "medium",
"full_content": "驱动电机应符合相关技术标准,功率参数应在规定范围内。",
},
{
"id": 8,
"name": "电动汽车安全要求",
"clause": "第7条",
"score": 0.45,
"match_keyword": "充电时间",
"category": "low",
"full_content": "充电系统应具备过充保护功能当电池SOC达到100%时应自动停止充电。",
},
],
},
{
"id": 3,
"index": 3,
"intent": "安全配置设计",
"start_pos": 570,
"end_pos": 725,
"content": "配备6个安全气囊前排双气囊、侧气囊、侧气帘采用预紧式安全带。ABS系统采用博世第9代ESP具备碰撞预警功能FCW和自动紧急制动AEB。方向盘集成驾驶员疲劳监测摄像头。",
"risk_level": "low",
"regulations": [
{
"id": 9,
"name": "GB 27887-2011",
"clause": "第5条",
"score": 0.92,
"match_keyword": "安全气囊",
"category": "high",
"full_content": "乘用车应配备驾驶员和乘客安全气囊,气囊系统应符合相关技术标准。",
},
{
"id": 10,
"name": "GB/T 26991-2011",
"clause": "第3条",
"score": 0.78,
"match_keyword": "ABS系统",
"category": "medium",
"full_content": "车辆应配备防抱死制动系统,系统性能应符合相关标准要求。",
},
{
"id": 11,
"name": "C-NCAP管理规则",
"clause": "第4.2条",
"score": 0.71,
"match_keyword": "AEB自动制动",
"category": "medium",
"full_content": "主动安全配置评分包含AEB功能AEB系统应能有效检测障碍物并主动减速。",
},
{
"id": 12,
"name": "机动车运行安全技术条件",
"clause": "第15条",
"score": 0.38,
"match_keyword": "疲劳监测",
"category": "low",
"full_content": "建议配备驾驶员状态监测系统,及时发现驾驶员疲劳或分心状态。",
},
],
},
],
"priority_actions": [
{
"regulation": "GB 26112-2010 第4.2条",
"issue": "缺少车顶抗压强度测试数据",
"suggestion": "补充车顶抗压强度具体测试数据确保满足1.5倍整备质量载荷要求",
"severity": "high",
},
{
"regulation": "GB/T 31484-2015 第4条",
"issue": "缺少电池热失控测试报告",
"suggestion": "补充电池热失控测试报告验证5分钟内不起火不爆炸",
"severity": "high",
},
{
"regulation": "C-NCAP管理规则 第3.1条",
"issue": "缺少碰撞后车门开启性能数据",
"suggestion": "提供碰撞后车门开启性能测试数据",
"severity": "medium",
},
],
}
# 预设合规对话响应模板
MOCK_COMPLIANCE_CHAT_RESPONSES: Dict[str, Dict[str, str]] = {
"车身结构设计": {
"compliance": "根据当前分析,车身结构设计部分存在以下合规问题:\n\n1. GB 26112-2010要求车顶承受1.5倍整备质量载荷,目前设计声明满足要求但缺少测试数据\n2. C-NCAP正面碰撞后车门应能打开需提供碰撞测试报告\n\n建议补充相关测试数据以提升合规评分。",
"interpretation": "GB 26112-2010 第4.2条具体要求解读:\n\n车顶抗压强度测试是车辆被动安全的重要指标。该标准要求车顶结构能够承受相当于车辆整备质量1.5倍的均匀分布载荷,试验后车顶变形量不得超过规定限值。\n\n热成型钢板22MnB5材料抗拉强度约1500-1650 MPa理论上能满足要求但需通过实际测试验证。",
"suggestion": "针对车身结构设计的修改建议:\n\n1. 补充车顶抗压强度测试报告\n2. 提供A柱材料认证证书\n3. 完善正面碰撞能量吸收设计说明\n4. 添加碰撞后车门开启性能数据\n\n这些补充材料可有效提升合规评分。",
},
"动力系统配置": {
"compliance": "动力系统配置整体合规性良好,主要检查点:\n\n1. 电池能量密度180Wh/kg超过最低要求120Wh/kg ✓\n2. 充电接口符合GB/T 18487.1标准 ✓\n3. 快充30分钟充至80%符合行业标准 ✓\n\n需补充电池热失控测试报告。",
"interpretation": "GB/T 31484-2015对动力电池的要求解读\n\n1. 能量密度不低于120Wh/kg您的设计180Wh/kg满足要求\n2. 循环寿命不少于1000次循环后容量保持率≥80%\n3. 安全测试:需通过针刺、过充、短路等测试\n\n建议补充循环寿命测试数据。",
"suggestion": "动力系统配置改进建议:\n\n1. 补充电池热失控测试报告\n2. 提供循环寿命测试数据\n3. 添加充电系统过充保护功能说明\n4. 完善电池管理系统BMS技术文档",
},
"安全配置设计": {
"compliance": "安全配置设计合规性评估:\n\n1. 安全气囊配置满足GB 27887-2011要求 ✓\n2. ABS/ESP系统符合标准 ✓\n3. AEB功能是C-NCAP加分项 ✓\n\n驾驶员疲劳监测是建议配置,不强制要求。",
"interpretation": "C-NCAP主动安全评分规则解读\n\nAEB自动紧急制动系统是C-NCAP评分的重要加分项最高可获得额外加分。测试场景包括\n- 目标车静止场景\n- 目标车移动场景\n- 目标车制动场景\n\n建议完善AEB系统测试数据以获取更高评分。",
"suggestion": "安全配置优化建议:\n\n1. 提供AEB系统测试数据\n2. 补充FCW预警功能测试报告\n3. 添加安全气囊展开时间数据\n4. 完善驾驶员疲劳监测系统说明(如有)",
},
}
# 预设系统统计数据
MOCK_SYSTEM_STATS: Dict[str, int] = {
"docs": 5,
"chunks": 510,
"vectors": 510,
"segments": 0,
}
# 预设系统配置
MOCK_SYSTEM_CONFIG: Dict[str, Any] = {
"llm": {
"model": "qwen-max",
},
"embedding": {
"model": "text-embedding-v3",
"dimension": 1536,
},
"milvus": {
"host": "localhost",
"port": 19530,
},
"retrieval": {
"vector_top_k": 10,
"final_top_k": 5,
},
}
def get_mock_documents() -> List[Dict[str, Any]]:
"""获取预设法规文档列表"""
return MOCK_DOCUMENTS
def get_mock_quick_questions() -> List[Dict[str, str]]:
"""获取预设快捷问题"""
return MOCK_QUICK_QUESTIONS
def get_mock_retrieval(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
"""根据查询关键词返回预设检索结果"""
results = []
for keyword, data in MOCK_RAG_ANSWERS.items():
if keyword in query:
for retrieval_id in data.get("retrieval_ids", []):
for item in MOCK_RETRIEVAL_RESULTS:
if item["id"] == retrieval_id:
results.append({
"id": item["id"],
"score": item["score"],
"preview": item["preview"],
"doc_name": item["doc_name"],
"clause": item["clause"],
})
break
if not results:
results = MOCK_RETRIEVAL_RESULTS[:top_k]
return results[:top_k]
def get_mock_rag_answer(query: str) -> str:
"""根据查询关键词返回预设答案"""
for keyword, data in MOCK_RAG_ANSWERS.items():
if keyword in query:
return data["text"]
return "抱歉,暂未找到与您问题直接相关的法规内容。请尝试更具体的问题,或联系交通管理部门获取详细信息。\n\n您可以尝试询问电动自行车、驾驶证、超速处罚、年检、电池安全、碰撞测试、AEB系统、高速公路规则等话题。"
def get_mock_compliance_result(task_id: str) -> Dict[str, Any]:
"""获取预设合规分析结果"""
result = MOCK_COMPLIANCE_RESULT.copy()
result["task_id"] = task_id
return result
def get_mock_compliance_chat_response(intent: str, query: str) -> str:
"""获取预设合规对话响应"""
responses = MOCK_COMPLIANCE_CHAT_RESPONSES.get(intent, {})
if "合规" in query or "符合" in query:
return responses.get("compliance", "根据相关法规分析,该段落的合规性需进一步评估。")
elif "解读" in query or "什么" in query or "如何" in query:
return responses.get("interpretation", "法规要求详细解读如下...")
elif "修改" in query or "建议" in query or "完善" in query:
return responses.get("suggestion", "建议进行以下修改以提升合规性...")
return f"关于您的问题,{intent}部分涉及多条相关法规。您可以进一步询问合规性评估或修改建议。"
def generate_task_id() -> str:
"""生成任务ID"""
return f"task-{uuid.uuid4().hex[:8]}"
def generate_doc_id() -> str:
"""生成文档ID"""
return f"doc-{uuid.uuid4().hex[:8]}"

4
app/utils/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
from .chunking import TextChunker, chunker
from .logger import logger, setup_logging
__all__ = ["TextChunker", "chunker", "logger", "setup_logging"]

78
app/utils/chunking.py Normal file
View File

@@ -0,0 +1,78 @@
import re
from typing import List
from app.core.config import settings
class TextChunker:
def __init__(
self,
chunk_size: int = settings.chunk_size,
chunk_overlap: int = settings.chunk_overlap,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_by_clause(self, text: str) -> List[dict]:
"""按条款边界分块(适用于法规文档)"""
clause_pattern = r"(第[一二三四五六七八九十百]+条)"
parts = re.split(clause_pattern, text)
chunks = []
current_clause = None
current_text = ""
chunk_index = 0
for part in parts:
if re.match(clause_pattern, part):
if current_clause and current_text.strip():
chunks.append({
"clause_id": current_clause,
"content": current_text.strip(),
"chunk_index": chunk_index,
})
chunk_index += 1
current_clause = part
current_text = ""
else:
current_text += part
if current_clause and current_text.strip():
chunks.append({
"clause_id": current_clause,
"content": current_text.strip(),
"chunk_index": chunk_index,
})
return chunks
def chunk_by_size(self, text: str) -> List[dict]:
"""按固定大小分块"""
chunks = []
start = 0
chunk_index = 0
while start < len(text):
end = start + self.chunk_size
chunk_text = text[start:end]
if chunk_text.strip():
chunks.append({
"content": chunk_text.strip(),
"chunk_index": chunk_index,
"start_pos": start,
"end_pos": end,
})
chunk_index += 1
start = end - self.chunk_overlap
return chunks
def estimate_tokens(self, text: str) -> int:
"""估算token数量"""
chinese_chars = len(re.findall(r"[^\x00-\xff]", text))
english_chars = len(text) - chinese_chars
return int(chinese_chars / 1.5 + english_chars / 4)
chunker = TextChunker()

24
app/utils/logger.py Normal file
View File

@@ -0,0 +1,24 @@
import logging
import sys
def setup_logging() -> logging.Logger:
"""配置日志"""
logger = logging.getLogger("app")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
logger = setup_logging()

12
app/workflows/__init__.py Normal file
View File

@@ -0,0 +1,12 @@
from .rag_workflow import RagState, rag_workflow, run_rag_workflow, stream_rag_workflow
from .compliance_workflow import ComplianceState, compliance_workflow, run_compliance_workflow
__all__ = [
"RagState",
"rag_workflow",
"run_rag_workflow",
"stream_rag_workflow",
"ComplianceState",
"compliance_workflow",
"run_compliance_workflow",
]

View File

@@ -0,0 +1,175 @@
from typing import TypedDict, List
from langgraph.graph import StateGraph, END
class ComplianceState(TypedDict):
document_path: str
raw_text: str
segments: List[dict]
matched_regulations: List[dict]
risk_dashboard: dict
priority_actions: List[dict]
def parse_document(state: ComplianceState) -> dict:
"""解析文档"""
from app.services import get_document_service
doc_service = get_document_service(
"/airegulation/demo-mao/backend/data/raw",
"/airegulation/demo-mao/backend/data/parsed",
)
text = doc_service.parse_document(state["document_path"])
return {"raw_text": text}
def segment_document(state: ComplianceState) -> dict:
"""AI语义分段"""
from app.services import llm_service
prompt = f"""请分析以下设计方案文档,按照设计意图将其分成若干语义段落。
文档内容:
{state['raw_text'][:3000]}
请输出JSON格式的分段结果每个段落包含
- intent: 段落意图/主题
- startPos: 在原文中的起始位置(大致)
- endPos: 在原文中的结束位置(大致)
- keywords: 关键词列表
输出格式:
[{{"intent": "...", "startPos": 0, "endPos": 100, "keywords": [...]}}]"""
# 简化处理:返回基本分段
segments = [
{
"id": 1,
"intent": "整体设计概述",
"content": state["raw_text"][:500],
"keywords": ["设计", "方案"],
}
]
return {"segments": segments}
def match_regulations(state: ComplianceState) -> dict:
"""法规匹配"""
from app.services import embedding_service, milvus_service
matched = []
for segment in state["segments"]:
keyword_text = " ".join(segment.get("keywords", []))
embedding = embedding_service.embed_single(keyword_text)
docs = milvus_service.search(embedding, top_k=5)
segment_regs = []
for doc in docs:
category = "high" if doc["score"] > 0.85 else ("medium" if doc["score"] > 0.6 else "low")
segment_regs.append({
"id": doc["id"],
"name": doc["doc_name"],
"clause": doc.get("clause_id"),
"score": doc["score"],
"match_keyword": keyword_text,
"category": category,
"full_content": doc["content"],
})
segment["regulations"] = segment_regs
matched.append(segment)
return {"matched_regulations": matched}
def calculate_risk(state: ComplianceState) -> dict:
"""计算风险等级"""
segments = state["matched_regulations"]
high_count = 0
medium_count = 0
low_count = 0
need_fix = 0
total_score = 0
for segment in segments:
regs = segment.get("regulations", [])
high_regs = [r for r in regs if r["category"] == "high"]
if high_regs:
avg_score = sum(r["score"] for r in high_regs) / len(high_regs)
if avg_score < 0.9:
segment["risk_level"] = "high"
high_count += 1
need_fix += 1
elif avg_score < 0.92:
segment["risk_level"] = "medium"
medium_count += 1
else:
segment["risk_level"] = "low"
low_count += 1
else:
segment["risk_level"] = "low"
low_count += 1
total_score += avg_score if high_regs else 100
avg_score = total_score / len(segments) if segments else 100
status = "pass" if avg_score >= 90 else ("warning" if avg_score >= 70 else "fail")
status_label = "合规" if status == "pass" else ("需要修改" if status == "warning" else "高风险")
dashboard = {
"score": avg_score,
"high_risk_count": high_count,
"medium_risk_count": medium_count,
"low_risk_count": low_count,
"need_fix_segments": need_fix,
"status": status,
"status_label": status_label,
}
return {"risk_dashboard": dashboard, "segments": segments}
def generate_suggestions(state: ComplianceState) -> dict:
"""生成优先建议"""
actions = []
for segment in state["segments"]:
for reg in segment.get("regulations", []):
if reg["category"] == "high" and reg["score"] < 0.9:
actions.append({
"regulation": reg["name"],
"issue": reg["match_keyword"],
"suggestion": f"建议对照{reg['name']}{reg.get('clause', '')}条进行修改",
"severity": "high",
})
return {"priority_actions": actions}
# 构建工作流图
compliance_graph = StateGraph(ComplianceState)
compliance_graph.add_node("parse", parse_document)
compliance_graph.add_node("segment", segment_document)
compliance_graph.add_node("match", match_regulations)
compliance_graph.add_node("score", calculate_risk)
compliance_graph.add_node("suggest", generate_suggestions)
compliance_graph.set_entry_point("parse")
compliance_graph.add_edge("parse", "segment")
compliance_graph.add_edge("segment", "match")
compliance_graph.add_edge("match", "score")
compliance_graph.add_edge("score", "suggest")
compliance_graph.add_edge("suggest", END)
compliance_workflow = compliance_graph.compile()
async def run_compliance_workflow(document_path: str) -> ComplianceState:
"""运行合规分析工作流"""
initial_state: ComplianceState = {"document_path": document_path}
result = compliance_workflow.invoke(initial_state)
return result

View File

@@ -0,0 +1,114 @@
from typing import TypedDict, List
from langgraph.graph import StateGraph, END
class RagState(TypedDict):
query: str
query_embedding: List[float]
retrieved_docs: List[dict]
context: str
answer: str
sources: List[dict]
def embed_query(state: RagState) -> dict:
"""将查询转为向量"""
from app.services import embedding_service
embedding = embedding_service.embed_single(state["query"])
return {"query_embedding": embedding}
def retrieve_docs(state: RagState) -> dict:
"""向量检索"""
from app.services import milvus_service
from app.core.config import settings
docs = milvus_service.search(
state["query_embedding"],
top_k=settings.vector_top_k,
)
return {"retrieved_docs": docs[:settings.final_top_k]}
def build_context(state: RagState) -> dict:
"""构建上下文"""
context_parts = []
sources = []
for doc in state["retrieved_docs"]:
context_parts.append(f"{doc['doc_name']} - {doc.get('clause_id', '')}\n{doc['content']}")
sources.append({
"name": doc["doc_name"],
"clause": doc.get("clause_id"),
})
context = "\n\n".join(context_parts)
return {"context": context, "sources": sources}
def generate_answer(state: RagState) -> dict:
"""生成答案"""
from app.services import llm_service
prompt = f"""请根据以下法规内容回答用户问题,并在回答中标注引用的法规条款。
法规内容:
{state['context']}
用户问题:{state['query']}
请给出准确、简洁的回答,并引用相关法规条款。"""
answer = ""
for chunk in llm_service.generate_stream(prompt):
answer += chunk
return {"answer": answer}
# 构建工作流图
rag_graph = StateGraph(RagState)
rag_graph.add_node("embed", embed_query)
rag_graph.add_node("retrieve", retrieve_docs)
rag_graph.add_node("build_context", build_context)
rag_graph.add_node("generate", generate_answer)
rag_graph.set_entry_point("embed")
rag_graph.add_edge("embed", "retrieve")
rag_graph.add_edge("retrieve", "build_context")
rag_graph.add_edge("build_context", "generate")
rag_graph.add_edge("generate", END)
rag_workflow = rag_graph.compile()
async def run_rag_workflow(query: str) -> RagState:
"""运行RAG工作流"""
initial_state: RagState = {"query": query}
result = rag_workflow.invoke(initial_state)
return result
def stream_rag_workflow(query: str):
"""流式运行RAG工作流"""
from app.services import llm_service
# 先完成检索阶段
state: RagState = {"query": query}
state.update(embed_query(state))
state.update(retrieve_docs(state))
state.update(build_context(state))
# 流式生成阶段
prompt = f"""请根据以下法规内容回答用户问题,并在回答中标注引用的法规条款。
法规内容:
{state['context']}
用户问题:{state['query']}
请给出准确、简洁的回答,并引用相关法规条款。"""
for chunk in llm_service.generate_stream(prompt):
yield {"type": "chunk", "text": chunk}
yield {"type": "done", "sources": state["sources"]}

View File

@@ -0,0 +1 @@
test content

View File

@@ -0,0 +1,2 @@
apache-flink==1.13.2
PyMySQL>=1.1.0

View File

@@ -0,0 +1,2 @@
apache-flink==1.13.2
PyMySQL>=1.1.0

6
main.py Normal file
View File

@@ -0,0 +1,6 @@
def main():
print("Hello from backend!")
if __name__ == "__main__":
main()

7
pyproject.toml Normal file
View File

@@ -0,0 +1,7 @@
[project]
name = "backend"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.9"
dependencies = []

34
requirements.txt Normal file
View File

@@ -0,0 +1,34 @@
# Web框架
fastapi>=0.110.0
uvicorn>=0.27.0
# LangGraph & LangChain
langgraph>=0.0.40
langchain>=0.2.0
langchain-community>=0.2.0
# DashScope
dashscope>=1.14.0
# Milvus
pymilvus>=2.3.0
# 文档解析
pypdf2>=3.0.0
python-docx>=1.1.0
pdfplumber>=0.10.0
# Pydantic配置
pydantic>=2.0.0
pydantic-settings>=2.0.0
# 工具
python-multipart>=0.0.9
sse-starlette>=1.8.0
python-dotenv>=1.0.0
tiktoken>=0.5.0
httpx>=0.25.0
# 测试
pytest>=7.4.0
pytest-asyncio>=0.21.0

0
tests/__init__.py Normal file
View File

8
uv.lock generated Normal file
View File

@@ -0,0 +1,8 @@
version = 1
revision = 3
requires-python = ">=3.9"
[[package]]
name = "backend"
version = "0.1.0"
source = { virtual = "." }