first commit
This commit is contained in:
89
.env.example
Normal file
89
.env.example
Normal file
@@ -0,0 +1,89 @@
|
||||
# ══════════════════════════════════════════════════
|
||||
# AI合规智能中枢 — 环境变量配置
|
||||
# 复制本文件为 .env 并填写实际值
|
||||
# cp .env.example .env
|
||||
# ══════════════════════════════════════════════════
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────
|
||||
# LLM 云端 API 配置(至少填写一个)
|
||||
# ──────────────────────────────────────────────────
|
||||
|
||||
# LLM 提供商:deepseek 或 qwen
|
||||
LLM_PROVIDER=deepseek
|
||||
|
||||
# DeepSeek API(推荐,约¥1/百万tokens)
|
||||
# 申请地址:https://platform.deepseek.com
|
||||
DEEPSEEK_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
# 可选模型:deepseek-chat(通用)、deepseek-reasoner(推理增强)
|
||||
DEEPSEEK_MODEL=deepseek-chat
|
||||
|
||||
# 阿里云 DashScope / Qwen API(备用)
|
||||
# 申请地址:https://dashscope.aliyuncs.com
|
||||
DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
# 可选模型:qwen-plus、qwen-max、qwen-turbo
|
||||
QWEN_MODEL=qwen-plus
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────
|
||||
# 数据库密码
|
||||
# ──────────────────────────────────────────────────
|
||||
|
||||
# PostgreSQL 密码(生产环境请使用强密码)
|
||||
POSTGRES_PASSWORD=compliance_secure_2026
|
||||
|
||||
# Redis 密码
|
||||
REDIS_PASSWORD=redis_secure_2026
|
||||
|
||||
# Neo4j 密码(不能包含特殊字符)
|
||||
NEO4J_PASSWORD=neo4j_secure_2026
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────
|
||||
# AI 模型配置
|
||||
# ──────────────────────────────────────────────────
|
||||
|
||||
# HuggingFace 镜像(国内加速,默认使用 hf-mirror.com)
|
||||
HF_ENDPOINT=https://hf-mirror.com
|
||||
|
||||
# 嵌入服务设备:cpu 或 cuda(有 GPU 时改为 cuda)
|
||||
EMBEDDING_DEVICE=cpu
|
||||
|
||||
# MinerU 解析设备:cpu 或 cuda
|
||||
MCP_DEVICE=cpu
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────
|
||||
# 应用配置
|
||||
# ──────────────────────────────────────────────────
|
||||
|
||||
# 运行环境:development / production
|
||||
APP_ENV=development
|
||||
|
||||
# 日志级别:DEBUG / INFO / WARNING / ERROR
|
||||
LOG_LEVEL=INFO
|
||||
|
||||
# API 认证密钥(用于内部服务间调用)
|
||||
API_SECRET_KEY=change_this_to_a_random_secret_key_32chars
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────
|
||||
# 监控配置(可选)
|
||||
# ──────────────────────────────────────────────────
|
||||
|
||||
# Grafana 管理员密码
|
||||
GRAFANA_PASSWORD=admin
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────
|
||||
# 外部推送配置(闭环③法规监控推送用)
|
||||
# ──────────────────────────────────────────────────
|
||||
|
||||
# 邮件推送(可选)
|
||||
# SMTP_HOST=smtp.example.com
|
||||
# SMTP_PORT=587
|
||||
# SMTP_USER=your@email.com
|
||||
# SMTP_PASSWORD=your_smtp_password
|
||||
|
||||
# Webhook 推送(可选,支持飞书/钉钉/企业微信)
|
||||
# WEBHOOK_URL=https://hooks.slack.com/services/xxx
|
||||
277
00_整体部署规划.md
Normal file
277
00_整体部署规划.md
Normal file
@@ -0,0 +1,277 @@
|
||||
# AI合规智能中枢 — 整体部署规划
|
||||
|
||||
> **版本:** 调研版 v1.0 | **日期:** 2026.04 | **团队:** T-Systems AI Regulations Team
|
||||
|
||||
---
|
||||
|
||||
## 一、项目背景
|
||||
|
||||
AI+合规智能中枢面向车企与工厂,是一个全链路合规智能平台。主要解决以下痛点:
|
||||
|
||||
| 痛点 | 说明 |
|
||||
|------|------|
|
||||
| 法规来源复杂 | GB、MIIT、UN-ECE、IATF 16949、ISO 45001 等多源并行 |
|
||||
| 更新频率高 | 新能源、数据安全、碳排放法规频繁变动 |
|
||||
| 跨语言要求 | 中英德法多语言法规并存 |
|
||||
| 文档管理分散 | 内部文档与外部法规割裂,难以统一检索 |
|
||||
| 被动识别隐患 | EHS 合规靠人工排查,效率低下 |
|
||||
|
||||
**调研目标:** 以最小资源投入(Docker Compose 单机)验证三条核心业务闭环的技术可行性。
|
||||
|
||||
---
|
||||
|
||||
## 二、部署架构概览
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ 单台服务器 │
|
||||
│ ┌──────────────┐ ┌──────────────────────────────────────┐ │
|
||||
│ │ API 网关 │ │ Docker Compose │ │
|
||||
│ │ Nginx :80 │───▶│ │ │
|
||||
│ └──────────────┘ │ ┌──────────────────────────────┐ │ │
|
||||
│ │ │ 业务服务层 │ │ │
|
||||
│ │ │ compliance-backend :8000 │ │ │
|
||||
│ │ │ celery-worker │ │ │
|
||||
│ │ │ celery-beat │ │ │
|
||||
│ │ └──────────┬───────────────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌──────────▼───────────────────┐ │ │
|
||||
│ │ │ AI 模型层 │ │ │
|
||||
│ │ │ embedding-service :8010 │ │ │
|
||||
│ │ │ mcp-server(MinerU) :8011 │ │ │
|
||||
│ │ │ LLM → DeepSeek API (云端) │ │ │
|
||||
│ │ └──────────┬───────────────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌──────────▼───────────────────┐ │ │
|
||||
│ │ │ 数据层 │ │ │
|
||||
│ │ │ PostgreSQL :5432 │ │ │
|
||||
│ │ │ Redis :6379 │ │ │
|
||||
│ │ │ Milvus :19530 │ │ │
|
||||
│ │ │ Neo4j :7474/:7687 │ │ │
|
||||
│ │ │ MinIO (Milvus内置) │ │ │
|
||||
│ │ └──────────────────────────────┘ │ │
|
||||
│ └──────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
┌─────────▼──────────┐
|
||||
│ DeepSeek API │
|
||||
│ (云端 LLM) │
|
||||
└────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 三、原方案 vs 调研方案对比
|
||||
|
||||
| 维度 | 原方案(生产级)| 调研方案 | 降级理由 |
|
||||
|------|--------------|---------|---------|
|
||||
| 编排 | Kubernetes 1.36 + Helm | **Docker Compose** | 无需集群管理,`up -d` 一键启动 |
|
||||
| LLM | vLLM + DeepSeek-V3(4×A100)| **DeepSeek/Qwen 云端 API** | 无 GPU 依赖,秒级就绪 |
|
||||
| 嵌入模型 | BGE-M3 GPU 服务 | **BGE-M3 CPU 容器** | 调研数据量小,CPU 够用 |
|
||||
| Milvus | 分布式集群 + MinIO | **Milvus Standalone**(含内置 MinIO)| 单容器,省去 MinIO 独立部署 |
|
||||
| 消息队列 | Kafka 3 节点 | **Redis + Celery**(复用已有 Redis)| 调研无需高吞吐,大幅简化 |
|
||||
| 监控 | Prometheus + Grafana + ELK | **仅 Prometheus + Grafana**(可选)| 轻量,后期按需加 |
|
||||
| 安全 | JWT + cert-manager + RBAC | **API Key 简单认证** | 调研期无需生产级安全 |
|
||||
| CI/CD | GitLab CI 完整流水线 | **无**(手动部署)| 调研期直接 compose up |
|
||||
|
||||
---
|
||||
|
||||
## 四、硬件最低要求
|
||||
|
||||
| 资源 | 最低配置 | 推荐配置 | 说明 |
|
||||
|------|---------|---------|------|
|
||||
| CPU | 8 核 | 16 核+ | BGE-M3 CPU 模式需要较多核心 |
|
||||
| 内存 | 32 GB | 64 GB | Milvus + BGE-M3 + Neo4j 内存消耗较大 |
|
||||
| 存储 | 200 GB SSD | 500 GB SSD | 含模型文件(约 5GB)+ 数据 |
|
||||
| GPU | **无需** | 1× RTX 3090(24GB)| 有 GPU 可加速嵌入/MinerU |
|
||||
| 网络 | 能访问 DeepSeek API | — | LLM 完全在云端 |
|
||||
| OS | Ubuntu 22.04 LTS | — | 或 Windows 11 + WSL2 |
|
||||
|
||||
**各组件内存估算:**
|
||||
|
||||
| 服务 | 内存占用 |
|
||||
|------|---------|
|
||||
| PostgreSQL | ~1 GB |
|
||||
| Redis | ~512 MB |
|
||||
| Milvus(含 etcd/minio)| ~4 GB |
|
||||
| Neo4j | ~2 GB |
|
||||
| BGE-M3(CPU 模式)| ~6 GB |
|
||||
| MinerU(CPU 模式)| ~4 GB |
|
||||
| compliance-backend | ~1 GB |
|
||||
| celery-worker × 1 | ~1 GB |
|
||||
| **合计** | **~20 GB** |
|
||||
|
||||
---
|
||||
|
||||
## 五、五阶段部署步骤(总览)
|
||||
|
||||
```
|
||||
阶段一:宿主机环境准备
|
||||
└─ 安装 Docker CE / Docker Desktop
|
||||
└─ 配置 nvidia-container-toolkit(有 GPU 时)
|
||||
└─ 创建项目目录,配置 .env
|
||||
|
||||
阶段二:基础中间件启动
|
||||
└─ PostgreSQL + Redis(优先启动)
|
||||
└─ etcd + MinIO(Milvus 依赖)
|
||||
└─ Milvus Standalone(向量检索核心)
|
||||
└─ Neo4j Community(知识图谱)
|
||||
|
||||
阶段三:AI 模型服务构建与启动
|
||||
└─ 构建 embedding-service(BGE-M3 封装)
|
||||
└─ 构建 mcp-server(MinerU 封装)
|
||||
└─ 预下载模型(BGE-M3 ~2.5GB,MinerU ~2GB)
|
||||
|
||||
阶段四:业务微服务启动
|
||||
└─ compliance-backend(FastAPI 主服务)
|
||||
└─ celery-worker(异步任务处理)
|
||||
└─ celery-beat(定时任务调度)
|
||||
└─ nginx(API 网关)
|
||||
|
||||
阶段五:验证与闭环测试
|
||||
└─ 健康检查(bash scripts/check_health.sh)
|
||||
└─ 端到端冒烟测试(bash scripts/07_smoke_test.sh)
|
||||
└─ 三条业务闭环验证
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 六、三条核心业务闭环
|
||||
|
||||
### 闭环①:法规入库 → 检索问答
|
||||
|
||||
```
|
||||
用户上传 PDF
|
||||
│
|
||||
▼
|
||||
API Gateway(Nginx)
|
||||
│
|
||||
▼
|
||||
kbmp-service(文件接收)
|
||||
│ 异步投递
|
||||
▼
|
||||
Celery Worker
|
||||
│
|
||||
├─► parse-worker ──► mcp-server(MinerU 解析)
|
||||
│ │ Markdown + 结构化文本
|
||||
│ ▼
|
||||
└─► vectorize-worker ──► embedding-service(BGE-M3)
|
||||
│ 1024维向量
|
||||
▼
|
||||
Milvus(向量存储)+ PostgreSQL(元数据)
|
||||
|
||||
用户提问
|
||||
│
|
||||
▼
|
||||
BM25 关键词检索 + BGE-M3 向量检索(Milvus hybrid search)
|
||||
│
|
||||
▼
|
||||
Cross-Encoder Reranker(精排 Top-K)
|
||||
│
|
||||
▼
|
||||
DeepSeek API(引文锚定生成)
|
||||
│
|
||||
▼
|
||||
返回答案(含原文引用 + 页码)
|
||||
```
|
||||
|
||||
### 闭环②:文档上传 → 合规审查
|
||||
|
||||
```
|
||||
上传供应商/内部文档
|
||||
│
|
||||
▼
|
||||
MinerU 解析 → 条款级分割
|
||||
│
|
||||
▼
|
||||
法规域匹配(vehicle_safety / data_security / ehs)
|
||||
│
|
||||
▼
|
||||
与法规库语义比对(向量相似度 + 关键字匹配)
|
||||
│
|
||||
▼
|
||||
DeepSeek API 风险评分(条款级分析)
|
||||
│
|
||||
▼
|
||||
生成 Markdown 审查报告(风险等级 + 整改建议)
|
||||
```
|
||||
|
||||
### 闭环③:法规监控 → 变更推送
|
||||
|
||||
```
|
||||
Celery Beat 定时触发(每天)
|
||||
│
|
||||
▼
|
||||
抓取监控源(国标委 / 工信部 / 应急管理部 / 生环部)
|
||||
│
|
||||
▼
|
||||
内容 Hash 比对(检测变更)
|
||||
│
|
||||
▼ [有变更]
|
||||
NLP Diff 分析(DeepSeek 提取新增/修订/废止条款)
|
||||
│
|
||||
▼
|
||||
增量入库(MinerU 解析 → BGE-M3 → Milvus + PostgreSQL + Neo4j)
|
||||
│
|
||||
▼
|
||||
差距分析(与企业现状比对)
|
||||
│
|
||||
▼
|
||||
推送通知(Email / Webhook / 飞书 / 钉钉)
|
||||
│
|
||||
▼
|
||||
记录变更日志 → 触发整改任务
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 七、技术选型决策依据
|
||||
|
||||
| 组件 | 选型 | 决策依据 |
|
||||
|------|------|---------|
|
||||
| 向量数据库 | Milvus 2.4 | 支持 Dense+Sparse 混合检索,BGE-M3 配套,生产可扩展 |
|
||||
| 图数据库 | Neo4j 5.x | 法规实体关系建模成熟,APOC 插件丰富,Cypher 查询友好 |
|
||||
| 嵌入模型 | BGE-M3 | 中英文双语,支持 dense+sparse+multi-vector,8192 token 上下文 |
|
||||
| LLM | DeepSeek API | 推理能力强,成本低(约¥1/百万 tokens),OpenAI 兼容 |
|
||||
| 文档解析 | MinerU | GPU 最快 0.21s/页,支持 109 种语言 OCR,布局感知 |
|
||||
| 任务队列 | Celery + Redis | 调研阶段够用,比 Kafka 轻量,Redis 可复用 |
|
||||
| API 框架 | FastAPI | 异步性能好,OpenAPI 自动生成,Pydantic 数据验证 |
|
||||
| 关系数据库 | PostgreSQL + pgvector | 元数据存储 + 备用向量检索,pgvector 镜像开箱即用 |
|
||||
|
||||
---
|
||||
|
||||
## 八、升级路径(调研 → 生产)
|
||||
|
||||
| 维度 | 升级内容 | 触发条件 |
|
||||
|------|---------|---------|
|
||||
| LLM | API → 本地 vLLM + DeepSeek-V3 | 数据安全要求/API成本超阈值 |
|
||||
| Milvus | Standalone → 分布式集群 | 向量数据 > 1000 万条 |
|
||||
| 消息队列 | Celery+Redis → Kafka | 并发任务 > 100/分钟 |
|
||||
| 编排 | Docker Compose → Kubernetes | 多节点部署/弹性伸缩需求 |
|
||||
| 安全 | API Key → JWT + RBAC | 对外提供服务/多租户 |
|
||||
| 监控 | Grafana → Grafana + ELK | 日志量大/需要复杂分析 |
|
||||
|
||||
---
|
||||
|
||||
## 九、文件结构说明
|
||||
|
||||
```
|
||||
Depolyment/
|
||||
├── 00_整体部署规划.md ← 本文档
|
||||
├── 01_技术架构详解.md ← 六层架构 + 六大微服务详细说明
|
||||
├── 02_组件安装指南.md ← 每个组件的详细安装步骤
|
||||
├── 03_业务闭环说明.md ← 三条闭环的数据流和接口规范
|
||||
├── README.md ← 快速启动指南
|
||||
├── docker-compose.yml ← 全服务编排
|
||||
├── .env.example ← 环境变量模板
|
||||
├── scripts/ ← 安装与运维脚本(13 个)
|
||||
├── services/ ← 服务源码
|
||||
│ ├── embedding/ ← BGE-M3 嵌入服务
|
||||
│ ├── mcp-server/ ← MinerU 文档解析服务
|
||||
│ └── compliance-backend/ ← 核心业务后端
|
||||
├── config/ ← Nginx、Prometheus 配置
|
||||
├── init-sql/ ← PostgreSQL 初始化 SQL
|
||||
├── data/ ← 运行时数据
|
||||
├── logs/ ← 服务日志
|
||||
└── models/ ← AI 模型缓存
|
||||
```
|
||||
263
01_技术架构详解.md
Normal file
263
01_技术架构详解.md
Normal file
@@ -0,0 +1,263 @@
|
||||
# AI合规智能中枢 — 技术架构详解
|
||||
|
||||
> 本文档对应架构文档:`01_分层次技术架构图.html` 和 `02_详细技术架构图.html`
|
||||
|
||||
---
|
||||
|
||||
## 一、六层架构总览
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────────────────────────────────┐
|
||||
│ L1 应用接入层:Web / Mobile / Bot / API Gateway / RBAC │
|
||||
├──────────────────────────────────────────────────────────────────┤
|
||||
│ L2 业务能力层:知识库问答 / 文档审查 / EHS / 法规监控 / 推荐 │
|
||||
├──────────────────────────────────────────────────────────────────┤
|
||||
│ L3 法规感知层:监控 → 感知 → 解析 → 图谱 → 分析 → 闭环 │
|
||||
├──────────────────────────────────────────────────────────────────┤
|
||||
│ L4 AI引擎层:RAG / LLM / 文档解析 / 知识图谱推理 / NLP │
|
||||
├──────────────────────────────────────────────────────────────────┤
|
||||
│ L5 数据知识层:Milvus / PostgreSQL / Neo4j / Redis / 知识库 │
|
||||
├──────────────────────────────────────────────────────────────────┤
|
||||
│ L6 基础设施层:安全治理 / 容器编排 / 运维观测 / CI/CD │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 二、六大微服务详解
|
||||
|
||||
### 2.1 kbmp-service(知识库公开接口)
|
||||
|
||||
**职责:** 知识库的统一入口,处理文件上传、检索编排、任务投递。
|
||||
|
||||
**核心接口:**
|
||||
|
||||
| 方法 | 路径 | 功能 |
|
||||
|------|------|------|
|
||||
| POST | `/workspace/create` | 创建知识库工作空间 |
|
||||
| POST | `/files/upload` | 上传文件(触发解析任务) |
|
||||
| POST | `/files/parse` | 手动触发解析 |
|
||||
| POST | `/knowledge/retrieval` | 混合检索(BM25 + 向量)|
|
||||
| POST | `/chunks/recall` | 原始 Chunk 召回 |
|
||||
| POST | `/qa` | 检索 + LLM 问答生成 |
|
||||
|
||||
**内部流程:**
|
||||
```
|
||||
文件上传 → 存储 data/uploads → 投递 Celery 任务(parse-queue)
|
||||
→ parse-worker 调用 mcp-server 解析
|
||||
→ vectorize-worker 调用 embedding-service 向量化
|
||||
→ 写入 Milvus(向量)+ PostgreSQL(元数据)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2.2 mcp-server(文档解析服务)
|
||||
|
||||
**职责:** 将 PDF/Word/Excel 等文档转换为结构化 Markdown,供后续向量化。
|
||||
|
||||
**核心接口:**
|
||||
|
||||
| 方法 | 路径 | 功能 |
|
||||
|------|------|------|
|
||||
| POST | `/parse-document` | 通用解析(自动选择引擎)|
|
||||
| POST | `/mineru-parse` | MinerU 精准解析 |
|
||||
| GET | `/health` | 健康检查 |
|
||||
|
||||
**解析策略(降级链):**
|
||||
```
|
||||
1. 阿里云文档解析 API(云端高精度)→ [调研版暂不启用]
|
||||
2. MinerU(本地,GPU/CPU 均支持)→ 主用
|
||||
3. python-docx / PyMuPDF(纯文本降级)→ 兜底
|
||||
```
|
||||
|
||||
**MinerU 特性:**
|
||||
- GPU 最快:0.21 秒/页
|
||||
- CPU 模式:约 3-5 秒/页(调研阶段可接受)
|
||||
- 支持 109 种语言 OCR
|
||||
- 布局感知:区分正文/标题/表格/图片/页眉页脚
|
||||
- 输出格式:Markdown + JSON(含结构化元数据)
|
||||
|
||||
---
|
||||
|
||||
### 2.3 合规业务后端(compliance-backend)
|
||||
|
||||
**职责:** 核心业务逻辑,整合三条闭环的业务处理。
|
||||
|
||||
**核心接口:**
|
||||
|
||||
| 方法 | 路径 | 功能 |
|
||||
|------|------|------|
|
||||
| POST | `/compliance/upload` | 上传待审查文档 |
|
||||
| POST | `/compliance/check` | 智能合规审查 |
|
||||
| GET | `/compliance/report/{id}` | 获取审查报告 |
|
||||
| POST | `/compliance/regulations/download` | 下载法规 |
|
||||
| POST | `/compliance/regulations/update` | 更新法规版本 |
|
||||
| POST | `/compliance/access-control` | 权限分级管理 |
|
||||
| POST | `/compliance/subscribe` | 订阅变更推送 |
|
||||
|
||||
---
|
||||
|
||||
### 2.4 法规感知引擎(Regulation Awareness Engine)
|
||||
|
||||
**职责:** 定时监控法规源,自动检测变更,触发增量更新。
|
||||
|
||||
**六步感知闭环:**
|
||||
|
||||
```
|
||||
① 法规源监控
|
||||
- 定时抓取:国家标准委、工信部、UN-ECE、EUR-Lex、碳交易平台
|
||||
- 技术:requests + BeautifulSoup + Playwright(动态页面)
|
||||
|
||||
② 智能变更感知
|
||||
- Hash 对比(快速过滤)
|
||||
- NLP 版本 Diff(精确识别新增/修订/废止条款)
|
||||
|
||||
③ 自动解析入库
|
||||
- MinerU 解析 → 条款级分割
|
||||
- BGE-M3 向量化 → Milvus + PostgreSQL
|
||||
|
||||
④ 知识图谱同步
|
||||
- Neo4j 更新:法规-条款-义务关系
|
||||
- 影响分析:哪些企业文档受影响
|
||||
|
||||
⑤ 差距分析
|
||||
- AI 比对企业现状 vs 新法规要求
|
||||
- 生成差距报告
|
||||
|
||||
⑥ 推送与整改触发
|
||||
- 按角色推送(研发/EHS/采购/法务)
|
||||
- 自动生成整改任务
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2.5 AI 推理引擎(AI Inference Engine)
|
||||
|
||||
**职责:** 混合检索、精排、LLM 生成、知识图谱推理。
|
||||
|
||||
**混合检索流程:**
|
||||
|
||||
```
|
||||
用户查询
|
||||
│
|
||||
├─► BGE-M3 向量化(Dense 1024维)
|
||||
│ │
|
||||
│ └─► Milvus ANN 检索(HNSW,Cosine 相似度)
|
||||
│
|
||||
├─► BM25 关键词检索(稀疏向量/倒排索引)
|
||||
│
|
||||
└─► 结果融合(RRF 排名融合)
|
||||
│
|
||||
▼
|
||||
Cross-Encoder Reranker(精排 Top-K)
|
||||
│
|
||||
▼
|
||||
LLM 生成(DeepSeek API)
|
||||
- System Prompt:引文锚定要求
|
||||
- 输出:答案 + 原文引用 + 来源文档 + 页码
|
||||
```
|
||||
|
||||
**BGE-M3 三种向量输出:**
|
||||
- **Dense Vector**(1024维):语义相似度,主要用于向量检索
|
||||
- **Sparse Vector**(词汇权重):关键字匹配,等效 BM25
|
||||
- **Multi-Vector**(ColBERT 风格):精细粒度 token 级匹配
|
||||
|
||||
---
|
||||
|
||||
### 2.6 Worker 集群
|
||||
|
||||
**职责:** 异步任务处理,解耦主服务压力。
|
||||
|
||||
**Worker 类型:**
|
||||
|
||||
| Worker | 队列 | 职责 |
|
||||
|--------|------|------|
|
||||
| parse-worker | `parse` | 调用 mcp-server 解析文档 |
|
||||
| vectorize-worker | `vectorize` | BGE-M3 向量化 + Milvus 写入 |
|
||||
| compliance-worker | `compliance` | 合规比对 + 风险评分 |
|
||||
| monitor-worker | `monitor` | 法规源定时抓取 |
|
||||
| push-worker | `push` | 推送通知(Email/Webhook)|
|
||||
|
||||
**调度配置(Celery Beat):**
|
||||
```python
|
||||
CELERY_BEAT_SCHEDULE = {
|
||||
"regulation-monitor": {
|
||||
"task": "app.worker.fetch_regulation_updates",
|
||||
"schedule": crontab(hour=2, minute=0), # 每天凌晨2点
|
||||
},
|
||||
"push-notifications": {
|
||||
"task": "app.worker.send_pending_notifications",
|
||||
"schedule": crontab(minute="*/30"), # 每30分钟
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 三、数据模型
|
||||
|
||||
### 3.1 PostgreSQL 表结构
|
||||
|
||||
```
|
||||
workspaces → 知识库工作空间
|
||||
files → 上传文件记录(含解析状态)
|
||||
tasks → 异步任务状态追踪
|
||||
compliance_reports → 合规审查报告
|
||||
regulation_sources → 法规监控源配置
|
||||
regulation_updates → 法规变更记录
|
||||
subscriptions → 推送订阅配置
|
||||
audit_logs → 全链路审计日志
|
||||
```
|
||||
|
||||
### 3.2 Milvus Collection 结构
|
||||
|
||||
```python
|
||||
# regulation_chunks / doc_chunks / case_library 共用相同 Schema
|
||||
fields = [
|
||||
FieldSchema("id", VARCHAR, primary_key=True),
|
||||
FieldSchema("file_id", VARCHAR), # 关联文件
|
||||
FieldSchema("workspace_id", VARCHAR), # 所属工作空间
|
||||
FieldSchema("chunk_idx", INT64), # 块序号
|
||||
FieldSchema("content", VARCHAR(65535)), # 原文内容
|
||||
FieldSchema("dense_vec", FLOAT_VECTOR(1024)), # BGE-M3 向量
|
||||
FieldSchema("metadata", JSON), # 扩展元数据
|
||||
]
|
||||
# 索引:HNSW,Cosine 相似度,M=16,efConstruction=200
|
||||
```
|
||||
|
||||
### 3.3 Neo4j 图模型
|
||||
|
||||
```cypher
|
||||
// 节点类型
|
||||
(:Regulation {id, title, code, version, domain, effective_date})
|
||||
(:Clause {id, number, content, clause_type})
|
||||
(:Obligation {id, description, obligation_type, subject})
|
||||
(:Enterprise {id, name, industry})
|
||||
(:RiskItem {id, description, severity, domain})
|
||||
(:Domain {name, label})
|
||||
|
||||
// 关系类型
|
||||
(Regulation)-[:CONTAINS]->(Clause)
|
||||
(Clause)-[:REQUIRES]->(Obligation)
|
||||
(Regulation)-[:SUPERSEDES]->(Regulation) // 版本替代
|
||||
(Clause)-[:MAPS_TO]->(RiskItem)
|
||||
(Enterprise)-[:SUBJECT_TO]->(Regulation)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 四、核心技术栈版本锁定
|
||||
|
||||
| 组件 | 版本 | Docker 镜像 |
|
||||
|------|------|------------|
|
||||
| PostgreSQL | 16 + pgvector | `pgvector/pgvector:pg16` |
|
||||
| Redis | 7.x | `redis:7-alpine` |
|
||||
| Milvus | 2.4.13 | `milvusdb/milvus:v2.4.13` |
|
||||
| Neo4j | 5.20 Community | `neo4j:5.20-community` |
|
||||
| BGE-M3 | 最新 | BAAI/bge-m3(HuggingFace)|
|
||||
| MinerU | 1.x | opendatalab/MinerU(pip)|
|
||||
| LangChain | 0.3+ | pip install langchain>=0.3 |
|
||||
| FastAPI | 0.115+ | pip install fastapi>=0.115 |
|
||||
| Celery | 5.4+ | pip install celery[redis]>=5.4 |
|
||||
| Python | 3.12 | python:3.12-slim(Docker)|
|
||||
| Nginx | 1.25 | `nginx:1.25-alpine` |
|
||||
569
02_组件安装指南.md
Normal file
569
02_组件安装指南.md
Normal file
@@ -0,0 +1,569 @@
|
||||
# AI合规智能中枢 — 组件安装指南
|
||||
|
||||
> 本文档提供每个组件的详细安装步骤、配置说明和验证方法。
|
||||
|
||||
---
|
||||
|
||||
## 前置:Docker 环境安装
|
||||
|
||||
### Ubuntu 22.04 LTS
|
||||
|
||||
```bash
|
||||
# 1. 更新包列表
|
||||
sudo apt-get update
|
||||
|
||||
# 2. 安装依赖
|
||||
sudo apt-get install -y ca-certificates curl gnupg lsb-release
|
||||
|
||||
# 3. 添加 Docker GPG 密钥
|
||||
sudo install -m 0755 -d /etc/apt/keyrings
|
||||
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \
|
||||
sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
|
||||
sudo chmod a+r /etc/apt/keyrings/docker.gpg
|
||||
|
||||
# 4. 添加 Docker 仓库
|
||||
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \
|
||||
https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | \
|
||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
|
||||
# 5. 安装 Docker CE
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y docker-ce docker-ce-cli containerd.io \
|
||||
docker-buildx-plugin docker-compose-plugin
|
||||
|
||||
# 6. 加入 docker 组(免 sudo)
|
||||
sudo usermod -aG docker $USER
|
||||
newgrp docker
|
||||
|
||||
# 7. 验证
|
||||
docker --version # Docker version 27.x.x
|
||||
docker compose version # Docker Compose version v2.x.x
|
||||
```
|
||||
|
||||
### Windows 11 + WSL2
|
||||
|
||||
```powershell
|
||||
# PowerShell(管理员)
|
||||
|
||||
# 1. 启用 WSL2
|
||||
wsl --install -d Ubuntu-22.04
|
||||
wsl --set-default-version 2
|
||||
|
||||
# 2. 安装 Docker Desktop(需重启)
|
||||
winget install -e --id Docker.DockerDesktop
|
||||
|
||||
# 3. 重启后,Docker Desktop 设置:
|
||||
# Settings → General → "Use WSL 2 based engine" ✓
|
||||
# Settings → Resources → WSL Integration → Ubuntu-22.04 ✓
|
||||
```
|
||||
|
||||
### GPU 支持(可选,有 NVIDIA GPU 时)
|
||||
|
||||
```bash
|
||||
# Ubuntu 安装 nvidia-container-toolkit
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
|
||||
sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
|
||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y nvidia-container-toolkit
|
||||
sudo nvidia-ctk runtime configure --runtime=docker
|
||||
sudo systemctl restart docker
|
||||
|
||||
# 验证
|
||||
docker run --rm --gpus all nvidia/cuda:12.4-base nvidia-smi
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件一:PostgreSQL 16 + pgvector
|
||||
|
||||
**用途:** 存储元数据(文件记录、任务状态、合规报告、法规变更)
|
||||
|
||||
### 配置参数
|
||||
|
||||
```yaml
|
||||
# docker-compose.yml 中的关键配置
|
||||
image: pgvector/pgvector:pg16 # 内置 pgvector 扩展
|
||||
POSTGRES_USER: compliance
|
||||
POSTGRES_PASSWORD: <your_password>
|
||||
POSTGRES_DB: compliance_db
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data # 数据持久化
|
||||
- ./init-sql:/docker-entrypoint-initdb.d # 自动执行初始化 SQL
|
||||
ports:
|
||||
- "5432:5432"
|
||||
```
|
||||
|
||||
### 启动与验证
|
||||
|
||||
```bash
|
||||
# 启动
|
||||
docker compose up -d postgres
|
||||
|
||||
# 等待健康(约10秒)
|
||||
docker compose ps postgres
|
||||
|
||||
# 连接测试
|
||||
docker compose exec postgres psql -U compliance -d compliance_db -c "\dt"
|
||||
|
||||
# 验证扩展
|
||||
docker compose exec postgres psql -U compliance -d compliance_db \
|
||||
-c "SELECT extname FROM pg_extension WHERE extname IN ('vector', 'uuid-ossp');"
|
||||
```
|
||||
|
||||
### 常用操作
|
||||
|
||||
```bash
|
||||
# 查看所有表
|
||||
docker compose exec postgres psql -U compliance -d compliance_db \
|
||||
-c "\dt"
|
||||
|
||||
# 查询任务状态
|
||||
docker compose exec postgres psql -U compliance -d compliance_db \
|
||||
-c "SELECT id, task_type, status, created_at FROM tasks ORDER BY created_at DESC LIMIT 10;"
|
||||
|
||||
# 备份数据库
|
||||
docker compose exec postgres pg_dump -U compliance compliance_db > backup_$(date +%Y%m%d).sql
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件二:Redis 7
|
||||
|
||||
**用途:** Celery 消息中间件、热数据缓存、分布式锁、会话存储
|
||||
|
||||
### 配置参数
|
||||
|
||||
```yaml
|
||||
image: redis:7-alpine
|
||||
command: >
|
||||
redis-server
|
||||
--requirepass <your_password>
|
||||
--maxmemory 2gb
|
||||
--maxmemory-policy allkeys-lru # 内存满时淘汰最近最少使用的 key
|
||||
ports:
|
||||
- "6379:6379"
|
||||
```
|
||||
|
||||
### 启动与验证
|
||||
|
||||
```bash
|
||||
# 启动
|
||||
docker compose up -d redis
|
||||
|
||||
# 连接测试
|
||||
docker compose exec redis redis-cli -a <password> ping
|
||||
# 应返回:PONG
|
||||
|
||||
# 查看 Celery 队列长度
|
||||
docker compose exec redis redis-cli -a <password> llen celery
|
||||
|
||||
# 查看内存使用
|
||||
docker compose exec redis redis-cli -a <password> info memory | grep used_memory_human
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件三:Milvus 2.4 Standalone
|
||||
|
||||
**用途:** 向量数据库,存储 BGE-M3 嵌入向量,支持混合检索
|
||||
|
||||
### 架构说明
|
||||
|
||||
Milvus Standalone 包含三个内部组件:
|
||||
- **etcd**:元数据存储(Collection 定义、索引配置)
|
||||
- **MinIO**:向量段文件存储
|
||||
- **milvus**:查询/写入引擎
|
||||
|
||||
### 启动顺序(严格按顺序)
|
||||
|
||||
```bash
|
||||
# 1. 先启动 etcd
|
||||
docker compose up -d etcd
|
||||
sleep 10
|
||||
|
||||
# 2. 再启动 MinIO
|
||||
docker compose up -d minio
|
||||
sleep 10
|
||||
|
||||
# 3. 最后启动 Milvus(依赖前两者)
|
||||
docker compose up -d milvus
|
||||
# Milvus 冷启动约需 60 秒,请耐心等待
|
||||
```
|
||||
|
||||
### 验证
|
||||
|
||||
```bash
|
||||
# HTTP 健康检查
|
||||
curl http://localhost:9091/healthz
|
||||
# 应返回:{"status":"ok"}
|
||||
|
||||
# Python 连接测试
|
||||
python3 -c "
|
||||
from pymilvus import connections, utility
|
||||
connections.connect(host='localhost', port='19530')
|
||||
print('Collections:', utility.list_collections())
|
||||
print('Milvus 连接成功')
|
||||
"
|
||||
```
|
||||
|
||||
### 创建 Collection(向量索引)
|
||||
|
||||
```python
|
||||
from pymilvus import (connections, Collection, CollectionSchema,
|
||||
FieldSchema, DataType, utility)
|
||||
|
||||
connections.connect(host='localhost', port='19530')
|
||||
|
||||
fields = [
|
||||
FieldSchema('id', DataType.VARCHAR, is_primary=True, max_length=128),
|
||||
FieldSchema('content', DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema('dense_vec', DataType.FLOAT_VECTOR, dim=1024), # BGE-M3
|
||||
FieldSchema('metadata', DataType.JSON),
|
||||
]
|
||||
|
||||
schema = CollectionSchema(fields, description='法规条款向量库')
|
||||
col = Collection('regulation_chunks', schema)
|
||||
|
||||
# 创建 HNSW 索引(速度/精度平衡)
|
||||
col.create_index('dense_vec', {
|
||||
'metric_type': 'COSINE',
|
||||
'index_type': 'HNSW',
|
||||
'params': {'M': 16, 'efConstruction': 200}
|
||||
})
|
||||
col.load()
|
||||
print('Collection 创建完成')
|
||||
```
|
||||
|
||||
### 常用查询
|
||||
|
||||
```python
|
||||
# 向量相似度检索
|
||||
results = col.search(
|
||||
data=[query_vector], # 查询向量(1024维)
|
||||
anns_field='dense_vec',
|
||||
param={'metric_type': 'COSINE', 'params': {'ef': 100}},
|
||||
limit=10,
|
||||
output_fields=['content', 'metadata']
|
||||
)
|
||||
|
||||
# 查看 Collection 统计
|
||||
print(col.num_entities) # 向量总数
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件四:Neo4j 5 Community
|
||||
|
||||
**用途:** 知识图谱存储,法规-条款-义务实体关系
|
||||
|
||||
### 配置参数
|
||||
|
||||
```yaml
|
||||
image: neo4j:5.20-community
|
||||
environment:
|
||||
NEO4J_AUTH: neo4j/<your_password>
|
||||
NEO4J_PLUGINS: '["apoc"]' # 必须安装 APOC 插件
|
||||
NEO4J_dbms_memory_heap_max__size: 2G
|
||||
ports:
|
||||
- "7474:7474" # Browser UI
|
||||
- "7687:7687" # Bolt 协议(应用连接用)
|
||||
```
|
||||
|
||||
### 启动与验证
|
||||
|
||||
```bash
|
||||
# 启动
|
||||
docker compose up -d neo4j
|
||||
# 首次启动约需 60 秒(下载 APOC 插件)
|
||||
|
||||
# 浏览器访问:http://localhost:7474
|
||||
# 用户名:neo4j,密码:见 .env 中 NEO4J_PASSWORD
|
||||
|
||||
# 命令行连接
|
||||
docker compose exec neo4j cypher-shell -u neo4j -p <password>
|
||||
```
|
||||
|
||||
### 常用 Cypher 查询
|
||||
|
||||
```cypher
|
||||
// 查看所有节点类型
|
||||
CALL apoc.meta.schema() YIELD value RETURN value;
|
||||
|
||||
// 创建法规节点
|
||||
CREATE (r:Regulation {
|
||||
id: 'GB18384-2020',
|
||||
title: 'GB 18384-2020 电动汽车安全要求',
|
||||
domain: 'vehicle_safety',
|
||||
effective_date: date('2021-01-01'),
|
||||
version: '2020'
|
||||
});
|
||||
|
||||
// 法规-条款关系
|
||||
MATCH (r:Regulation {id: 'GB18384-2020'})
|
||||
CREATE (c:Clause {
|
||||
id: 'GB18384-2020-2.1',
|
||||
number: '2.1',
|
||||
content: '绝缘电阻要求:直流电路绝缘电阻不得低于100Ω/V'
|
||||
})
|
||||
CREATE (r)-[:CONTAINS]->(c);
|
||||
|
||||
// 多跳查询:查找某法规所有义务
|
||||
MATCH (r:Regulation {domain: 'vehicle_safety'})-[:CONTAINS]->(c)-[:REQUIRES]->(o)
|
||||
RETURN r.title, c.number, o.description LIMIT 20;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件五:BGE-M3 嵌入服务
|
||||
|
||||
**用途:** 将文本转换为 1024 维向量,支持中英双语,支持 Dense+Sparse 混合检索
|
||||
|
||||
### 服务构建
|
||||
|
||||
```bash
|
||||
# 构建镜像
|
||||
docker compose build embedding-service
|
||||
|
||||
# 首次启动(会自动下载 BGE-M3 模型约 2.5GB)
|
||||
docker compose up -d embedding-service
|
||||
|
||||
# 查看下载进度
|
||||
docker compose logs -f embedding-service
|
||||
```
|
||||
|
||||
### 模型预下载(推荐,避免启动超时)
|
||||
|
||||
```bash
|
||||
# 方法1:通过 hf-mirror.com 加速
|
||||
bash scripts/download_models.sh
|
||||
|
||||
# 方法2:通过 ModelScope(国内最快)
|
||||
pip install modelscope
|
||||
python3 -c "
|
||||
from modelscope import snapshot_download
|
||||
snapshot_download('AI-ModelScope/bge-m3', cache_dir='./models/modelscope')
|
||||
"
|
||||
```
|
||||
|
||||
### API 使用
|
||||
|
||||
```bash
|
||||
# 健康检查
|
||||
curl http://localhost:8010/health
|
||||
|
||||
# 生成嵌入向量
|
||||
curl -X POST http://localhost:8010/embed \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"texts": ["GB 18384 电动汽车碰撞安全", "vehicle crash safety requirements"],
|
||||
"batch_size": 2
|
||||
}'
|
||||
# 返回:{"dense": [[...1024个浮点数...], [...]], "sparse": [{...词汇权重...}, {...}]}
|
||||
```
|
||||
|
||||
### 性能参考
|
||||
|
||||
| 模式 | 硬件 | 速度 |
|
||||
|------|------|------|
|
||||
| CPU | 16核,64GB RAM | 约 2-5 秒/批(batch=16)|
|
||||
| GPU | RTX 3090 24GB | 约 0.2-0.5 秒/批(batch=32)|
|
||||
|
||||
---
|
||||
|
||||
## 组件六:MinerU 文档解析服务
|
||||
|
||||
**用途:** 将 PDF/Word/Excel 解析为 Markdown + 结构化 JSON
|
||||
|
||||
### 服务构建
|
||||
|
||||
```bash
|
||||
# 构建镜像(首次约需 10-20 分钟,下载大量依赖)
|
||||
docker compose build mcp-server
|
||||
|
||||
# 启动服务(首次会下载 MinerU 模型约 2GB)
|
||||
docker compose up -d mcp-server
|
||||
|
||||
# 查看启动日志
|
||||
docker compose logs -f mcp-server
|
||||
```
|
||||
|
||||
### API 使用
|
||||
|
||||
```bash
|
||||
# 解析 PDF
|
||||
curl -X POST http://localhost:8011/mineru-parse \
|
||||
-F "file=@/path/to/regulation.pdf"
|
||||
# 返回:{"markdown": "# 法规标题\n\n## 第一章...", "filename": "regulation.pdf"}
|
||||
|
||||
# 解析 Word 文档
|
||||
curl -X POST http://localhost:8011/parse-document \
|
||||
-F "file=@/path/to/document.docx"
|
||||
```
|
||||
|
||||
### 性能参考
|
||||
|
||||
| 模式 | 速度 | 说明 |
|
||||
|------|------|------|
|
||||
| CPU | 3-5 秒/页 | 调研阶段可接受 |
|
||||
| GPU(RTX 3090)| 0.21 秒/页 | 生产推荐 |
|
||||
|
||||
---
|
||||
|
||||
## 组件七:业务后端(compliance-backend)
|
||||
|
||||
**用途:** FastAPI 主服务,整合所有业务逻辑
|
||||
|
||||
### 关键依赖配置
|
||||
|
||||
```bash
|
||||
# .env 中必须设置
|
||||
DEEPSEEK_API_KEY=sk-xxxx # DeepSeek API Key
|
||||
LLM_PROVIDER=deepseek # 或 qwen
|
||||
DATABASE_URL=postgresql+asyncpg://...
|
||||
REDIS_URL=redis://:password@redis:6379/0
|
||||
MILVUS_HOST=milvus
|
||||
NEO4J_URI=bolt://neo4j:7687
|
||||
EMBEDDING_SERVICE_URL=http://embedding-service:8010
|
||||
MCP_SERVER_URL=http://mcp-server:8011
|
||||
```
|
||||
|
||||
### 启动与验证
|
||||
|
||||
```bash
|
||||
# 启动服务
|
||||
docker compose up -d compliance-backend celery-worker celery-beat
|
||||
|
||||
# 验证 API 文档
|
||||
open http://localhost:8000/docs
|
||||
|
||||
# 查看健康状态(包含所有依赖)
|
||||
curl http://localhost:8000/health
|
||||
```
|
||||
|
||||
### Celery Worker 监控
|
||||
|
||||
```bash
|
||||
# 查看 Worker 状态
|
||||
docker compose exec celery-worker celery -A app.worker inspect active
|
||||
|
||||
# 查看队列积压
|
||||
docker compose exec redis redis-cli -a <password> llen celery
|
||||
|
||||
# Worker 日志
|
||||
docker compose logs -f celery-worker
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件八:Nginx API 网关
|
||||
|
||||
**用途:** 反向代理,统一路由,TLS 终止(生产)
|
||||
|
||||
### 配置说明(config/nginx.conf)
|
||||
|
||||
```nginx
|
||||
upstream compliance_backend {
|
||||
server compliance-backend:8000;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
client_max_body_size 100M; # 支持大 PDF 上传
|
||||
proxy_read_timeout 300s; # LLM 推理超时设置
|
||||
|
||||
location /api/kb/ { proxy_pass http://compliance_backend; }
|
||||
location /api/compliance/ { proxy_pass http://compliance_backend; }
|
||||
location /api/regulation/ { proxy_pass http://compliance_backend; }
|
||||
location /health { proxy_pass http://compliance_backend; }
|
||||
location /docs { proxy_pass http://compliance_backend; }
|
||||
}
|
||||
```
|
||||
|
||||
### 启动与验证
|
||||
|
||||
```bash
|
||||
# 启动
|
||||
docker compose up -d nginx
|
||||
|
||||
# 测试路由
|
||||
curl http://localhost/health
|
||||
curl http://localhost/docs # 应返回 Swagger UI HTML
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 完整启动顺序
|
||||
|
||||
```bash
|
||||
# 方式1:分步启动(推荐,含健康等待)
|
||||
bash scripts/06_start_all.sh
|
||||
|
||||
# 方式2:手动分步
|
||||
docker compose up -d postgres redis # 等30s
|
||||
docker compose up -d etcd minio # 等30s
|
||||
docker compose up -d milvus # 等60s
|
||||
docker compose up -d neo4j # 等60s
|
||||
docker compose build embedding-service mcp-server compliance-backend
|
||||
docker compose up -d embedding-service mcp-server # 等120s(模型加载)
|
||||
bash scripts/05_init_db.sh # 初始化数据库
|
||||
docker compose up -d compliance-backend celery-worker celery-beat nginx
|
||||
|
||||
# 验证
|
||||
bash scripts/check_health.sh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 常见问题
|
||||
|
||||
### Q: Milvus 启动失败
|
||||
|
||||
```bash
|
||||
# 检查 etcd 和 minio 是否健康
|
||||
docker compose ps etcd minio
|
||||
|
||||
# 查看 Milvus 日志
|
||||
docker compose logs milvus | tail -50
|
||||
|
||||
# 常见原因:内存不足(Milvus 需要至少 4GB 可用内存)
|
||||
free -h
|
||||
```
|
||||
|
||||
### Q: BGE-M3 模型下载失败
|
||||
|
||||
```bash
|
||||
# 使用镜像加速
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
docker compose up -d embedding-service
|
||||
|
||||
# 或使用 ModelScope
|
||||
bash scripts/download_models.sh
|
||||
```
|
||||
|
||||
### Q: DeepSeek API 连接超时
|
||||
|
||||
```bash
|
||||
# 测试连通性
|
||||
curl -X POST https://api.deepseek.com/v1/chat/completions \
|
||||
-H "Authorization: Bearer $DEEPSEEK_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model": "deepseek-chat", "messages": [{"role": "user", "content": "ping"}]}'
|
||||
|
||||
# 常见原因:API Key 未设置或网络问题
|
||||
```
|
||||
|
||||
### Q: 内存不足
|
||||
|
||||
```bash
|
||||
# 查看内存使用
|
||||
docker stats --no-stream
|
||||
|
||||
# 临时解决:减少 BGE-M3 批大小(降低内存峰值)
|
||||
# 编辑 .env,添加:
|
||||
# EMBEDDING_BATCH_SIZE=4 (默认16)
|
||||
```
|
||||
536
03_业务闭环说明.md
Normal file
536
03_业务闭环说明.md
Normal file
@@ -0,0 +1,536 @@
|
||||
# AI合规智能中枢 — 三条业务闭环说明
|
||||
|
||||
> 本文档详细描述三条核心业务闭环的数据流、接口规范和验证方法。
|
||||
|
||||
---
|
||||
|
||||
## 一、闭环①:法规入库 → 检索问答
|
||||
|
||||
### 1.1 业务场景
|
||||
|
||||
**触发场景:**
|
||||
- 法务/研发人员上传新法规 PDF(如 GB 18384-2020、UN-ECE R155)
|
||||
- 系统自动解析、分块、向量化,建立可检索知识库
|
||||
- 用户用自然语言提问,系统返回精准答案并标注来源
|
||||
|
||||
**用户角色:** 车企研发、法务、合规管理员
|
||||
|
||||
### 1.2 数据流
|
||||
|
||||
```
|
||||
[用户] 上传 PDF
|
||||
│
|
||||
▼
|
||||
POST /api/kb/files/upload
|
||||
{workspace_id, file}
|
||||
│
|
||||
▼
|
||||
[kbmp-service]
|
||||
- 存储文件 → data/uploads/{file_id}.pdf
|
||||
- 写入 files 表(status: uploaded)
|
||||
- 投递 Celery 任务 → parse-queue
|
||||
- 返回 {task_id, file_id}
|
||||
│
|
||||
▼ 异步
|
||||
[celery: parse-worker]
|
||||
- 调用 POST http://mcp-server:8011/mineru-parse
|
||||
- 获取 Markdown 文本
|
||||
- 更新 files 表(status: parsed)
|
||||
- 投递 vectorize-queue
|
||||
│
|
||||
▼ 异步
|
||||
[celery: vectorize-worker]
|
||||
- 文本分块(chunk_size=512,overlap=64)
|
||||
- 调用 POST http://embedding-service:8010/embed
|
||||
- 获取 1024维 Dense + Sparse 向量
|
||||
- 写入 Milvus regulation_chunks
|
||||
- 写入 PostgreSQL(chunk 元数据)
|
||||
- 更新 files 表(status: vectorized)
|
||||
- 更新 tasks 表(status: completed)
|
||||
|
||||
[用户] 提问
|
||||
│
|
||||
▼
|
||||
POST /api/kb/qa
|
||||
{query, workspace_id, top_k=5}
|
||||
│
|
||||
▼
|
||||
[rag-service]
|
||||
1. BGE-M3 向量化查询
|
||||
2. Milvus Dense 向量检索(Cosine,top-20)
|
||||
3. Milvus Sparse 向量检索(BM25 等效,top-20)
|
||||
4. RRF 融合(Reciprocal Rank Fusion)
|
||||
5. Cross-Encoder Reranker 精排(top-5)
|
||||
6. 构建 RAG Prompt(含检索片段)
|
||||
7. DeepSeek API 生成答案(引文锚定)
|
||||
│
|
||||
▼
|
||||
返回:{answer, sources: [{content, file, page, score}], tokens_used}
|
||||
```
|
||||
|
||||
### 1.3 关键接口
|
||||
|
||||
```http
|
||||
### 创建工作空间
|
||||
POST /api/kb/workspaces
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"name": "汽车安全法规库",
|
||||
"description": "GB、UN-ECE 系列法规",
|
||||
"domain": "vehicle_safety"
|
||||
}
|
||||
|
||||
### 响应
|
||||
{
|
||||
"id": "uuid-xxx",
|
||||
"name": "汽车安全法规库",
|
||||
"created_at": "2026-04-22T10:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
### 上传文件
|
||||
POST /api/kb/files/upload
|
||||
Content-Type: multipart/form-data
|
||||
|
||||
file: <binary>
|
||||
workspace_id: uuid-xxx
|
||||
|
||||
### 响应
|
||||
{
|
||||
"file_id": "uuid-yyy",
|
||||
"task_id": "uuid-zzz",
|
||||
"filename": "GB18384-2020.pdf",
|
||||
"status": "processing"
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
### 查询任务状态
|
||||
GET /api/kb/tasks/{task_id}
|
||||
|
||||
### 响应
|
||||
{
|
||||
"task_id": "uuid-zzz",
|
||||
"status": "completed", // pending / running / completed / failed
|
||||
"progress": 100,
|
||||
"file_id": "uuid-yyy",
|
||||
"completed_at": "2026-04-22T10:05:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
### 智能问答
|
||||
POST /api/kb/qa
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"query": "电动汽车碰撞后高压系统的断电时间要求是多少?",
|
||||
"workspace_id": "uuid-xxx",
|
||||
"top_k": 5,
|
||||
"return_sources": true
|
||||
}
|
||||
|
||||
### 响应
|
||||
{
|
||||
"answer": "根据 GB 18384-2020 第 2.2 条,碰撞后 5 秒内,高压系统电压应降至 60V 以下。[来源:GB18384-2020.pdf,第3页]",
|
||||
"sources": [
|
||||
{
|
||||
"content": "碰撞后5秒内,高压系统电压应降至60V以下。",
|
||||
"file": "GB18384-2020.pdf",
|
||||
"page": 3,
|
||||
"chunk_idx": 12,
|
||||
"score": 0.94
|
||||
}
|
||||
],
|
||||
"tokens_used": 1250
|
||||
}
|
||||
```
|
||||
|
||||
### 1.4 分块策略
|
||||
|
||||
```python
|
||||
# 推荐分块配置(调研阶段)
|
||||
CHUNK_SIZE = 512 # 每块最大 token 数
|
||||
CHUNK_OVERLAP = 64 # 块间重叠(保留上下文)
|
||||
SEPARATOR = "\n\n" # 优先按段落分割
|
||||
|
||||
# 法规文档特殊处理
|
||||
# - 识别条款编号(1.1, 2.3.1 等),保证条款完整性
|
||||
# - 表格单独处理(不与正文混合)
|
||||
# - 图片提取 alt text
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 二、闭环②:文档上传 → 合规审查
|
||||
|
||||
### 2.1 业务场景
|
||||
|
||||
**触发场景:**
|
||||
- 采购/供应链人员上传供应商文件(技术规格书、合规声明等)
|
||||
- 研发人员上传设计文档,检查是否符合最新法规
|
||||
- EHS 工程师上传安全操作规程,验证 ISO 45001 合规性
|
||||
|
||||
**用户角色:** 采购、供应链、研发、EHS 工程师
|
||||
|
||||
### 2.2 数据流
|
||||
|
||||
```
|
||||
[用户] 上传供应商文件
|
||||
│
|
||||
▼
|
||||
POST /api/compliance/upload
|
||||
{file, regulation_domains}
|
||||
│
|
||||
▼
|
||||
[compliance-backend]
|
||||
- MinerU 解析文档
|
||||
- 条款级分割(识别条款结构)
|
||||
- 法规域匹配(根据内容自动识别:vehicle_safety / data_security / ehs)
|
||||
- 投递 compliance-queue
|
||||
│
|
||||
▼ 异步
|
||||
[celery: compliance-worker]
|
||||
1. 对每个条款,在 Milvus 中检索相关法规要求
|
||||
2. DeepSeek API 评估合规性
|
||||
Prompt: "对比以下供应商条款与法规要求,评估合规性..."
|
||||
3. 生成风险评分(0-100)
|
||||
4. 汇总生成 Markdown 报告
|
||||
5. 存储 compliance_reports 表
|
||||
│
|
||||
▼
|
||||
[用户] 获取报告
|
||||
GET /api/compliance/report/{id}
|
||||
```
|
||||
|
||||
### 2.3 关键接口
|
||||
|
||||
```http
|
||||
### 上传并审查文档
|
||||
POST /api/compliance/upload
|
||||
Content-Type: multipart/form-data
|
||||
|
||||
file: <binary>
|
||||
regulation_domains: ["vehicle_safety", "data_security"] # 可多选
|
||||
|
||||
### 响应
|
||||
{
|
||||
"report_id": "uuid-aaa",
|
||||
"file_id": "uuid-bbb",
|
||||
"status": "analyzing",
|
||||
"estimated_time_seconds": 60
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
### 直接合规检查(文本输入)
|
||||
POST /api/compliance/check
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"query": "供应商声明:产品绝缘电阻为50Ω/V,满足行业标准",
|
||||
"regulation_domains": ["vehicle_safety"],
|
||||
"top_k": 3
|
||||
}
|
||||
|
||||
### 响应
|
||||
{
|
||||
"risk_level": "high",
|
||||
"risk_score": 78,
|
||||
"findings": [
|
||||
{
|
||||
"clause": "GB 18384-2020 第2.1条",
|
||||
"requirement": "直流电路绝缘电阻不得低于100Ω/V",
|
||||
"actual": "供应商声明50Ω/V",
|
||||
"gap": "不满足,差距50Ω/V",
|
||||
"severity": "critical"
|
||||
}
|
||||
],
|
||||
"recommendations": [
|
||||
"要求供应商提升绝缘电阻至100Ω/V以上",
|
||||
"提供经第三方认证的测试报告"
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
### 获取完整审查报告
|
||||
GET /api/compliance/report/{report_id}
|
||||
|
||||
### 响应
|
||||
{
|
||||
"report_id": "uuid-aaa",
|
||||
"overall_risk_level": "high",
|
||||
"risk_score": 78,
|
||||
"findings": [...],
|
||||
"recommendations": [...],
|
||||
"report_markdown": "# 合规审查报告\n\n## 总体评估\n...",
|
||||
"regulation_domains": ["vehicle_safety"],
|
||||
"llm_model": "deepseek-chat",
|
||||
"created_at": "2026-04-22T11:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
### 2.4 风险等级定义
|
||||
|
||||
| 风险等级 | 分数 | 说明 | 建议行动 |
|
||||
|---------|------|------|---------|
|
||||
| low | 0-30 | 基本合规,小幅优化 | 记录并监控 |
|
||||
| medium | 31-60 | 部分不符合,需要整改 | 制定整改计划 |
|
||||
| high | 61-80 | 重大不符合,需立即处理 | 暂停合作/紧急整改 |
|
||||
| critical | 81-100 | 严重违规,可能造成法律风险 | 立即停止/上报管理层 |
|
||||
|
||||
---
|
||||
|
||||
## 三、闭环③:法规监控 → 变更推送
|
||||
|
||||
### 3.1 业务场景
|
||||
|
||||
**触发场景:**
|
||||
- 国家发布新的新能源汽车数据安全法规
|
||||
- 现有法规(如 GB 7258)进行修订
|
||||
- 碳排放法规新增企业义务
|
||||
|
||||
系统自动检测变更,分析影响,推送给相关角色。
|
||||
|
||||
**用户角色:** 合规管理员、法务专员、EHS 工程师(订阅对应域)
|
||||
|
||||
### 3.2 数据流
|
||||
|
||||
```
|
||||
[Celery Beat] 每天凌晨 2:00 触发
|
||||
│
|
||||
▼
|
||||
[celery: monitor-worker]
|
||||
- 读取 regulation_sources 表(所有 is_active=True 的监控源)
|
||||
- 对每个监控源:
|
||||
a. HTTP 抓取页面内容
|
||||
b. 计算 MD5 Hash
|
||||
c. 与 last_hash 对比
|
||||
d. 有变化 → 投递变更分析任务
|
||||
│
|
||||
▼ [有变更时]
|
||||
[celery: compliance-worker]
|
||||
- DeepSeek API 分析变更内容
|
||||
- 提取新增/修订/废止条款
|
||||
- 生成变更摘要
|
||||
- 写入 regulation_updates 表
|
||||
- 触发增量入库(重新向量化变更条款)
|
||||
- 更新 Neo4j 知识图谱
|
||||
│
|
||||
▼
|
||||
[celery: push-worker]
|
||||
- 读取 subscriptions 表
|
||||
- 按域、重要性过滤
|
||||
- 发送推送(Email / Webhook / 飞书)
|
||||
- 标记 is_notified=True
|
||||
```
|
||||
|
||||
### 3.3 关键接口
|
||||
|
||||
```http
|
||||
### 配置监控源
|
||||
POST /api/regulation/sources
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"name": "国家标准全文公开系统",
|
||||
"url": "https://std.samr.gov.cn",
|
||||
"domain": "vehicle_safety",
|
||||
"fetch_interval": 86400,
|
||||
"fetch_config": {
|
||||
"css_selector": ".standard-list .item",
|
||||
"title_selector": ".title",
|
||||
"date_selector": ".date"
|
||||
}
|
||||
}
|
||||
|
||||
### 响应
|
||||
{
|
||||
"id": "uuid-src1",
|
||||
"name": "国家标准全文公开系统",
|
||||
"status": "active",
|
||||
"next_fetch_at": "2026-04-23T02:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
### 查看法规变更记录
|
||||
GET /api/regulation/updates?domain=vehicle_safety&limit=10&offset=0
|
||||
|
||||
### 响应
|
||||
{
|
||||
"total": 25,
|
||||
"updates": [
|
||||
{
|
||||
"id": "uuid-upd1",
|
||||
"title": "GB 18384-2022 电动汽车安全要求(修订版)",
|
||||
"url": "https://std.samr.gov.cn/xxxx",
|
||||
"change_type": "revised",
|
||||
"summary": "主要变更:碰撞断电时间由5秒缩短至3秒;新增涉水安全要求",
|
||||
"importance": "high",
|
||||
"fetched_at": "2026-04-22T02:00:00Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
### 手动触发法规源采集(测试用)
|
||||
POST /api/regulation/sources/{source_id}/fetch
|
||||
|
||||
### 响应
|
||||
{
|
||||
"task_id": "uuid-task1",
|
||||
"status": "queued",
|
||||
"source_id": "uuid-src1"
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
### 订阅变更推送
|
||||
POST /api/regulation/subscribe
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"name": "EHS 工程师推送",
|
||||
"channel": "webhook",
|
||||
"target": "https://open.feishu.cn/open-apis/bot/v2/hook/xxxx",
|
||||
"domains": ["ehs", "carbon"],
|
||||
"importance_min": "normal"
|
||||
}
|
||||
```
|
||||
|
||||
### 3.4 内置监控源列表
|
||||
|
||||
| 名称 | URL | 域 |
|
||||
|------|-----|-----|
|
||||
| 国家标准全文公开系统 | https://std.samr.gov.cn | vehicle_safety |
|
||||
| 工信部政策法规 | https://www.miit.gov.cn/jgsj/fgs/zcfg | vehicle_safety |
|
||||
| 应急管理部法规 | https://www.mem.gov.cn/gk/zcfg | ehs |
|
||||
| 生态环境部法规 | https://www.mee.gov.cn/ywgz/fgbz/fl | carbon |
|
||||
| 网信办法规 | https://www.cac.gov.cn/zcfg/index.htm | data_security |
|
||||
|
||||
---
|
||||
|
||||
## 四、接口认证说明(调研版)
|
||||
|
||||
调研版使用简单 API Key 认证(在 `Authorization` 头传入):
|
||||
|
||||
```http
|
||||
# 所有请求需要携带 API Key
|
||||
Authorization: Bearer <API_SECRET_KEY>
|
||||
```
|
||||
|
||||
> `API_SECRET_KEY` 在 `.env` 中配置,默认值仅供本地调研使用,生产环境必须更换。
|
||||
|
||||
---
|
||||
|
||||
## 五、完整冒烟测试脚本
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
# 完整三条闭环验证
|
||||
API="http://localhost"
|
||||
KEY="your_api_secret_key"
|
||||
HEADER="-H 'Authorization: Bearer $KEY' -H 'Content-Type: application/json'"
|
||||
|
||||
# ── 闭环①测试 ────────────────────────────────
|
||||
echo "=== 测试闭环①:法规入库 → 问答 ==="
|
||||
|
||||
# 1. 创建工作空间
|
||||
WS=$(curl -sf -X POST $API/api/kb/workspaces \
|
||||
-H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \
|
||||
-d '{"name":"测试法规库","domain":"vehicle_safety"}')
|
||||
WS_ID=$(echo $WS | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])")
|
||||
echo "工作空间:$WS_ID"
|
||||
|
||||
# 2. 上传测试 PDF
|
||||
UPLOAD=$(curl -sf -X POST $API/api/kb/files/upload \
|
||||
-H "Authorization: Bearer $KEY" \
|
||||
-F "file=@data/uploads/test_regulation.txt" \
|
||||
-F "workspace_id=$WS_ID")
|
||||
TASK_ID=$(echo $UPLOAD | python3 -c "import sys,json; print(json.load(sys.stdin)['task_id'])")
|
||||
echo "任务ID:$TASK_ID"
|
||||
|
||||
# 3. 等待处理
|
||||
for i in {1..30}; do
|
||||
STATUS=$(curl -sf $API/api/kb/tasks/$TASK_ID -H "Authorization: Bearer $KEY" | \
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin)['status'])")
|
||||
[[ "$STATUS" == "completed" ]] && echo "处理完成" && break
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# 4. 问答测试
|
||||
QA=$(curl -sf -X POST $API/api/kb/qa \
|
||||
-H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \
|
||||
-d "{\"query\":\"碰撞后高压系统要求\",\"workspace_id\":\"$WS_ID\"}")
|
||||
echo "问答结果:$(echo $QA | python3 -c "import sys,json; print(json.load(sys.stdin).get('answer','')[:100])")"
|
||||
|
||||
# ── 闭环②测试 ────────────────────────────────
|
||||
echo ""
|
||||
echo "=== 测试闭环②:合规审查 ==="
|
||||
CHECK=$(curl -sf -X POST $API/api/compliance/check \
|
||||
-H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \
|
||||
-d '{"query":"绝缘电阻50Ω/V","regulation_domains":["vehicle_safety"]}')
|
||||
echo "风险等级:$(echo $CHECK | python3 -c "import sys,json; print(json.load(sys.stdin).get('risk_level','unknown'))")"
|
||||
|
||||
# ── 闭环③测试 ────────────────────────────────
|
||||
echo ""
|
||||
echo "=== 测试闭环③:法规监控 ==="
|
||||
SRC=$(curl -sf -X POST $API/api/regulation/sources \
|
||||
-H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \
|
||||
-d '{"name":"测试源","url":"https://std.samr.gov.cn","domain":"vehicle_safety"}')
|
||||
echo "监控源:$(echo $SRC | python3 -c "import sys,json; print(json.load(sys.stdin).get('id','failed'))")"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 六、数据流示意图(完整版)
|
||||
|
||||
```
|
||||
┌─────────────────────────────────┐
|
||||
│ 用户请求 │
|
||||
│ Web / API / Mobile / Bot │
|
||||
└──────────────┬──────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────┐
|
||||
│ Nginx API Gateway │
|
||||
│ 路由 / 限流 / 认证 │
|
||||
└──────────────┬──────────────────┘
|
||||
│
|
||||
┌────────────────────┼────────────────────┐
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌──────────────┐ ┌──────────────────┐ ┌────────────────┐
|
||||
│ 知识库 │ │ 合规审查 │ │ 法规监控 │
|
||||
│ /api/kb/* │ │ /api/compliance/* │ │/api/regulation/│
|
||||
└──────┬───────┘ └────────┬─────────┘ └───────┬────────┘
|
||||
│ │ │
|
||||
└──────────┬──────────┘ │
|
||||
│ │
|
||||
▼ ▼
|
||||
┌──────────────────┐ ┌──────────────────┐
|
||||
│ compliance- │ │ Celery Beat │
|
||||
│ backend │ │ 定时调度 │
|
||||
└──────┬───────────┘ └────────┬─────────┘
|
||||
│ │
|
||||
┌──────────┼──────────┐ ┌──────────┼──────────┐
|
||||
│ │ │ │ │ │
|
||||
▼ ▼ ▼ ▼ ▼ ▼
|
||||
parse-w vectorize-w compliance-w monitor-w push-w
|
||||
│ │ │ │ │
|
||||
▼ ▼ │ │ ▼
|
||||
mcp-server embedding LLM API 网络抓取 通知推送
|
||||
(MinerU) (BGE-M3) (DeepSeek) (requests) (Email/Bot)
|
||||
│ │
|
||||
└────┬─────┘
|
||||
│
|
||||
┌──────────┼──────────────┐
|
||||
▼ ▼ ▼
|
||||
PostgreSQL Milvus Neo4j
|
||||
(元数据/报告) (向量检索) (知识图谱)
|
||||
```
|
||||
190
README.md
Normal file
190
README.md
Normal file
@@ -0,0 +1,190 @@
|
||||
# AI合规智能中枢 — 调研版部署指南
|
||||
|
||||
面向车企与工厂的全链路合规智能平台,Docker Compose 单机部署版本,用于验证三条业务闭环。
|
||||
|
||||
## 快速开始
|
||||
|
||||
### 前置要求
|
||||
|
||||
| 资源 | 最低 | 推荐 |
|
||||
|------|------|------|
|
||||
| CPU | 8核 | 16核+ |
|
||||
| 内存 | 32 GB | 64 GB |
|
||||
| 存储 | 200 GB SSD | 500 GB SSD |
|
||||
| GPU | 无需 | 1× RTX 3090(加速嵌入)|
|
||||
| OS | Ubuntu 22.04 LTS 或 Windows 11 + WSL2 | — |
|
||||
|
||||
### 1. 安装 Docker
|
||||
|
||||
**Ubuntu/Linux:**
|
||||
```bash
|
||||
bash scripts/00_install_docker_ubuntu.sh
|
||||
```
|
||||
|
||||
**Windows(PowerShell 管理员):**
|
||||
```powershell
|
||||
.\scripts\00_install_docker_windows.ps1
|
||||
```
|
||||
|
||||
### 2. 配置环境变量
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
# 编辑 .env,至少填写:
|
||||
# - DEEPSEEK_API_KEY(在 https://platform.deepseek.com 申请)
|
||||
nano .env
|
||||
```
|
||||
|
||||
### 3. 一键启动
|
||||
|
||||
```bash
|
||||
# 拉取镜像(可选,加速首次启动)
|
||||
bash scripts/02_pull_images.sh
|
||||
|
||||
# 分步启动(推荐,含健康等待)
|
||||
bash scripts/06_start_all.sh
|
||||
```
|
||||
|
||||
### 4. 验证部署
|
||||
|
||||
```bash
|
||||
# 检查所有服务状态
|
||||
bash scripts/check_health.sh
|
||||
|
||||
# 运行端到端冒烟测试
|
||||
bash scripts/07_smoke_test.sh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 服务访问地址
|
||||
|
||||
| 服务 | 地址 | 说明 |
|
||||
|------|------|------|
|
||||
| API 网关 | http://localhost | Nginx 入口,所有 API 请求入口 |
|
||||
| 业务后端 | http://localhost:8000/docs | FastAPI Swagger UI |
|
||||
| Neo4j 浏览器 | http://localhost:7474 | 知识图谱可视化 |
|
||||
| Grafana | http://localhost:3000 | 监控面板(`--profile monitoring` 启动)|
|
||||
| Milvus | localhost:19530 | 向量数据库 gRPC 端口 |
|
||||
|
||||
---
|
||||
|
||||
## 三条业务闭环
|
||||
|
||||
### 闭环①:法规入库 → 检索问答
|
||||
|
||||
```bash
|
||||
# 上传法规PDF
|
||||
curl -X POST http://localhost/api/kb/files/upload \
|
||||
-F "file=@your_regulation.pdf" \
|
||||
-F "workspace_id=auto-regulation"
|
||||
|
||||
# 查询任务状态
|
||||
curl http://localhost/api/kb/tasks/{task_id}
|
||||
|
||||
# 检索问答
|
||||
curl -X POST http://localhost/api/kb/qa \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"query": "GB 18384 电动汽车碰撞安全要求", "top_k": 5}'
|
||||
```
|
||||
|
||||
### 闭环②:文档上传 → 合规审查
|
||||
|
||||
```bash
|
||||
# 上传供应商文件
|
||||
curl -X POST http://localhost/api/compliance/upload \
|
||||
-F "file=@supplier_document.pdf"
|
||||
|
||||
# 触发合规审查
|
||||
curl -X POST http://localhost/api/compliance/check \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"doc_id": "xxx", "regulation_domains": ["vehicle_safety", "data_security"]}'
|
||||
|
||||
# 获取审查报告
|
||||
curl http://localhost/api/compliance/report/{id}
|
||||
```
|
||||
|
||||
### 闭环③:法规监控 → 变更推送
|
||||
|
||||
```bash
|
||||
# 配置监控源
|
||||
curl -X POST http://localhost/api/regulation/sources \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url": "https://std.samr.gov.cn", "name": "国家标准全文公开"}'
|
||||
|
||||
# 查看变更记录
|
||||
curl http://localhost/api/regulation/updates
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 目录结构
|
||||
|
||||
```
|
||||
Depolyment/
|
||||
├── README.md # 本文件
|
||||
├── docker-compose.yml # 全服务编排
|
||||
├── .env.example # 环境变量模板
|
||||
├── scripts/ # 安装与运维脚本
|
||||
├── services/
|
||||
│ ├── embedding/ # BGE-M3 嵌入服务
|
||||
│ ├── mcp-server/ # MinerU 文档解析服务
|
||||
│ └── compliance-backend/ # 核心业务后端
|
||||
├── config/ # Nginx、Prometheus 配置
|
||||
├── init-sql/ # PostgreSQL 初始化 SQL
|
||||
├── data/ # 运行时数据(上传文件、解析结果)
|
||||
├── logs/ # 服务日志
|
||||
└── models/ # AI 模型缓存(BGE-M3、MinerU)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 常用操作
|
||||
|
||||
```bash
|
||||
# 查看所有服务状态
|
||||
docker compose ps
|
||||
|
||||
# 查看某个服务日志
|
||||
docker compose logs -f compliance-backend
|
||||
|
||||
# 重启某个服务
|
||||
docker compose restart embedding-service
|
||||
|
||||
# 停止所有服务(保留数据)
|
||||
docker compose stop
|
||||
|
||||
# 完全重置(删除所有数据,慎用)
|
||||
bash scripts/reset_all.sh
|
||||
|
||||
# 启动监控面板
|
||||
docker compose --profile monitoring up -d grafana
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## LLM 切换
|
||||
|
||||
默认使用 DeepSeek API,如需切换到 Qwen(阿里云):
|
||||
|
||||
编辑 `.env`:
|
||||
```bash
|
||||
LLM_PROVIDER=qwen
|
||||
DASHSCOPE_API_KEY=your_key_here
|
||||
QWEN_MODEL=qwen-plus
|
||||
```
|
||||
|
||||
然后重启业务服务:
|
||||
```bash
|
||||
docker compose restart compliance-backend celery-worker
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 生产升级路径
|
||||
|
||||
调研验证通过后,升级要点:
|
||||
1. **LLM**:从 API 切换到本地 vLLM + DeepSeek-V3(需要 4×A100)
|
||||
2. **Milvus**:从 Standalone 升级到分布式集群(加独立 MinIO)
|
||||
3. **编排**:从 Docker Compose 迁移到 Kubernetes(服务配置文件可复用)
|
||||
4. **安全**:启用完整 JWT/RBAC,添加 TLS 证书
|
||||
63
config/nginx.conf
Normal file
63
config/nginx.conf
Normal file
@@ -0,0 +1,63 @@
|
||||
upstream compliance_backend {
|
||||
server compliance-backend:8000;
|
||||
keepalive 32;
|
||||
}
|
||||
|
||||
# 文件上传大小限制(法规PDF可能较大)
|
||||
client_max_body_size 100M;
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
server_name _;
|
||||
|
||||
# 访问日志
|
||||
access_log /var/log/nginx/access.log;
|
||||
error_log /var/log/nginx/error.log;
|
||||
|
||||
# 超时配置(LLM推理可能较慢)
|
||||
proxy_connect_timeout 60s;
|
||||
proxy_send_timeout 300s;
|
||||
proxy_read_timeout 300s;
|
||||
|
||||
# 通用代理头
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Connection "";
|
||||
|
||||
# ── 知识库接口 ─────────────────────────────
|
||||
location /api/kb/ {
|
||||
proxy_pass http://compliance_backend/api/kb/;
|
||||
}
|
||||
|
||||
# ── 合规审查接口 ───────────────────────────
|
||||
location /api/compliance/ {
|
||||
proxy_pass http://compliance_backend/api/compliance/;
|
||||
}
|
||||
|
||||
# ── 法规监控接口 ───────────────────────────
|
||||
location /api/regulation/ {
|
||||
proxy_pass http://compliance_backend/api/regulation/;
|
||||
}
|
||||
|
||||
# ── 健康检查 ───────────────────────────────
|
||||
location /health {
|
||||
proxy_pass http://compliance_backend/health;
|
||||
}
|
||||
|
||||
# ── API 文档(开发环境)────────────────────
|
||||
location /docs {
|
||||
proxy_pass http://compliance_backend/docs;
|
||||
}
|
||||
|
||||
location /openapi.json {
|
||||
proxy_pass http://compliance_backend/openapi.json;
|
||||
}
|
||||
|
||||
# ── 根路径 ─────────────────────────────────
|
||||
location / {
|
||||
proxy_pass http://compliance_backend/;
|
||||
}
|
||||
}
|
||||
22
config/prometheus.yml
Normal file
22
config/prometheus.yml
Normal file
@@ -0,0 +1,22 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "compliance-backend"
|
||||
static_configs:
|
||||
- targets: ["compliance-backend:8000"]
|
||||
metrics_path: /metrics
|
||||
|
||||
- job_name: "milvus"
|
||||
static_configs:
|
||||
- targets: ["milvus:9091"]
|
||||
metrics_path: /metrics
|
||||
|
||||
- job_name: "redis"
|
||||
static_configs:
|
||||
- targets: ["redis:6379"]
|
||||
|
||||
- job_name: "postgres"
|
||||
static_configs:
|
||||
- targets: ["postgres:5432"]
|
||||
380
docker-compose.yml
Normal file
380
docker-compose.yml
Normal file
@@ -0,0 +1,380 @@
|
||||
version: "3.9"
|
||||
|
||||
networks:
|
||||
compliance-net:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
redis_data:
|
||||
milvus_data:
|
||||
minio_data:
|
||||
neo4j_data:
|
||||
neo4j_logs:
|
||||
|
||||
services:
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# 基础数据层
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
postgres:
|
||||
image: pgvector/pgvector:pg16
|
||||
container_name: compliance-postgres
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
POSTGRES_USER: compliance
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-compliance123}
|
||||
POSTGRES_DB: compliance_db
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
- ./init-sql:/docker-entrypoint-initdb.d
|
||||
ports:
|
||||
- "5432:5432"
|
||||
networks: [compliance-net]
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U compliance -d compliance_db"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: compliance-redis
|
||||
restart: unless-stopped
|
||||
command: >
|
||||
redis-server
|
||||
--requirepass ${REDIS_PASSWORD:-redis123}
|
||||
--maxmemory 2gb
|
||||
--maxmemory-policy allkeys-lru
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
ports:
|
||||
- "6379:6379"
|
||||
networks: [compliance-net]
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD:-redis123}", "ping"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# Milvus 向量数据库(Standalone,含 etcd + minio)
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
etcd:
|
||||
image: quay.io/coreos/etcd:v3.5.5
|
||||
container_name: milvus-etcd
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
ETCD_AUTO_COMPACTION_MODE: revision
|
||||
ETCD_AUTO_COMPACTION_RETENTION: "1000"
|
||||
ETCD_QUOTA_BACKEND_BYTES: "4294967296"
|
||||
ETCD_SNAPSHOT_COUNT: "50000"
|
||||
volumes:
|
||||
- milvus_data:/etcd
|
||||
command: >
|
||||
etcd
|
||||
-advertise-client-urls=http://127.0.0.1:2379
|
||||
-listen-client-urls=http://0.0.0.0:2379
|
||||
--data-dir=/etcd
|
||||
networks: [compliance-net]
|
||||
healthcheck:
|
||||
test: ["CMD", "etcdctl", "endpoint", "health"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
|
||||
minio:
|
||||
image: minio/minio:RELEASE.2023-03-13T19-46-17Z
|
||||
container_name: milvus-minio
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
MINIO_ACCESS_KEY: minioadmin
|
||||
MINIO_SECRET_KEY: minioadmin
|
||||
volumes:
|
||||
- minio_data:/minio_data
|
||||
command: minio server /minio_data --console-address ":9001"
|
||||
ports:
|
||||
- "9001:9001" # MinIO 控制台(可选访问)
|
||||
networks: [compliance-net]
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
|
||||
milvus:
|
||||
image: milvusdb/milvus:v2.4.13
|
||||
container_name: compliance-milvus
|
||||
restart: unless-stopped
|
||||
command: ["milvus", "run", "standalone"]
|
||||
environment:
|
||||
ETCD_ENDPOINTS: etcd:2379
|
||||
MINIO_ADDRESS: minio:9000
|
||||
volumes:
|
||||
- milvus_data:/var/lib/milvus
|
||||
ports:
|
||||
- "19530:19530" # gRPC API
|
||||
- "9091:9091" # HTTP API
|
||||
depends_on:
|
||||
etcd:
|
||||
condition: service_healthy
|
||||
minio:
|
||||
condition: service_healthy
|
||||
networks: [compliance-net]
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 10
|
||||
start_period: 60s
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# Neo4j 知识图谱数据库
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
neo4j:
|
||||
image: neo4j:5.20-community
|
||||
container_name: compliance-neo4j
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
NEO4J_AUTH: neo4j/${NEO4J_PASSWORD:-neo4j123}
|
||||
NEO4J_PLUGINS: '["apoc"]'
|
||||
NEO4J_dbms_security_procedures_unrestricted: apoc.*
|
||||
NEO4J_dbms_memory_heap_initial__size: 512m
|
||||
NEO4J_dbms_memory_heap_max__size: 2G
|
||||
NEO4J_dbms_memory_pagecache_size: 1G
|
||||
volumes:
|
||||
- neo4j_data:/data
|
||||
- neo4j_logs:/logs
|
||||
ports:
|
||||
- "7474:7474" # Browser UI
|
||||
- "7687:7687" # Bolt 协议
|
||||
networks: [compliance-net]
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -q --spider http://localhost:7474 || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 10
|
||||
start_period: 60s
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# AI 模型服务
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
embedding-service:
|
||||
build:
|
||||
context: ./services/embedding
|
||||
dockerfile: Dockerfile
|
||||
image: compliance-embedding:latest
|
||||
container_name: compliance-embedding
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
MODEL_NAME: BAAI/bge-m3
|
||||
HF_ENDPOINT: ${HF_ENDPOINT:-https://hf-mirror.com}
|
||||
DEVICE: ${EMBEDDING_DEVICE:-cpu}
|
||||
MAX_BATCH_SIZE: "16"
|
||||
volumes:
|
||||
- ./models:/app/models
|
||||
ports:
|
||||
- "8010:8010"
|
||||
networks: [compliance-net]
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 8G
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8010/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
start_period: 120s # 模型加载需要时间
|
||||
|
||||
mcp-server:
|
||||
build:
|
||||
context: ./services/mcp-server
|
||||
dockerfile: Dockerfile
|
||||
image: compliance-mcp:latest
|
||||
container_name: compliance-mcp
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
DEVICE: ${MCP_DEVICE:-cpu}
|
||||
HF_ENDPOINT: ${HF_ENDPOINT:-https://hf-mirror.com}
|
||||
volumes:
|
||||
- ./models:/app/models
|
||||
- ./data/uploads:/app/uploads
|
||||
- ./data/parsed:/app/parsed
|
||||
ports:
|
||||
- "8011:8011"
|
||||
networks: [compliance-net]
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 8G
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8011/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
start_period: 120s
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# 业务服务层
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
compliance-backend:
|
||||
build:
|
||||
context: ./services/compliance-backend
|
||||
dockerfile: Dockerfile
|
||||
image: compliance-backend:latest
|
||||
container_name: compliance-backend
|
||||
restart: unless-stopped
|
||||
env_file: .env
|
||||
environment:
|
||||
DATABASE_URL: postgresql+asyncpg://compliance:${POSTGRES_PASSWORD:-compliance123}@postgres:5432/compliance_db
|
||||
REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/0
|
||||
MILVUS_HOST: milvus
|
||||
MILVUS_PORT: "19530"
|
||||
NEO4J_URI: bolt://neo4j:7687
|
||||
NEO4J_USER: neo4j
|
||||
NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123}
|
||||
EMBEDDING_SERVICE_URL: http://embedding-service:8010
|
||||
MCP_SERVER_URL: http://mcp-server:8011
|
||||
LLM_PROVIDER: ${LLM_PROVIDER:-deepseek}
|
||||
DEEPSEEK_API_KEY: ${DEEPSEEK_API_KEY:-}
|
||||
DEEPSEEK_MODEL: ${DEEPSEEK_MODEL:-deepseek-chat}
|
||||
DASHSCOPE_API_KEY: ${DASHSCOPE_API_KEY:-}
|
||||
QWEN_MODEL: ${QWEN_MODEL:-qwen-plus}
|
||||
LOG_LEVEL: ${LOG_LEVEL:-INFO}
|
||||
APP_ENV: ${APP_ENV:-development}
|
||||
volumes:
|
||||
- ./data:/app/data
|
||||
- ./logs:/app/logs
|
||||
ports:
|
||||
- "8000:8000"
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
milvus:
|
||||
condition: service_healthy
|
||||
embedding-service:
|
||||
condition: service_healthy
|
||||
networks: [compliance-net]
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
|
||||
celery-worker:
|
||||
build:
|
||||
context: ./services/compliance-backend
|
||||
dockerfile: Dockerfile
|
||||
image: compliance-backend:latest
|
||||
container_name: compliance-worker
|
||||
restart: unless-stopped
|
||||
command: >
|
||||
celery -A app.worker worker
|
||||
--loglevel=info
|
||||
--concurrency=4
|
||||
--queues=default,parse,vectorize,compliance,monitor,push
|
||||
env_file: .env
|
||||
environment:
|
||||
DATABASE_URL: postgresql+asyncpg://compliance:${POSTGRES_PASSWORD:-compliance123}@postgres:5432/compliance_db
|
||||
REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/0
|
||||
MILVUS_HOST: milvus
|
||||
MILVUS_PORT: "19530"
|
||||
NEO4J_URI: bolt://neo4j:7687
|
||||
NEO4J_USER: neo4j
|
||||
NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123}
|
||||
EMBEDDING_SERVICE_URL: http://embedding-service:8010
|
||||
MCP_SERVER_URL: http://mcp-server:8011
|
||||
LLM_PROVIDER: ${LLM_PROVIDER:-deepseek}
|
||||
DEEPSEEK_API_KEY: ${DEEPSEEK_API_KEY:-}
|
||||
DASHSCOPE_API_KEY: ${DASHSCOPE_API_KEY:-}
|
||||
volumes:
|
||||
- ./data:/app/data
|
||||
- ./logs:/app/logs
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
compliance-backend:
|
||||
condition: service_healthy
|
||||
networks: [compliance-net]
|
||||
|
||||
celery-beat:
|
||||
build:
|
||||
context: ./services/compliance-backend
|
||||
dockerfile: Dockerfile
|
||||
image: compliance-backend:latest
|
||||
container_name: compliance-beat
|
||||
restart: unless-stopped
|
||||
command: >
|
||||
celery -A app.worker beat
|
||||
--loglevel=info
|
||||
--scheduler celery.beat.PersistentScheduler
|
||||
env_file: .env
|
||||
environment:
|
||||
DATABASE_URL: postgresql+asyncpg://compliance:${POSTGRES_PASSWORD:-compliance123}@postgres:5432/compliance_db
|
||||
REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/0
|
||||
DEEPSEEK_API_KEY: ${DEEPSEEK_API_KEY:-}
|
||||
volumes:
|
||||
- ./data:/app/data
|
||||
- ./logs:/app/logs
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
networks: [compliance-net]
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# API 网关
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
nginx:
|
||||
image: nginx:1.25-alpine
|
||||
container_name: compliance-nginx
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./config/nginx.conf:/etc/nginx/conf.d/default.conf:ro
|
||||
ports:
|
||||
- "80:80"
|
||||
depends_on:
|
||||
compliance-backend:
|
||||
condition: service_healthy
|
||||
networks: [compliance-net]
|
||||
healthcheck:
|
||||
test: ["CMD", "nginx", "-t"]
|
||||
interval: 30s
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# 监控(可选,--profile monitoring 启动)
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:11.0.0
|
||||
container_name: compliance-grafana
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD:-admin}
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
volumes:
|
||||
- ./config/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml:ro
|
||||
ports:
|
||||
- "3000:3000"
|
||||
networks: [compliance-net]
|
||||
profiles: [monitoring]
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.51.0
|
||||
container_name: compliance-prometheus
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
ports:
|
||||
- "9090:9090"
|
||||
networks: [compliance-net]
|
||||
profiles: [monitoring]
|
||||
192
init-sql/01_init_schema.sql
Normal file
192
init-sql/01_init_schema.sql
Normal file
@@ -0,0 +1,192 @@
|
||||
-- AI合规智能中枢 — PostgreSQL 初始化 Schema
|
||||
-- 执行时机:容器首次启动时自动执行
|
||||
|
||||
-- 启用扩展
|
||||
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
||||
CREATE EXTENSION IF NOT EXISTS vector; -- pgvector(pgvector/pgvector:pg16 镜像已内置)
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm; -- 全文检索支持
|
||||
|
||||
-- ══════════════════════════════════════════════════
|
||||
-- 工作空间(知识库)
|
||||
-- ══════════════════════════════════════════════════
|
||||
CREATE TABLE IF NOT EXISTS workspaces (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
name VARCHAR(255) NOT NULL,
|
||||
description TEXT,
|
||||
domain VARCHAR(100), -- vehicle_safety / data_security / ehs / carbon
|
||||
created_by VARCHAR(255),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- ══════════════════════════════════════════════════
|
||||
-- 文件记录
|
||||
-- ══════════════════════════════════════════════════
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
workspace_id UUID REFERENCES workspaces(id) ON DELETE CASCADE,
|
||||
filename VARCHAR(500) NOT NULL,
|
||||
original_name VARCHAR(500) NOT NULL,
|
||||
file_type VARCHAR(50), -- pdf / docx / xlsx
|
||||
file_size BIGINT,
|
||||
storage_path TEXT, -- data/uploads/相对路径
|
||||
parsed_path TEXT, -- data/parsed/相对路径
|
||||
status VARCHAR(50) DEFAULT 'uploaded', -- uploaded/parsing/parsed/vectorized/failed
|
||||
error_msg TEXT,
|
||||
metadata JSONB DEFAULT '{}',
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_files_workspace ON files(workspace_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_files_status ON files(status);
|
||||
|
||||
-- ══════════════════════════════════════════════════
|
||||
-- 异步任务记录
|
||||
-- ══════════════════════════════════════════════════
|
||||
CREATE TABLE IF NOT EXISTS tasks (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
task_type VARCHAR(100) NOT NULL, -- parse / vectorize / compliance_check / regulation_fetch
|
||||
status VARCHAR(50) DEFAULT 'pending', -- pending/running/completed/failed
|
||||
payload JSONB DEFAULT '{}',
|
||||
result JSONB,
|
||||
error_msg TEXT,
|
||||
progress INTEGER DEFAULT 0, -- 0-100
|
||||
file_id UUID REFERENCES files(id),
|
||||
celery_task_id VARCHAR(255),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_tasks_status ON tasks(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_tasks_type ON tasks(task_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_tasks_file ON tasks(file_id);
|
||||
|
||||
-- ══════════════════════════════════════════════════
|
||||
-- 合规审查报告
|
||||
-- ══════════════════════════════════════════════════
|
||||
CREATE TABLE IF NOT EXISTS compliance_reports (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
file_id UUID REFERENCES files(id),
|
||||
regulation_domains TEXT[], -- 适用法规域
|
||||
overall_risk_level VARCHAR(20), -- high / medium / low
|
||||
risk_score DECIMAL(5,2), -- 0-100
|
||||
findings JSONB DEFAULT '[]', -- 问题列表
|
||||
recommendations JSONB DEFAULT '[]', -- 整改建议
|
||||
report_markdown TEXT, -- 完整报告(Markdown格式)
|
||||
llm_model VARCHAR(100), -- 生成时使用的模型
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_reports_file ON compliance_reports(file_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_reports_risk ON compliance_reports(overall_risk_level);
|
||||
|
||||
-- ══════════════════════════════════════════════════
|
||||
-- 法规监控源
|
||||
-- ══════════════════════════════════════════════════
|
||||
CREATE TABLE IF NOT EXISTS regulation_sources (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
name VARCHAR(255) NOT NULL,
|
||||
url TEXT NOT NULL,
|
||||
source_type VARCHAR(50) DEFAULT 'webpage', -- webpage / rss / api
|
||||
domain VARCHAR(100), -- vehicle_safety / ehs 等
|
||||
fetch_interval INTEGER DEFAULT 86400, -- 抓取间隔(秒),默认每天
|
||||
is_active BOOLEAN DEFAULT TRUE,
|
||||
last_fetched_at TIMESTAMPTZ,
|
||||
last_hash VARCHAR(64), -- 内容hash,用于变更检测
|
||||
fetch_config JSONB DEFAULT '{}', -- 抓取配置(CSS选择器等)
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sources_active ON regulation_sources(is_active);
|
||||
CREATE INDEX IF NOT EXISTS idx_sources_domain ON regulation_sources(domain);
|
||||
|
||||
-- ══════════════════════════════════════════════════
|
||||
-- 法规变更记录
|
||||
-- ══════════════════════════════════════════════════
|
||||
CREATE TABLE IF NOT EXISTS regulation_updates (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
source_id UUID REFERENCES regulation_sources(id),
|
||||
title VARCHAR(500),
|
||||
url TEXT,
|
||||
change_type VARCHAR(50), -- new / revised / revoked / notice
|
||||
summary TEXT, -- AI生成的变更摘要
|
||||
raw_content TEXT, -- 原始抓取内容
|
||||
diff_content TEXT, -- 与上次内容的差异
|
||||
is_notified BOOLEAN DEFAULT FALSE,
|
||||
importance VARCHAR(20) DEFAULT 'normal', -- high / normal / low
|
||||
fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
published_at TIMESTAMPTZ
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_updates_source ON regulation_updates(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_updates_notified ON regulation_updates(is_notified);
|
||||
CREATE INDEX IF NOT EXISTS idx_updates_fetched ON regulation_updates(fetched_at DESC);
|
||||
|
||||
-- ══════════════════════════════════════════════════
|
||||
-- 推送订阅
|
||||
-- ══════════════════════════════════════════════════
|
||||
CREATE TABLE IF NOT EXISTS subscriptions (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
name VARCHAR(255),
|
||||
channel VARCHAR(50) NOT NULL, -- email / webhook / feishu / dingtalk
|
||||
target TEXT NOT NULL, -- 邮件地址 或 Webhook URL
|
||||
domains TEXT[], -- 订阅的法规域,为空则订阅全部
|
||||
importance_min VARCHAR(20) DEFAULT 'normal',
|
||||
is_active BOOLEAN DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- ══════════════════════════════════════════════════
|
||||
-- 全链路审计日志
|
||||
-- ══════════════════════════════════════════════════
|
||||
CREATE TABLE IF NOT EXISTS audit_logs (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
action VARCHAR(100) NOT NULL, -- upload / query / compliance_check / etc
|
||||
resource VARCHAR(100),
|
||||
resource_id UUID,
|
||||
user_id VARCHAR(255),
|
||||
ip_address INET,
|
||||
request JSONB,
|
||||
response JSONB,
|
||||
duration_ms INTEGER,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_action ON audit_logs(action);
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_created ON audit_logs(created_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_user ON audit_logs(user_id);
|
||||
|
||||
-- ══════════════════════════════════════════════════
|
||||
-- 更新时间自动维护
|
||||
-- ══════════════════════════════════════════════════
|
||||
CREATE OR REPLACE FUNCTION update_updated_at_column()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER update_workspaces_updated_at
|
||||
BEFORE UPDATE ON workspaces
|
||||
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
|
||||
|
||||
CREATE TRIGGER update_files_updated_at
|
||||
BEFORE UPDATE ON files
|
||||
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
|
||||
|
||||
CREATE TRIGGER update_tasks_updated_at
|
||||
BEFORE UPDATE ON tasks
|
||||
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
|
||||
|
||||
-- ══════════════════════════════════════════════════
|
||||
-- 初始数据:预置监控源
|
||||
-- ══════════════════════════════════════════════════
|
||||
INSERT INTO regulation_sources (name, url, domain, fetch_interval) VALUES
|
||||
('国家标准全文公开系统', 'https://std.samr.gov.cn', 'vehicle_safety', 86400),
|
||||
('工信部政策法规', 'https://www.miit.gov.cn/jgsj/fgs/zcfg/index.html', 'vehicle_safety', 86400),
|
||||
('应急管理部政策法规', 'https://www.mem.gov.cn/gk/zcfg/', 'ehs', 86400),
|
||||
('生态环境部政策法规', 'https://www.mee.gov.cn/ywgz/fgbz/fl/', 'carbon', 86400)
|
||||
ON CONFLICT DO NOTHING;
|
||||
117
scripts/00_install_docker_ubuntu.sh
Normal file
117
scripts/00_install_docker_ubuntu.sh
Normal file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# 00_install_docker_ubuntu.sh
|
||||
# Ubuntu 22.04 LTS 安装 Docker CE + nvidia-container-toolkit
|
||||
# 用法:bash scripts/00_install_docker_ubuntu.sh
|
||||
# ══════════════════════════════════════════════════
|
||||
set -euo pipefail
|
||||
|
||||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||||
|
||||
# ── 检查 root 权限 ──────────────────────────────
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
error "请以 root 或 sudo 运行:sudo bash scripts/00_install_docker_ubuntu.sh"
|
||||
fi
|
||||
|
||||
# ── 检测 Ubuntu 版本 ────────────────────────────
|
||||
. /etc/os-release
|
||||
info "检测到 OS:$NAME $VERSION_ID"
|
||||
if [[ "$ID" != "ubuntu" ]]; then
|
||||
warn "非 Ubuntu 系统,脚本可能不适用。继续(y/n)?"
|
||||
read -r ans; [[ "$ans" != "y" ]] && exit 0
|
||||
fi
|
||||
|
||||
# ── Step 1:换国内源(可选)──────────────────────
|
||||
info "Step 1/5:配置 APT 源..."
|
||||
if [[ "${USE_MIRROR:-false}" == "true" ]]; then
|
||||
sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list
|
||||
sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list
|
||||
ok "已切换到阿里云镜像"
|
||||
fi
|
||||
apt-get update -qq
|
||||
|
||||
# ── Step 2:安装依赖 ────────────────────────────
|
||||
info "Step 2/5:安装依赖包..."
|
||||
apt-get install -y -qq \
|
||||
ca-certificates \
|
||||
curl \
|
||||
gnupg \
|
||||
lsb-release \
|
||||
apt-transport-https
|
||||
|
||||
# ── Step 3:安装 Docker CE ──────────────────────
|
||||
info "Step 3/5:安装 Docker CE..."
|
||||
if command -v docker &>/dev/null; then
|
||||
DOCKER_VER=$(docker --version)
|
||||
warn "Docker 已安装:$DOCKER_VER"
|
||||
warn "跳过 Docker 安装。如需重装,请先运行:apt-get remove docker docker-engine docker.io containerd"
|
||||
else
|
||||
# 添加 Docker 官方 GPG 密钥
|
||||
install -m 0755 -d /etc/apt/keyrings
|
||||
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \
|
||||
gpg --dearmor -o /etc/apt/keyrings/docker.gpg
|
||||
chmod a+r /etc/apt/keyrings/docker.gpg
|
||||
|
||||
# 添加 Docker 仓库
|
||||
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \
|
||||
https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | \
|
||||
tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
|
||||
apt-get update -qq
|
||||
apt-get install -y -qq docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
||||
|
||||
# 启动并设置开机自启
|
||||
systemctl enable docker
|
||||
systemctl start docker
|
||||
ok "Docker CE 安装完成"
|
||||
fi
|
||||
|
||||
# 验证
|
||||
docker --version
|
||||
docker compose version
|
||||
|
||||
# ── Step 4:将当前用户加入 docker 组 ────────────
|
||||
info "Step 4/5:配置 Docker 用户组..."
|
||||
CURRENT_USER=${SUDO_USER:-$USER}
|
||||
if [[ -n "$CURRENT_USER" && "$CURRENT_USER" != "root" ]]; then
|
||||
usermod -aG docker "$CURRENT_USER"
|
||||
ok "用户 $CURRENT_USER 已加入 docker 组(重新登录后生效)"
|
||||
fi
|
||||
|
||||
# ── Step 5:安装 nvidia-container-toolkit(可选)─
|
||||
info "Step 5/5:检查 NVIDIA GPU..."
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
info "检测到 NVIDIA GPU,安装 nvidia-container-toolkit..."
|
||||
nvidia-smi --query-gpu=name --format=csv,noheader
|
||||
|
||||
# 添加 NVIDIA 仓库
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
|
||||
gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
|
||||
apt-get update -qq
|
||||
apt-get install -y -qq nvidia-container-toolkit
|
||||
nvidia-ctk runtime configure --runtime=docker
|
||||
systemctl restart docker
|
||||
ok "nvidia-container-toolkit 安装完成"
|
||||
else
|
||||
warn "未检测到 NVIDIA GPU,跳过 nvidia-container-toolkit 安装"
|
||||
warn "如有 GPU 请手动安装驱动后重新运行本脚本"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo -e "${GREEN} Docker 安装完成!${NC}"
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo ""
|
||||
echo " Docker 版本:$(docker --version)"
|
||||
echo " Compose 版本:$(docker compose version)"
|
||||
echo ""
|
||||
echo -e "${YELLOW} 注意:${NC}请重新登录以使 docker 组权限生效"
|
||||
echo " 验证命令:docker run hello-world"
|
||||
105
scripts/00_install_docker_windows.ps1
Normal file
105
scripts/00_install_docker_windows.ps1
Normal file
@@ -0,0 +1,105 @@
|
||||
# ══════════════════════════════════════════════════
|
||||
# 00_install_docker_windows.ps1
|
||||
# Windows 11 安装 Docker Desktop + WSL2 配置
|
||||
# 用法:以管理员身份运行 PowerShell,执行:
|
||||
# .\scripts\00_install_docker_windows.ps1
|
||||
# ══════════════════════════════════════════════════
|
||||
#Requires -RunAsAdministrator
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function Write-Info { Write-Host "[INFO] $args" -ForegroundColor Cyan }
|
||||
function Write-Ok { Write-Host "[OK] $args" -ForegroundColor Green }
|
||||
function Write-Warn { Write-Host "[WARN] $args" -ForegroundColor Yellow }
|
||||
function Write-Err { Write-Host "[ERR] $args" -ForegroundColor Red; exit 1 }
|
||||
|
||||
Write-Info "============================================"
|
||||
Write-Info "AI合规智能中枢 — Windows Docker 环境安装"
|
||||
Write-Info "============================================"
|
||||
|
||||
# ── Step 1:启用 WSL2 ──────────────────────────
|
||||
Write-Info "Step 1/4:检查并启用 WSL2..."
|
||||
$wslFeature = Get-WindowsOptionalFeature -Online -FeatureName Microsoft-Windows-Subsystem-Linux
|
||||
$vmFeature = Get-WindowsOptionalFeature -Online -FeatureName VirtualMachinePlatform
|
||||
|
||||
if ($wslFeature.State -ne "Enabled") {
|
||||
Write-Info "启用 WSL 功能..."
|
||||
Enable-WindowsOptionalFeature -Online -FeatureName Microsoft-Windows-Subsystem-Linux -NoRestart
|
||||
}
|
||||
if ($vmFeature.State -ne "Enabled") {
|
||||
Write-Info "启用虚拟机平台..."
|
||||
Enable-WindowsOptionalFeature -Online -FeatureName VirtualMachinePlatform -NoRestart
|
||||
}
|
||||
|
||||
# 更新 WSL 内核
|
||||
Write-Info "更新 WSL2 内核..."
|
||||
wsl --update
|
||||
wsl --set-default-version 2
|
||||
Write-Ok "WSL2 配置完成"
|
||||
|
||||
# ── Step 2:安装 Ubuntu WSL 发行版 ─────────────
|
||||
Write-Info "Step 2/4:检查 Ubuntu WSL..."
|
||||
$wslList = wsl --list --quiet 2>$null
|
||||
if ($wslList -notmatch "Ubuntu") {
|
||||
Write-Info "安装 Ubuntu 22.04..."
|
||||
wsl --install -d Ubuntu-22.04
|
||||
Write-Ok "Ubuntu 22.04 安装完成(首次运行需要设置用户名和密码)"
|
||||
} else {
|
||||
Write-Ok "Ubuntu WSL 已安装"
|
||||
wsl --list --verbose
|
||||
}
|
||||
|
||||
# ── Step 3:安装 Docker Desktop ────────────────
|
||||
Write-Info "Step 3/4:检查 Docker Desktop..."
|
||||
$dockerCmd = Get-Command docker -ErrorAction SilentlyContinue
|
||||
if ($dockerCmd) {
|
||||
Write-Ok "Docker 已安装:$(docker --version)"
|
||||
} else {
|
||||
# 尝试用 winget 安装
|
||||
$winget = Get-Command winget -ErrorAction SilentlyContinue
|
||||
if ($winget) {
|
||||
Write-Info "通过 winget 安装 Docker Desktop..."
|
||||
winget install -e --id Docker.DockerDesktop --accept-package-agreements --accept-source-agreements
|
||||
Write-Ok "Docker Desktop 安装完成"
|
||||
} else {
|
||||
Write-Warn "未找到 winget,请手动安装 Docker Desktop:"
|
||||
Write-Warn "下载地址:https://www.docker.com/products/docker-desktop/"
|
||||
Write-Warn "安装时勾选:Use WSL 2 instead of Hyper-V"
|
||||
Start-Process "https://www.docker.com/products/docker-desktop/"
|
||||
Read-Host "安装完成后按 Enter 继续"
|
||||
}
|
||||
}
|
||||
|
||||
# ── Step 4:配置 Docker Desktop WSL 集成 ───────
|
||||
Write-Info "Step 4/4:提示 Docker Desktop 配置..."
|
||||
Write-Warn ""
|
||||
Write-Warn "请确认 Docker Desktop 已进行以下配置:"
|
||||
Write-Warn " 1. Settings → General → 勾选 'Use WSL 2 based engine'"
|
||||
Write-Warn " 2. Settings → Resources → WSL Integration → 开启 Ubuntu-22.04"
|
||||
Write-Warn " 3. 如有 NVIDIA GPU:"
|
||||
Write-Warn " Settings → General → 勾选 'Use GPU with WSL 2'"
|
||||
Write-Warn ""
|
||||
|
||||
# ── 验证 ───────────────────────────────────────
|
||||
Write-Info "验证安装..."
|
||||
try {
|
||||
$dockerVer = docker --version
|
||||
$composeVer = docker compose version
|
||||
Write-Ok "Docker: $dockerVer"
|
||||
Write-Ok "Compose: $composeVer"
|
||||
} catch {
|
||||
Write-Warn "Docker 命令不可用,可能需要重启后再验证"
|
||||
Write-Warn "重启后运行:docker run hello-world"
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "============================================" -ForegroundColor Green
|
||||
Write-Host " 安装完成!" -ForegroundColor Green
|
||||
Write-Host "============================================" -ForegroundColor Green
|
||||
Write-Host ""
|
||||
Write-Host "后续步骤(在 WSL2 Ubuntu 中执行):" -ForegroundColor Yellow
|
||||
Write-Host " 1. 打开 Ubuntu WSL 终端"
|
||||
Write-Host " 2. cd /mnt/c/Projects/AIProjects/AIRegulations/Depolyment"
|
||||
Write-Host " 3. bash scripts/01_setup_project.sh"
|
||||
Write-Host ""
|
||||
Write-Host "如需重启系统请现在重启,然后继续操作。" -ForegroundColor Yellow
|
||||
73
scripts/01_setup_project.sh
Normal file
73
scripts/01_setup_project.sh
Normal file
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# 01_setup_project.sh
|
||||
# 初始化项目:创建目录、生成 .env 文件
|
||||
# 用法:bash scripts/01_setup_project.sh
|
||||
# ══════════════════════════════════════════════════
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
|
||||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
|
||||
cd "$PROJECT_DIR"
|
||||
info "项目目录:$PROJECT_DIR"
|
||||
|
||||
# ── 创建运行时目录 ──────────────────────────────
|
||||
info "创建运行时目录..."
|
||||
mkdir -p data/uploads data/parsed logs models
|
||||
mkdir -p services/embedding services/mcp-server
|
||||
mkdir -p services/compliance-backend/app/{core,api,services,models}
|
||||
ok "目录结构创建完成"
|
||||
|
||||
# ── 复制 .env 文件 ──────────────────────────────
|
||||
if [[ ! -f ".env" ]]; then
|
||||
cp .env.example .env
|
||||
warn "已创建 .env 文件,请编辑并填写必要配置:"
|
||||
warn " 必填:DEEPSEEK_API_KEY(或 DASHSCOPE_API_KEY)"
|
||||
warn " 可选:修改各组件密码"
|
||||
echo ""
|
||||
echo -e "${YELLOW}是否现在编辑 .env 文件?(y/n)${NC}"
|
||||
read -r ans
|
||||
if [[ "$ans" == "y" ]]; then
|
||||
${EDITOR:-nano} .env
|
||||
fi
|
||||
else
|
||||
ok ".env 文件已存在,跳过复制"
|
||||
fi
|
||||
|
||||
# ── 验证 .env 关键字段 ──────────────────────────
|
||||
info "验证 .env 配置..."
|
||||
source .env 2>/dev/null || true
|
||||
|
||||
if [[ -z "${DEEPSEEK_API_KEY:-}" && -z "${DASHSCOPE_API_KEY:-}" ]]; then
|
||||
warn "⚠️ 未设置 LLM API Key!"
|
||||
warn " 请在 .env 中设置 DEEPSEEK_API_KEY 或 DASHSCOPE_API_KEY"
|
||||
warn " DeepSeek 申请:https://platform.deepseek.com"
|
||||
else
|
||||
ok "LLM API Key 已配置"
|
||||
fi
|
||||
|
||||
# ── 验证 Docker ─────────────────────────────────
|
||||
info "检查 Docker 环境..."
|
||||
if ! command -v docker &>/dev/null; then
|
||||
warn "Docker 未安装,请先运行:bash scripts/00_install_docker_ubuntu.sh"
|
||||
exit 1
|
||||
fi
|
||||
docker compose version > /dev/null
|
||||
ok "Docker Compose 可用:$(docker compose version)"
|
||||
|
||||
# ── 显示下一步 ──────────────────────────────────
|
||||
echo ""
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo -e "${GREEN} 项目初始化完成!${NC}"
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo ""
|
||||
echo "下一步操作:"
|
||||
echo " 1. 拉取镜像(可选,较慢):bash scripts/02_pull_images.sh"
|
||||
echo " 2. 启动全部服务: bash scripts/06_start_all.sh"
|
||||
echo " 3. 检查健康状态: bash scripts/check_health.sh"
|
||||
46
scripts/02_pull_images.sh
Normal file
46
scripts/02_pull_images.sh
Normal file
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# 02_pull_images.sh
|
||||
# 预拉取所有 Docker 镜像(离线/弱网环境准备)
|
||||
# 用法:bash scripts/02_pull_images.sh
|
||||
# ══════════════════════════════════════════════════
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
BLUE='\033[0;34m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
|
||||
# 所有基础镜像列表
|
||||
IMAGES=(
|
||||
"pgvector/pgvector:pg16"
|
||||
"redis:7-alpine"
|
||||
"quay.io/coreos/etcd:v3.5.5"
|
||||
"minio/minio:RELEASE.2023-03-13T19-46-17Z"
|
||||
"milvusdb/milvus:v2.4.13"
|
||||
"neo4j:5.20-community"
|
||||
"nginx:1.25-alpine"
|
||||
"grafana/grafana:11.0.0"
|
||||
"prom/prometheus:v2.51.0"
|
||||
)
|
||||
|
||||
info "开始拉取 ${#IMAGES[@]} 个基础镜像..."
|
||||
echo ""
|
||||
|
||||
for img in "${IMAGES[@]}"; do
|
||||
info "拉取:$img"
|
||||
docker pull "$img"
|
||||
ok "完成:$img"
|
||||
echo ""
|
||||
done
|
||||
|
||||
info "所有基础镜像拉取完成"
|
||||
echo ""
|
||||
info "自定义服务镜像(embedding/mcp/backend)将在 build 时自动拉取基础层"
|
||||
echo ""
|
||||
echo -e "${YELLOW}提示:如在国内网络环境下 quay.io 或 milvusdb 拉取慢,${NC}"
|
||||
echo -e "${YELLOW}可配置 Docker 镜像加速器:/etc/docker/daemon.json${NC}"
|
||||
echo ' {"registry-mirrors": ["https://docker.mirrors.ustc.edu.cn"]}'
|
||||
93
scripts/03_start_infra.sh
Normal file
93
scripts/03_start_infra.sh
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# 03_start_infra.sh
|
||||
# 分步启动基础设施(含健康等待),顺序:
|
||||
# PostgreSQL + Redis → etcd + MinIO → Milvus → Neo4j
|
||||
# 用法:bash scripts/03_start_infra.sh
|
||||
# ══════════════════════════════════════════════════
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||||
|
||||
# 等待服务健康的函数
|
||||
wait_healthy() {
|
||||
local service=$1
|
||||
local max_wait=${2:-120}
|
||||
local interval=5
|
||||
local elapsed=0
|
||||
|
||||
info "等待 $service 健康就绪..."
|
||||
while [[ $elapsed -lt $max_wait ]]; do
|
||||
local status
|
||||
status=$(docker compose ps --format json "$service" 2>/dev/null | \
|
||||
python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('Health','unknown'))" 2>/dev/null || echo "unknown")
|
||||
|
||||
if [[ "$status" == "healthy" ]]; then
|
||||
ok "$service 已就绪"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo -n "."
|
||||
sleep $interval
|
||||
elapsed=$((elapsed + interval))
|
||||
done
|
||||
echo ""
|
||||
error "$service 等待超时(${max_wait}s),请检查:docker compose logs $service"
|
||||
}
|
||||
|
||||
info "══════════════════════════════════════════"
|
||||
info " 启动基础设施层"
|
||||
info "══════════════════════════════════════════"
|
||||
|
||||
# ── Step 1:PostgreSQL + Redis ──────────────────
|
||||
info "Step 1/4:启动 PostgreSQL 和 Redis..."
|
||||
docker compose up -d postgres redis
|
||||
|
||||
wait_healthy postgres 90
|
||||
wait_healthy redis 30
|
||||
ok "数据层就绪"
|
||||
|
||||
# ── Step 2:etcd + MinIO(Milvus 依赖)─────────
|
||||
info "Step 2/4:启动 etcd 和 MinIO(Milvus 依赖)..."
|
||||
docker compose up -d etcd minio
|
||||
|
||||
wait_healthy etcd 60
|
||||
wait_healthy minio 60
|
||||
ok "对象存储层就绪"
|
||||
|
||||
# ── Step 3:Milvus ──────────────────────────────
|
||||
info "Step 3/4:启动 Milvus(向量数据库)..."
|
||||
docker compose up -d milvus
|
||||
|
||||
info "Milvus 初始化需要约 60 秒,请耐心等待..."
|
||||
wait_healthy milvus 180
|
||||
ok "Milvus 就绪"
|
||||
|
||||
# ── Step 4:Neo4j ───────────────────────────────
|
||||
info "Step 4/4:启动 Neo4j(知识图谱)..."
|
||||
docker compose up -d neo4j
|
||||
|
||||
wait_healthy neo4j 120
|
||||
ok "Neo4j 就绪"
|
||||
|
||||
# ── 汇总 ────────────────────────────────────────
|
||||
echo ""
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo -e "${GREEN} 基础设施启动完成!${NC}"
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo ""
|
||||
echo " PostgreSQL : localhost:5432"
|
||||
echo " Redis : localhost:6379"
|
||||
echo " Milvus : localhost:19530 (gRPC), localhost:9091 (HTTP)"
|
||||
echo " Neo4j : localhost:7474 (Browser), localhost:7687 (Bolt)"
|
||||
echo " MinIO 控制台: localhost:9001 (admin/minioadmin)"
|
||||
echo ""
|
||||
echo "下一步:bash scripts/04_build_services.sh"
|
||||
59
scripts/04_build_services.sh
Normal file
59
scripts/04_build_services.sh
Normal file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# 04_build_services.sh
|
||||
# 构建自定义服务 Docker 镜像
|
||||
# embedding-service / mcp-server / compliance-backend
|
||||
# 用法:bash scripts/04_build_services.sh
|
||||
# ══════════════════════════════════════════════════
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
BLUE='\033[0;34m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
|
||||
info "══════════════════════════════════════════"
|
||||
info " 构建自定义服务镜像"
|
||||
info "══════════════════════════════════════════"
|
||||
warn "首次构建较慢(需下载 Python 依赖 + AI 模型)"
|
||||
warn "BGE-M3 模型约 2.5GB,MinerU 模型约 2GB"
|
||||
echo ""
|
||||
|
||||
# ── 构建嵌入服务 ────────────────────────────────
|
||||
info "构建 embedding-service(BGE-M3)..."
|
||||
START=$(date +%s)
|
||||
docker compose build embedding-service
|
||||
END=$(date +%s)
|
||||
ok "embedding-service 构建完成($(( END - START ))s)"
|
||||
echo ""
|
||||
|
||||
# ── 构建 MinerU 解析服务 ────────────────────────
|
||||
info "构建 mcp-server(MinerU)..."
|
||||
START=$(date +%s)
|
||||
docker compose build mcp-server
|
||||
END=$(date +%s)
|
||||
ok "mcp-server 构建完成($(( END - START ))s)"
|
||||
echo ""
|
||||
|
||||
# ── 构建业务后端 ────────────────────────────────
|
||||
info "构建 compliance-backend..."
|
||||
START=$(date +%s)
|
||||
docker compose build compliance-backend
|
||||
END=$(date +%s)
|
||||
ok "compliance-backend 构建完成($(( END - START ))s)"
|
||||
echo ""
|
||||
|
||||
# ── 列出构建的镜像 ──────────────────────────────
|
||||
info "已构建的镜像:"
|
||||
docker images | grep -E "compliance-(embedding|mcp|backend)" || true
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo -e "${GREEN} 所有服务镜像构建完成!${NC}"
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo ""
|
||||
echo "下一步:bash scripts/05_init_db.sh"
|
||||
124
scripts/05_init_db.sh
Normal file
124
scripts/05_init_db.sh
Normal file
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# 05_init_db.sh
|
||||
# 初始化数据库:PostgreSQL Schema + Milvus Collections + Neo4j Constraints
|
||||
# 用法:bash scripts/05_init_db.sh
|
||||
# 前提:postgres / milvus / neo4j 已运行且健康
|
||||
# ══════════════════════════════════════════════════
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||||
|
||||
source .env 2>/dev/null || true
|
||||
POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-compliance123}
|
||||
NEO4J_PASSWORD=${NEO4J_PASSWORD:-neo4j123}
|
||||
|
||||
# ── Step 1:PostgreSQL Schema ───────────────────
|
||||
info "Step 1/3:初始化 PostgreSQL Schema..."
|
||||
if docker compose ps postgres | grep -q "healthy"; then
|
||||
docker compose exec -T postgres psql \
|
||||
-U compliance -d compliance_db \
|
||||
-f /docker-entrypoint-initdb.d/01_init_schema.sql \
|
||||
2>&1 | tail -5 || warn "SQL 可能部分已存在(IF NOT EXISTS),这是正常的"
|
||||
ok "PostgreSQL Schema 初始化完成"
|
||||
else
|
||||
error "PostgreSQL 未运行,请先执行:bash scripts/03_start_infra.sh"
|
||||
fi
|
||||
|
||||
# ── Step 2:Milvus Collections ──────────────────
|
||||
info "Step 2/3:初始化 Milvus Collections..."
|
||||
if docker compose ps milvus | grep -q "healthy"; then
|
||||
docker compose run --rm --no-deps compliance-backend \
|
||||
python3 -c "
|
||||
import asyncio
|
||||
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
|
||||
|
||||
connections.connect(host='milvus', port='19530')
|
||||
print('Milvus 连接成功')
|
||||
|
||||
def create_collection(name, description):
|
||||
if utility.has_collection(name):
|
||||
print(f' Collection {name} 已存在,跳过')
|
||||
return
|
||||
|
||||
fields = [
|
||||
FieldSchema(name='id', dtype=DataType.VARCHAR, is_primary=True, max_length=128),
|
||||
FieldSchema(name='file_id', dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name='workspace_id', dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name='chunk_idx', dtype=DataType.INT64),
|
||||
FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name='dense_vec', dtype=DataType.FLOAT_VECTOR, dim=1024), # BGE-M3 dense
|
||||
FieldSchema(name='metadata', dtype=DataType.JSON),
|
||||
]
|
||||
schema = CollectionSchema(fields, description=description)
|
||||
col = Collection(name, schema)
|
||||
|
||||
# 创建向量索引(HNSW,适合调研阶段)
|
||||
index_params = {
|
||||
'metric_type': 'COSINE',
|
||||
'index_type': 'HNSW',
|
||||
'params': {'M': 16, 'efConstruction': 200}
|
||||
}
|
||||
col.create_index('dense_vec', index_params)
|
||||
col.load()
|
||||
print(f' Collection {name} 创建完成')
|
||||
|
||||
create_collection('regulation_chunks', '法规条款向量库')
|
||||
create_collection('doc_chunks', '企业文档向量库')
|
||||
create_collection('case_library', '行业案例库')
|
||||
|
||||
print('Milvus 初始化完成')
|
||||
" 2>&1
|
||||
ok "Milvus Collections 初始化完成"
|
||||
else
|
||||
error "Milvus 未运行,请先执行:bash scripts/03_start_infra.sh"
|
||||
fi
|
||||
|
||||
# ── Step 3:Neo4j 约束和索引 ────────────────────
|
||||
info "Step 3/3:初始化 Neo4j 约束和索引..."
|
||||
sleep 5 # Neo4j 可能还在预热
|
||||
|
||||
docker compose exec -T neo4j cypher-shell \
|
||||
-u neo4j -p "$NEO4J_PASSWORD" \
|
||||
--format plain <<'CYPHER'
|
||||
// 节点约束(唯一性)
|
||||
CREATE CONSTRAINT regulation_id IF NOT EXISTS
|
||||
FOR (r:Regulation) REQUIRE r.id IS UNIQUE;
|
||||
CREATE CONSTRAINT clause_id IF NOT EXISTS
|
||||
FOR (c:Clause) REQUIRE c.id IS UNIQUE;
|
||||
CREATE CONSTRAINT obligation_id IF NOT EXISTS
|
||||
FOR (o:Obligation) REQUIRE o.id IS UNIQUE;
|
||||
|
||||
// 全文索引(模糊查询)
|
||||
CREATE FULLTEXT INDEX regulation_fulltext IF NOT EXISTS
|
||||
FOR (r:Regulation) ON EACH [r.title, r.code, r.domain];
|
||||
CREATE FULLTEXT INDEX clause_fulltext IF NOT EXISTS
|
||||
FOR (c:Clause) ON EACH [c.content, c.title];
|
||||
|
||||
// 插入示例节点(验证连通性)
|
||||
MERGE (d:Domain {name: 'vehicle_safety', label: '车辆安全法规'});
|
||||
MERGE (d:Domain {name: 'data_security', label: '数据安全法规'});
|
||||
MERGE (d:Domain {name: 'ehs', label: 'EHS安全法规'});
|
||||
MERGE (d:Domain {name: 'carbon', label: '碳排放法规'});
|
||||
RETURN '初始化完成' AS result;
|
||||
CYPHER
|
||||
ok "Neo4j 约束和索引初始化完成"
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo -e "${GREEN} 数据库初始化完成!${NC}"
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo ""
|
||||
echo " PostgreSQL: 所有表已创建"
|
||||
echo " Milvus: regulation_chunks / doc_chunks / case_library"
|
||||
echo " Neo4j: 约束 + 全文索引 + 基础域节点"
|
||||
echo ""
|
||||
echo "下一步:bash scripts/06_start_all.sh"
|
||||
98
scripts/06_start_all.sh
Normal file
98
scripts/06_start_all.sh
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# 06_start_all.sh
|
||||
# 一键启动所有服务(完整流程)
|
||||
# 用法:bash scripts/06_start_all.sh
|
||||
# ══════════════════════════════════════════════════
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||||
|
||||
echo ""
|
||||
echo -e "${BLUE}╔══════════════════════════════════════════╗${NC}"
|
||||
echo -e "${BLUE}║ AI合规智能中枢 — 全服务启动 ║${NC}"
|
||||
echo -e "${BLUE}╚══════════════════════════════════════════╝${NC}"
|
||||
echo ""
|
||||
|
||||
# ── 前置检查 ────────────────────────────────────
|
||||
if [[ ! -f ".env" ]]; then
|
||||
error ".env 文件不存在,请先运行:bash scripts/01_setup_project.sh"
|
||||
fi
|
||||
|
||||
source .env 2>/dev/null || true
|
||||
if [[ -z "${DEEPSEEK_API_KEY:-}" && -z "${DASHSCOPE_API_KEY:-}" ]]; then
|
||||
warn "⚠️ 未设置 LLM API Key,LLM 功能将不可用"
|
||||
warn "请在 .env 中设置 DEEPSEEK_API_KEY 或 DASHSCOPE_API_KEY"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# ── Phase 1:基础设施 ────────────────────────────
|
||||
info "Phase 1/4:启动基础设施..."
|
||||
bash "$SCRIPT_DIR/03_start_infra.sh"
|
||||
echo ""
|
||||
|
||||
# ── Phase 2:构建服务镜像 ────────────────────────
|
||||
info "Phase 2/4:构建服务镜像(首次较慢)..."
|
||||
docker compose build embedding-service mcp-server compliance-backend 2>&1 | \
|
||||
grep -E "(Step|Successfully|=>|ERROR)" || true
|
||||
ok "镜像构建完成"
|
||||
echo ""
|
||||
|
||||
# ── Phase 3:初始化数据库 ────────────────────────
|
||||
info "Phase 3/4:初始化数据库..."
|
||||
bash "$SCRIPT_DIR/05_init_db.sh"
|
||||
echo ""
|
||||
|
||||
# ── Phase 4:启动所有服务 ────────────────────────
|
||||
info "Phase 4/4:启动 AI 模型服务和业务服务..."
|
||||
docker compose up -d embedding-service mcp-server
|
||||
info "等待 AI 模型加载(BGE-M3/MinerU 约需 2-3 分钟)..."
|
||||
sleep 30
|
||||
|
||||
# 等待嵌入服务就绪
|
||||
for i in {1..20}; do
|
||||
if curl -sf http://localhost:8010/health > /dev/null 2>&1; then
|
||||
ok "embedding-service 就绪"
|
||||
break
|
||||
fi
|
||||
echo -n "."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
docker compose up -d compliance-backend celery-worker celery-beat nginx
|
||||
info "等待业务服务启动..."
|
||||
sleep 15
|
||||
|
||||
for i in {1..12}; do
|
||||
if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
|
||||
ok "compliance-backend 就绪"
|
||||
break
|
||||
fi
|
||||
echo -n "."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# ── 最终状态 ────────────────────────────────────
|
||||
echo ""
|
||||
echo -e "${GREEN}╔══════════════════════════════════════════╗${NC}"
|
||||
echo -e "${GREEN}║ 所有服务启动完成! ║${NC}"
|
||||
echo -e "${GREEN}╚══════════════════════════════════════════╝${NC}"
|
||||
echo ""
|
||||
docker compose ps --format "table {{.Service}}\t{{.Status}}\t{{.Ports}}"
|
||||
echo ""
|
||||
echo -e "${BLUE}访问地址:${NC}"
|
||||
echo " API 网关 : http://localhost"
|
||||
echo " API 文档 : http://localhost/docs"
|
||||
echo " Neo4j 浏览器 : http://localhost:7474"
|
||||
echo " MinIO 控制台 : http://localhost:9001"
|
||||
echo ""
|
||||
echo -e "${YELLOW}运行冒烟测试:${NC}"
|
||||
echo " bash scripts/07_smoke_test.sh"
|
||||
183
scripts/07_smoke_test.sh
Normal file
183
scripts/07_smoke_test.sh
Normal file
@@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# 07_smoke_test.sh
|
||||
# 端到端冒烟测试:验证三条业务闭环
|
||||
# 用法:bash scripts/07_smoke_test.sh
|
||||
# ══════════════════════════════════════════════════
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[✓]${NC} $*"; }
|
||||
fail() { echo -e "${RED}[✗]${NC} $*"; FAILED=$((FAILED+1)); }
|
||||
warn() { echo -e "${YELLOW}[~]${NC} $*"; }
|
||||
|
||||
FAILED=0
|
||||
API_BASE="http://localhost"
|
||||
|
||||
echo ""
|
||||
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||
echo -e "${BLUE} AI合规智能中枢 端到端冒烟测试${NC}"
|
||||
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||
echo ""
|
||||
|
||||
# ── 基础健康检查 ────────────────────────────────
|
||||
info "=== 基础设施健康检查 ==="
|
||||
|
||||
check_service() {
|
||||
local name=$1; local url=$2
|
||||
if curl -sf "$url" > /dev/null 2>&1; then
|
||||
ok "$name"
|
||||
else
|
||||
fail "$name($url 不可达)"
|
||||
fi
|
||||
}
|
||||
|
||||
check_service "API 网关 (Nginx)" "http://localhost/health"
|
||||
check_service "业务后端 (FastAPI)" "http://localhost:8000/health"
|
||||
check_service "嵌入服务 (BGE-M3)" "http://localhost:8010/health"
|
||||
check_service "解析服务 (MinerU)" "http://localhost:8011/health"
|
||||
check_service "Milvus HTTP" "http://localhost:9091/healthz"
|
||||
check_service "Neo4j Browser" "http://localhost:7474"
|
||||
echo ""
|
||||
|
||||
# ── 嵌入服务测试 ────────────────────────────────
|
||||
info "=== 嵌入服务测试 ==="
|
||||
EMBED_RESP=$(curl -sf -X POST http://localhost:8010/embed \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"texts": ["GB 18384 电动汽车碰撞安全要求"], "batch_size": 1}' 2>/dev/null || echo "{}")
|
||||
|
||||
if echo "$EMBED_RESP" | python3 -c "import sys,json; d=json.load(sys.stdin); assert len(d.get('dense',[])[0])==1024" 2>/dev/null; then
|
||||
ok "BGE-M3 嵌入:返回 1024 维向量"
|
||||
else
|
||||
fail "BGE-M3 嵌入失败,响应:${EMBED_RESP:0:200}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── 创建测试 PDF ────────────────────────────────
|
||||
info "=== 创建测试文档 ==="
|
||||
TEST_PDF="$PROJECT_DIR/data/uploads/test_regulation.txt"
|
||||
cat > "$TEST_PDF" << 'EOF'
|
||||
GB 18384-2020 电动汽车安全要求
|
||||
|
||||
第一章 总则
|
||||
本标准规定了电动汽车的安全要求,适用于M1类纯电动汽车。
|
||||
|
||||
第二章 电气安全
|
||||
2.1 绝缘电阻要求
|
||||
直流电路绝缘电阻不得低于100Ω/V。
|
||||
2.2 碰撞安全
|
||||
车辆碰撞后,高压电系统应自动断电。
|
||||
碰撞后5秒内,高压系统电压应降至60V以下。
|
||||
|
||||
第三章 防水要求
|
||||
高压系统防护等级应达到IP67。
|
||||
EOF
|
||||
ok "测试文档创建:$TEST_PDF"
|
||||
echo ""
|
||||
|
||||
# ── 闭环①:文件上传 → 向量化 → 问答 ───────────
|
||||
info "=== 闭环①:法规入库 → 检索问答 ==="
|
||||
|
||||
# 创建工作空间
|
||||
WORKSPACE_RESP=$(curl -sf -X POST "$API_BASE/api/kb/workspaces" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name": "测试法规库", "domain": "vehicle_safety"}' 2>/dev/null || echo "{}")
|
||||
WS_ID=$(echo "$WORKSPACE_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$WS_ID" ]]; then
|
||||
ok "工作空间创建:$WS_ID"
|
||||
else
|
||||
warn "工作空间创建失败(可能接口未完全实现),跳过后续上传测试"
|
||||
WS_ID="test-workspace"
|
||||
fi
|
||||
|
||||
# 上传文件
|
||||
UPLOAD_RESP=$(curl -sf -X POST "$API_BASE/api/kb/files/upload" \
|
||||
-F "file=@$TEST_PDF" \
|
||||
-F "workspace_id=$WS_ID" 2>/dev/null || echo "{}")
|
||||
TASK_ID=$(echo "$UPLOAD_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('task_id',''))" 2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$TASK_ID" ]]; then
|
||||
ok "文件上传任务已创建:$TASK_ID"
|
||||
|
||||
# 轮询任务状态(最多等待120秒)
|
||||
info "等待向量化完成..."
|
||||
for i in {1..24}; do
|
||||
TASK_STATUS=$(curl -sf "$API_BASE/api/kb/tasks/$TASK_ID" 2>/dev/null | \
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" 2>/dev/null || echo "unknown")
|
||||
if [[ "$TASK_STATUS" == "completed" ]]; then
|
||||
ok "向量化完成(${i}×5s)"
|
||||
break
|
||||
elif [[ "$TASK_STATUS" == "failed" ]]; then
|
||||
fail "向量化失败"
|
||||
break
|
||||
fi
|
||||
echo -n "."
|
||||
sleep 5
|
||||
done
|
||||
echo ""
|
||||
|
||||
# 检索问答
|
||||
QA_RESP=$(curl -sf -X POST "$API_BASE/api/kb/qa" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"query\": \"碰撞后高压系统电压要求\", \"workspace_id\": \"$WS_ID\", \"top_k\": 3}" 2>/dev/null || echo "{}")
|
||||
ANSWER=$(echo "$QA_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('answer','')[:100])" 2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$ANSWER" ]]; then
|
||||
ok "问答成功:${ANSWER}..."
|
||||
else
|
||||
warn "问答返回空(LLM API 可能未配置或响应缓慢)"
|
||||
fi
|
||||
else
|
||||
warn "文件上传失败(接口可能未实现)"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── 闭环②:合规审查 ────────────────────────────
|
||||
info "=== 闭环②:文档上传 → 合规审查 ==="
|
||||
|
||||
CHECK_RESP=$(curl -sf -X POST "$API_BASE/api/compliance/check" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"query": "供应商文件是否符合GB 18384碰撞安全要求", "domains": ["vehicle_safety"]}' 2>/dev/null || echo "{}")
|
||||
RISK=$(echo "$CHECK_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('risk_level','unknown'))" 2>/dev/null || echo "unknown")
|
||||
|
||||
if [[ "$RISK" != "unknown" && -n "$RISK" ]]; then
|
||||
ok "合规审查完成,风险等级:$RISK"
|
||||
else
|
||||
warn "合规审查接口返回空(功能可能未完全实现)"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── 闭环③:法规监控 ────────────────────────────
|
||||
info "=== 闭环③:法规监控源配置 ==="
|
||||
|
||||
SOURCE_RESP=$(curl -sf -X POST "$API_BASE/api/regulation/sources" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name": "测试监控源", "url": "https://std.samr.gov.cn", "domain": "vehicle_safety"}' 2>/dev/null || echo "{}")
|
||||
SOURCE_ID=$(echo "$SOURCE_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$SOURCE_ID" ]]; then
|
||||
ok "监控源配置成功:$SOURCE_ID"
|
||||
else
|
||||
warn "监控源配置返回空(功能可能未完全实现)"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── 汇总 ────────────────────────────────────────
|
||||
echo ""
|
||||
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||
if [[ $FAILED -eq 0 ]]; then
|
||||
echo -e "${GREEN} 全部检查通过!${NC}"
|
||||
else
|
||||
echo -e "${YELLOW} 完成,${FAILED} 项失败${NC}(部分功能可能尚未实现)"
|
||||
fi
|
||||
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||
echo ""
|
||||
echo "查看服务日志:"
|
||||
echo " docker compose logs -f compliance-backend"
|
||||
echo " docker compose logs -f celery-worker"
|
||||
66
scripts/check_health.sh
Normal file
66
scripts/check_health.sh
Normal file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# check_health.sh
|
||||
# 检查所有服务的健康状态和资源使用
|
||||
# 用法:bash scripts/check_health.sh
|
||||
# ══════════════════════════════════════════════════
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||
|
||||
echo ""
|
||||
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||
echo -e "${BLUE} 服务健康检查报告${NC}"
|
||||
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||
echo ""
|
||||
|
||||
# Docker 服务状态
|
||||
echo -e "${BLUE}【Docker Compose 服务状态】${NC}"
|
||||
docker compose ps --format "table {{.Service}}\t{{.Status}}\t{{.Ports}}"
|
||||
echo ""
|
||||
|
||||
# HTTP 端点检查
|
||||
echo -e "${BLUE}【HTTP 健康端点】${NC}"
|
||||
check_http() {
|
||||
local name=$1; local url=$2
|
||||
if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then
|
||||
echo -e " ${GREEN}[OK]${NC} $name ($url)"
|
||||
else
|
||||
echo -e " ${RED}[FAIL]${NC} $name ($url)"
|
||||
fi
|
||||
}
|
||||
|
||||
check_http "API 网关" "http://localhost/health"
|
||||
check_http "业务后端" "http://localhost:8000/health"
|
||||
check_http "嵌入服务" "http://localhost:8010/health"
|
||||
check_http "解析服务" "http://localhost:8011/health"
|
||||
check_http "Milvus" "http://localhost:9091/healthz"
|
||||
check_http "Neo4j" "http://localhost:7474"
|
||||
echo ""
|
||||
|
||||
# 资源使用
|
||||
echo -e "${BLUE}【容器资源使用】${NC}"
|
||||
docker stats --no-stream --format \
|
||||
"table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" \
|
||||
2>/dev/null | head -15
|
||||
echo ""
|
||||
|
||||
# 磁盘使用
|
||||
echo -e "${BLUE}【磁盘使用】${NC}"
|
||||
df -h . | tail -1 | awk '{print " 项目目录:已用 "$3",可用 "$4"(" $5 " 使用率)"}'
|
||||
docker system df 2>/dev/null | head -6
|
||||
echo ""
|
||||
|
||||
# LLM 配置检查
|
||||
echo -e "${BLUE}【LLM API 配置】${NC}"
|
||||
source .env 2>/dev/null || true
|
||||
if [[ -n "${DEEPSEEK_API_KEY:-}" ]]; then
|
||||
echo -e " ${GREEN}[OK]${NC} DeepSeek API Key 已配置"
|
||||
elif [[ -n "${DASHSCOPE_API_KEY:-}" ]]; then
|
||||
echo -e " ${GREEN}[OK]${NC} DashScope (Qwen) API Key 已配置"
|
||||
else
|
||||
echo -e " ${YELLOW}[WARN]${NC} 未配置 LLM API Key(LLM 功能不可用)"
|
||||
fi
|
||||
echo ""
|
||||
91
scripts/download_models.sh
Normal file
91
scripts/download_models.sh
Normal file
@@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# download_models.sh
|
||||
# 预下载 AI 模型到 ./models 目录(加速容器启动)
|
||||
# 支持 HuggingFace 镜像加速(国内网络)
|
||||
# 用法:bash scripts/download_models.sh
|
||||
# ══════════════════════════════════════════════════
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
BLUE='\033[0;34m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
|
||||
MODELS_DIR="$PROJECT_DIR/models"
|
||||
mkdir -p "$MODELS_DIR"
|
||||
|
||||
# 设置镜像加速
|
||||
export HF_ENDPOINT="${HF_ENDPOINT:-https://hf-mirror.com}"
|
||||
export HF_HOME="$MODELS_DIR"
|
||||
info "HuggingFace 镜像:$HF_ENDPOINT"
|
||||
info "模型保存路径:$MODELS_DIR"
|
||||
echo ""
|
||||
|
||||
# ── 方法1:通过 huggingface_hub 下载 ────────────
|
||||
download_hf() {
|
||||
local repo=$1; local local_name=$2
|
||||
info "下载 $repo..."
|
||||
if python3 -c "
|
||||
import os
|
||||
os.environ['HF_ENDPOINT'] = '${HF_ENDPOINT}'
|
||||
os.environ['HF_HOME'] = '${MODELS_DIR}'
|
||||
from huggingface_hub import snapshot_download
|
||||
snapshot_download(repo_id='$repo', cache_dir='${MODELS_DIR}')
|
||||
print('下载完成')
|
||||
" 2>&1; then
|
||||
ok "$repo 下载成功"
|
||||
else
|
||||
warn "$repo HuggingFace 下载失败,尝试 ModelScope..."
|
||||
download_modelscope "$repo" "$local_name"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── 方法2:通过 ModelScope 下载(备用)──────────
|
||||
download_modelscope() {
|
||||
local hf_name=$1
|
||||
local ms_name=${2:-$1}
|
||||
python3 -c "
|
||||
try:
|
||||
from modelscope import snapshot_download
|
||||
snapshot_download(model_id='$ms_name', cache_dir='${MODELS_DIR}/modelscope')
|
||||
print('ModelScope 下载完成')
|
||||
except ImportError:
|
||||
print('ModelScope 未安装,跳过')
|
||||
except Exception as e:
|
||||
print(f'ModelScope 下载失败: {e}')
|
||||
" 2>&1 || warn "ModelScope 下载也失败,模型将在容器启动时自动下载"
|
||||
}
|
||||
|
||||
# ── 检查 Python 环境 ────────────────────────────
|
||||
if ! python3 -c "import huggingface_hub" 2>/dev/null; then
|
||||
warn "未安装 huggingface_hub,尝试安装..."
|
||||
pip3 install -q huggingface_hub modelscope 2>/dev/null || \
|
||||
warn "安装失败,模型将在容器首次启动时下载"
|
||||
fi
|
||||
|
||||
# ── 下载模型列表 ────────────────────────────────
|
||||
info "=== 下载 BGE-M3 嵌入模型(约 2.5GB)==="
|
||||
download_hf "BAAI/bge-m3" "BAAI/bge-m3"
|
||||
echo ""
|
||||
|
||||
info "=== 下载 BGE-Reranker 精排模型(约 1.1GB)==="
|
||||
download_hf "BAAI/bge-reranker-v2-m3" "BAAI/bge-reranker-v2-m3"
|
||||
echo ""
|
||||
|
||||
# MinerU 模型通过容器内脚本下载(依赖 magic-pdf 配置)
|
||||
info "=== MinerU 模型说明 ==="
|
||||
warn "MinerU 模型(约 2GB)将在 mcp-server 容器首次启动时自动下载"
|
||||
warn "如需预下载,请在 mcp-server 容器内运行:mineru-models-download"
|
||||
echo ""
|
||||
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo -e "${GREEN} 模型下载完成!${NC}"
|
||||
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||
echo ""
|
||||
echo "已下载到:$MODELS_DIR"
|
||||
du -sh "$MODELS_DIR" 2>/dev/null || true
|
||||
37
scripts/reset_all.sh
Normal file
37
scripts/reset_all.sh
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
# ══════════════════════════════════════════════════
|
||||
# reset_all.sh
|
||||
# ⚠️ 危险操作:停止所有服务并删除所有数据(慎用!)
|
||||
# 用法:bash scripts/reset_all.sh
|
||||
# ══════════════════════════════════════════════════
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
||||
|
||||
echo ""
|
||||
echo -e "${RED}╔══════════════════════════════════════════╗${NC}"
|
||||
echo -e "${RED}║ ⚠️ 警告:此操作将删除所有数据! ║${NC}"
|
||||
echo -e "${RED}║ 包括:PostgreSQL / Milvus / Neo4j 数据 ║${NC}"
|
||||
echo -e "${RED}║ 以及所有上传的文件和日志 ║${NC}"
|
||||
echo -e "${RED}╚══════════════════════════════════════════╝${NC}"
|
||||
echo ""
|
||||
echo -e "${YELLOW}确认要重置所有数据吗?(输入 'yes' 确认,其他取消)${NC}"
|
||||
read -r CONFIRM
|
||||
|
||||
if [[ "$CONFIRM" != "yes" ]]; then
|
||||
echo "已取消"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "停止所有服务..."
|
||||
docker compose down --volumes --remove-orphans
|
||||
|
||||
echo "清理数据目录..."
|
||||
rm -rf data/uploads/* data/parsed/* logs/*
|
||||
echo "✓ 数据目录已清空(保留目录结构)"
|
||||
|
||||
echo ""
|
||||
echo -e "${YELLOW}重置完成。重新启动:bash scripts/06_start_all.sh${NC}"
|
||||
24
services/compliance-backend/Dockerfile
Normal file
24
services/compliance-backend/Dockerfile
Normal file
@@ -0,0 +1,24 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 使用 uv 加速依赖安装
|
||||
RUN pip install uv --no-cache-dir
|
||||
|
||||
COPY pyproject.toml .
|
||||
RUN uv pip install --system --no-cache -r pyproject.toml \
|
||||
--index-url https://pypi.tuna.tsinghua.edu.cn/simple \
|
||||
--trusted-host pypi.tuna.tsinghua.edu.cn
|
||||
|
||||
COPY app/ ./app/
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=5 \
|
||||
CMD curl -f http://localhost:8000/health || exit 1
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
|
||||
0
services/compliance-backend/app/__init__.py
Normal file
0
services/compliance-backend/app/__init__.py
Normal file
0
services/compliance-backend/app/api/__init__.py
Normal file
0
services/compliance-backend/app/api/__init__.py
Normal file
95
services/compliance-backend/app/api/compliance.py
Normal file
95
services/compliance-backend/app/api/compliance.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import uuid
|
||||
import logging
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
|
||||
from ..core.llm import get_llm, COMPLIANCE_CHECK_PROMPT
|
||||
from ..services.rag import hybrid_search
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/api/compliance", tags=["合规审查"])
|
||||
|
||||
|
||||
class ComplianceCheckRequest(BaseModel):
|
||||
query: str
|
||||
regulation_domains: list[str] = ["vehicle_safety"]
|
||||
top_k: int = 5
|
||||
|
||||
|
||||
class ComplianceCheckResponse(BaseModel):
|
||||
risk_level: str
|
||||
risk_score: float
|
||||
findings: list[dict]
|
||||
recommendations: list[str]
|
||||
sources: list[dict]
|
||||
|
||||
|
||||
@router.post("/check", response_model=ComplianceCheckResponse)
|
||||
async def check_compliance(req: ComplianceCheckRequest):
|
||||
"""
|
||||
对输入内容进行合规性检查,与法规库比对后给出风险评估。
|
||||
"""
|
||||
# 检索相关法规(从多个域检索)
|
||||
all_chunks = []
|
||||
for domain in req.regulation_domains:
|
||||
chunks = await hybrid_search(
|
||||
req.query,
|
||||
collection_name="regulation_chunks",
|
||||
top_k=req.top_k,
|
||||
)
|
||||
all_chunks.extend(chunks)
|
||||
|
||||
# 去重 + 按分数排序
|
||||
seen = set()
|
||||
unique_chunks = []
|
||||
for c in sorted(all_chunks, key=lambda x: x["score"], reverse=True):
|
||||
if c["id"] not in seen:
|
||||
seen.add(c["id"])
|
||||
unique_chunks.append(c)
|
||||
top_chunks = unique_chunks[:req.top_k]
|
||||
|
||||
if not top_chunks:
|
||||
return ComplianceCheckResponse(
|
||||
risk_level="unknown",
|
||||
risk_score=0,
|
||||
findings=[{"issue": "未找到相关法规,请先上传法规文档"}],
|
||||
recommendations=["上传相关法规文档到知识库后重试"],
|
||||
sources=[],
|
||||
)
|
||||
|
||||
# 构建法规上下文
|
||||
regulations_text = "\n\n".join(
|
||||
f"[{i+1}] {c['content'][:500]}" for i, c in enumerate(top_chunks)
|
||||
)
|
||||
|
||||
prompt = COMPLIANCE_CHECK_PROMPT.format(
|
||||
content=req.query,
|
||||
regulations=regulations_text,
|
||||
)
|
||||
|
||||
llm = get_llm(temperature=0.0)
|
||||
try:
|
||||
response = await llm.ainvoke([HumanMessage(content=prompt)])
|
||||
analysis = response.content
|
||||
except Exception as e:
|
||||
logger.error(f"LLM 合规分析失败:{e}")
|
||||
analysis = f"LLM 分析失败:{e}"
|
||||
|
||||
# 简单解析 LLM 输出(生产可用结构化输出)
|
||||
risk_level = "medium"
|
||||
risk_score = 50.0
|
||||
if "critical" in analysis.lower() or "严重" in analysis:
|
||||
risk_level, risk_score = "critical", 90.0
|
||||
elif "high" in analysis.lower() or "高风险" in analysis:
|
||||
risk_level, risk_score = "high", 70.0
|
||||
elif "low" in analysis.lower() or "低风险" in analysis:
|
||||
risk_level, risk_score = "low", 20.0
|
||||
|
||||
return ComplianceCheckResponse(
|
||||
risk_level=risk_level,
|
||||
risk_score=risk_score,
|
||||
findings=[{"analysis": analysis}],
|
||||
recommendations=["请参考上述分析进行整改"],
|
||||
sources=[{"content": c["content"][:200], "score": c["score"]} for c in top_chunks],
|
||||
)
|
||||
114
services/compliance-backend/app/api/kb.py
Normal file
114
services/compliance-backend/app/api/kb.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import uuid
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, Depends, UploadFile, File, Form, HTTPException, BackgroundTasks
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select
|
||||
|
||||
from ..core.deps import get_db
|
||||
from ..models.db import Workspace, File as FileRecord, Task
|
||||
from ..services.rag import hybrid_search, rerank, generate_answer
|
||||
from ..worker import process_file_task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/api/kb", tags=["知识库"])
|
||||
|
||||
UPLOAD_DIR = Path("/app/data/uploads")
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
class WorkspaceCreate(BaseModel):
|
||||
name: str
|
||||
description: str = ""
|
||||
domain: str = "general"
|
||||
|
||||
|
||||
class QARequest(BaseModel):
|
||||
query: str
|
||||
workspace_id: str | None = None
|
||||
top_k: int = 5
|
||||
return_sources: bool = True
|
||||
|
||||
|
||||
@router.post("/workspaces")
|
||||
async def create_workspace(req: WorkspaceCreate, db: AsyncSession = Depends(get_db)):
|
||||
ws = Workspace(name=req.name, description=req.description, domain=req.domain)
|
||||
db.add(ws)
|
||||
await db.flush()
|
||||
return {"id": str(ws.id), "name": ws.name, "domain": ws.domain}
|
||||
|
||||
|
||||
@router.post("/files/upload")
|
||||
async def upload_file(
|
||||
background_tasks: BackgroundTasks,
|
||||
file: UploadFile = File(...),
|
||||
workspace_id: str = Form(default=""),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
content = await file.read()
|
||||
file_id = str(uuid.uuid4())
|
||||
suffix = Path(file.filename or "doc").suffix
|
||||
save_path = UPLOAD_DIR / f"{file_id}{suffix}"
|
||||
save_path.write_bytes(content)
|
||||
|
||||
file_record = FileRecord(
|
||||
id=uuid.UUID(file_id),
|
||||
filename=f"{file_id}{suffix}",
|
||||
original_name=file.filename or "unknown",
|
||||
file_type=suffix.lstrip("."),
|
||||
file_size=len(content),
|
||||
storage_path=str(save_path),
|
||||
workspace_id=uuid.UUID(workspace_id) if workspace_id else None,
|
||||
status="uploaded",
|
||||
)
|
||||
db.add(file_record)
|
||||
|
||||
task = Task(
|
||||
task_type="parse_and_vectorize",
|
||||
status="pending",
|
||||
file_id=uuid.UUID(file_id),
|
||||
payload={"workspace_id": workspace_id},
|
||||
)
|
||||
db.add(task)
|
||||
await db.flush()
|
||||
|
||||
# 异步触发 Celery 任务
|
||||
celery_task = process_file_task.delay(file_id, str(task.id), workspace_id)
|
||||
task.celery_task_id = celery_task.id
|
||||
await db.flush()
|
||||
|
||||
return {"file_id": file_id, "task_id": str(task.id), "status": "processing"}
|
||||
|
||||
|
||||
@router.get("/tasks/{task_id}")
|
||||
async def get_task(task_id: str, db: AsyncSession = Depends(get_db)):
|
||||
result = await db.execute(select(Task).where(Task.id == uuid.UUID(task_id)))
|
||||
task = result.scalar_one_or_none()
|
||||
if not task:
|
||||
raise HTTPException(status_code=404, detail="任务不存在")
|
||||
return {
|
||||
"task_id": str(task.id),
|
||||
"status": task.status,
|
||||
"progress": task.progress,
|
||||
"file_id": str(task.file_id) if task.file_id else None,
|
||||
"error_msg": task.error_msg,
|
||||
"completed_at": task.completed_at.isoformat() if task.completed_at else None,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/qa")
|
||||
async def qa(req: QARequest):
|
||||
chunks = await hybrid_search(req.query, workspace_id=req.workspace_id, top_k=req.top_k * 2)
|
||||
ranked = await rerank(req.query, chunks, top_k=req.top_k)
|
||||
result = await generate_answer(req.query, ranked)
|
||||
if not req.return_sources:
|
||||
result.pop("sources", None)
|
||||
return result
|
||||
|
||||
|
||||
@router.post("/knowledge/retrieval")
|
||||
async def retrieval(req: QARequest):
|
||||
chunks = await hybrid_search(req.query, workspace_id=req.workspace_id, top_k=req.top_k)
|
||||
return {"chunks": chunks, "total": len(chunks)}
|
||||
111
services/compliance-backend/app/api/regulation.py
Normal file
111
services/compliance-backend/app/api/regulation.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import uuid
|
||||
import logging
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select, desc
|
||||
|
||||
from ..core.deps import get_db
|
||||
from ..models.db import RegulationSource, RegulationUpdate
|
||||
from ..worker import fetch_regulation_source
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/api/regulation", tags=["法规监控"])
|
||||
|
||||
|
||||
class SourceCreate(BaseModel):
|
||||
name: str
|
||||
url: str
|
||||
domain: str = "vehicle_safety"
|
||||
fetch_interval: int = 86400
|
||||
fetch_config: dict = {}
|
||||
|
||||
|
||||
class SubscribeRequest(BaseModel):
|
||||
name: str
|
||||
channel: str # email / webhook / feishu / dingtalk
|
||||
target: str
|
||||
domains: list[str] = []
|
||||
importance_min: str = "normal"
|
||||
|
||||
|
||||
@router.post("/sources")
|
||||
async def create_source(req: SourceCreate, db: AsyncSession = Depends(get_db)):
|
||||
source = RegulationSource(
|
||||
name=req.name,
|
||||
url=req.url,
|
||||
domain=req.domain,
|
||||
fetch_interval=req.fetch_interval,
|
||||
fetch_config=req.fetch_config,
|
||||
)
|
||||
db.add(source)
|
||||
await db.flush()
|
||||
return {
|
||||
"id": str(source.id),
|
||||
"name": source.name,
|
||||
"url": source.url,
|
||||
"domain": source.domain,
|
||||
"status": "active",
|
||||
}
|
||||
|
||||
|
||||
@router.get("/sources")
|
||||
async def list_sources(db: AsyncSession = Depends(get_db)):
|
||||
result = await db.execute(
|
||||
select(RegulationSource).where(RegulationSource.is_active == True)
|
||||
)
|
||||
sources = result.scalars().all()
|
||||
return [{"id": str(s.id), "name": s.name, "url": s.url, "domain": s.domain} for s in sources]
|
||||
|
||||
|
||||
@router.post("/sources/{source_id}/fetch")
|
||||
async def manual_fetch(source_id: str, db: AsyncSession = Depends(get_db)):
|
||||
"""手动触发某个监控源的抓取(测试用)"""
|
||||
result = await db.execute(
|
||||
select(RegulationSource).where(RegulationSource.id == uuid.UUID(source_id))
|
||||
)
|
||||
source = result.scalar_one_or_none()
|
||||
if not source:
|
||||
raise HTTPException(status_code=404, detail="监控源不存在")
|
||||
|
||||
task = fetch_regulation_source.delay(source_id)
|
||||
return {"task_id": task.id, "status": "queued", "source_id": source_id}
|
||||
|
||||
|
||||
@router.get("/updates")
|
||||
async def get_updates(
|
||||
domain: str | None = None,
|
||||
limit: int = 20,
|
||||
offset: int = 0,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
query = select(RegulationUpdate).order_by(desc(RegulationUpdate.fetched_at))
|
||||
result = await db.execute(query.limit(limit).offset(offset))
|
||||
updates = result.scalars().all()
|
||||
return {
|
||||
"updates": [
|
||||
{
|
||||
"id": str(u.id),
|
||||
"title": u.title,
|
||||
"url": u.url,
|
||||
"change_type": u.change_type,
|
||||
"summary": u.summary,
|
||||
"importance": u.importance,
|
||||
"fetched_at": u.fetched_at.isoformat() if u.fetched_at else None,
|
||||
}
|
||||
for u in updates
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@router.post("/subscribe")
|
||||
async def subscribe(req: SubscribeRequest, db: AsyncSession = Depends(get_db)):
|
||||
from ..models.db import Workspace # 借用DB session
|
||||
# 简化版:仅记录订阅(推送逻辑在 push-worker 中实现)
|
||||
return {
|
||||
"id": str(uuid.uuid4()),
|
||||
"name": req.name,
|
||||
"channel": req.channel,
|
||||
"domains": req.domains,
|
||||
"status": "active",
|
||||
}
|
||||
0
services/compliance-backend/app/core/__init__.py
Normal file
0
services/compliance-backend/app/core/__init__.py
Normal file
37
services/compliance-backend/app/core/config.py
Normal file
37
services/compliance-backend/app/core/config.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
model_config = SettingsConfigDict(env_file=".env", extra="ignore")
|
||||
|
||||
# 应用
|
||||
app_env: str = "development"
|
||||
log_level: str = "INFO"
|
||||
api_secret_key: str = "change_this_key"
|
||||
|
||||
# 数据库
|
||||
database_url: str = "postgresql+asyncpg://compliance:compliance123@postgres:5432/compliance_db"
|
||||
redis_url: str = "redis://:redis123@redis:6379/0"
|
||||
|
||||
# Milvus
|
||||
milvus_host: str = "milvus"
|
||||
milvus_port: int = 19530
|
||||
|
||||
# Neo4j
|
||||
neo4j_uri: str = "bolt://neo4j:7687"
|
||||
neo4j_user: str = "neo4j"
|
||||
neo4j_password: str = "neo4j123"
|
||||
|
||||
# AI 服务
|
||||
embedding_service_url: str = "http://embedding-service:8010"
|
||||
mcp_server_url: str = "http://mcp-server:8011"
|
||||
|
||||
# LLM
|
||||
llm_provider: str = "deepseek" # deepseek / qwen
|
||||
deepseek_api_key: str = ""
|
||||
deepseek_model: str = "deepseek-chat"
|
||||
dashscope_api_key: str = ""
|
||||
qwen_model: str = "qwen-plus"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
54
services/compliance-backend/app/core/deps.py
Normal file
54
services/compliance-backend/app/core/deps.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from functools import lru_cache
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import httpx
|
||||
from neo4j import AsyncGraphDatabase
|
||||
from pymilvus import connections, Collection
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
|
||||
|
||||
from .config import settings
|
||||
|
||||
# ── PostgreSQL ──────────────────────────────────
|
||||
engine = create_async_engine(settings.database_url, pool_size=10, max_overflow=20)
|
||||
AsyncSessionLocal = async_sessionmaker(engine, expire_on_commit=False)
|
||||
|
||||
|
||||
async def get_db() -> AsyncGenerator[AsyncSession, None]:
|
||||
async with AsyncSessionLocal() as session:
|
||||
try:
|
||||
yield session
|
||||
await session.commit()
|
||||
except Exception:
|
||||
await session.rollback()
|
||||
raise
|
||||
|
||||
|
||||
# ── Milvus ──────────────────────────────────────
|
||||
def get_milvus_collection(name: str) -> Collection:
|
||||
connections.connect(host=settings.milvus_host, port=settings.milvus_port)
|
||||
return Collection(name)
|
||||
|
||||
|
||||
# ── Neo4j ───────────────────────────────────────
|
||||
_neo4j_driver = None
|
||||
|
||||
|
||||
def get_neo4j():
|
||||
global _neo4j_driver
|
||||
if _neo4j_driver is None:
|
||||
_neo4j_driver = AsyncGraphDatabase.driver(
|
||||
settings.neo4j_uri,
|
||||
auth=(settings.neo4j_user, settings.neo4j_password),
|
||||
)
|
||||
return _neo4j_driver
|
||||
|
||||
|
||||
# ── HTTP 客户端(复用连接池)────────────────────
|
||||
_http_client = None
|
||||
|
||||
|
||||
def get_http_client() -> httpx.AsyncClient:
|
||||
global _http_client
|
||||
if _http_client is None:
|
||||
_http_client = httpx.AsyncClient(timeout=120.0)
|
||||
return _http_client
|
||||
56
services/compliance-backend/app/core/llm.py
Normal file
56
services/compliance-backend/app/core/llm.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from langchain_openai import ChatOpenAI
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
from .config import settings
|
||||
|
||||
|
||||
def get_llm(temperature: float = 0.1) -> ChatOpenAI:
|
||||
"""获取 LLM 客户端(DeepSeek 或 Qwen,均兼容 OpenAI API)"""
|
||||
if settings.llm_provider == "deepseek":
|
||||
return ChatOpenAI(
|
||||
model=settings.deepseek_model,
|
||||
api_key=settings.deepseek_api_key,
|
||||
base_url="https://api.deepseek.com/v1",
|
||||
temperature=temperature,
|
||||
max_retries=3,
|
||||
timeout=120,
|
||||
)
|
||||
elif settings.llm_provider == "qwen":
|
||||
return ChatOpenAI(
|
||||
model=settings.qwen_model,
|
||||
api_key=settings.dashscope_api_key,
|
||||
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
temperature=temperature,
|
||||
max_retries=3,
|
||||
timeout=120,
|
||||
)
|
||||
raise ValueError(f"不支持的 LLM 提供商:{settings.llm_provider}")
|
||||
|
||||
|
||||
RAG_SYSTEM_PROMPT = """你是一位专业的汽车行业合规专家,具备深厚的法规知识(GB标准、UN-ECE、ISO 45001、IATF 16949等)。
|
||||
|
||||
回答规则:
|
||||
1. 仅基于提供的参考文献回答,不添加不在文献中的信息
|
||||
2. 每个关键陈述必须标注来源(格式:[来源:文件名,第X页])
|
||||
3. 如果参考文献不足以回答问题,明确说明
|
||||
4. 使用专业但清晰的语言,适合工程师和法务人员阅读
|
||||
5. 对于数值要求(如绝缘电阻值、时间限制等),精确引用原文"""
|
||||
|
||||
|
||||
COMPLIANCE_CHECK_PROMPT = """你是一位专业的汽车合规审查专家。
|
||||
|
||||
请对以下内容进行合规性评估:
|
||||
|
||||
【待审查内容】
|
||||
{content}
|
||||
|
||||
【相关法规要求】
|
||||
{regulations}
|
||||
|
||||
请按以下格式输出:
|
||||
1. 整体风险等级:[low/medium/high/critical]
|
||||
2. 风险分数:[0-100]
|
||||
3. 发现的合规问题(逐条列出):
|
||||
- 问题描述
|
||||
- 违反的具体法规条款
|
||||
- 严重程度
|
||||
4. 整改建议(具体可操作)"""
|
||||
84
services/compliance-backend/app/main.py
Normal file
84
services/compliance-backend/app/main.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import logging
|
||||
import time
|
||||
|
||||
import structlog
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from prometheus_fastapi_instrumentator import Instrumentator
|
||||
|
||||
from .api import kb, compliance, regulation
|
||||
from .core.config import settings
|
||||
|
||||
# 结构化日志配置
|
||||
structlog.configure(
|
||||
wrapper_class=structlog.make_filtering_bound_logger(
|
||||
getattr(logging, settings.log_level.upper(), logging.INFO)
|
||||
)
|
||||
)
|
||||
logger = structlog.get_logger()
|
||||
|
||||
app = FastAPI(
|
||||
title="AI合规智能中枢 API",
|
||||
description="面向车企与工厂的全链路合规智能平台",
|
||||
version="0.1.0",
|
||||
docs_url="/docs",
|
||||
redoc_url="/redoc",
|
||||
)
|
||||
|
||||
# CORS(开发环境)
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"] if settings.app_env == "development" else [],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Prometheus 指标
|
||||
Instrumentator().instrument(app).expose(app)
|
||||
|
||||
# 注册路由
|
||||
app.include_router(kb.router)
|
||||
app.include_router(compliance.router)
|
||||
app.include_router(regulation.router)
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
async def log_requests(request: Request, call_next):
|
||||
start = time.time()
|
||||
response = await call_next(request)
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
logger.info(
|
||||
"request",
|
||||
method=request.method,
|
||||
path=request.url.path,
|
||||
status=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
return response
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
"""健康检查(含依赖服务检测)"""
|
||||
import httpx
|
||||
from .core.config import settings
|
||||
|
||||
checks = {"status": "ok", "services": {}}
|
||||
|
||||
# 检查嵌入服务
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5) as client:
|
||||
r = await client.get(f"{settings.embedding_service_url}/health")
|
||||
checks["services"]["embedding"] = "ok" if r.status_code == 200 else "degraded"
|
||||
except Exception:
|
||||
checks["services"]["embedding"] = "unavailable"
|
||||
|
||||
# 检查 MCP Server
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5) as client:
|
||||
r = await client.get(f"{settings.mcp_server_url}/health")
|
||||
checks["services"]["mcp"] = "ok" if r.status_code == 200 else "degraded"
|
||||
except Exception:
|
||||
checks["services"]["mcp"] = "unavailable"
|
||||
|
||||
return checks
|
||||
0
services/compliance-backend/app/models/__init__.py
Normal file
0
services/compliance-backend/app/models/__init__.py
Normal file
113
services/compliance-backend/app/models/db.py
Normal file
113
services/compliance-backend/app/models/db.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import Column, String, Integer, BigInteger, Boolean, Text, ARRAY, Numeric
|
||||
from sqlalchemy import DateTime, ForeignKey, func
|
||||
from sqlalchemy.dialects.postgresql import UUID, JSONB, INET
|
||||
from sqlalchemy.orm import DeclarativeBase, relationship
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
class Workspace(Base):
|
||||
__tablename__ = "workspaces"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
name = Column(String(255), nullable=False)
|
||||
description = Column(Text)
|
||||
domain = Column(String(100))
|
||||
created_by = Column(String(255))
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
||||
|
||||
files = relationship("File", back_populates="workspace")
|
||||
|
||||
|
||||
class File(Base):
|
||||
__tablename__ = "files"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
workspace_id = Column(UUID(as_uuid=True), ForeignKey("workspaces.id", ondelete="CASCADE"))
|
||||
filename = Column(String(500), nullable=False)
|
||||
original_name = Column(String(500), nullable=False)
|
||||
file_type = Column(String(50))
|
||||
file_size = Column(BigInteger)
|
||||
storage_path = Column(Text)
|
||||
parsed_path = Column(Text)
|
||||
status = Column(String(50), default="uploaded")
|
||||
error_msg = Column(Text)
|
||||
metadata = Column(JSONB, default={})
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
||||
|
||||
workspace = relationship("Workspace", back_populates="files")
|
||||
tasks = relationship("Task", back_populates="file")
|
||||
|
||||
|
||||
class Task(Base):
|
||||
__tablename__ = "tasks"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
task_type = Column(String(100), nullable=False)
|
||||
status = Column(String(50), default="pending")
|
||||
payload = Column(JSONB, default={})
|
||||
result = Column(JSONB)
|
||||
error_msg = Column(Text)
|
||||
progress = Column(Integer, default=0)
|
||||
file_id = Column(UUID(as_uuid=True), ForeignKey("files.id"))
|
||||
celery_task_id = Column(String(255))
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
||||
completed_at = Column(DateTime(timezone=True))
|
||||
|
||||
file = relationship("File", back_populates="tasks")
|
||||
|
||||
|
||||
class ComplianceReport(Base):
|
||||
__tablename__ = "compliance_reports"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
file_id = Column(UUID(as_uuid=True), ForeignKey("files.id"))
|
||||
regulation_domains = Column(ARRAY(Text))
|
||||
overall_risk_level = Column(String(20))
|
||||
risk_score = Column(Numeric(5, 2))
|
||||
findings = Column(JSONB, default=[])
|
||||
recommendations = Column(JSONB, default=[])
|
||||
report_markdown = Column(Text)
|
||||
llm_model = Column(String(100))
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
|
||||
|
||||
class RegulationSource(Base):
|
||||
__tablename__ = "regulation_sources"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
name = Column(String(255), nullable=False)
|
||||
url = Column(Text, nullable=False)
|
||||
source_type = Column(String(50), default="webpage")
|
||||
domain = Column(String(100))
|
||||
fetch_interval = Column(Integer, default=86400)
|
||||
is_active = Column(Boolean, default=True)
|
||||
last_fetched_at = Column(DateTime(timezone=True))
|
||||
last_hash = Column(String(64))
|
||||
fetch_config = Column(JSONB, default={})
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
|
||||
|
||||
class RegulationUpdate(Base):
|
||||
__tablename__ = "regulation_updates"
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
source_id = Column(UUID(as_uuid=True), ForeignKey("regulation_sources.id"))
|
||||
title = Column(String(500))
|
||||
url = Column(Text)
|
||||
change_type = Column(String(50))
|
||||
summary = Column(Text)
|
||||
raw_content = Column(Text)
|
||||
diff_content = Column(Text)
|
||||
is_notified = Column(Boolean, default=False)
|
||||
importance = Column(String(20), default="normal")
|
||||
fetched_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
published_at = Column(DateTime(timezone=True))
|
||||
21
services/compliance-backend/app/services/embed.py
Normal file
21
services/compliance-backend/app/services/embed.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import httpx
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
from ..core.config import settings
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
|
||||
async def embed_texts(texts: list[str], batch_size: int = 12) -> dict:
|
||||
"""调用嵌入服务,返回 dense 和 sparse 向量"""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(
|
||||
f"{settings.embedding_service_url}/embed",
|
||||
json={"texts": texts, "batch_size": batch_size},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
|
||||
async def embed_single(text: str) -> list[float]:
|
||||
"""嵌入单条文本,返回 dense 向量"""
|
||||
result = await embed_texts([text], batch_size=1)
|
||||
return result["dense"][0]
|
||||
65
services/compliance-backend/app/services/graph.py
Normal file
65
services/compliance-backend/app/services/graph.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import logging
|
||||
from ..core.deps import get_neo4j
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def create_regulation_node(regulation: dict) -> str:
|
||||
"""在 Neo4j 中创建法规节点"""
|
||||
driver = get_neo4j()
|
||||
async with driver.session() as session:
|
||||
result = await session.run(
|
||||
"""
|
||||
MERGE (r:Regulation {id: $id})
|
||||
SET r.title = $title,
|
||||
r.domain = $domain,
|
||||
r.version = $version,
|
||||
r.code = $code
|
||||
RETURN r.id as id
|
||||
""",
|
||||
id=regulation.get("id"),
|
||||
title=regulation.get("title", ""),
|
||||
domain=regulation.get("domain", ""),
|
||||
version=regulation.get("version", ""),
|
||||
code=regulation.get("code", ""),
|
||||
)
|
||||
record = await result.single()
|
||||
return record["id"] if record else None
|
||||
|
||||
|
||||
async def create_clause_node(clause: dict, regulation_id: str) -> str:
|
||||
"""创建条款节点并关联到法规"""
|
||||
driver = get_neo4j()
|
||||
async with driver.session() as session:
|
||||
result = await session.run(
|
||||
"""
|
||||
MATCH (r:Regulation {id: $reg_id})
|
||||
MERGE (c:Clause {id: $id})
|
||||
SET c.number = $number,
|
||||
c.content = $content
|
||||
MERGE (r)-[:CONTAINS]->(c)
|
||||
RETURN c.id as id
|
||||
""",
|
||||
reg_id=regulation_id,
|
||||
id=clause.get("id"),
|
||||
number=clause.get("number", ""),
|
||||
content=clause.get("content", "")[:2000],
|
||||
)
|
||||
record = await result.single()
|
||||
return record["id"] if record else None
|
||||
|
||||
|
||||
async def search_related_regulations(domain: str, limit: int = 10) -> list[dict]:
|
||||
"""查询指定域下的所有法规"""
|
||||
driver = get_neo4j()
|
||||
async with driver.session() as session:
|
||||
result = await session.run(
|
||||
"""
|
||||
MATCH (r:Regulation {domain: $domain})
|
||||
RETURN r.id as id, r.title as title, r.code as code, r.version as version
|
||||
LIMIT $limit
|
||||
""",
|
||||
domain=domain,
|
||||
limit=limit,
|
||||
)
|
||||
return [dict(record) async for record in result]
|
||||
59
services/compliance-backend/app/services/monitor.py
Normal file
59
services/compliance-backend/app/services/monitor.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime, timezone
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def fetch_url(url: str, timeout: int = 30) -> str | None:
|
||||
"""抓取 URL 内容"""
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=timeout,
|
||||
headers={"User-Agent": "Mozilla/5.0 (compliance-monitor/1.0)"},
|
||||
follow_redirects=True,
|
||||
) as client:
|
||||
resp = await client.get(url)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
except Exception as e:
|
||||
logger.warning(f"抓取 {url} 失败:{e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_text(html: str) -> str:
|
||||
"""提取 HTML 中的主要文本内容"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
||||
tag.decompose()
|
||||
return soup.get_text(separator="\n", strip=True)
|
||||
|
||||
|
||||
def compute_hash(content: str) -> str:
|
||||
return hashlib.md5(content.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
async def check_source_for_updates(source: dict) -> dict | None:
|
||||
"""
|
||||
检查监控源是否有更新。
|
||||
返回 None 表示无变化,返回 dict 表示有新内容。
|
||||
"""
|
||||
html = await fetch_url(source["url"])
|
||||
if not html:
|
||||
return None
|
||||
|
||||
text = extract_text(html)
|
||||
new_hash = compute_hash(text)
|
||||
|
||||
if source.get("last_hash") == new_hash:
|
||||
logger.info(f"监控源 {source['name']} 无变化")
|
||||
return None
|
||||
|
||||
return {
|
||||
"source_id": source["id"],
|
||||
"raw_content": text[:50000], # 最多保存 50KB
|
||||
"new_hash": new_hash,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
43
services/compliance-backend/app/services/parse.py
Normal file
43
services/compliance-backend/app/services/parse.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import httpx
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
from ..core.config import settings
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=30))
|
||||
async def parse_document(file_content: bytes, filename: str) -> dict:
|
||||
"""调用 mcp-server 解析文档,返回 Markdown"""
|
||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||
resp = await client.post(
|
||||
f"{settings.mcp_server_url}/parse-document",
|
||||
files={"file": (filename, file_content, "application/octet-stream")},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = 512, overlap: int = 64) -> list[dict]:
|
||||
"""将文本按 token 数分块(简单版,按字符数估算)"""
|
||||
chars_per_chunk = chunk_size * 2 # 中文约2字符/token
|
||||
chars_overlap = overlap * 2
|
||||
chunks = []
|
||||
start = 0
|
||||
idx = 0
|
||||
|
||||
while start < len(text):
|
||||
end = min(start + chars_per_chunk, len(text))
|
||||
# 尝试在段落边界截断
|
||||
if end < len(text):
|
||||
for sep in ["\n\n", "\n", "。", ".", " "]:
|
||||
pos = text.rfind(sep, start, end)
|
||||
if pos > start + chars_per_chunk // 2:
|
||||
end = pos + len(sep)
|
||||
break
|
||||
|
||||
chunk_text = text[start:end].strip()
|
||||
if chunk_text:
|
||||
chunks.append({"idx": idx, "content": chunk_text, "start": start, "end": end})
|
||||
idx += 1
|
||||
|
||||
start = max(start + 1, end - chars_overlap)
|
||||
|
||||
return chunks
|
||||
92
services/compliance-backend/app/services/rag.py
Normal file
92
services/compliance-backend/app/services/rag.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import logging
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
from pymilvus import connections, Collection
|
||||
|
||||
from .embed import embed_single, embed_texts
|
||||
from ..core.llm import get_llm, RAG_SYSTEM_PROMPT
|
||||
from ..core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_collection(name: str) -> Collection:
|
||||
connections.connect(host=settings.milvus_host, port=settings.milvus_port)
|
||||
return Collection(name)
|
||||
|
||||
|
||||
async def hybrid_search(
|
||||
query: str,
|
||||
collection_name: str = "regulation_chunks",
|
||||
top_k: int = 10,
|
||||
workspace_id: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""混合检索:BGE-M3 向量检索(调研版简化,省去 BM25 融合)"""
|
||||
query_vec = await embed_single(query)
|
||||
|
||||
col = _get_collection(collection_name)
|
||||
|
||||
expr = f'workspace_id == "{workspace_id}"' if workspace_id else None
|
||||
results = col.search(
|
||||
data=[query_vec],
|
||||
anns_field="dense_vec",
|
||||
param={"metric_type": "COSINE", "params": {"ef": 100}},
|
||||
limit=top_k,
|
||||
expr=expr,
|
||||
output_fields=["content", "metadata", "file_id", "chunk_idx"],
|
||||
)
|
||||
|
||||
chunks = []
|
||||
for hits in results:
|
||||
for hit in hits:
|
||||
chunks.append({
|
||||
"id": hit.id,
|
||||
"content": hit.entity.get("content", ""),
|
||||
"score": float(hit.score),
|
||||
"file_id": hit.entity.get("file_id", ""),
|
||||
"chunk_idx": hit.entity.get("chunk_idx", 0),
|
||||
"metadata": hit.entity.get("metadata", {}),
|
||||
})
|
||||
return chunks
|
||||
|
||||
|
||||
async def rerank(query: str, chunks: list[dict], top_k: int = 5) -> list[dict]:
|
||||
"""简化版精排(调研版按 score 直接排序,生产可换 Cross-Encoder)"""
|
||||
return sorted(chunks, key=lambda x: x["score"], reverse=True)[:top_k]
|
||||
|
||||
|
||||
async def generate_answer(query: str, chunks: list[dict]) -> dict:
|
||||
"""基于检索结果,调用 LLM 生成引文锚定的答案"""
|
||||
if not chunks:
|
||||
return {"answer": "未找到相关法规内容,请上传相关法规文档后重试。", "sources": []}
|
||||
|
||||
# 构建 RAG 上下文
|
||||
context_parts = []
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
meta = chunk.get("metadata", {})
|
||||
source_info = f"[来源 {i}:{meta.get('filename', '未知文件')},第 {meta.get('page', '?')} 页]"
|
||||
context_parts.append(f"{source_info}\n{chunk['content']}")
|
||||
|
||||
context = "\n\n---\n\n".join(context_parts)
|
||||
user_prompt = f"参考文献:\n\n{context}\n\n问题:{query}\n\n请基于以上参考文献回答,并标注来源。"
|
||||
|
||||
llm = get_llm(temperature=0.1)
|
||||
messages = [SystemMessage(content=RAG_SYSTEM_PROMPT), HumanMessage(content=user_prompt)]
|
||||
|
||||
try:
|
||||
response = await llm.ainvoke(messages)
|
||||
answer = response.content
|
||||
except Exception as e:
|
||||
logger.error(f"LLM 生成失败:{e}")
|
||||
answer = f"LLM 生成失败:{e}。检索到的相关内容:{chunks[0]['content'][:200]}..."
|
||||
|
||||
sources = [
|
||||
{
|
||||
"content": c["content"][:300],
|
||||
"file_id": c.get("file_id", ""),
|
||||
"chunk_idx": c.get("chunk_idx", 0),
|
||||
"score": c.get("score", 0),
|
||||
"metadata": c.get("metadata", {}),
|
||||
}
|
||||
for c in chunks
|
||||
]
|
||||
return {"answer": answer, "sources": sources}
|
||||
212
services/compliance-backend/app/worker.py
Normal file
212
services/compliance-backend/app/worker.py
Normal file
@@ -0,0 +1,212 @@
|
||||
import uuid
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from celery import Celery
|
||||
from celery.schedules import crontab
|
||||
|
||||
from .core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Celery 配置
|
||||
celery_app = Celery(
|
||||
"compliance",
|
||||
broker=settings.redis_url,
|
||||
backend=settings.redis_url,
|
||||
)
|
||||
celery_app.conf.update(
|
||||
task_serializer="json",
|
||||
accept_content=["json"],
|
||||
result_serializer="json",
|
||||
timezone="Asia/Shanghai",
|
||||
task_routes={
|
||||
"app.worker.process_file_task": {"queue": "parse"},
|
||||
"app.worker.fetch_regulation_source": {"queue": "monitor"},
|
||||
"app.worker.send_notifications": {"queue": "push"},
|
||||
},
|
||||
beat_schedule={
|
||||
"daily-regulation-monitor": {
|
||||
"task": "app.worker.run_all_monitors",
|
||||
"schedule": crontab(hour=2, minute=0),
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
# ── 文件处理任务(解析 + 向量化)────────────────
|
||||
|
||||
@celery_app.task(name="app.worker.process_file_task", bind=True, max_retries=3)
|
||||
def process_file_task(self, file_id: str, task_id: str, workspace_id: str):
|
||||
"""解析文档并向量化存入 Milvus"""
|
||||
import asyncio
|
||||
asyncio.run(_process_file(file_id, task_id, workspace_id))
|
||||
|
||||
|
||||
async def _process_file(file_id: str, task_id: str, workspace_id: str):
|
||||
from pathlib import Path
|
||||
from sqlalchemy import select
|
||||
from .core.deps import AsyncSessionLocal, get_milvus_collection
|
||||
from .models.db import File, Task
|
||||
from .services.parse import parse_document, chunk_text
|
||||
from .services.embed import embed_texts
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
# 查找文件记录
|
||||
result = await db.execute(select(File).where(File.id == uuid.UUID(file_id)))
|
||||
file_record = result.scalar_one_or_none()
|
||||
if not file_record:
|
||||
logger.error(f"文件 {file_id} 不存在")
|
||||
return
|
||||
|
||||
task_result = await db.execute(select(Task).where(Task.id == uuid.UUID(task_id)))
|
||||
task = task_result.scalar_one_or_none()
|
||||
|
||||
try:
|
||||
# 更新状态
|
||||
file_record.status = "parsing"
|
||||
if task:
|
||||
task.status = "running"
|
||||
task.progress = 10
|
||||
await db.commit()
|
||||
|
||||
# Step 1:解析文档
|
||||
file_content = Path(file_record.storage_path).read_bytes()
|
||||
parse_result = await parse_document(file_content, file_record.original_name)
|
||||
markdown = parse_result.get("markdown", "")
|
||||
|
||||
if not markdown.strip():
|
||||
raise ValueError("文档解析结果为空")
|
||||
|
||||
file_record.status = "parsed"
|
||||
if task:
|
||||
task.progress = 40
|
||||
await db.commit()
|
||||
|
||||
# Step 2:分块
|
||||
chunks = chunk_text(markdown, chunk_size=512, overlap=64)
|
||||
logger.info(f"文件 {file_id} 分割为 {len(chunks)} 块")
|
||||
|
||||
# Step 3:向量化(分批处理)
|
||||
batch_size = 16
|
||||
col = get_milvus_collection("regulation_chunks")
|
||||
|
||||
for i in range(0, len(chunks), batch_size):
|
||||
batch = chunks[i:i + batch_size]
|
||||
texts = [c["content"] for c in batch]
|
||||
embed_result = await embed_texts(texts, batch_size=batch_size)
|
||||
dense_vecs = embed_result["dense"]
|
||||
|
||||
entities = [
|
||||
[f"{file_id}_{c['idx']}" for c in batch],
|
||||
[file_id] * len(batch),
|
||||
[workspace_id] * len(batch),
|
||||
[c["idx"] for c in batch],
|
||||
[c["content"] for c in batch],
|
||||
dense_vecs,
|
||||
[{"filename": file_record.original_name, "page": c.get("page", 0)} for c in batch],
|
||||
]
|
||||
col.insert(entities)
|
||||
|
||||
if task:
|
||||
task.progress = 40 + int(60 * (i + batch_size) / len(chunks))
|
||||
await db.commit()
|
||||
|
||||
col.flush()
|
||||
|
||||
# 完成
|
||||
file_record.status = "vectorized"
|
||||
if task:
|
||||
task.status = "completed"
|
||||
task.progress = 100
|
||||
task.completed_at = datetime.now(timezone.utc)
|
||||
await db.commit()
|
||||
logger.info(f"文件 {file_id} 处理完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"文件 {file_id} 处理失败:{e}")
|
||||
file_record.status = "failed"
|
||||
file_record.error_msg = str(e)
|
||||
if task:
|
||||
task.status = "failed"
|
||||
task.error_msg = str(e)
|
||||
await db.commit()
|
||||
raise
|
||||
|
||||
|
||||
# ── 法规监控任务 ────────────────────────────────
|
||||
|
||||
@celery_app.task(name="app.worker.run_all_monitors")
|
||||
def run_all_monitors():
|
||||
"""定时触发所有活跃监控源"""
|
||||
import asyncio
|
||||
asyncio.run(_run_all_monitors())
|
||||
|
||||
|
||||
async def _run_all_monitors():
|
||||
from sqlalchemy import select
|
||||
from .core.deps import AsyncSessionLocal
|
||||
from .models.db import RegulationSource
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
result = await db.execute(
|
||||
select(RegulationSource).where(RegulationSource.is_active == True)
|
||||
)
|
||||
sources = result.scalars().all()
|
||||
for source in sources:
|
||||
fetch_regulation_source.delay(str(source.id))
|
||||
logger.info(f"触发监控源抓取:{source.name}")
|
||||
|
||||
|
||||
@celery_app.task(name="app.worker.fetch_regulation_source", bind=True, max_retries=2)
|
||||
def fetch_regulation_source(self, source_id: str):
|
||||
import asyncio
|
||||
asyncio.run(_fetch_source(source_id))
|
||||
|
||||
|
||||
async def _fetch_source(source_id: str):
|
||||
import hashlib
|
||||
from sqlalchemy import select
|
||||
from .core.deps import AsyncSessionLocal
|
||||
from .models.db import RegulationSource, RegulationUpdate
|
||||
from .services.monitor import check_source_for_updates
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
result = await db.execute(
|
||||
select(RegulationSource).where(RegulationSource.id == uuid.UUID(source_id))
|
||||
)
|
||||
source = result.scalar_one_or_none()
|
||||
if not source:
|
||||
return
|
||||
|
||||
source_dict = {
|
||||
"id": str(source.id),
|
||||
"name": source.name,
|
||||
"url": source.url,
|
||||
"last_hash": source.last_hash,
|
||||
}
|
||||
update_data = await check_source_for_updates(source_dict)
|
||||
|
||||
if update_data:
|
||||
logger.info(f"检测到变更:{source.name}")
|
||||
source.last_hash = update_data["new_hash"]
|
||||
source.last_fetched_at = datetime.now(timezone.utc)
|
||||
|
||||
update = RegulationUpdate(
|
||||
source_id=uuid.UUID(source_id),
|
||||
change_type="updated",
|
||||
raw_content=update_data["raw_content"][:50000],
|
||||
importance="normal",
|
||||
)
|
||||
db.add(update)
|
||||
await db.commit()
|
||||
else:
|
||||
source.last_fetched_at = datetime.now(timezone.utc)
|
||||
await db.commit()
|
||||
|
||||
|
||||
@celery_app.task(name="app.worker.send_notifications")
|
||||
def send_notifications():
|
||||
logger.info("推送通知任务执行(待实现)")
|
||||
|
||||
|
||||
# 导出供 FastAPI 使用
|
||||
worker = celery_app
|
||||
29
services/compliance-backend/pyproject.toml
Normal file
29
services/compliance-backend/pyproject.toml
Normal file
@@ -0,0 +1,29 @@
|
||||
[project]
|
||||
name = "compliance-backend"
|
||||
version = "0.1.0"
|
||||
description = "AI合规智能中枢 — 业务后端"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"fastapi>=0.115",
|
||||
"uvicorn[standard]>=0.30",
|
||||
"pydantic>=2.7",
|
||||
"pydantic-settings>=2.4",
|
||||
"sqlalchemy[asyncio]>=2.0",
|
||||
"asyncpg>=0.29",
|
||||
"redis[asyncio]>=5.0",
|
||||
"celery[redis]>=5.4",
|
||||
"pymilvus>=2.4",
|
||||
"neo4j>=5.20",
|
||||
"langchain>=0.3",
|
||||
"langchain-openai>=0.2",
|
||||
"langchain-community>=0.3",
|
||||
"llama-index-core>=0.11",
|
||||
"httpx>=0.27",
|
||||
"python-multipart>=0.0.9",
|
||||
"python-jose[cryptography]>=3.3",
|
||||
"structlog>=24.0",
|
||||
"prometheus-fastapi-instrumentator>=7.0",
|
||||
"tenacity>=8.5",
|
||||
"beautifulsoup4>=4.12",
|
||||
"requests>=2.32",
|
||||
]
|
||||
24
services/embedding/Dockerfile
Normal file
24
services/embedding/Dockerfile
Normal file
@@ -0,0 +1,24 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 系统依赖
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Python 依赖(先装,利用构建缓存)
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt \
|
||||
--index-url https://pypi.tuna.tsinghua.edu.cn/simple \
|
||||
--trusted-host pypi.tuna.tsinghua.edu.cn
|
||||
|
||||
COPY main.py .
|
||||
|
||||
# 健康检查
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
|
||||
CMD curl -f http://localhost:8010/health || exit 1
|
||||
|
||||
EXPOSE 8010
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8010", "--workers", "1"]
|
||||
87
services/embedding/main.py
Normal file
87
services/embedding/main.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import os
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MODEL_NAME = os.getenv("MODEL_NAME", "BAAI/bge-m3")
|
||||
MODEL_CACHE = os.getenv("HF_HOME", "/app/models")
|
||||
DEVICE = os.getenv("DEVICE", "cpu")
|
||||
MAX_BATCH = int(os.getenv("MAX_BATCH_SIZE", "16"))
|
||||
|
||||
# 设置 HuggingFace 镜像
|
||||
if os.getenv("HF_ENDPOINT"):
|
||||
os.environ["HF_ENDPOINT"] = os.getenv("HF_ENDPOINT")
|
||||
|
||||
model = None
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
global model
|
||||
logger.info(f"加载模型 {MODEL_NAME},设备:{DEVICE}")
|
||||
try:
|
||||
from FlagEmbedding import BGEM3FlagModel
|
||||
model = BGEM3FlagModel(
|
||||
MODEL_NAME,
|
||||
use_fp16=(DEVICE != "cpu"),
|
||||
cache_dir=MODEL_CACHE,
|
||||
)
|
||||
logger.info("BGE-M3 模型加载完成")
|
||||
except Exception as e:
|
||||
logger.error(f"模型加载失败:{e}")
|
||||
raise
|
||||
yield
|
||||
logger.info("服务关闭")
|
||||
|
||||
|
||||
app = FastAPI(title="BGE-M3 嵌入服务", lifespan=lifespan)
|
||||
|
||||
|
||||
class EmbedRequest(BaseModel):
|
||||
texts: list[str] = Field(..., min_length=1, max_length=100)
|
||||
batch_size: int = Field(default=12, ge=1, le=MAX_BATCH)
|
||||
return_dense: bool = True
|
||||
return_sparse: bool = True
|
||||
|
||||
|
||||
class EmbedResponse(BaseModel):
|
||||
dense: Optional[list[list[float]]] = None
|
||||
sparse: Optional[list[dict]] = None
|
||||
model: str
|
||||
count: int
|
||||
|
||||
|
||||
@app.post("/embed", response_model=EmbedResponse)
|
||||
def embed(req: EmbedRequest) -> EmbedResponse:
|
||||
if model is None:
|
||||
raise HTTPException(status_code=503, detail="模型未就绪")
|
||||
if len(req.texts) > 100:
|
||||
raise HTTPException(status_code=400, detail="单次最多 100 条文本")
|
||||
|
||||
try:
|
||||
output = model.encode(
|
||||
req.texts,
|
||||
batch_size=req.batch_size,
|
||||
return_dense=req.return_dense,
|
||||
return_sparse=req.return_sparse,
|
||||
)
|
||||
return EmbedResponse(
|
||||
dense=output["dense_vecs"].tolist() if req.return_dense else None,
|
||||
sparse=[dict(w) for w in output["lexical_weights"]] if req.return_sparse else None,
|
||||
model=MODEL_NAME,
|
||||
count=len(req.texts),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"嵌入生成失败:{e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok", "model": MODEL_NAME, "device": DEVICE, "ready": model is not None}
|
||||
10
services/embedding/requirements.txt
Normal file
10
services/embedding/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
fastapi>=0.115
|
||||
uvicorn[standard]>=0.30
|
||||
pydantic>=2.7
|
||||
FlagEmbedding>=1.3
|
||||
# CPU 版本 PyTorch(减小镜像体积)
|
||||
torch>=2.3.0 --index-url https://download.pytorch.org/whl/cpu
|
||||
transformers>=4.44
|
||||
sentence-transformers>=3.0
|
||||
huggingface-hub>=0.24
|
||||
numpy>=1.26
|
||||
38
services/mcp-server/Dockerfile
Normal file
38
services/mcp-server/Dockerfile
Normal file
@@ -0,0 +1,38 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 系统依赖(MinerU 需要 libGL)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
libgl1-mesa-glx \
|
||||
libglib2.0-0 \
|
||||
libsm6 \
|
||||
libxrender1 \
|
||||
libxext6 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt \
|
||||
--index-url https://pypi.tuna.tsinghua.edu.cn/simple \
|
||||
--trusted-host pypi.tuna.tsinghua.edu.cn
|
||||
|
||||
# 预下载 MinerU 模型(构建时执行,加速启动)
|
||||
RUN python -c "
|
||||
import os
|
||||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||
try:
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
|
||||
print('MinerU 模型下载完成')
|
||||
except Exception as e:
|
||||
print(f'模型下载跳过(将在运行时下载): {e}')
|
||||
" || true
|
||||
|
||||
COPY main.py .
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
|
||||
CMD curl -f http://localhost:8011/health || exit 1
|
||||
|
||||
EXPOSE 8011
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8011", "--workers", "1"]
|
||||
136
services/mcp-server/main.py
Normal file
136
services/mcp-server/main.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import os
|
||||
import tempfile
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, UploadFile, File, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEVICE = os.getenv("DEVICE", "cpu")
|
||||
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "/app/uploads"))
|
||||
PARSED_DIR = Path(os.getenv("PARSED_DIR", "/app/parsed"))
|
||||
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
PARSED_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
app = FastAPI(title="MinerU 文档解析服务")
|
||||
|
||||
SUPPORTED_TYPES = {
|
||||
"application/pdf": "pdf",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
||||
"application/msword": "doc",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
||||
}
|
||||
|
||||
|
||||
def parse_pdf_mineru(pdf_path: str) -> str:
|
||||
"""使用 MinerU 解析 PDF"""
|
||||
try:
|
||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
||||
from magic_pdf.pipe.UnicodeFormulaPDFPipe import UnicodeFormulaPDFPipe
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
writer = FileBasedDataWriter(tmpdir)
|
||||
pipe = UnicodeFormulaPDFPipe(pdf_path, writer)
|
||||
pipe.pipe_classify()
|
||||
pipe.pipe_analyze()
|
||||
pipe.pipe_parse()
|
||||
md_content = pipe.pipe_mk_uni_format(tmpdir, drop_mode="none")
|
||||
return md_content or ""
|
||||
except Exception as e:
|
||||
logger.warning(f"MinerU 解析失败,降级到 PyMuPDF:{e}")
|
||||
return parse_pdf_pymupdf(pdf_path)
|
||||
|
||||
|
||||
def parse_pdf_pymupdf(pdf_path: str) -> str:
|
||||
"""降级:使用 PyMuPDF 提取文本"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
doc = fitz.open(pdf_path)
|
||||
pages = []
|
||||
for i, page in enumerate(doc):
|
||||
text = page.get_text()
|
||||
if text.strip():
|
||||
pages.append(f"## 第 {i+1} 页\n\n{text}")
|
||||
return "\n\n".join(pages)
|
||||
except Exception as e:
|
||||
return f"[解析失败:{e}]"
|
||||
|
||||
|
||||
def parse_docx(file_path: str) -> str:
|
||||
"""解析 Word 文档"""
|
||||
try:
|
||||
from docx import Document
|
||||
doc = Document(file_path)
|
||||
parts = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
style = para.style.name if para.style else ""
|
||||
if "Heading" in style:
|
||||
level = style.replace("Heading ", "").strip()
|
||||
try:
|
||||
prefix = "#" * int(level)
|
||||
except ValueError:
|
||||
prefix = "##"
|
||||
parts.append(f"{prefix} {para.text}")
|
||||
else:
|
||||
parts.append(para.text)
|
||||
for table in doc.tables:
|
||||
rows = []
|
||||
for row in table.rows:
|
||||
rows.append(" | ".join(cell.text.strip() for cell in row.cells))
|
||||
if rows:
|
||||
parts.append("\n".join(rows))
|
||||
return "\n\n".join(parts)
|
||||
except Exception as e:
|
||||
return f"[Word 解析失败:{e}]"
|
||||
|
||||
|
||||
class ParseResponse(BaseModel):
|
||||
filename: str
|
||||
markdown: str
|
||||
page_count: int
|
||||
parser: str
|
||||
|
||||
|
||||
@app.post("/mineru-parse", response_model=ParseResponse)
|
||||
async def mineru_parse(file: UploadFile = File(...)) -> ParseResponse:
|
||||
content = await file.read()
|
||||
suffix = Path(file.filename or "doc.pdf").suffix.lower()
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
if suffix == ".pdf":
|
||||
markdown = parse_pdf_mineru(tmp_path)
|
||||
parser = "mineru"
|
||||
elif suffix in (".docx", ".doc"):
|
||||
markdown = parse_docx(tmp_path)
|
||||
parser = "python-docx"
|
||||
else:
|
||||
raise HTTPException(status_code=415, detail=f"不支持的文件类型:{suffix}")
|
||||
|
||||
page_count = markdown.count("## 第") if suffix == ".pdf" else markdown.count("\n\n")
|
||||
return ParseResponse(
|
||||
filename=file.filename or "unknown",
|
||||
markdown=markdown,
|
||||
page_count=max(page_count, 1),
|
||||
parser=parser,
|
||||
)
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
|
||||
@app.post("/parse-document", response_model=ParseResponse)
|
||||
async def parse_document(file: UploadFile = File(...)) -> ParseResponse:
|
||||
return await mineru_parse(file)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok", "device": DEVICE}
|
||||
11
services/mcp-server/requirements.txt
Normal file
11
services/mcp-server/requirements.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
fastapi>=0.115
|
||||
uvicorn[standard]>=0.30
|
||||
pydantic>=2.7
|
||||
python-multipart>=0.0.9
|
||||
httpx>=0.27
|
||||
# MinerU 文档解析
|
||||
mineru[pipeline]>=1.0
|
||||
# Word/Excel 降级解析
|
||||
python-docx>=1.1
|
||||
openpyxl>=3.1
|
||||
PyMuPDF>=1.24 # PDF 降级解析
|
||||
Reference in New Issue
Block a user