first commit
This commit is contained in:
89
.env.example
Normal file
89
.env.example
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# AI合规智能中枢 — 环境变量配置
|
||||||
|
# 复制本文件为 .env 并填写实际值
|
||||||
|
# cp .env.example .env
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
# LLM 云端 API 配置(至少填写一个)
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# LLM 提供商:deepseek 或 qwen
|
||||||
|
LLM_PROVIDER=deepseek
|
||||||
|
|
||||||
|
# DeepSeek API(推荐,约¥1/百万tokens)
|
||||||
|
# 申请地址:https://platform.deepseek.com
|
||||||
|
DEEPSEEK_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||||
|
# 可选模型:deepseek-chat(通用)、deepseek-reasoner(推理增强)
|
||||||
|
DEEPSEEK_MODEL=deepseek-chat
|
||||||
|
|
||||||
|
# 阿里云 DashScope / Qwen API(备用)
|
||||||
|
# 申请地址:https://dashscope.aliyuncs.com
|
||||||
|
DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||||
|
# 可选模型:qwen-plus、qwen-max、qwen-turbo
|
||||||
|
QWEN_MODEL=qwen-plus
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
# 数据库密码
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# PostgreSQL 密码(生产环境请使用强密码)
|
||||||
|
POSTGRES_PASSWORD=compliance_secure_2026
|
||||||
|
|
||||||
|
# Redis 密码
|
||||||
|
REDIS_PASSWORD=redis_secure_2026
|
||||||
|
|
||||||
|
# Neo4j 密码(不能包含特殊字符)
|
||||||
|
NEO4J_PASSWORD=neo4j_secure_2026
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
# AI 模型配置
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# HuggingFace 镜像(国内加速,默认使用 hf-mirror.com)
|
||||||
|
HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
|
# 嵌入服务设备:cpu 或 cuda(有 GPU 时改为 cuda)
|
||||||
|
EMBEDDING_DEVICE=cpu
|
||||||
|
|
||||||
|
# MinerU 解析设备:cpu 或 cuda
|
||||||
|
MCP_DEVICE=cpu
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
# 应用配置
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# 运行环境:development / production
|
||||||
|
APP_ENV=development
|
||||||
|
|
||||||
|
# 日志级别:DEBUG / INFO / WARNING / ERROR
|
||||||
|
LOG_LEVEL=INFO
|
||||||
|
|
||||||
|
# API 认证密钥(用于内部服务间调用)
|
||||||
|
API_SECRET_KEY=change_this_to_a_random_secret_key_32chars
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
# 监控配置(可选)
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Grafana 管理员密码
|
||||||
|
GRAFANA_PASSWORD=admin
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
# 外部推送配置(闭环③法规监控推送用)
|
||||||
|
# ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# 邮件推送(可选)
|
||||||
|
# SMTP_HOST=smtp.example.com
|
||||||
|
# SMTP_PORT=587
|
||||||
|
# SMTP_USER=your@email.com
|
||||||
|
# SMTP_PASSWORD=your_smtp_password
|
||||||
|
|
||||||
|
# Webhook 推送(可选,支持飞书/钉钉/企业微信)
|
||||||
|
# WEBHOOK_URL=https://hooks.slack.com/services/xxx
|
||||||
277
00_整体部署规划.md
Normal file
277
00_整体部署规划.md
Normal file
@@ -0,0 +1,277 @@
|
|||||||
|
# AI合规智能中枢 — 整体部署规划
|
||||||
|
|
||||||
|
> **版本:** 调研版 v1.0 | **日期:** 2026.04 | **团队:** T-Systems AI Regulations Team
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 一、项目背景
|
||||||
|
|
||||||
|
AI+合规智能中枢面向车企与工厂,是一个全链路合规智能平台。主要解决以下痛点:
|
||||||
|
|
||||||
|
| 痛点 | 说明 |
|
||||||
|
|------|------|
|
||||||
|
| 法规来源复杂 | GB、MIIT、UN-ECE、IATF 16949、ISO 45001 等多源并行 |
|
||||||
|
| 更新频率高 | 新能源、数据安全、碳排放法规频繁变动 |
|
||||||
|
| 跨语言要求 | 中英德法多语言法规并存 |
|
||||||
|
| 文档管理分散 | 内部文档与外部法规割裂,难以统一检索 |
|
||||||
|
| 被动识别隐患 | EHS 合规靠人工排查,效率低下 |
|
||||||
|
|
||||||
|
**调研目标:** 以最小资源投入(Docker Compose 单机)验证三条核心业务闭环的技术可行性。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 二、部署架构概览
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ 单台服务器 │
|
||||||
|
│ ┌──────────────┐ ┌──────────────────────────────────────┐ │
|
||||||
|
│ │ API 网关 │ │ Docker Compose │ │
|
||||||
|
│ │ Nginx :80 │───▶│ │ │
|
||||||
|
│ └──────────────┘ │ ┌──────────────────────────────┐ │ │
|
||||||
|
│ │ │ 业务服务层 │ │ │
|
||||||
|
│ │ │ compliance-backend :8000 │ │ │
|
||||||
|
│ │ │ celery-worker │ │ │
|
||||||
|
│ │ │ celery-beat │ │ │
|
||||||
|
│ │ └──────────┬───────────────────┘ │ │
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ ┌──────────▼───────────────────┐ │ │
|
||||||
|
│ │ │ AI 模型层 │ │ │
|
||||||
|
│ │ │ embedding-service :8010 │ │ │
|
||||||
|
│ │ │ mcp-server(MinerU) :8011 │ │ │
|
||||||
|
│ │ │ LLM → DeepSeek API (云端) │ │ │
|
||||||
|
│ │ └──────────┬───────────────────┘ │ │
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ ┌──────────▼───────────────────┐ │ │
|
||||||
|
│ │ │ 数据层 │ │ │
|
||||||
|
│ │ │ PostgreSQL :5432 │ │ │
|
||||||
|
│ │ │ Redis :6379 │ │ │
|
||||||
|
│ │ │ Milvus :19530 │ │ │
|
||||||
|
│ │ │ Neo4j :7474/:7687 │ │ │
|
||||||
|
│ │ │ MinIO (Milvus内置) │ │ │
|
||||||
|
│ │ └──────────────────────────────┘ │ │
|
||||||
|
│ └──────────────────────────────────────┘ │
|
||||||
|
└─────────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
┌─────────▼──────────┐
|
||||||
|
│ DeepSeek API │
|
||||||
|
│ (云端 LLM) │
|
||||||
|
└────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 三、原方案 vs 调研方案对比
|
||||||
|
|
||||||
|
| 维度 | 原方案(生产级)| 调研方案 | 降级理由 |
|
||||||
|
|------|--------------|---------|---------|
|
||||||
|
| 编排 | Kubernetes 1.36 + Helm | **Docker Compose** | 无需集群管理,`up -d` 一键启动 |
|
||||||
|
| LLM | vLLM + DeepSeek-V3(4×A100)| **DeepSeek/Qwen 云端 API** | 无 GPU 依赖,秒级就绪 |
|
||||||
|
| 嵌入模型 | BGE-M3 GPU 服务 | **BGE-M3 CPU 容器** | 调研数据量小,CPU 够用 |
|
||||||
|
| Milvus | 分布式集群 + MinIO | **Milvus Standalone**(含内置 MinIO)| 单容器,省去 MinIO 独立部署 |
|
||||||
|
| 消息队列 | Kafka 3 节点 | **Redis + Celery**(复用已有 Redis)| 调研无需高吞吐,大幅简化 |
|
||||||
|
| 监控 | Prometheus + Grafana + ELK | **仅 Prometheus + Grafana**(可选)| 轻量,后期按需加 |
|
||||||
|
| 安全 | JWT + cert-manager + RBAC | **API Key 简单认证** | 调研期无需生产级安全 |
|
||||||
|
| CI/CD | GitLab CI 完整流水线 | **无**(手动部署)| 调研期直接 compose up |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 四、硬件最低要求
|
||||||
|
|
||||||
|
| 资源 | 最低配置 | 推荐配置 | 说明 |
|
||||||
|
|------|---------|---------|------|
|
||||||
|
| CPU | 8 核 | 16 核+ | BGE-M3 CPU 模式需要较多核心 |
|
||||||
|
| 内存 | 32 GB | 64 GB | Milvus + BGE-M3 + Neo4j 内存消耗较大 |
|
||||||
|
| 存储 | 200 GB SSD | 500 GB SSD | 含模型文件(约 5GB)+ 数据 |
|
||||||
|
| GPU | **无需** | 1× RTX 3090(24GB)| 有 GPU 可加速嵌入/MinerU |
|
||||||
|
| 网络 | 能访问 DeepSeek API | — | LLM 完全在云端 |
|
||||||
|
| OS | Ubuntu 22.04 LTS | — | 或 Windows 11 + WSL2 |
|
||||||
|
|
||||||
|
**各组件内存估算:**
|
||||||
|
|
||||||
|
| 服务 | 内存占用 |
|
||||||
|
|------|---------|
|
||||||
|
| PostgreSQL | ~1 GB |
|
||||||
|
| Redis | ~512 MB |
|
||||||
|
| Milvus(含 etcd/minio)| ~4 GB |
|
||||||
|
| Neo4j | ~2 GB |
|
||||||
|
| BGE-M3(CPU 模式)| ~6 GB |
|
||||||
|
| MinerU(CPU 模式)| ~4 GB |
|
||||||
|
| compliance-backend | ~1 GB |
|
||||||
|
| celery-worker × 1 | ~1 GB |
|
||||||
|
| **合计** | **~20 GB** |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 五、五阶段部署步骤(总览)
|
||||||
|
|
||||||
|
```
|
||||||
|
阶段一:宿主机环境准备
|
||||||
|
└─ 安装 Docker CE / Docker Desktop
|
||||||
|
└─ 配置 nvidia-container-toolkit(有 GPU 时)
|
||||||
|
└─ 创建项目目录,配置 .env
|
||||||
|
|
||||||
|
阶段二:基础中间件启动
|
||||||
|
└─ PostgreSQL + Redis(优先启动)
|
||||||
|
└─ etcd + MinIO(Milvus 依赖)
|
||||||
|
└─ Milvus Standalone(向量检索核心)
|
||||||
|
└─ Neo4j Community(知识图谱)
|
||||||
|
|
||||||
|
阶段三:AI 模型服务构建与启动
|
||||||
|
└─ 构建 embedding-service(BGE-M3 封装)
|
||||||
|
└─ 构建 mcp-server(MinerU 封装)
|
||||||
|
└─ 预下载模型(BGE-M3 ~2.5GB,MinerU ~2GB)
|
||||||
|
|
||||||
|
阶段四:业务微服务启动
|
||||||
|
└─ compliance-backend(FastAPI 主服务)
|
||||||
|
└─ celery-worker(异步任务处理)
|
||||||
|
└─ celery-beat(定时任务调度)
|
||||||
|
└─ nginx(API 网关)
|
||||||
|
|
||||||
|
阶段五:验证与闭环测试
|
||||||
|
└─ 健康检查(bash scripts/check_health.sh)
|
||||||
|
└─ 端到端冒烟测试(bash scripts/07_smoke_test.sh)
|
||||||
|
└─ 三条业务闭环验证
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 六、三条核心业务闭环
|
||||||
|
|
||||||
|
### 闭环①:法规入库 → 检索问答
|
||||||
|
|
||||||
|
```
|
||||||
|
用户上传 PDF
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
API Gateway(Nginx)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
kbmp-service(文件接收)
|
||||||
|
│ 异步投递
|
||||||
|
▼
|
||||||
|
Celery Worker
|
||||||
|
│
|
||||||
|
├─► parse-worker ──► mcp-server(MinerU 解析)
|
||||||
|
│ │ Markdown + 结构化文本
|
||||||
|
│ ▼
|
||||||
|
└─► vectorize-worker ──► embedding-service(BGE-M3)
|
||||||
|
│ 1024维向量
|
||||||
|
▼
|
||||||
|
Milvus(向量存储)+ PostgreSQL(元数据)
|
||||||
|
|
||||||
|
用户提问
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
BM25 关键词检索 + BGE-M3 向量检索(Milvus hybrid search)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
Cross-Encoder Reranker(精排 Top-K)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
DeepSeek API(引文锚定生成)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
返回答案(含原文引用 + 页码)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 闭环②:文档上传 → 合规审查
|
||||||
|
|
||||||
|
```
|
||||||
|
上传供应商/内部文档
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
MinerU 解析 → 条款级分割
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
法规域匹配(vehicle_safety / data_security / ehs)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
与法规库语义比对(向量相似度 + 关键字匹配)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
DeepSeek API 风险评分(条款级分析)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
生成 Markdown 审查报告(风险等级 + 整改建议)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 闭环③:法规监控 → 变更推送
|
||||||
|
|
||||||
|
```
|
||||||
|
Celery Beat 定时触发(每天)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
抓取监控源(国标委 / 工信部 / 应急管理部 / 生环部)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
内容 Hash 比对(检测变更)
|
||||||
|
│
|
||||||
|
▼ [有变更]
|
||||||
|
NLP Diff 分析(DeepSeek 提取新增/修订/废止条款)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
增量入库(MinerU 解析 → BGE-M3 → Milvus + PostgreSQL + Neo4j)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
差距分析(与企业现状比对)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
推送通知(Email / Webhook / 飞书 / 钉钉)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
记录变更日志 → 触发整改任务
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 七、技术选型决策依据
|
||||||
|
|
||||||
|
| 组件 | 选型 | 决策依据 |
|
||||||
|
|------|------|---------|
|
||||||
|
| 向量数据库 | Milvus 2.4 | 支持 Dense+Sparse 混合检索,BGE-M3 配套,生产可扩展 |
|
||||||
|
| 图数据库 | Neo4j 5.x | 法规实体关系建模成熟,APOC 插件丰富,Cypher 查询友好 |
|
||||||
|
| 嵌入模型 | BGE-M3 | 中英文双语,支持 dense+sparse+multi-vector,8192 token 上下文 |
|
||||||
|
| LLM | DeepSeek API | 推理能力强,成本低(约¥1/百万 tokens),OpenAI 兼容 |
|
||||||
|
| 文档解析 | MinerU | GPU 最快 0.21s/页,支持 109 种语言 OCR,布局感知 |
|
||||||
|
| 任务队列 | Celery + Redis | 调研阶段够用,比 Kafka 轻量,Redis 可复用 |
|
||||||
|
| API 框架 | FastAPI | 异步性能好,OpenAPI 自动生成,Pydantic 数据验证 |
|
||||||
|
| 关系数据库 | PostgreSQL + pgvector | 元数据存储 + 备用向量检索,pgvector 镜像开箱即用 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 八、升级路径(调研 → 生产)
|
||||||
|
|
||||||
|
| 维度 | 升级内容 | 触发条件 |
|
||||||
|
|------|---------|---------|
|
||||||
|
| LLM | API → 本地 vLLM + DeepSeek-V3 | 数据安全要求/API成本超阈值 |
|
||||||
|
| Milvus | Standalone → 分布式集群 | 向量数据 > 1000 万条 |
|
||||||
|
| 消息队列 | Celery+Redis → Kafka | 并发任务 > 100/分钟 |
|
||||||
|
| 编排 | Docker Compose → Kubernetes | 多节点部署/弹性伸缩需求 |
|
||||||
|
| 安全 | API Key → JWT + RBAC | 对外提供服务/多租户 |
|
||||||
|
| 监控 | Grafana → Grafana + ELK | 日志量大/需要复杂分析 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 九、文件结构说明
|
||||||
|
|
||||||
|
```
|
||||||
|
Depolyment/
|
||||||
|
├── 00_整体部署规划.md ← 本文档
|
||||||
|
├── 01_技术架构详解.md ← 六层架构 + 六大微服务详细说明
|
||||||
|
├── 02_组件安装指南.md ← 每个组件的详细安装步骤
|
||||||
|
├── 03_业务闭环说明.md ← 三条闭环的数据流和接口规范
|
||||||
|
├── README.md ← 快速启动指南
|
||||||
|
├── docker-compose.yml ← 全服务编排
|
||||||
|
├── .env.example ← 环境变量模板
|
||||||
|
├── scripts/ ← 安装与运维脚本(13 个)
|
||||||
|
├── services/ ← 服务源码
|
||||||
|
│ ├── embedding/ ← BGE-M3 嵌入服务
|
||||||
|
│ ├── mcp-server/ ← MinerU 文档解析服务
|
||||||
|
│ └── compliance-backend/ ← 核心业务后端
|
||||||
|
├── config/ ← Nginx、Prometheus 配置
|
||||||
|
├── init-sql/ ← PostgreSQL 初始化 SQL
|
||||||
|
├── data/ ← 运行时数据
|
||||||
|
├── logs/ ← 服务日志
|
||||||
|
└── models/ ← AI 模型缓存
|
||||||
|
```
|
||||||
263
01_技术架构详解.md
Normal file
263
01_技术架构详解.md
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
# AI合规智能中枢 — 技术架构详解
|
||||||
|
|
||||||
|
> 本文档对应架构文档:`01_分层次技术架构图.html` 和 `02_详细技术架构图.html`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 一、六层架构总览
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────────────────────────────────────────────────────────────┐
|
||||||
|
│ L1 应用接入层:Web / Mobile / Bot / API Gateway / RBAC │
|
||||||
|
├──────────────────────────────────────────────────────────────────┤
|
||||||
|
│ L2 业务能力层:知识库问答 / 文档审查 / EHS / 法规监控 / 推荐 │
|
||||||
|
├──────────────────────────────────────────────────────────────────┤
|
||||||
|
│ L3 法规感知层:监控 → 感知 → 解析 → 图谱 → 分析 → 闭环 │
|
||||||
|
├──────────────────────────────────────────────────────────────────┤
|
||||||
|
│ L4 AI引擎层:RAG / LLM / 文档解析 / 知识图谱推理 / NLP │
|
||||||
|
├──────────────────────────────────────────────────────────────────┤
|
||||||
|
│ L5 数据知识层:Milvus / PostgreSQL / Neo4j / Redis / 知识库 │
|
||||||
|
├──────────────────────────────────────────────────────────────────┤
|
||||||
|
│ L6 基础设施层:安全治理 / 容器编排 / 运维观测 / CI/CD │
|
||||||
|
└──────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 二、六大微服务详解
|
||||||
|
|
||||||
|
### 2.1 kbmp-service(知识库公开接口)
|
||||||
|
|
||||||
|
**职责:** 知识库的统一入口,处理文件上传、检索编排、任务投递。
|
||||||
|
|
||||||
|
**核心接口:**
|
||||||
|
|
||||||
|
| 方法 | 路径 | 功能 |
|
||||||
|
|------|------|------|
|
||||||
|
| POST | `/workspace/create` | 创建知识库工作空间 |
|
||||||
|
| POST | `/files/upload` | 上传文件(触发解析任务) |
|
||||||
|
| POST | `/files/parse` | 手动触发解析 |
|
||||||
|
| POST | `/knowledge/retrieval` | 混合检索(BM25 + 向量)|
|
||||||
|
| POST | `/chunks/recall` | 原始 Chunk 召回 |
|
||||||
|
| POST | `/qa` | 检索 + LLM 问答生成 |
|
||||||
|
|
||||||
|
**内部流程:**
|
||||||
|
```
|
||||||
|
文件上传 → 存储 data/uploads → 投递 Celery 任务(parse-queue)
|
||||||
|
→ parse-worker 调用 mcp-server 解析
|
||||||
|
→ vectorize-worker 调用 embedding-service 向量化
|
||||||
|
→ 写入 Milvus(向量)+ PostgreSQL(元数据)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2.2 mcp-server(文档解析服务)
|
||||||
|
|
||||||
|
**职责:** 将 PDF/Word/Excel 等文档转换为结构化 Markdown,供后续向量化。
|
||||||
|
|
||||||
|
**核心接口:**
|
||||||
|
|
||||||
|
| 方法 | 路径 | 功能 |
|
||||||
|
|------|------|------|
|
||||||
|
| POST | `/parse-document` | 通用解析(自动选择引擎)|
|
||||||
|
| POST | `/mineru-parse` | MinerU 精准解析 |
|
||||||
|
| GET | `/health` | 健康检查 |
|
||||||
|
|
||||||
|
**解析策略(降级链):**
|
||||||
|
```
|
||||||
|
1. 阿里云文档解析 API(云端高精度)→ [调研版暂不启用]
|
||||||
|
2. MinerU(本地,GPU/CPU 均支持)→ 主用
|
||||||
|
3. python-docx / PyMuPDF(纯文本降级)→ 兜底
|
||||||
|
```
|
||||||
|
|
||||||
|
**MinerU 特性:**
|
||||||
|
- GPU 最快:0.21 秒/页
|
||||||
|
- CPU 模式:约 3-5 秒/页(调研阶段可接受)
|
||||||
|
- 支持 109 种语言 OCR
|
||||||
|
- 布局感知:区分正文/标题/表格/图片/页眉页脚
|
||||||
|
- 输出格式:Markdown + JSON(含结构化元数据)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2.3 合规业务后端(compliance-backend)
|
||||||
|
|
||||||
|
**职责:** 核心业务逻辑,整合三条闭环的业务处理。
|
||||||
|
|
||||||
|
**核心接口:**
|
||||||
|
|
||||||
|
| 方法 | 路径 | 功能 |
|
||||||
|
|------|------|------|
|
||||||
|
| POST | `/compliance/upload` | 上传待审查文档 |
|
||||||
|
| POST | `/compliance/check` | 智能合规审查 |
|
||||||
|
| GET | `/compliance/report/{id}` | 获取审查报告 |
|
||||||
|
| POST | `/compliance/regulations/download` | 下载法规 |
|
||||||
|
| POST | `/compliance/regulations/update` | 更新法规版本 |
|
||||||
|
| POST | `/compliance/access-control` | 权限分级管理 |
|
||||||
|
| POST | `/compliance/subscribe` | 订阅变更推送 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2.4 法规感知引擎(Regulation Awareness Engine)
|
||||||
|
|
||||||
|
**职责:** 定时监控法规源,自动检测变更,触发增量更新。
|
||||||
|
|
||||||
|
**六步感知闭环:**
|
||||||
|
|
||||||
|
```
|
||||||
|
① 法规源监控
|
||||||
|
- 定时抓取:国家标准委、工信部、UN-ECE、EUR-Lex、碳交易平台
|
||||||
|
- 技术:requests + BeautifulSoup + Playwright(动态页面)
|
||||||
|
|
||||||
|
② 智能变更感知
|
||||||
|
- Hash 对比(快速过滤)
|
||||||
|
- NLP 版本 Diff(精确识别新增/修订/废止条款)
|
||||||
|
|
||||||
|
③ 自动解析入库
|
||||||
|
- MinerU 解析 → 条款级分割
|
||||||
|
- BGE-M3 向量化 → Milvus + PostgreSQL
|
||||||
|
|
||||||
|
④ 知识图谱同步
|
||||||
|
- Neo4j 更新:法规-条款-义务关系
|
||||||
|
- 影响分析:哪些企业文档受影响
|
||||||
|
|
||||||
|
⑤ 差距分析
|
||||||
|
- AI 比对企业现状 vs 新法规要求
|
||||||
|
- 生成差距报告
|
||||||
|
|
||||||
|
⑥ 推送与整改触发
|
||||||
|
- 按角色推送(研发/EHS/采购/法务)
|
||||||
|
- 自动生成整改任务
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2.5 AI 推理引擎(AI Inference Engine)
|
||||||
|
|
||||||
|
**职责:** 混合检索、精排、LLM 生成、知识图谱推理。
|
||||||
|
|
||||||
|
**混合检索流程:**
|
||||||
|
|
||||||
|
```
|
||||||
|
用户查询
|
||||||
|
│
|
||||||
|
├─► BGE-M3 向量化(Dense 1024维)
|
||||||
|
│ │
|
||||||
|
│ └─► Milvus ANN 检索(HNSW,Cosine 相似度)
|
||||||
|
│
|
||||||
|
├─► BM25 关键词检索(稀疏向量/倒排索引)
|
||||||
|
│
|
||||||
|
└─► 结果融合(RRF 排名融合)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
Cross-Encoder Reranker(精排 Top-K)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
LLM 生成(DeepSeek API)
|
||||||
|
- System Prompt:引文锚定要求
|
||||||
|
- 输出:答案 + 原文引用 + 来源文档 + 页码
|
||||||
|
```
|
||||||
|
|
||||||
|
**BGE-M3 三种向量输出:**
|
||||||
|
- **Dense Vector**(1024维):语义相似度,主要用于向量检索
|
||||||
|
- **Sparse Vector**(词汇权重):关键字匹配,等效 BM25
|
||||||
|
- **Multi-Vector**(ColBERT 风格):精细粒度 token 级匹配
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2.6 Worker 集群
|
||||||
|
|
||||||
|
**职责:** 异步任务处理,解耦主服务压力。
|
||||||
|
|
||||||
|
**Worker 类型:**
|
||||||
|
|
||||||
|
| Worker | 队列 | 职责 |
|
||||||
|
|--------|------|------|
|
||||||
|
| parse-worker | `parse` | 调用 mcp-server 解析文档 |
|
||||||
|
| vectorize-worker | `vectorize` | BGE-M3 向量化 + Milvus 写入 |
|
||||||
|
| compliance-worker | `compliance` | 合规比对 + 风险评分 |
|
||||||
|
| monitor-worker | `monitor` | 法规源定时抓取 |
|
||||||
|
| push-worker | `push` | 推送通知(Email/Webhook)|
|
||||||
|
|
||||||
|
**调度配置(Celery Beat):**
|
||||||
|
```python
|
||||||
|
CELERY_BEAT_SCHEDULE = {
|
||||||
|
"regulation-monitor": {
|
||||||
|
"task": "app.worker.fetch_regulation_updates",
|
||||||
|
"schedule": crontab(hour=2, minute=0), # 每天凌晨2点
|
||||||
|
},
|
||||||
|
"push-notifications": {
|
||||||
|
"task": "app.worker.send_pending_notifications",
|
||||||
|
"schedule": crontab(minute="*/30"), # 每30分钟
|
||||||
|
},
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 三、数据模型
|
||||||
|
|
||||||
|
### 3.1 PostgreSQL 表结构
|
||||||
|
|
||||||
|
```
|
||||||
|
workspaces → 知识库工作空间
|
||||||
|
files → 上传文件记录(含解析状态)
|
||||||
|
tasks → 异步任务状态追踪
|
||||||
|
compliance_reports → 合规审查报告
|
||||||
|
regulation_sources → 法规监控源配置
|
||||||
|
regulation_updates → 法规变更记录
|
||||||
|
subscriptions → 推送订阅配置
|
||||||
|
audit_logs → 全链路审计日志
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.2 Milvus Collection 结构
|
||||||
|
|
||||||
|
```python
|
||||||
|
# regulation_chunks / doc_chunks / case_library 共用相同 Schema
|
||||||
|
fields = [
|
||||||
|
FieldSchema("id", VARCHAR, primary_key=True),
|
||||||
|
FieldSchema("file_id", VARCHAR), # 关联文件
|
||||||
|
FieldSchema("workspace_id", VARCHAR), # 所属工作空间
|
||||||
|
FieldSchema("chunk_idx", INT64), # 块序号
|
||||||
|
FieldSchema("content", VARCHAR(65535)), # 原文内容
|
||||||
|
FieldSchema("dense_vec", FLOAT_VECTOR(1024)), # BGE-M3 向量
|
||||||
|
FieldSchema("metadata", JSON), # 扩展元数据
|
||||||
|
]
|
||||||
|
# 索引:HNSW,Cosine 相似度,M=16,efConstruction=200
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.3 Neo4j 图模型
|
||||||
|
|
||||||
|
```cypher
|
||||||
|
// 节点类型
|
||||||
|
(:Regulation {id, title, code, version, domain, effective_date})
|
||||||
|
(:Clause {id, number, content, clause_type})
|
||||||
|
(:Obligation {id, description, obligation_type, subject})
|
||||||
|
(:Enterprise {id, name, industry})
|
||||||
|
(:RiskItem {id, description, severity, domain})
|
||||||
|
(:Domain {name, label})
|
||||||
|
|
||||||
|
// 关系类型
|
||||||
|
(Regulation)-[:CONTAINS]->(Clause)
|
||||||
|
(Clause)-[:REQUIRES]->(Obligation)
|
||||||
|
(Regulation)-[:SUPERSEDES]->(Regulation) // 版本替代
|
||||||
|
(Clause)-[:MAPS_TO]->(RiskItem)
|
||||||
|
(Enterprise)-[:SUBJECT_TO]->(Regulation)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 四、核心技术栈版本锁定
|
||||||
|
|
||||||
|
| 组件 | 版本 | Docker 镜像 |
|
||||||
|
|------|------|------------|
|
||||||
|
| PostgreSQL | 16 + pgvector | `pgvector/pgvector:pg16` |
|
||||||
|
| Redis | 7.x | `redis:7-alpine` |
|
||||||
|
| Milvus | 2.4.13 | `milvusdb/milvus:v2.4.13` |
|
||||||
|
| Neo4j | 5.20 Community | `neo4j:5.20-community` |
|
||||||
|
| BGE-M3 | 最新 | BAAI/bge-m3(HuggingFace)|
|
||||||
|
| MinerU | 1.x | opendatalab/MinerU(pip)|
|
||||||
|
| LangChain | 0.3+ | pip install langchain>=0.3 |
|
||||||
|
| FastAPI | 0.115+ | pip install fastapi>=0.115 |
|
||||||
|
| Celery | 5.4+ | pip install celery[redis]>=5.4 |
|
||||||
|
| Python | 3.12 | python:3.12-slim(Docker)|
|
||||||
|
| Nginx | 1.25 | `nginx:1.25-alpine` |
|
||||||
569
02_组件安装指南.md
Normal file
569
02_组件安装指南.md
Normal file
@@ -0,0 +1,569 @@
|
|||||||
|
# AI合规智能中枢 — 组件安装指南
|
||||||
|
|
||||||
|
> 本文档提供每个组件的详细安装步骤、配置说明和验证方法。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 前置:Docker 环境安装
|
||||||
|
|
||||||
|
### Ubuntu 22.04 LTS
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. 更新包列表
|
||||||
|
sudo apt-get update
|
||||||
|
|
||||||
|
# 2. 安装依赖
|
||||||
|
sudo apt-get install -y ca-certificates curl gnupg lsb-release
|
||||||
|
|
||||||
|
# 3. 添加 Docker GPG 密钥
|
||||||
|
sudo install -m 0755 -d /etc/apt/keyrings
|
||||||
|
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \
|
||||||
|
sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
|
||||||
|
sudo chmod a+r /etc/apt/keyrings/docker.gpg
|
||||||
|
|
||||||
|
# 4. 添加 Docker 仓库
|
||||||
|
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \
|
||||||
|
https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | \
|
||||||
|
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||||
|
|
||||||
|
# 5. 安装 Docker CE
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y docker-ce docker-ce-cli containerd.io \
|
||||||
|
docker-buildx-plugin docker-compose-plugin
|
||||||
|
|
||||||
|
# 6. 加入 docker 组(免 sudo)
|
||||||
|
sudo usermod -aG docker $USER
|
||||||
|
newgrp docker
|
||||||
|
|
||||||
|
# 7. 验证
|
||||||
|
docker --version # Docker version 27.x.x
|
||||||
|
docker compose version # Docker Compose version v2.x.x
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows 11 + WSL2
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
# PowerShell(管理员)
|
||||||
|
|
||||||
|
# 1. 启用 WSL2
|
||||||
|
wsl --install -d Ubuntu-22.04
|
||||||
|
wsl --set-default-version 2
|
||||||
|
|
||||||
|
# 2. 安装 Docker Desktop(需重启)
|
||||||
|
winget install -e --id Docker.DockerDesktop
|
||||||
|
|
||||||
|
# 3. 重启后,Docker Desktop 设置:
|
||||||
|
# Settings → General → "Use WSL 2 based engine" ✓
|
||||||
|
# Settings → Resources → WSL Integration → Ubuntu-22.04 ✓
|
||||||
|
```
|
||||||
|
|
||||||
|
### GPU 支持(可选,有 NVIDIA GPU 时)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Ubuntu 安装 nvidia-container-toolkit
|
||||||
|
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
|
||||||
|
sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||||
|
|
||||||
|
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||||
|
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||||
|
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||||
|
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y nvidia-container-toolkit
|
||||||
|
sudo nvidia-ctk runtime configure --runtime=docker
|
||||||
|
sudo systemctl restart docker
|
||||||
|
|
||||||
|
# 验证
|
||||||
|
docker run --rm --gpus all nvidia/cuda:12.4-base nvidia-smi
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 组件一:PostgreSQL 16 + pgvector
|
||||||
|
|
||||||
|
**用途:** 存储元数据(文件记录、任务状态、合规报告、法规变更)
|
||||||
|
|
||||||
|
### 配置参数
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# docker-compose.yml 中的关键配置
|
||||||
|
image: pgvector/pgvector:pg16 # 内置 pgvector 扩展
|
||||||
|
POSTGRES_USER: compliance
|
||||||
|
POSTGRES_PASSWORD: <your_password>
|
||||||
|
POSTGRES_DB: compliance_db
|
||||||
|
volumes:
|
||||||
|
- postgres_data:/var/lib/postgresql/data # 数据持久化
|
||||||
|
- ./init-sql:/docker-entrypoint-initdb.d # 自动执行初始化 SQL
|
||||||
|
ports:
|
||||||
|
- "5432:5432"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 启动与验证
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 启动
|
||||||
|
docker compose up -d postgres
|
||||||
|
|
||||||
|
# 等待健康(约10秒)
|
||||||
|
docker compose ps postgres
|
||||||
|
|
||||||
|
# 连接测试
|
||||||
|
docker compose exec postgres psql -U compliance -d compliance_db -c "\dt"
|
||||||
|
|
||||||
|
# 验证扩展
|
||||||
|
docker compose exec postgres psql -U compliance -d compliance_db \
|
||||||
|
-c "SELECT extname FROM pg_extension WHERE extname IN ('vector', 'uuid-ossp');"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 常用操作
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 查看所有表
|
||||||
|
docker compose exec postgres psql -U compliance -d compliance_db \
|
||||||
|
-c "\dt"
|
||||||
|
|
||||||
|
# 查询任务状态
|
||||||
|
docker compose exec postgres psql -U compliance -d compliance_db \
|
||||||
|
-c "SELECT id, task_type, status, created_at FROM tasks ORDER BY created_at DESC LIMIT 10;"
|
||||||
|
|
||||||
|
# 备份数据库
|
||||||
|
docker compose exec postgres pg_dump -U compliance compliance_db > backup_$(date +%Y%m%d).sql
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 组件二:Redis 7
|
||||||
|
|
||||||
|
**用途:** Celery 消息中间件、热数据缓存、分布式锁、会话存储
|
||||||
|
|
||||||
|
### 配置参数
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
image: redis:7-alpine
|
||||||
|
command: >
|
||||||
|
redis-server
|
||||||
|
--requirepass <your_password>
|
||||||
|
--maxmemory 2gb
|
||||||
|
--maxmemory-policy allkeys-lru # 内存满时淘汰最近最少使用的 key
|
||||||
|
ports:
|
||||||
|
- "6379:6379"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 启动与验证
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 启动
|
||||||
|
docker compose up -d redis
|
||||||
|
|
||||||
|
# 连接测试
|
||||||
|
docker compose exec redis redis-cli -a <password> ping
|
||||||
|
# 应返回:PONG
|
||||||
|
|
||||||
|
# 查看 Celery 队列长度
|
||||||
|
docker compose exec redis redis-cli -a <password> llen celery
|
||||||
|
|
||||||
|
# 查看内存使用
|
||||||
|
docker compose exec redis redis-cli -a <password> info memory | grep used_memory_human
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 组件三:Milvus 2.4 Standalone
|
||||||
|
|
||||||
|
**用途:** 向量数据库,存储 BGE-M3 嵌入向量,支持混合检索
|
||||||
|
|
||||||
|
### 架构说明
|
||||||
|
|
||||||
|
Milvus Standalone 包含三个内部组件:
|
||||||
|
- **etcd**:元数据存储(Collection 定义、索引配置)
|
||||||
|
- **MinIO**:向量段文件存储
|
||||||
|
- **milvus**:查询/写入引擎
|
||||||
|
|
||||||
|
### 启动顺序(严格按顺序)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. 先启动 etcd
|
||||||
|
docker compose up -d etcd
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# 2. 再启动 MinIO
|
||||||
|
docker compose up -d minio
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# 3. 最后启动 Milvus(依赖前两者)
|
||||||
|
docker compose up -d milvus
|
||||||
|
# Milvus 冷启动约需 60 秒,请耐心等待
|
||||||
|
```
|
||||||
|
|
||||||
|
### 验证
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# HTTP 健康检查
|
||||||
|
curl http://localhost:9091/healthz
|
||||||
|
# 应返回:{"status":"ok"}
|
||||||
|
|
||||||
|
# Python 连接测试
|
||||||
|
python3 -c "
|
||||||
|
from pymilvus import connections, utility
|
||||||
|
connections.connect(host='localhost', port='19530')
|
||||||
|
print('Collections:', utility.list_collections())
|
||||||
|
print('Milvus 连接成功')
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 创建 Collection(向量索引)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from pymilvus import (connections, Collection, CollectionSchema,
|
||||||
|
FieldSchema, DataType, utility)
|
||||||
|
|
||||||
|
connections.connect(host='localhost', port='19530')
|
||||||
|
|
||||||
|
fields = [
|
||||||
|
FieldSchema('id', DataType.VARCHAR, is_primary=True, max_length=128),
|
||||||
|
FieldSchema('content', DataType.VARCHAR, max_length=65535),
|
||||||
|
FieldSchema('dense_vec', DataType.FLOAT_VECTOR, dim=1024), # BGE-M3
|
||||||
|
FieldSchema('metadata', DataType.JSON),
|
||||||
|
]
|
||||||
|
|
||||||
|
schema = CollectionSchema(fields, description='法规条款向量库')
|
||||||
|
col = Collection('regulation_chunks', schema)
|
||||||
|
|
||||||
|
# 创建 HNSW 索引(速度/精度平衡)
|
||||||
|
col.create_index('dense_vec', {
|
||||||
|
'metric_type': 'COSINE',
|
||||||
|
'index_type': 'HNSW',
|
||||||
|
'params': {'M': 16, 'efConstruction': 200}
|
||||||
|
})
|
||||||
|
col.load()
|
||||||
|
print('Collection 创建完成')
|
||||||
|
```
|
||||||
|
|
||||||
|
### 常用查询
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 向量相似度检索
|
||||||
|
results = col.search(
|
||||||
|
data=[query_vector], # 查询向量(1024维)
|
||||||
|
anns_field='dense_vec',
|
||||||
|
param={'metric_type': 'COSINE', 'params': {'ef': 100}},
|
||||||
|
limit=10,
|
||||||
|
output_fields=['content', 'metadata']
|
||||||
|
)
|
||||||
|
|
||||||
|
# 查看 Collection 统计
|
||||||
|
print(col.num_entities) # 向量总数
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 组件四:Neo4j 5 Community
|
||||||
|
|
||||||
|
**用途:** 知识图谱存储,法规-条款-义务实体关系
|
||||||
|
|
||||||
|
### 配置参数
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
image: neo4j:5.20-community
|
||||||
|
environment:
|
||||||
|
NEO4J_AUTH: neo4j/<your_password>
|
||||||
|
NEO4J_PLUGINS: '["apoc"]' # 必须安装 APOC 插件
|
||||||
|
NEO4J_dbms_memory_heap_max__size: 2G
|
||||||
|
ports:
|
||||||
|
- "7474:7474" # Browser UI
|
||||||
|
- "7687:7687" # Bolt 协议(应用连接用)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 启动与验证
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 启动
|
||||||
|
docker compose up -d neo4j
|
||||||
|
# 首次启动约需 60 秒(下载 APOC 插件)
|
||||||
|
|
||||||
|
# 浏览器访问:http://localhost:7474
|
||||||
|
# 用户名:neo4j,密码:见 .env 中 NEO4J_PASSWORD
|
||||||
|
|
||||||
|
# 命令行连接
|
||||||
|
docker compose exec neo4j cypher-shell -u neo4j -p <password>
|
||||||
|
```
|
||||||
|
|
||||||
|
### 常用 Cypher 查询
|
||||||
|
|
||||||
|
```cypher
|
||||||
|
// 查看所有节点类型
|
||||||
|
CALL apoc.meta.schema() YIELD value RETURN value;
|
||||||
|
|
||||||
|
// 创建法规节点
|
||||||
|
CREATE (r:Regulation {
|
||||||
|
id: 'GB18384-2020',
|
||||||
|
title: 'GB 18384-2020 电动汽车安全要求',
|
||||||
|
domain: 'vehicle_safety',
|
||||||
|
effective_date: date('2021-01-01'),
|
||||||
|
version: '2020'
|
||||||
|
});
|
||||||
|
|
||||||
|
// 法规-条款关系
|
||||||
|
MATCH (r:Regulation {id: 'GB18384-2020'})
|
||||||
|
CREATE (c:Clause {
|
||||||
|
id: 'GB18384-2020-2.1',
|
||||||
|
number: '2.1',
|
||||||
|
content: '绝缘电阻要求:直流电路绝缘电阻不得低于100Ω/V'
|
||||||
|
})
|
||||||
|
CREATE (r)-[:CONTAINS]->(c);
|
||||||
|
|
||||||
|
// 多跳查询:查找某法规所有义务
|
||||||
|
MATCH (r:Regulation {domain: 'vehicle_safety'})-[:CONTAINS]->(c)-[:REQUIRES]->(o)
|
||||||
|
RETURN r.title, c.number, o.description LIMIT 20;
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 组件五:BGE-M3 嵌入服务
|
||||||
|
|
||||||
|
**用途:** 将文本转换为 1024 维向量,支持中英双语,支持 Dense+Sparse 混合检索
|
||||||
|
|
||||||
|
### 服务构建
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 构建镜像
|
||||||
|
docker compose build embedding-service
|
||||||
|
|
||||||
|
# 首次启动(会自动下载 BGE-M3 模型约 2.5GB)
|
||||||
|
docker compose up -d embedding-service
|
||||||
|
|
||||||
|
# 查看下载进度
|
||||||
|
docker compose logs -f embedding-service
|
||||||
|
```
|
||||||
|
|
||||||
|
### 模型预下载(推荐,避免启动超时)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 方法1:通过 hf-mirror.com 加速
|
||||||
|
bash scripts/download_models.sh
|
||||||
|
|
||||||
|
# 方法2:通过 ModelScope(国内最快)
|
||||||
|
pip install modelscope
|
||||||
|
python3 -c "
|
||||||
|
from modelscope import snapshot_download
|
||||||
|
snapshot_download('AI-ModelScope/bge-m3', cache_dir='./models/modelscope')
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
### API 使用
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 健康检查
|
||||||
|
curl http://localhost:8010/health
|
||||||
|
|
||||||
|
# 生成嵌入向量
|
||||||
|
curl -X POST http://localhost:8010/embed \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"texts": ["GB 18384 电动汽车碰撞安全", "vehicle crash safety requirements"],
|
||||||
|
"batch_size": 2
|
||||||
|
}'
|
||||||
|
# 返回:{"dense": [[...1024个浮点数...], [...]], "sparse": [{...词汇权重...}, {...}]}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 性能参考
|
||||||
|
|
||||||
|
| 模式 | 硬件 | 速度 |
|
||||||
|
|------|------|------|
|
||||||
|
| CPU | 16核,64GB RAM | 约 2-5 秒/批(batch=16)|
|
||||||
|
| GPU | RTX 3090 24GB | 约 0.2-0.5 秒/批(batch=32)|
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 组件六:MinerU 文档解析服务
|
||||||
|
|
||||||
|
**用途:** 将 PDF/Word/Excel 解析为 Markdown + 结构化 JSON
|
||||||
|
|
||||||
|
### 服务构建
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 构建镜像(首次约需 10-20 分钟,下载大量依赖)
|
||||||
|
docker compose build mcp-server
|
||||||
|
|
||||||
|
# 启动服务(首次会下载 MinerU 模型约 2GB)
|
||||||
|
docker compose up -d mcp-server
|
||||||
|
|
||||||
|
# 查看启动日志
|
||||||
|
docker compose logs -f mcp-server
|
||||||
|
```
|
||||||
|
|
||||||
|
### API 使用
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 解析 PDF
|
||||||
|
curl -X POST http://localhost:8011/mineru-parse \
|
||||||
|
-F "file=@/path/to/regulation.pdf"
|
||||||
|
# 返回:{"markdown": "# 法规标题\n\n## 第一章...", "filename": "regulation.pdf"}
|
||||||
|
|
||||||
|
# 解析 Word 文档
|
||||||
|
curl -X POST http://localhost:8011/parse-document \
|
||||||
|
-F "file=@/path/to/document.docx"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 性能参考
|
||||||
|
|
||||||
|
| 模式 | 速度 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| CPU | 3-5 秒/页 | 调研阶段可接受 |
|
||||||
|
| GPU(RTX 3090)| 0.21 秒/页 | 生产推荐 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 组件七:业务后端(compliance-backend)
|
||||||
|
|
||||||
|
**用途:** FastAPI 主服务,整合所有业务逻辑
|
||||||
|
|
||||||
|
### 关键依赖配置
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# .env 中必须设置
|
||||||
|
DEEPSEEK_API_KEY=sk-xxxx # DeepSeek API Key
|
||||||
|
LLM_PROVIDER=deepseek # 或 qwen
|
||||||
|
DATABASE_URL=postgresql+asyncpg://...
|
||||||
|
REDIS_URL=redis://:password@redis:6379/0
|
||||||
|
MILVUS_HOST=milvus
|
||||||
|
NEO4J_URI=bolt://neo4j:7687
|
||||||
|
EMBEDDING_SERVICE_URL=http://embedding-service:8010
|
||||||
|
MCP_SERVER_URL=http://mcp-server:8011
|
||||||
|
```
|
||||||
|
|
||||||
|
### 启动与验证
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 启动服务
|
||||||
|
docker compose up -d compliance-backend celery-worker celery-beat
|
||||||
|
|
||||||
|
# 验证 API 文档
|
||||||
|
open http://localhost:8000/docs
|
||||||
|
|
||||||
|
# 查看健康状态(包含所有依赖)
|
||||||
|
curl http://localhost:8000/health
|
||||||
|
```
|
||||||
|
|
||||||
|
### Celery Worker 监控
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 查看 Worker 状态
|
||||||
|
docker compose exec celery-worker celery -A app.worker inspect active
|
||||||
|
|
||||||
|
# 查看队列积压
|
||||||
|
docker compose exec redis redis-cli -a <password> llen celery
|
||||||
|
|
||||||
|
# Worker 日志
|
||||||
|
docker compose logs -f celery-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 组件八:Nginx API 网关
|
||||||
|
|
||||||
|
**用途:** 反向代理,统一路由,TLS 终止(生产)
|
||||||
|
|
||||||
|
### 配置说明(config/nginx.conf)
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
upstream compliance_backend {
|
||||||
|
server compliance-backend:8000;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
client_max_body_size 100M; # 支持大 PDF 上传
|
||||||
|
proxy_read_timeout 300s; # LLM 推理超时设置
|
||||||
|
|
||||||
|
location /api/kb/ { proxy_pass http://compliance_backend; }
|
||||||
|
location /api/compliance/ { proxy_pass http://compliance_backend; }
|
||||||
|
location /api/regulation/ { proxy_pass http://compliance_backend; }
|
||||||
|
location /health { proxy_pass http://compliance_backend; }
|
||||||
|
location /docs { proxy_pass http://compliance_backend; }
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 启动与验证
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 启动
|
||||||
|
docker compose up -d nginx
|
||||||
|
|
||||||
|
# 测试路由
|
||||||
|
curl http://localhost/health
|
||||||
|
curl http://localhost/docs # 应返回 Swagger UI HTML
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 完整启动顺序
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 方式1:分步启动(推荐,含健康等待)
|
||||||
|
bash scripts/06_start_all.sh
|
||||||
|
|
||||||
|
# 方式2:手动分步
|
||||||
|
docker compose up -d postgres redis # 等30s
|
||||||
|
docker compose up -d etcd minio # 等30s
|
||||||
|
docker compose up -d milvus # 等60s
|
||||||
|
docker compose up -d neo4j # 等60s
|
||||||
|
docker compose build embedding-service mcp-server compliance-backend
|
||||||
|
docker compose up -d embedding-service mcp-server # 等120s(模型加载)
|
||||||
|
bash scripts/05_init_db.sh # 初始化数据库
|
||||||
|
docker compose up -d compliance-backend celery-worker celery-beat nginx
|
||||||
|
|
||||||
|
# 验证
|
||||||
|
bash scripts/check_health.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 常见问题
|
||||||
|
|
||||||
|
### Q: Milvus 启动失败
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 检查 etcd 和 minio 是否健康
|
||||||
|
docker compose ps etcd minio
|
||||||
|
|
||||||
|
# 查看 Milvus 日志
|
||||||
|
docker compose logs milvus | tail -50
|
||||||
|
|
||||||
|
# 常见原因:内存不足(Milvus 需要至少 4GB 可用内存)
|
||||||
|
free -h
|
||||||
|
```
|
||||||
|
|
||||||
|
### Q: BGE-M3 模型下载失败
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 使用镜像加速
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
docker compose up -d embedding-service
|
||||||
|
|
||||||
|
# 或使用 ModelScope
|
||||||
|
bash scripts/download_models.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Q: DeepSeek API 连接超时
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 测试连通性
|
||||||
|
curl -X POST https://api.deepseek.com/v1/chat/completions \
|
||||||
|
-H "Authorization: Bearer $DEEPSEEK_API_KEY" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"model": "deepseek-chat", "messages": [{"role": "user", "content": "ping"}]}'
|
||||||
|
|
||||||
|
# 常见原因:API Key 未设置或网络问题
|
||||||
|
```
|
||||||
|
|
||||||
|
### Q: 内存不足
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 查看内存使用
|
||||||
|
docker stats --no-stream
|
||||||
|
|
||||||
|
# 临时解决:减少 BGE-M3 批大小(降低内存峰值)
|
||||||
|
# 编辑 .env,添加:
|
||||||
|
# EMBEDDING_BATCH_SIZE=4 (默认16)
|
||||||
|
```
|
||||||
536
03_业务闭环说明.md
Normal file
536
03_业务闭环说明.md
Normal file
@@ -0,0 +1,536 @@
|
|||||||
|
# AI合规智能中枢 — 三条业务闭环说明
|
||||||
|
|
||||||
|
> 本文档详细描述三条核心业务闭环的数据流、接口规范和验证方法。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 一、闭环①:法规入库 → 检索问答
|
||||||
|
|
||||||
|
### 1.1 业务场景
|
||||||
|
|
||||||
|
**触发场景:**
|
||||||
|
- 法务/研发人员上传新法规 PDF(如 GB 18384-2020、UN-ECE R155)
|
||||||
|
- 系统自动解析、分块、向量化,建立可检索知识库
|
||||||
|
- 用户用自然语言提问,系统返回精准答案并标注来源
|
||||||
|
|
||||||
|
**用户角色:** 车企研发、法务、合规管理员
|
||||||
|
|
||||||
|
### 1.2 数据流
|
||||||
|
|
||||||
|
```
|
||||||
|
[用户] 上传 PDF
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
POST /api/kb/files/upload
|
||||||
|
{workspace_id, file}
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
[kbmp-service]
|
||||||
|
- 存储文件 → data/uploads/{file_id}.pdf
|
||||||
|
- 写入 files 表(status: uploaded)
|
||||||
|
- 投递 Celery 任务 → parse-queue
|
||||||
|
- 返回 {task_id, file_id}
|
||||||
|
│
|
||||||
|
▼ 异步
|
||||||
|
[celery: parse-worker]
|
||||||
|
- 调用 POST http://mcp-server:8011/mineru-parse
|
||||||
|
- 获取 Markdown 文本
|
||||||
|
- 更新 files 表(status: parsed)
|
||||||
|
- 投递 vectorize-queue
|
||||||
|
│
|
||||||
|
▼ 异步
|
||||||
|
[celery: vectorize-worker]
|
||||||
|
- 文本分块(chunk_size=512,overlap=64)
|
||||||
|
- 调用 POST http://embedding-service:8010/embed
|
||||||
|
- 获取 1024维 Dense + Sparse 向量
|
||||||
|
- 写入 Milvus regulation_chunks
|
||||||
|
- 写入 PostgreSQL(chunk 元数据)
|
||||||
|
- 更新 files 表(status: vectorized)
|
||||||
|
- 更新 tasks 表(status: completed)
|
||||||
|
|
||||||
|
[用户] 提问
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
POST /api/kb/qa
|
||||||
|
{query, workspace_id, top_k=5}
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
[rag-service]
|
||||||
|
1. BGE-M3 向量化查询
|
||||||
|
2. Milvus Dense 向量检索(Cosine,top-20)
|
||||||
|
3. Milvus Sparse 向量检索(BM25 等效,top-20)
|
||||||
|
4. RRF 融合(Reciprocal Rank Fusion)
|
||||||
|
5. Cross-Encoder Reranker 精排(top-5)
|
||||||
|
6. 构建 RAG Prompt(含检索片段)
|
||||||
|
7. DeepSeek API 生成答案(引文锚定)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
返回:{answer, sources: [{content, file, page, score}], tokens_used}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.3 关键接口
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 创建工作空间
|
||||||
|
POST /api/kb/workspaces
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "汽车安全法规库",
|
||||||
|
"description": "GB、UN-ECE 系列法规",
|
||||||
|
"domain": "vehicle_safety"
|
||||||
|
}
|
||||||
|
|
||||||
|
### 响应
|
||||||
|
{
|
||||||
|
"id": "uuid-xxx",
|
||||||
|
"name": "汽车安全法规库",
|
||||||
|
"created_at": "2026-04-22T10:00:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 上传文件
|
||||||
|
POST /api/kb/files/upload
|
||||||
|
Content-Type: multipart/form-data
|
||||||
|
|
||||||
|
file: <binary>
|
||||||
|
workspace_id: uuid-xxx
|
||||||
|
|
||||||
|
### 响应
|
||||||
|
{
|
||||||
|
"file_id": "uuid-yyy",
|
||||||
|
"task_id": "uuid-zzz",
|
||||||
|
"filename": "GB18384-2020.pdf",
|
||||||
|
"status": "processing"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 查询任务状态
|
||||||
|
GET /api/kb/tasks/{task_id}
|
||||||
|
|
||||||
|
### 响应
|
||||||
|
{
|
||||||
|
"task_id": "uuid-zzz",
|
||||||
|
"status": "completed", // pending / running / completed / failed
|
||||||
|
"progress": 100,
|
||||||
|
"file_id": "uuid-yyy",
|
||||||
|
"completed_at": "2026-04-22T10:05:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 智能问答
|
||||||
|
POST /api/kb/qa
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"query": "电动汽车碰撞后高压系统的断电时间要求是多少?",
|
||||||
|
"workspace_id": "uuid-xxx",
|
||||||
|
"top_k": 5,
|
||||||
|
"return_sources": true
|
||||||
|
}
|
||||||
|
|
||||||
|
### 响应
|
||||||
|
{
|
||||||
|
"answer": "根据 GB 18384-2020 第 2.2 条,碰撞后 5 秒内,高压系统电压应降至 60V 以下。[来源:GB18384-2020.pdf,第3页]",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"content": "碰撞后5秒内,高压系统电压应降至60V以下。",
|
||||||
|
"file": "GB18384-2020.pdf",
|
||||||
|
"page": 3,
|
||||||
|
"chunk_idx": 12,
|
||||||
|
"score": 0.94
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"tokens_used": 1250
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.4 分块策略
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 推荐分块配置(调研阶段)
|
||||||
|
CHUNK_SIZE = 512 # 每块最大 token 数
|
||||||
|
CHUNK_OVERLAP = 64 # 块间重叠(保留上下文)
|
||||||
|
SEPARATOR = "\n\n" # 优先按段落分割
|
||||||
|
|
||||||
|
# 法规文档特殊处理
|
||||||
|
# - 识别条款编号(1.1, 2.3.1 等),保证条款完整性
|
||||||
|
# - 表格单独处理(不与正文混合)
|
||||||
|
# - 图片提取 alt text
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 二、闭环②:文档上传 → 合规审查
|
||||||
|
|
||||||
|
### 2.1 业务场景
|
||||||
|
|
||||||
|
**触发场景:**
|
||||||
|
- 采购/供应链人员上传供应商文件(技术规格书、合规声明等)
|
||||||
|
- 研发人员上传设计文档,检查是否符合最新法规
|
||||||
|
- EHS 工程师上传安全操作规程,验证 ISO 45001 合规性
|
||||||
|
|
||||||
|
**用户角色:** 采购、供应链、研发、EHS 工程师
|
||||||
|
|
||||||
|
### 2.2 数据流
|
||||||
|
|
||||||
|
```
|
||||||
|
[用户] 上传供应商文件
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
POST /api/compliance/upload
|
||||||
|
{file, regulation_domains}
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
[compliance-backend]
|
||||||
|
- MinerU 解析文档
|
||||||
|
- 条款级分割(识别条款结构)
|
||||||
|
- 法规域匹配(根据内容自动识别:vehicle_safety / data_security / ehs)
|
||||||
|
- 投递 compliance-queue
|
||||||
|
│
|
||||||
|
▼ 异步
|
||||||
|
[celery: compliance-worker]
|
||||||
|
1. 对每个条款,在 Milvus 中检索相关法规要求
|
||||||
|
2. DeepSeek API 评估合规性
|
||||||
|
Prompt: "对比以下供应商条款与法规要求,评估合规性..."
|
||||||
|
3. 生成风险评分(0-100)
|
||||||
|
4. 汇总生成 Markdown 报告
|
||||||
|
5. 存储 compliance_reports 表
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
[用户] 获取报告
|
||||||
|
GET /api/compliance/report/{id}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.3 关键接口
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 上传并审查文档
|
||||||
|
POST /api/compliance/upload
|
||||||
|
Content-Type: multipart/form-data
|
||||||
|
|
||||||
|
file: <binary>
|
||||||
|
regulation_domains: ["vehicle_safety", "data_security"] # 可多选
|
||||||
|
|
||||||
|
### 响应
|
||||||
|
{
|
||||||
|
"report_id": "uuid-aaa",
|
||||||
|
"file_id": "uuid-bbb",
|
||||||
|
"status": "analyzing",
|
||||||
|
"estimated_time_seconds": 60
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 直接合规检查(文本输入)
|
||||||
|
POST /api/compliance/check
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"query": "供应商声明:产品绝缘电阻为50Ω/V,满足行业标准",
|
||||||
|
"regulation_domains": ["vehicle_safety"],
|
||||||
|
"top_k": 3
|
||||||
|
}
|
||||||
|
|
||||||
|
### 响应
|
||||||
|
{
|
||||||
|
"risk_level": "high",
|
||||||
|
"risk_score": 78,
|
||||||
|
"findings": [
|
||||||
|
{
|
||||||
|
"clause": "GB 18384-2020 第2.1条",
|
||||||
|
"requirement": "直流电路绝缘电阻不得低于100Ω/V",
|
||||||
|
"actual": "供应商声明50Ω/V",
|
||||||
|
"gap": "不满足,差距50Ω/V",
|
||||||
|
"severity": "critical"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"recommendations": [
|
||||||
|
"要求供应商提升绝缘电阻至100Ω/V以上",
|
||||||
|
"提供经第三方认证的测试报告"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 获取完整审查报告
|
||||||
|
GET /api/compliance/report/{report_id}
|
||||||
|
|
||||||
|
### 响应
|
||||||
|
{
|
||||||
|
"report_id": "uuid-aaa",
|
||||||
|
"overall_risk_level": "high",
|
||||||
|
"risk_score": 78,
|
||||||
|
"findings": [...],
|
||||||
|
"recommendations": [...],
|
||||||
|
"report_markdown": "# 合规审查报告\n\n## 总体评估\n...",
|
||||||
|
"regulation_domains": ["vehicle_safety"],
|
||||||
|
"llm_model": "deepseek-chat",
|
||||||
|
"created_at": "2026-04-22T11:00:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.4 风险等级定义
|
||||||
|
|
||||||
|
| 风险等级 | 分数 | 说明 | 建议行动 |
|
||||||
|
|---------|------|------|---------|
|
||||||
|
| low | 0-30 | 基本合规,小幅优化 | 记录并监控 |
|
||||||
|
| medium | 31-60 | 部分不符合,需要整改 | 制定整改计划 |
|
||||||
|
| high | 61-80 | 重大不符合,需立即处理 | 暂停合作/紧急整改 |
|
||||||
|
| critical | 81-100 | 严重违规,可能造成法律风险 | 立即停止/上报管理层 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 三、闭环③:法规监控 → 变更推送
|
||||||
|
|
||||||
|
### 3.1 业务场景
|
||||||
|
|
||||||
|
**触发场景:**
|
||||||
|
- 国家发布新的新能源汽车数据安全法规
|
||||||
|
- 现有法规(如 GB 7258)进行修订
|
||||||
|
- 碳排放法规新增企业义务
|
||||||
|
|
||||||
|
系统自动检测变更,分析影响,推送给相关角色。
|
||||||
|
|
||||||
|
**用户角色:** 合规管理员、法务专员、EHS 工程师(订阅对应域)
|
||||||
|
|
||||||
|
### 3.2 数据流
|
||||||
|
|
||||||
|
```
|
||||||
|
[Celery Beat] 每天凌晨 2:00 触发
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
[celery: monitor-worker]
|
||||||
|
- 读取 regulation_sources 表(所有 is_active=True 的监控源)
|
||||||
|
- 对每个监控源:
|
||||||
|
a. HTTP 抓取页面内容
|
||||||
|
b. 计算 MD5 Hash
|
||||||
|
c. 与 last_hash 对比
|
||||||
|
d. 有变化 → 投递变更分析任务
|
||||||
|
│
|
||||||
|
▼ [有变更时]
|
||||||
|
[celery: compliance-worker]
|
||||||
|
- DeepSeek API 分析变更内容
|
||||||
|
- 提取新增/修订/废止条款
|
||||||
|
- 生成变更摘要
|
||||||
|
- 写入 regulation_updates 表
|
||||||
|
- 触发增量入库(重新向量化变更条款)
|
||||||
|
- 更新 Neo4j 知识图谱
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
[celery: push-worker]
|
||||||
|
- 读取 subscriptions 表
|
||||||
|
- 按域、重要性过滤
|
||||||
|
- 发送推送(Email / Webhook / 飞书)
|
||||||
|
- 标记 is_notified=True
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.3 关键接口
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 配置监控源
|
||||||
|
POST /api/regulation/sources
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "国家标准全文公开系统",
|
||||||
|
"url": "https://std.samr.gov.cn",
|
||||||
|
"domain": "vehicle_safety",
|
||||||
|
"fetch_interval": 86400,
|
||||||
|
"fetch_config": {
|
||||||
|
"css_selector": ".standard-list .item",
|
||||||
|
"title_selector": ".title",
|
||||||
|
"date_selector": ".date"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
### 响应
|
||||||
|
{
|
||||||
|
"id": "uuid-src1",
|
||||||
|
"name": "国家标准全文公开系统",
|
||||||
|
"status": "active",
|
||||||
|
"next_fetch_at": "2026-04-23T02:00:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 查看法规变更记录
|
||||||
|
GET /api/regulation/updates?domain=vehicle_safety&limit=10&offset=0
|
||||||
|
|
||||||
|
### 响应
|
||||||
|
{
|
||||||
|
"total": 25,
|
||||||
|
"updates": [
|
||||||
|
{
|
||||||
|
"id": "uuid-upd1",
|
||||||
|
"title": "GB 18384-2022 电动汽车安全要求(修订版)",
|
||||||
|
"url": "https://std.samr.gov.cn/xxxx",
|
||||||
|
"change_type": "revised",
|
||||||
|
"summary": "主要变更:碰撞断电时间由5秒缩短至3秒;新增涉水安全要求",
|
||||||
|
"importance": "high",
|
||||||
|
"fetched_at": "2026-04-22T02:00:00Z"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 手动触发法规源采集(测试用)
|
||||||
|
POST /api/regulation/sources/{source_id}/fetch
|
||||||
|
|
||||||
|
### 响应
|
||||||
|
{
|
||||||
|
"task_id": "uuid-task1",
|
||||||
|
"status": "queued",
|
||||||
|
"source_id": "uuid-src1"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```http
|
||||||
|
### 订阅变更推送
|
||||||
|
POST /api/regulation/subscribe
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "EHS 工程师推送",
|
||||||
|
"channel": "webhook",
|
||||||
|
"target": "https://open.feishu.cn/open-apis/bot/v2/hook/xxxx",
|
||||||
|
"domains": ["ehs", "carbon"],
|
||||||
|
"importance_min": "normal"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.4 内置监控源列表
|
||||||
|
|
||||||
|
| 名称 | URL | 域 |
|
||||||
|
|------|-----|-----|
|
||||||
|
| 国家标准全文公开系统 | https://std.samr.gov.cn | vehicle_safety |
|
||||||
|
| 工信部政策法规 | https://www.miit.gov.cn/jgsj/fgs/zcfg | vehicle_safety |
|
||||||
|
| 应急管理部法规 | https://www.mem.gov.cn/gk/zcfg | ehs |
|
||||||
|
| 生态环境部法规 | https://www.mee.gov.cn/ywgz/fgbz/fl | carbon |
|
||||||
|
| 网信办法规 | https://www.cac.gov.cn/zcfg/index.htm | data_security |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 四、接口认证说明(调研版)
|
||||||
|
|
||||||
|
调研版使用简单 API Key 认证(在 `Authorization` 头传入):
|
||||||
|
|
||||||
|
```http
|
||||||
|
# 所有请求需要携带 API Key
|
||||||
|
Authorization: Bearer <API_SECRET_KEY>
|
||||||
|
```
|
||||||
|
|
||||||
|
> `API_SECRET_KEY` 在 `.env` 中配置,默认值仅供本地调研使用,生产环境必须更换。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 五、完整冒烟测试脚本
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# 完整三条闭环验证
|
||||||
|
API="http://localhost"
|
||||||
|
KEY="your_api_secret_key"
|
||||||
|
HEADER="-H 'Authorization: Bearer $KEY' -H 'Content-Type: application/json'"
|
||||||
|
|
||||||
|
# ── 闭环①测试 ────────────────────────────────
|
||||||
|
echo "=== 测试闭环①:法规入库 → 问答 ==="
|
||||||
|
|
||||||
|
# 1. 创建工作空间
|
||||||
|
WS=$(curl -sf -X POST $API/api/kb/workspaces \
|
||||||
|
-H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \
|
||||||
|
-d '{"name":"测试法规库","domain":"vehicle_safety"}')
|
||||||
|
WS_ID=$(echo $WS | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])")
|
||||||
|
echo "工作空间:$WS_ID"
|
||||||
|
|
||||||
|
# 2. 上传测试 PDF
|
||||||
|
UPLOAD=$(curl -sf -X POST $API/api/kb/files/upload \
|
||||||
|
-H "Authorization: Bearer $KEY" \
|
||||||
|
-F "file=@data/uploads/test_regulation.txt" \
|
||||||
|
-F "workspace_id=$WS_ID")
|
||||||
|
TASK_ID=$(echo $UPLOAD | python3 -c "import sys,json; print(json.load(sys.stdin)['task_id'])")
|
||||||
|
echo "任务ID:$TASK_ID"
|
||||||
|
|
||||||
|
# 3. 等待处理
|
||||||
|
for i in {1..30}; do
|
||||||
|
STATUS=$(curl -sf $API/api/kb/tasks/$TASK_ID -H "Authorization: Bearer $KEY" | \
|
||||||
|
python3 -c "import sys,json; print(json.load(sys.stdin)['status'])")
|
||||||
|
[[ "$STATUS" == "completed" ]] && echo "处理完成" && break
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
|
# 4. 问答测试
|
||||||
|
QA=$(curl -sf -X POST $API/api/kb/qa \
|
||||||
|
-H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"query\":\"碰撞后高压系统要求\",\"workspace_id\":\"$WS_ID\"}")
|
||||||
|
echo "问答结果:$(echo $QA | python3 -c "import sys,json; print(json.load(sys.stdin).get('answer','')[:100])")"
|
||||||
|
|
||||||
|
# ── 闭环②测试 ────────────────────────────────
|
||||||
|
echo ""
|
||||||
|
echo "=== 测试闭环②:合规审查 ==="
|
||||||
|
CHECK=$(curl -sf -X POST $API/api/compliance/check \
|
||||||
|
-H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \
|
||||||
|
-d '{"query":"绝缘电阻50Ω/V","regulation_domains":["vehicle_safety"]}')
|
||||||
|
echo "风险等级:$(echo $CHECK | python3 -c "import sys,json; print(json.load(sys.stdin).get('risk_level','unknown'))")"
|
||||||
|
|
||||||
|
# ── 闭环③测试 ────────────────────────────────
|
||||||
|
echo ""
|
||||||
|
echo "=== 测试闭环③:法规监控 ==="
|
||||||
|
SRC=$(curl -sf -X POST $API/api/regulation/sources \
|
||||||
|
-H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \
|
||||||
|
-d '{"name":"测试源","url":"https://std.samr.gov.cn","domain":"vehicle_safety"}')
|
||||||
|
echo "监控源:$(echo $SRC | python3 -c "import sys,json; print(json.load(sys.stdin).get('id','failed'))")"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 六、数据流示意图(完整版)
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────┐
|
||||||
|
│ 用户请求 │
|
||||||
|
│ Web / API / Mobile / Bot │
|
||||||
|
└──────────────┬──────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────┐
|
||||||
|
│ Nginx API Gateway │
|
||||||
|
│ 路由 / 限流 / 认证 │
|
||||||
|
└──────────────┬──────────────────┘
|
||||||
|
│
|
||||||
|
┌────────────────────┼────────────────────┐
|
||||||
|
│ │ │
|
||||||
|
▼ ▼ ▼
|
||||||
|
┌──────────────┐ ┌──────────────────┐ ┌────────────────┐
|
||||||
|
│ 知识库 │ │ 合规审查 │ │ 法规监控 │
|
||||||
|
│ /api/kb/* │ │ /api/compliance/* │ │/api/regulation/│
|
||||||
|
└──────┬───────┘ └────────┬─────────┘ └───────┬────────┘
|
||||||
|
│ │ │
|
||||||
|
└──────────┬──────────┘ │
|
||||||
|
│ │
|
||||||
|
▼ ▼
|
||||||
|
┌──────────────────┐ ┌──────────────────┐
|
||||||
|
│ compliance- │ │ Celery Beat │
|
||||||
|
│ backend │ │ 定时调度 │
|
||||||
|
└──────┬───────────┘ └────────┬─────────┘
|
||||||
|
│ │
|
||||||
|
┌──────────┼──────────┐ ┌──────────┼──────────┐
|
||||||
|
│ │ │ │ │ │
|
||||||
|
▼ ▼ ▼ ▼ ▼ ▼
|
||||||
|
parse-w vectorize-w compliance-w monitor-w push-w
|
||||||
|
│ │ │ │ │
|
||||||
|
▼ ▼ │ │ ▼
|
||||||
|
mcp-server embedding LLM API 网络抓取 通知推送
|
||||||
|
(MinerU) (BGE-M3) (DeepSeek) (requests) (Email/Bot)
|
||||||
|
│ │
|
||||||
|
└────┬─────┘
|
||||||
|
│
|
||||||
|
┌──────────┼──────────────┐
|
||||||
|
▼ ▼ ▼
|
||||||
|
PostgreSQL Milvus Neo4j
|
||||||
|
(元数据/报告) (向量检索) (知识图谱)
|
||||||
|
```
|
||||||
190
README.md
Normal file
190
README.md
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
# AI合规智能中枢 — 调研版部署指南
|
||||||
|
|
||||||
|
面向车企与工厂的全链路合规智能平台,Docker Compose 单机部署版本,用于验证三条业务闭环。
|
||||||
|
|
||||||
|
## 快速开始
|
||||||
|
|
||||||
|
### 前置要求
|
||||||
|
|
||||||
|
| 资源 | 最低 | 推荐 |
|
||||||
|
|------|------|------|
|
||||||
|
| CPU | 8核 | 16核+ |
|
||||||
|
| 内存 | 32 GB | 64 GB |
|
||||||
|
| 存储 | 200 GB SSD | 500 GB SSD |
|
||||||
|
| GPU | 无需 | 1× RTX 3090(加速嵌入)|
|
||||||
|
| OS | Ubuntu 22.04 LTS 或 Windows 11 + WSL2 | — |
|
||||||
|
|
||||||
|
### 1. 安装 Docker
|
||||||
|
|
||||||
|
**Ubuntu/Linux:**
|
||||||
|
```bash
|
||||||
|
bash scripts/00_install_docker_ubuntu.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**Windows(PowerShell 管理员):**
|
||||||
|
```powershell
|
||||||
|
.\scripts\00_install_docker_windows.ps1
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. 配置环境变量
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
# 编辑 .env,至少填写:
|
||||||
|
# - DEEPSEEK_API_KEY(在 https://platform.deepseek.com 申请)
|
||||||
|
nano .env
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. 一键启动
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 拉取镜像(可选,加速首次启动)
|
||||||
|
bash scripts/02_pull_images.sh
|
||||||
|
|
||||||
|
# 分步启动(推荐,含健康等待)
|
||||||
|
bash scripts/06_start_all.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. 验证部署
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 检查所有服务状态
|
||||||
|
bash scripts/check_health.sh
|
||||||
|
|
||||||
|
# 运行端到端冒烟测试
|
||||||
|
bash scripts/07_smoke_test.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 服务访问地址
|
||||||
|
|
||||||
|
| 服务 | 地址 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| API 网关 | http://localhost | Nginx 入口,所有 API 请求入口 |
|
||||||
|
| 业务后端 | http://localhost:8000/docs | FastAPI Swagger UI |
|
||||||
|
| Neo4j 浏览器 | http://localhost:7474 | 知识图谱可视化 |
|
||||||
|
| Grafana | http://localhost:3000 | 监控面板(`--profile monitoring` 启动)|
|
||||||
|
| Milvus | localhost:19530 | 向量数据库 gRPC 端口 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 三条业务闭环
|
||||||
|
|
||||||
|
### 闭环①:法规入库 → 检索问答
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 上传法规PDF
|
||||||
|
curl -X POST http://localhost/api/kb/files/upload \
|
||||||
|
-F "file=@your_regulation.pdf" \
|
||||||
|
-F "workspace_id=auto-regulation"
|
||||||
|
|
||||||
|
# 查询任务状态
|
||||||
|
curl http://localhost/api/kb/tasks/{task_id}
|
||||||
|
|
||||||
|
# 检索问答
|
||||||
|
curl -X POST http://localhost/api/kb/qa \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"query": "GB 18384 电动汽车碰撞安全要求", "top_k": 5}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### 闭环②:文档上传 → 合规审查
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 上传供应商文件
|
||||||
|
curl -X POST http://localhost/api/compliance/upload \
|
||||||
|
-F "file=@supplier_document.pdf"
|
||||||
|
|
||||||
|
# 触发合规审查
|
||||||
|
curl -X POST http://localhost/api/compliance/check \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"doc_id": "xxx", "regulation_domains": ["vehicle_safety", "data_security"]}'
|
||||||
|
|
||||||
|
# 获取审查报告
|
||||||
|
curl http://localhost/api/compliance/report/{id}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 闭环③:法规监控 → 变更推送
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 配置监控源
|
||||||
|
curl -X POST http://localhost/api/regulation/sources \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"url": "https://std.samr.gov.cn", "name": "国家标准全文公开"}'
|
||||||
|
|
||||||
|
# 查看变更记录
|
||||||
|
curl http://localhost/api/regulation/updates
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 目录结构
|
||||||
|
|
||||||
|
```
|
||||||
|
Depolyment/
|
||||||
|
├── README.md # 本文件
|
||||||
|
├── docker-compose.yml # 全服务编排
|
||||||
|
├── .env.example # 环境变量模板
|
||||||
|
├── scripts/ # 安装与运维脚本
|
||||||
|
├── services/
|
||||||
|
│ ├── embedding/ # BGE-M3 嵌入服务
|
||||||
|
│ ├── mcp-server/ # MinerU 文档解析服务
|
||||||
|
│ └── compliance-backend/ # 核心业务后端
|
||||||
|
├── config/ # Nginx、Prometheus 配置
|
||||||
|
├── init-sql/ # PostgreSQL 初始化 SQL
|
||||||
|
├── data/ # 运行时数据(上传文件、解析结果)
|
||||||
|
├── logs/ # 服务日志
|
||||||
|
└── models/ # AI 模型缓存(BGE-M3、MinerU)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 常用操作
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 查看所有服务状态
|
||||||
|
docker compose ps
|
||||||
|
|
||||||
|
# 查看某个服务日志
|
||||||
|
docker compose logs -f compliance-backend
|
||||||
|
|
||||||
|
# 重启某个服务
|
||||||
|
docker compose restart embedding-service
|
||||||
|
|
||||||
|
# 停止所有服务(保留数据)
|
||||||
|
docker compose stop
|
||||||
|
|
||||||
|
# 完全重置(删除所有数据,慎用)
|
||||||
|
bash scripts/reset_all.sh
|
||||||
|
|
||||||
|
# 启动监控面板
|
||||||
|
docker compose --profile monitoring up -d grafana
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## LLM 切换
|
||||||
|
|
||||||
|
默认使用 DeepSeek API,如需切换到 Qwen(阿里云):
|
||||||
|
|
||||||
|
编辑 `.env`:
|
||||||
|
```bash
|
||||||
|
LLM_PROVIDER=qwen
|
||||||
|
DASHSCOPE_API_KEY=your_key_here
|
||||||
|
QWEN_MODEL=qwen-plus
|
||||||
|
```
|
||||||
|
|
||||||
|
然后重启业务服务:
|
||||||
|
```bash
|
||||||
|
docker compose restart compliance-backend celery-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 生产升级路径
|
||||||
|
|
||||||
|
调研验证通过后,升级要点:
|
||||||
|
1. **LLM**:从 API 切换到本地 vLLM + DeepSeek-V3(需要 4×A100)
|
||||||
|
2. **Milvus**:从 Standalone 升级到分布式集群(加独立 MinIO)
|
||||||
|
3. **编排**:从 Docker Compose 迁移到 Kubernetes(服务配置文件可复用)
|
||||||
|
4. **安全**:启用完整 JWT/RBAC,添加 TLS 证书
|
||||||
63
config/nginx.conf
Normal file
63
config/nginx.conf
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
upstream compliance_backend {
|
||||||
|
server compliance-backend:8000;
|
||||||
|
keepalive 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
# 文件上传大小限制(法规PDF可能较大)
|
||||||
|
client_max_body_size 100M;
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name _;
|
||||||
|
|
||||||
|
# 访问日志
|
||||||
|
access_log /var/log/nginx/access.log;
|
||||||
|
error_log /var/log/nginx/error.log;
|
||||||
|
|
||||||
|
# 超时配置(LLM推理可能较慢)
|
||||||
|
proxy_connect_timeout 60s;
|
||||||
|
proxy_send_timeout 300s;
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
|
||||||
|
# 通用代理头
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Connection "";
|
||||||
|
|
||||||
|
# ── 知识库接口 ─────────────────────────────
|
||||||
|
location /api/kb/ {
|
||||||
|
proxy_pass http://compliance_backend/api/kb/;
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── 合规审查接口 ───────────────────────────
|
||||||
|
location /api/compliance/ {
|
||||||
|
proxy_pass http://compliance_backend/api/compliance/;
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── 法规监控接口 ───────────────────────────
|
||||||
|
location /api/regulation/ {
|
||||||
|
proxy_pass http://compliance_backend/api/regulation/;
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── 健康检查 ───────────────────────────────
|
||||||
|
location /health {
|
||||||
|
proxy_pass http://compliance_backend/health;
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── API 文档(开发环境)────────────────────
|
||||||
|
location /docs {
|
||||||
|
proxy_pass http://compliance_backend/docs;
|
||||||
|
}
|
||||||
|
|
||||||
|
location /openapi.json {
|
||||||
|
proxy_pass http://compliance_backend/openapi.json;
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── 根路径 ─────────────────────────────────
|
||||||
|
location / {
|
||||||
|
proxy_pass http://compliance_backend/;
|
||||||
|
}
|
||||||
|
}
|
||||||
22
config/prometheus.yml
Normal file
22
config/prometheus.yml
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: "compliance-backend"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["compliance-backend:8000"]
|
||||||
|
metrics_path: /metrics
|
||||||
|
|
||||||
|
- job_name: "milvus"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["milvus:9091"]
|
||||||
|
metrics_path: /metrics
|
||||||
|
|
||||||
|
- job_name: "redis"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["redis:6379"]
|
||||||
|
|
||||||
|
- job_name: "postgres"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["postgres:5432"]
|
||||||
380
docker-compose.yml
Normal file
380
docker-compose.yml
Normal file
@@ -0,0 +1,380 @@
|
|||||||
|
version: "3.9"
|
||||||
|
|
||||||
|
networks:
|
||||||
|
compliance-net:
|
||||||
|
driver: bridge
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
postgres_data:
|
||||||
|
redis_data:
|
||||||
|
milvus_data:
|
||||||
|
minio_data:
|
||||||
|
neo4j_data:
|
||||||
|
neo4j_logs:
|
||||||
|
|
||||||
|
services:
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
# 基础数据层
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
|
||||||
|
postgres:
|
||||||
|
image: pgvector/pgvector:pg16
|
||||||
|
container_name: compliance-postgres
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: compliance
|
||||||
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-compliance123}
|
||||||
|
POSTGRES_DB: compliance_db
|
||||||
|
volumes:
|
||||||
|
- postgres_data:/var/lib/postgresql/data
|
||||||
|
- ./init-sql:/docker-entrypoint-initdb.d
|
||||||
|
ports:
|
||||||
|
- "5432:5432"
|
||||||
|
networks: [compliance-net]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U compliance -d compliance_db"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
|
||||||
|
redis:
|
||||||
|
image: redis:7-alpine
|
||||||
|
container_name: compliance-redis
|
||||||
|
restart: unless-stopped
|
||||||
|
command: >
|
||||||
|
redis-server
|
||||||
|
--requirepass ${REDIS_PASSWORD:-redis123}
|
||||||
|
--maxmemory 2gb
|
||||||
|
--maxmemory-policy allkeys-lru
|
||||||
|
volumes:
|
||||||
|
- redis_data:/data
|
||||||
|
ports:
|
||||||
|
- "6379:6379"
|
||||||
|
networks: [compliance-net]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD:-redis123}", "ping"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
# Milvus 向量数据库(Standalone,含 etcd + minio)
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
|
||||||
|
etcd:
|
||||||
|
image: quay.io/coreos/etcd:v3.5.5
|
||||||
|
container_name: milvus-etcd
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
ETCD_AUTO_COMPACTION_MODE: revision
|
||||||
|
ETCD_AUTO_COMPACTION_RETENTION: "1000"
|
||||||
|
ETCD_QUOTA_BACKEND_BYTES: "4294967296"
|
||||||
|
ETCD_SNAPSHOT_COUNT: "50000"
|
||||||
|
volumes:
|
||||||
|
- milvus_data:/etcd
|
||||||
|
command: >
|
||||||
|
etcd
|
||||||
|
-advertise-client-urls=http://127.0.0.1:2379
|
||||||
|
-listen-client-urls=http://0.0.0.0:2379
|
||||||
|
--data-dir=/etcd
|
||||||
|
networks: [compliance-net]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "etcdctl", "endpoint", "health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 20s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
minio:
|
||||||
|
image: minio/minio:RELEASE.2023-03-13T19-46-17Z
|
||||||
|
container_name: milvus-minio
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
MINIO_ACCESS_KEY: minioadmin
|
||||||
|
MINIO_SECRET_KEY: minioadmin
|
||||||
|
volumes:
|
||||||
|
- minio_data:/minio_data
|
||||||
|
command: minio server /minio_data --console-address ":9001"
|
||||||
|
ports:
|
||||||
|
- "9001:9001" # MinIO 控制台(可选访问)
|
||||||
|
networks: [compliance-net]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 20s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
milvus:
|
||||||
|
image: milvusdb/milvus:v2.4.13
|
||||||
|
container_name: compliance-milvus
|
||||||
|
restart: unless-stopped
|
||||||
|
command: ["milvus", "run", "standalone"]
|
||||||
|
environment:
|
||||||
|
ETCD_ENDPOINTS: etcd:2379
|
||||||
|
MINIO_ADDRESS: minio:9000
|
||||||
|
volumes:
|
||||||
|
- milvus_data:/var/lib/milvus
|
||||||
|
ports:
|
||||||
|
- "19530:19530" # gRPC API
|
||||||
|
- "9091:9091" # HTTP API
|
||||||
|
depends_on:
|
||||||
|
etcd:
|
||||||
|
condition: service_healthy
|
||||||
|
minio:
|
||||||
|
condition: service_healthy
|
||||||
|
networks: [compliance-net]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 20s
|
||||||
|
retries: 10
|
||||||
|
start_period: 60s
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
# Neo4j 知识图谱数据库
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
|
||||||
|
neo4j:
|
||||||
|
image: neo4j:5.20-community
|
||||||
|
container_name: compliance-neo4j
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
NEO4J_AUTH: neo4j/${NEO4J_PASSWORD:-neo4j123}
|
||||||
|
NEO4J_PLUGINS: '["apoc"]'
|
||||||
|
NEO4J_dbms_security_procedures_unrestricted: apoc.*
|
||||||
|
NEO4J_dbms_memory_heap_initial__size: 512m
|
||||||
|
NEO4J_dbms_memory_heap_max__size: 2G
|
||||||
|
NEO4J_dbms_memory_pagecache_size: 1G
|
||||||
|
volumes:
|
||||||
|
- neo4j_data:/data
|
||||||
|
- neo4j_logs:/logs
|
||||||
|
ports:
|
||||||
|
- "7474:7474" # Browser UI
|
||||||
|
- "7687:7687" # Bolt 协议
|
||||||
|
networks: [compliance-net]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "wget -q --spider http://localhost:7474 || exit 1"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 10
|
||||||
|
start_period: 60s
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
# AI 模型服务
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
|
||||||
|
embedding-service:
|
||||||
|
build:
|
||||||
|
context: ./services/embedding
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: compliance-embedding:latest
|
||||||
|
container_name: compliance-embedding
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
MODEL_NAME: BAAI/bge-m3
|
||||||
|
HF_ENDPOINT: ${HF_ENDPOINT:-https://hf-mirror.com}
|
||||||
|
DEVICE: ${EMBEDDING_DEVICE:-cpu}
|
||||||
|
MAX_BATCH_SIZE: "16"
|
||||||
|
volumes:
|
||||||
|
- ./models:/app/models
|
||||||
|
ports:
|
||||||
|
- "8010:8010"
|
||||||
|
networks: [compliance-net]
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 8G
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8010/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 5
|
||||||
|
start_period: 120s # 模型加载需要时间
|
||||||
|
|
||||||
|
mcp-server:
|
||||||
|
build:
|
||||||
|
context: ./services/mcp-server
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: compliance-mcp:latest
|
||||||
|
container_name: compliance-mcp
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
DEVICE: ${MCP_DEVICE:-cpu}
|
||||||
|
HF_ENDPOINT: ${HF_ENDPOINT:-https://hf-mirror.com}
|
||||||
|
volumes:
|
||||||
|
- ./models:/app/models
|
||||||
|
- ./data/uploads:/app/uploads
|
||||||
|
- ./data/parsed:/app/parsed
|
||||||
|
ports:
|
||||||
|
- "8011:8011"
|
||||||
|
networks: [compliance-net]
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 8G
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8011/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 5
|
||||||
|
start_period: 120s
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
# 业务服务层
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
|
||||||
|
compliance-backend:
|
||||||
|
build:
|
||||||
|
context: ./services/compliance-backend
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: compliance-backend:latest
|
||||||
|
container_name: compliance-backend
|
||||||
|
restart: unless-stopped
|
||||||
|
env_file: .env
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: postgresql+asyncpg://compliance:${POSTGRES_PASSWORD:-compliance123}@postgres:5432/compliance_db
|
||||||
|
REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/0
|
||||||
|
MILVUS_HOST: milvus
|
||||||
|
MILVUS_PORT: "19530"
|
||||||
|
NEO4J_URI: bolt://neo4j:7687
|
||||||
|
NEO4J_USER: neo4j
|
||||||
|
NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123}
|
||||||
|
EMBEDDING_SERVICE_URL: http://embedding-service:8010
|
||||||
|
MCP_SERVER_URL: http://mcp-server:8011
|
||||||
|
LLM_PROVIDER: ${LLM_PROVIDER:-deepseek}
|
||||||
|
DEEPSEEK_API_KEY: ${DEEPSEEK_API_KEY:-}
|
||||||
|
DEEPSEEK_MODEL: ${DEEPSEEK_MODEL:-deepseek-chat}
|
||||||
|
DASHSCOPE_API_KEY: ${DASHSCOPE_API_KEY:-}
|
||||||
|
QWEN_MODEL: ${QWEN_MODEL:-qwen-plus}
|
||||||
|
LOG_LEVEL: ${LOG_LEVEL:-INFO}
|
||||||
|
APP_ENV: ${APP_ENV:-development}
|
||||||
|
volumes:
|
||||||
|
- ./data:/app/data
|
||||||
|
- ./logs:/app/logs
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
depends_on:
|
||||||
|
postgres:
|
||||||
|
condition: service_healthy
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
milvus:
|
||||||
|
condition: service_healthy
|
||||||
|
embedding-service:
|
||||||
|
condition: service_healthy
|
||||||
|
networks: [compliance-net]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 5
|
||||||
|
start_period: 30s
|
||||||
|
|
||||||
|
celery-worker:
|
||||||
|
build:
|
||||||
|
context: ./services/compliance-backend
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: compliance-backend:latest
|
||||||
|
container_name: compliance-worker
|
||||||
|
restart: unless-stopped
|
||||||
|
command: >
|
||||||
|
celery -A app.worker worker
|
||||||
|
--loglevel=info
|
||||||
|
--concurrency=4
|
||||||
|
--queues=default,parse,vectorize,compliance,monitor,push
|
||||||
|
env_file: .env
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: postgresql+asyncpg://compliance:${POSTGRES_PASSWORD:-compliance123}@postgres:5432/compliance_db
|
||||||
|
REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/0
|
||||||
|
MILVUS_HOST: milvus
|
||||||
|
MILVUS_PORT: "19530"
|
||||||
|
NEO4J_URI: bolt://neo4j:7687
|
||||||
|
NEO4J_USER: neo4j
|
||||||
|
NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123}
|
||||||
|
EMBEDDING_SERVICE_URL: http://embedding-service:8010
|
||||||
|
MCP_SERVER_URL: http://mcp-server:8011
|
||||||
|
LLM_PROVIDER: ${LLM_PROVIDER:-deepseek}
|
||||||
|
DEEPSEEK_API_KEY: ${DEEPSEEK_API_KEY:-}
|
||||||
|
DASHSCOPE_API_KEY: ${DASHSCOPE_API_KEY:-}
|
||||||
|
volumes:
|
||||||
|
- ./data:/app/data
|
||||||
|
- ./logs:/app/logs
|
||||||
|
depends_on:
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
compliance-backend:
|
||||||
|
condition: service_healthy
|
||||||
|
networks: [compliance-net]
|
||||||
|
|
||||||
|
celery-beat:
|
||||||
|
build:
|
||||||
|
context: ./services/compliance-backend
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: compliance-backend:latest
|
||||||
|
container_name: compliance-beat
|
||||||
|
restart: unless-stopped
|
||||||
|
command: >
|
||||||
|
celery -A app.worker beat
|
||||||
|
--loglevel=info
|
||||||
|
--scheduler celery.beat.PersistentScheduler
|
||||||
|
env_file: .env
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: postgresql+asyncpg://compliance:${POSTGRES_PASSWORD:-compliance123}@postgres:5432/compliance_db
|
||||||
|
REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/0
|
||||||
|
DEEPSEEK_API_KEY: ${DEEPSEEK_API_KEY:-}
|
||||||
|
volumes:
|
||||||
|
- ./data:/app/data
|
||||||
|
- ./logs:/app/logs
|
||||||
|
depends_on:
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
networks: [compliance-net]
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
# API 网关
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
|
||||||
|
nginx:
|
||||||
|
image: nginx:1.25-alpine
|
||||||
|
container_name: compliance-nginx
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./config/nginx.conf:/etc/nginx/conf.d/default.conf:ro
|
||||||
|
ports:
|
||||||
|
- "80:80"
|
||||||
|
depends_on:
|
||||||
|
compliance-backend:
|
||||||
|
condition: service_healthy
|
||||||
|
networks: [compliance-net]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "nginx", "-t"]
|
||||||
|
interval: 30s
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
# 监控(可选,--profile monitoring 启动)
|
||||||
|
# ═══════════════════════════════════════════════
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
image: grafana/grafana:11.0.0
|
||||||
|
container_name: compliance-grafana
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD:-admin}
|
||||||
|
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||||
|
volumes:
|
||||||
|
- ./config/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml:ro
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
networks: [compliance-net]
|
||||||
|
profiles: [monitoring]
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:v2.51.0
|
||||||
|
container_name: compliance-prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
ports:
|
||||||
|
- "9090:9090"
|
||||||
|
networks: [compliance-net]
|
||||||
|
profiles: [monitoring]
|
||||||
192
init-sql/01_init_schema.sql
Normal file
192
init-sql/01_init_schema.sql
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
-- AI合规智能中枢 — PostgreSQL 初始化 Schema
|
||||||
|
-- 执行时机:容器首次启动时自动执行
|
||||||
|
|
||||||
|
-- 启用扩展
|
||||||
|
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
||||||
|
CREATE EXTENSION IF NOT EXISTS vector; -- pgvector(pgvector/pgvector:pg16 镜像已内置)
|
||||||
|
CREATE EXTENSION IF NOT EXISTS pg_trgm; -- 全文检索支持
|
||||||
|
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
-- 工作空间(知识库)
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
CREATE TABLE IF NOT EXISTS workspaces (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
domain VARCHAR(100), -- vehicle_safety / data_security / ehs / carbon
|
||||||
|
created_by VARCHAR(255),
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
-- 文件记录
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
CREATE TABLE IF NOT EXISTS files (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
workspace_id UUID REFERENCES workspaces(id) ON DELETE CASCADE,
|
||||||
|
filename VARCHAR(500) NOT NULL,
|
||||||
|
original_name VARCHAR(500) NOT NULL,
|
||||||
|
file_type VARCHAR(50), -- pdf / docx / xlsx
|
||||||
|
file_size BIGINT,
|
||||||
|
storage_path TEXT, -- data/uploads/相对路径
|
||||||
|
parsed_path TEXT, -- data/parsed/相对路径
|
||||||
|
status VARCHAR(50) DEFAULT 'uploaded', -- uploaded/parsing/parsed/vectorized/failed
|
||||||
|
error_msg TEXT,
|
||||||
|
metadata JSONB DEFAULT '{}',
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_files_workspace ON files(workspace_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_files_status ON files(status);
|
||||||
|
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
-- 异步任务记录
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
CREATE TABLE IF NOT EXISTS tasks (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
task_type VARCHAR(100) NOT NULL, -- parse / vectorize / compliance_check / regulation_fetch
|
||||||
|
status VARCHAR(50) DEFAULT 'pending', -- pending/running/completed/failed
|
||||||
|
payload JSONB DEFAULT '{}',
|
||||||
|
result JSONB,
|
||||||
|
error_msg TEXT,
|
||||||
|
progress INTEGER DEFAULT 0, -- 0-100
|
||||||
|
file_id UUID REFERENCES files(id),
|
||||||
|
celery_task_id VARCHAR(255),
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
completed_at TIMESTAMPTZ
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_tasks_status ON tasks(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_tasks_type ON tasks(task_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_tasks_file ON tasks(file_id);
|
||||||
|
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
-- 合规审查报告
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
CREATE TABLE IF NOT EXISTS compliance_reports (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
file_id UUID REFERENCES files(id),
|
||||||
|
regulation_domains TEXT[], -- 适用法规域
|
||||||
|
overall_risk_level VARCHAR(20), -- high / medium / low
|
||||||
|
risk_score DECIMAL(5,2), -- 0-100
|
||||||
|
findings JSONB DEFAULT '[]', -- 问题列表
|
||||||
|
recommendations JSONB DEFAULT '[]', -- 整改建议
|
||||||
|
report_markdown TEXT, -- 完整报告(Markdown格式)
|
||||||
|
llm_model VARCHAR(100), -- 生成时使用的模型
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_reports_file ON compliance_reports(file_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_reports_risk ON compliance_reports(overall_risk_level);
|
||||||
|
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
-- 法规监控源
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
CREATE TABLE IF NOT EXISTS regulation_sources (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
url TEXT NOT NULL,
|
||||||
|
source_type VARCHAR(50) DEFAULT 'webpage', -- webpage / rss / api
|
||||||
|
domain VARCHAR(100), -- vehicle_safety / ehs 等
|
||||||
|
fetch_interval INTEGER DEFAULT 86400, -- 抓取间隔(秒),默认每天
|
||||||
|
is_active BOOLEAN DEFAULT TRUE,
|
||||||
|
last_fetched_at TIMESTAMPTZ,
|
||||||
|
last_hash VARCHAR(64), -- 内容hash,用于变更检测
|
||||||
|
fetch_config JSONB DEFAULT '{}', -- 抓取配置(CSS选择器等)
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sources_active ON regulation_sources(is_active);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sources_domain ON regulation_sources(domain);
|
||||||
|
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
-- 法规变更记录
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
CREATE TABLE IF NOT EXISTS regulation_updates (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
source_id UUID REFERENCES regulation_sources(id),
|
||||||
|
title VARCHAR(500),
|
||||||
|
url TEXT,
|
||||||
|
change_type VARCHAR(50), -- new / revised / revoked / notice
|
||||||
|
summary TEXT, -- AI生成的变更摘要
|
||||||
|
raw_content TEXT, -- 原始抓取内容
|
||||||
|
diff_content TEXT, -- 与上次内容的差异
|
||||||
|
is_notified BOOLEAN DEFAULT FALSE,
|
||||||
|
importance VARCHAR(20) DEFAULT 'normal', -- high / normal / low
|
||||||
|
fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
published_at TIMESTAMPTZ
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_updates_source ON regulation_updates(source_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_updates_notified ON regulation_updates(is_notified);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_updates_fetched ON regulation_updates(fetched_at DESC);
|
||||||
|
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
-- 推送订阅
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
CREATE TABLE IF NOT EXISTS subscriptions (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
name VARCHAR(255),
|
||||||
|
channel VARCHAR(50) NOT NULL, -- email / webhook / feishu / dingtalk
|
||||||
|
target TEXT NOT NULL, -- 邮件地址 或 Webhook URL
|
||||||
|
domains TEXT[], -- 订阅的法规域,为空则订阅全部
|
||||||
|
importance_min VARCHAR(20) DEFAULT 'normal',
|
||||||
|
is_active BOOLEAN DEFAULT TRUE,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
-- 全链路审计日志
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
CREATE TABLE IF NOT EXISTS audit_logs (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
action VARCHAR(100) NOT NULL, -- upload / query / compliance_check / etc
|
||||||
|
resource VARCHAR(100),
|
||||||
|
resource_id UUID,
|
||||||
|
user_id VARCHAR(255),
|
||||||
|
ip_address INET,
|
||||||
|
request JSONB,
|
||||||
|
response JSONB,
|
||||||
|
duration_ms INTEGER,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_audit_action ON audit_logs(action);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_audit_created ON audit_logs(created_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_audit_user ON audit_logs(user_id);
|
||||||
|
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
-- 更新时间自动维护
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
CREATE OR REPLACE FUNCTION update_updated_at_column()
|
||||||
|
RETURNS TRIGGER AS $$
|
||||||
|
BEGIN
|
||||||
|
NEW.updated_at = NOW();
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
CREATE TRIGGER update_workspaces_updated_at
|
||||||
|
BEFORE UPDATE ON workspaces
|
||||||
|
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
|
||||||
|
|
||||||
|
CREATE TRIGGER update_files_updated_at
|
||||||
|
BEFORE UPDATE ON files
|
||||||
|
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
|
||||||
|
|
||||||
|
CREATE TRIGGER update_tasks_updated_at
|
||||||
|
BEFORE UPDATE ON tasks
|
||||||
|
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
|
||||||
|
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
-- 初始数据:预置监控源
|
||||||
|
-- ══════════════════════════════════════════════════
|
||||||
|
INSERT INTO regulation_sources (name, url, domain, fetch_interval) VALUES
|
||||||
|
('国家标准全文公开系统', 'https://std.samr.gov.cn', 'vehicle_safety', 86400),
|
||||||
|
('工信部政策法规', 'https://www.miit.gov.cn/jgsj/fgs/zcfg/index.html', 'vehicle_safety', 86400),
|
||||||
|
('应急管理部政策法规', 'https://www.mem.gov.cn/gk/zcfg/', 'ehs', 86400),
|
||||||
|
('生态环境部政策法规', 'https://www.mee.gov.cn/ywgz/fgbz/fl/', 'carbon', 86400)
|
||||||
|
ON CONFLICT DO NOTHING;
|
||||||
117
scripts/00_install_docker_ubuntu.sh
Normal file
117
scripts/00_install_docker_ubuntu.sh
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# 00_install_docker_ubuntu.sh
|
||||||
|
# Ubuntu 22.04 LTS 安装 Docker CE + nvidia-container-toolkit
|
||||||
|
# 用法:bash scripts/00_install_docker_ubuntu.sh
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||||
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||||||
|
|
||||||
|
# ── 检查 root 权限 ──────────────────────────────
|
||||||
|
if [[ $EUID -ne 0 ]]; then
|
||||||
|
error "请以 root 或 sudo 运行:sudo bash scripts/00_install_docker_ubuntu.sh"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 检测 Ubuntu 版本 ────────────────────────────
|
||||||
|
. /etc/os-release
|
||||||
|
info "检测到 OS:$NAME $VERSION_ID"
|
||||||
|
if [[ "$ID" != "ubuntu" ]]; then
|
||||||
|
warn "非 Ubuntu 系统,脚本可能不适用。继续(y/n)?"
|
||||||
|
read -r ans; [[ "$ans" != "y" ]] && exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Step 1:换国内源(可选)──────────────────────
|
||||||
|
info "Step 1/5:配置 APT 源..."
|
||||||
|
if [[ "${USE_MIRROR:-false}" == "true" ]]; then
|
||||||
|
sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list
|
||||||
|
sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list
|
||||||
|
ok "已切换到阿里云镜像"
|
||||||
|
fi
|
||||||
|
apt-get update -qq
|
||||||
|
|
||||||
|
# ── Step 2:安装依赖 ────────────────────────────
|
||||||
|
info "Step 2/5:安装依赖包..."
|
||||||
|
apt-get install -y -qq \
|
||||||
|
ca-certificates \
|
||||||
|
curl \
|
||||||
|
gnupg \
|
||||||
|
lsb-release \
|
||||||
|
apt-transport-https
|
||||||
|
|
||||||
|
# ── Step 3:安装 Docker CE ──────────────────────
|
||||||
|
info "Step 3/5:安装 Docker CE..."
|
||||||
|
if command -v docker &>/dev/null; then
|
||||||
|
DOCKER_VER=$(docker --version)
|
||||||
|
warn "Docker 已安装:$DOCKER_VER"
|
||||||
|
warn "跳过 Docker 安装。如需重装,请先运行:apt-get remove docker docker-engine docker.io containerd"
|
||||||
|
else
|
||||||
|
# 添加 Docker 官方 GPG 密钥
|
||||||
|
install -m 0755 -d /etc/apt/keyrings
|
||||||
|
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \
|
||||||
|
gpg --dearmor -o /etc/apt/keyrings/docker.gpg
|
||||||
|
chmod a+r /etc/apt/keyrings/docker.gpg
|
||||||
|
|
||||||
|
# 添加 Docker 仓库
|
||||||
|
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \
|
||||||
|
https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | \
|
||||||
|
tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||||
|
|
||||||
|
apt-get update -qq
|
||||||
|
apt-get install -y -qq docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
||||||
|
|
||||||
|
# 启动并设置开机自启
|
||||||
|
systemctl enable docker
|
||||||
|
systemctl start docker
|
||||||
|
ok "Docker CE 安装完成"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 验证
|
||||||
|
docker --version
|
||||||
|
docker compose version
|
||||||
|
|
||||||
|
# ── Step 4:将当前用户加入 docker 组 ────────────
|
||||||
|
info "Step 4/5:配置 Docker 用户组..."
|
||||||
|
CURRENT_USER=${SUDO_USER:-$USER}
|
||||||
|
if [[ -n "$CURRENT_USER" && "$CURRENT_USER" != "root" ]]; then
|
||||||
|
usermod -aG docker "$CURRENT_USER"
|
||||||
|
ok "用户 $CURRENT_USER 已加入 docker 组(重新登录后生效)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Step 5:安装 nvidia-container-toolkit(可选)─
|
||||||
|
info "Step 5/5:检查 NVIDIA GPU..."
|
||||||
|
if command -v nvidia-smi &>/dev/null; then
|
||||||
|
info "检测到 NVIDIA GPU,安装 nvidia-container-toolkit..."
|
||||||
|
nvidia-smi --query-gpu=name --format=csv,noheader
|
||||||
|
|
||||||
|
# 添加 NVIDIA 仓库
|
||||||
|
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
|
||||||
|
gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||||
|
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||||
|
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||||
|
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||||
|
|
||||||
|
apt-get update -qq
|
||||||
|
apt-get install -y -qq nvidia-container-toolkit
|
||||||
|
nvidia-ctk runtime configure --runtime=docker
|
||||||
|
systemctl restart docker
|
||||||
|
ok "nvidia-container-toolkit 安装完成"
|
||||||
|
else
|
||||||
|
warn "未检测到 NVIDIA GPU,跳过 nvidia-container-toolkit 安装"
|
||||||
|
warn "如有 GPU 请手动安装驱动后重新运行本脚本"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${GREEN} Docker 安装完成!${NC}"
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo ""
|
||||||
|
echo " Docker 版本:$(docker --version)"
|
||||||
|
echo " Compose 版本:$(docker compose version)"
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW} 注意:${NC}请重新登录以使 docker 组权限生效"
|
||||||
|
echo " 验证命令:docker run hello-world"
|
||||||
105
scripts/00_install_docker_windows.ps1
Normal file
105
scripts/00_install_docker_windows.ps1
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# 00_install_docker_windows.ps1
|
||||||
|
# Windows 11 安装 Docker Desktop + WSL2 配置
|
||||||
|
# 用法:以管理员身份运行 PowerShell,执行:
|
||||||
|
# .\scripts\00_install_docker_windows.ps1
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
#Requires -RunAsAdministrator
|
||||||
|
|
||||||
|
$ErrorActionPreference = "Stop"
|
||||||
|
|
||||||
|
function Write-Info { Write-Host "[INFO] $args" -ForegroundColor Cyan }
|
||||||
|
function Write-Ok { Write-Host "[OK] $args" -ForegroundColor Green }
|
||||||
|
function Write-Warn { Write-Host "[WARN] $args" -ForegroundColor Yellow }
|
||||||
|
function Write-Err { Write-Host "[ERR] $args" -ForegroundColor Red; exit 1 }
|
||||||
|
|
||||||
|
Write-Info "============================================"
|
||||||
|
Write-Info "AI合规智能中枢 — Windows Docker 环境安装"
|
||||||
|
Write-Info "============================================"
|
||||||
|
|
||||||
|
# ── Step 1:启用 WSL2 ──────────────────────────
|
||||||
|
Write-Info "Step 1/4:检查并启用 WSL2..."
|
||||||
|
$wslFeature = Get-WindowsOptionalFeature -Online -FeatureName Microsoft-Windows-Subsystem-Linux
|
||||||
|
$vmFeature = Get-WindowsOptionalFeature -Online -FeatureName VirtualMachinePlatform
|
||||||
|
|
||||||
|
if ($wslFeature.State -ne "Enabled") {
|
||||||
|
Write-Info "启用 WSL 功能..."
|
||||||
|
Enable-WindowsOptionalFeature -Online -FeatureName Microsoft-Windows-Subsystem-Linux -NoRestart
|
||||||
|
}
|
||||||
|
if ($vmFeature.State -ne "Enabled") {
|
||||||
|
Write-Info "启用虚拟机平台..."
|
||||||
|
Enable-WindowsOptionalFeature -Online -FeatureName VirtualMachinePlatform -NoRestart
|
||||||
|
}
|
||||||
|
|
||||||
|
# 更新 WSL 内核
|
||||||
|
Write-Info "更新 WSL2 内核..."
|
||||||
|
wsl --update
|
||||||
|
wsl --set-default-version 2
|
||||||
|
Write-Ok "WSL2 配置完成"
|
||||||
|
|
||||||
|
# ── Step 2:安装 Ubuntu WSL 发行版 ─────────────
|
||||||
|
Write-Info "Step 2/4:检查 Ubuntu WSL..."
|
||||||
|
$wslList = wsl --list --quiet 2>$null
|
||||||
|
if ($wslList -notmatch "Ubuntu") {
|
||||||
|
Write-Info "安装 Ubuntu 22.04..."
|
||||||
|
wsl --install -d Ubuntu-22.04
|
||||||
|
Write-Ok "Ubuntu 22.04 安装完成(首次运行需要设置用户名和密码)"
|
||||||
|
} else {
|
||||||
|
Write-Ok "Ubuntu WSL 已安装"
|
||||||
|
wsl --list --verbose
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Step 3:安装 Docker Desktop ────────────────
|
||||||
|
Write-Info "Step 3/4:检查 Docker Desktop..."
|
||||||
|
$dockerCmd = Get-Command docker -ErrorAction SilentlyContinue
|
||||||
|
if ($dockerCmd) {
|
||||||
|
Write-Ok "Docker 已安装:$(docker --version)"
|
||||||
|
} else {
|
||||||
|
# 尝试用 winget 安装
|
||||||
|
$winget = Get-Command winget -ErrorAction SilentlyContinue
|
||||||
|
if ($winget) {
|
||||||
|
Write-Info "通过 winget 安装 Docker Desktop..."
|
||||||
|
winget install -e --id Docker.DockerDesktop --accept-package-agreements --accept-source-agreements
|
||||||
|
Write-Ok "Docker Desktop 安装完成"
|
||||||
|
} else {
|
||||||
|
Write-Warn "未找到 winget,请手动安装 Docker Desktop:"
|
||||||
|
Write-Warn "下载地址:https://www.docker.com/products/docker-desktop/"
|
||||||
|
Write-Warn "安装时勾选:Use WSL 2 instead of Hyper-V"
|
||||||
|
Start-Process "https://www.docker.com/products/docker-desktop/"
|
||||||
|
Read-Host "安装完成后按 Enter 继续"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Step 4:配置 Docker Desktop WSL 集成 ───────
|
||||||
|
Write-Info "Step 4/4:提示 Docker Desktop 配置..."
|
||||||
|
Write-Warn ""
|
||||||
|
Write-Warn "请确认 Docker Desktop 已进行以下配置:"
|
||||||
|
Write-Warn " 1. Settings → General → 勾选 'Use WSL 2 based engine'"
|
||||||
|
Write-Warn " 2. Settings → Resources → WSL Integration → 开启 Ubuntu-22.04"
|
||||||
|
Write-Warn " 3. 如有 NVIDIA GPU:"
|
||||||
|
Write-Warn " Settings → General → 勾选 'Use GPU with WSL 2'"
|
||||||
|
Write-Warn ""
|
||||||
|
|
||||||
|
# ── 验证 ───────────────────────────────────────
|
||||||
|
Write-Info "验证安装..."
|
||||||
|
try {
|
||||||
|
$dockerVer = docker --version
|
||||||
|
$composeVer = docker compose version
|
||||||
|
Write-Ok "Docker: $dockerVer"
|
||||||
|
Write-Ok "Compose: $composeVer"
|
||||||
|
} catch {
|
||||||
|
Write-Warn "Docker 命令不可用,可能需要重启后再验证"
|
||||||
|
Write-Warn "重启后运行:docker run hello-world"
|
||||||
|
}
|
||||||
|
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "============================================" -ForegroundColor Green
|
||||||
|
Write-Host " 安装完成!" -ForegroundColor Green
|
||||||
|
Write-Host "============================================" -ForegroundColor Green
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "后续步骤(在 WSL2 Ubuntu 中执行):" -ForegroundColor Yellow
|
||||||
|
Write-Host " 1. 打开 Ubuntu WSL 终端"
|
||||||
|
Write-Host " 2. cd /mnt/c/Projects/AIProjects/AIRegulations/Depolyment"
|
||||||
|
Write-Host " 3. bash scripts/01_setup_project.sh"
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "如需重启系统请现在重启,然后继续操作。" -ForegroundColor Yellow
|
||||||
73
scripts/01_setup_project.sh
Normal file
73
scripts/01_setup_project.sh
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# 01_setup_project.sh
|
||||||
|
# 初始化项目:创建目录、生成 .env 文件
|
||||||
|
# 用法:bash scripts/01_setup_project.sh
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
|
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||||
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
|
||||||
|
cd "$PROJECT_DIR"
|
||||||
|
info "项目目录:$PROJECT_DIR"
|
||||||
|
|
||||||
|
# ── 创建运行时目录 ──────────────────────────────
|
||||||
|
info "创建运行时目录..."
|
||||||
|
mkdir -p data/uploads data/parsed logs models
|
||||||
|
mkdir -p services/embedding services/mcp-server
|
||||||
|
mkdir -p services/compliance-backend/app/{core,api,services,models}
|
||||||
|
ok "目录结构创建完成"
|
||||||
|
|
||||||
|
# ── 复制 .env 文件 ──────────────────────────────
|
||||||
|
if [[ ! -f ".env" ]]; then
|
||||||
|
cp .env.example .env
|
||||||
|
warn "已创建 .env 文件,请编辑并填写必要配置:"
|
||||||
|
warn " 必填:DEEPSEEK_API_KEY(或 DASHSCOPE_API_KEY)"
|
||||||
|
warn " 可选:修改各组件密码"
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW}是否现在编辑 .env 文件?(y/n)${NC}"
|
||||||
|
read -r ans
|
||||||
|
if [[ "$ans" == "y" ]]; then
|
||||||
|
${EDITOR:-nano} .env
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
ok ".env 文件已存在,跳过复制"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 验证 .env 关键字段 ──────────────────────────
|
||||||
|
info "验证 .env 配置..."
|
||||||
|
source .env 2>/dev/null || true
|
||||||
|
|
||||||
|
if [[ -z "${DEEPSEEK_API_KEY:-}" && -z "${DASHSCOPE_API_KEY:-}" ]]; then
|
||||||
|
warn "⚠️ 未设置 LLM API Key!"
|
||||||
|
warn " 请在 .env 中设置 DEEPSEEK_API_KEY 或 DASHSCOPE_API_KEY"
|
||||||
|
warn " DeepSeek 申请:https://platform.deepseek.com"
|
||||||
|
else
|
||||||
|
ok "LLM API Key 已配置"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 验证 Docker ─────────────────────────────────
|
||||||
|
info "检查 Docker 环境..."
|
||||||
|
if ! command -v docker &>/dev/null; then
|
||||||
|
warn "Docker 未安装,请先运行:bash scripts/00_install_docker_ubuntu.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
docker compose version > /dev/null
|
||||||
|
ok "Docker Compose 可用:$(docker compose version)"
|
||||||
|
|
||||||
|
# ── 显示下一步 ──────────────────────────────────
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${GREEN} 项目初始化完成!${NC}"
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo ""
|
||||||
|
echo "下一步操作:"
|
||||||
|
echo " 1. 拉取镜像(可选,较慢):bash scripts/02_pull_images.sh"
|
||||||
|
echo " 2. 启动全部服务: bash scripts/06_start_all.sh"
|
||||||
|
echo " 3. 检查健康状态: bash scripts/check_health.sh"
|
||||||
46
scripts/02_pull_images.sh
Normal file
46
scripts/02_pull_images.sh
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# 02_pull_images.sh
|
||||||
|
# 预拉取所有 Docker 镜像(离线/弱网环境准备)
|
||||||
|
# 用法:bash scripts/02_pull_images.sh
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
cd "$PROJECT_DIR"
|
||||||
|
|
||||||
|
BLUE='\033[0;34m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
||||||
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
|
||||||
|
# 所有基础镜像列表
|
||||||
|
IMAGES=(
|
||||||
|
"pgvector/pgvector:pg16"
|
||||||
|
"redis:7-alpine"
|
||||||
|
"quay.io/coreos/etcd:v3.5.5"
|
||||||
|
"minio/minio:RELEASE.2023-03-13T19-46-17Z"
|
||||||
|
"milvusdb/milvus:v2.4.13"
|
||||||
|
"neo4j:5.20-community"
|
||||||
|
"nginx:1.25-alpine"
|
||||||
|
"grafana/grafana:11.0.0"
|
||||||
|
"prom/prometheus:v2.51.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
info "开始拉取 ${#IMAGES[@]} 个基础镜像..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
for img in "${IMAGES[@]}"; do
|
||||||
|
info "拉取:$img"
|
||||||
|
docker pull "$img"
|
||||||
|
ok "完成:$img"
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
info "所有基础镜像拉取完成"
|
||||||
|
echo ""
|
||||||
|
info "自定义服务镜像(embedding/mcp/backend)将在 build 时自动拉取基础层"
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW}提示:如在国内网络环境下 quay.io 或 milvusdb 拉取慢,${NC}"
|
||||||
|
echo -e "${YELLOW}可配置 Docker 镜像加速器:/etc/docker/daemon.json${NC}"
|
||||||
|
echo ' {"registry-mirrors": ["https://docker.mirrors.ustc.edu.cn"]}'
|
||||||
93
scripts/03_start_infra.sh
Normal file
93
scripts/03_start_infra.sh
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# 03_start_infra.sh
|
||||||
|
# 分步启动基础设施(含健康等待),顺序:
|
||||||
|
# PostgreSQL + Redis → etcd + MinIO → Milvus → Neo4j
|
||||||
|
# 用法:bash scripts/03_start_infra.sh
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
cd "$PROJECT_DIR"
|
||||||
|
|
||||||
|
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||||
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||||||
|
|
||||||
|
# 等待服务健康的函数
|
||||||
|
wait_healthy() {
|
||||||
|
local service=$1
|
||||||
|
local max_wait=${2:-120}
|
||||||
|
local interval=5
|
||||||
|
local elapsed=0
|
||||||
|
|
||||||
|
info "等待 $service 健康就绪..."
|
||||||
|
while [[ $elapsed -lt $max_wait ]]; do
|
||||||
|
local status
|
||||||
|
status=$(docker compose ps --format json "$service" 2>/dev/null | \
|
||||||
|
python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('Health','unknown'))" 2>/dev/null || echo "unknown")
|
||||||
|
|
||||||
|
if [[ "$status" == "healthy" ]]; then
|
||||||
|
ok "$service 已就绪"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo -n "."
|
||||||
|
sleep $interval
|
||||||
|
elapsed=$((elapsed + interval))
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
error "$service 等待超时(${max_wait}s),请检查:docker compose logs $service"
|
||||||
|
}
|
||||||
|
|
||||||
|
info "══════════════════════════════════════════"
|
||||||
|
info " 启动基础设施层"
|
||||||
|
info "══════════════════════════════════════════"
|
||||||
|
|
||||||
|
# ── Step 1:PostgreSQL + Redis ──────────────────
|
||||||
|
info "Step 1/4:启动 PostgreSQL 和 Redis..."
|
||||||
|
docker compose up -d postgres redis
|
||||||
|
|
||||||
|
wait_healthy postgres 90
|
||||||
|
wait_healthy redis 30
|
||||||
|
ok "数据层就绪"
|
||||||
|
|
||||||
|
# ── Step 2:etcd + MinIO(Milvus 依赖)─────────
|
||||||
|
info "Step 2/4:启动 etcd 和 MinIO(Milvus 依赖)..."
|
||||||
|
docker compose up -d etcd minio
|
||||||
|
|
||||||
|
wait_healthy etcd 60
|
||||||
|
wait_healthy minio 60
|
||||||
|
ok "对象存储层就绪"
|
||||||
|
|
||||||
|
# ── Step 3:Milvus ──────────────────────────────
|
||||||
|
info "Step 3/4:启动 Milvus(向量数据库)..."
|
||||||
|
docker compose up -d milvus
|
||||||
|
|
||||||
|
info "Milvus 初始化需要约 60 秒,请耐心等待..."
|
||||||
|
wait_healthy milvus 180
|
||||||
|
ok "Milvus 就绪"
|
||||||
|
|
||||||
|
# ── Step 4:Neo4j ───────────────────────────────
|
||||||
|
info "Step 4/4:启动 Neo4j(知识图谱)..."
|
||||||
|
docker compose up -d neo4j
|
||||||
|
|
||||||
|
wait_healthy neo4j 120
|
||||||
|
ok "Neo4j 就绪"
|
||||||
|
|
||||||
|
# ── 汇总 ────────────────────────────────────────
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${GREEN} 基础设施启动完成!${NC}"
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo ""
|
||||||
|
echo " PostgreSQL : localhost:5432"
|
||||||
|
echo " Redis : localhost:6379"
|
||||||
|
echo " Milvus : localhost:19530 (gRPC), localhost:9091 (HTTP)"
|
||||||
|
echo " Neo4j : localhost:7474 (Browser), localhost:7687 (Bolt)"
|
||||||
|
echo " MinIO 控制台: localhost:9001 (admin/minioadmin)"
|
||||||
|
echo ""
|
||||||
|
echo "下一步:bash scripts/04_build_services.sh"
|
||||||
59
scripts/04_build_services.sh
Normal file
59
scripts/04_build_services.sh
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# 04_build_services.sh
|
||||||
|
# 构建自定义服务 Docker 镜像
|
||||||
|
# embedding-service / mcp-server / compliance-backend
|
||||||
|
# 用法:bash scripts/04_build_services.sh
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
cd "$PROJECT_DIR"
|
||||||
|
|
||||||
|
BLUE='\033[0;34m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
||||||
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
|
||||||
|
info "══════════════════════════════════════════"
|
||||||
|
info " 构建自定义服务镜像"
|
||||||
|
info "══════════════════════════════════════════"
|
||||||
|
warn "首次构建较慢(需下载 Python 依赖 + AI 模型)"
|
||||||
|
warn "BGE-M3 模型约 2.5GB,MinerU 模型约 2GB"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 构建嵌入服务 ────────────────────────────────
|
||||||
|
info "构建 embedding-service(BGE-M3)..."
|
||||||
|
START=$(date +%s)
|
||||||
|
docker compose build embedding-service
|
||||||
|
END=$(date +%s)
|
||||||
|
ok "embedding-service 构建完成($(( END - START ))s)"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 构建 MinerU 解析服务 ────────────────────────
|
||||||
|
info "构建 mcp-server(MinerU)..."
|
||||||
|
START=$(date +%s)
|
||||||
|
docker compose build mcp-server
|
||||||
|
END=$(date +%s)
|
||||||
|
ok "mcp-server 构建完成($(( END - START ))s)"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 构建业务后端 ────────────────────────────────
|
||||||
|
info "构建 compliance-backend..."
|
||||||
|
START=$(date +%s)
|
||||||
|
docker compose build compliance-backend
|
||||||
|
END=$(date +%s)
|
||||||
|
ok "compliance-backend 构建完成($(( END - START ))s)"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 列出构建的镜像 ──────────────────────────────
|
||||||
|
info "已构建的镜像:"
|
||||||
|
docker images | grep -E "compliance-(embedding|mcp|backend)" || true
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${GREEN} 所有服务镜像构建完成!${NC}"
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo ""
|
||||||
|
echo "下一步:bash scripts/05_init_db.sh"
|
||||||
124
scripts/05_init_db.sh
Normal file
124
scripts/05_init_db.sh
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# 05_init_db.sh
|
||||||
|
# 初始化数据库:PostgreSQL Schema + Milvus Collections + Neo4j Constraints
|
||||||
|
# 用法:bash scripts/05_init_db.sh
|
||||||
|
# 前提:postgres / milvus / neo4j 已运行且健康
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
cd "$PROJECT_DIR"
|
||||||
|
|
||||||
|
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||||
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||||||
|
|
||||||
|
source .env 2>/dev/null || true
|
||||||
|
POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-compliance123}
|
||||||
|
NEO4J_PASSWORD=${NEO4J_PASSWORD:-neo4j123}
|
||||||
|
|
||||||
|
# ── Step 1:PostgreSQL Schema ───────────────────
|
||||||
|
info "Step 1/3:初始化 PostgreSQL Schema..."
|
||||||
|
if docker compose ps postgres | grep -q "healthy"; then
|
||||||
|
docker compose exec -T postgres psql \
|
||||||
|
-U compliance -d compliance_db \
|
||||||
|
-f /docker-entrypoint-initdb.d/01_init_schema.sql \
|
||||||
|
2>&1 | tail -5 || warn "SQL 可能部分已存在(IF NOT EXISTS),这是正常的"
|
||||||
|
ok "PostgreSQL Schema 初始化完成"
|
||||||
|
else
|
||||||
|
error "PostgreSQL 未运行,请先执行:bash scripts/03_start_infra.sh"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Step 2:Milvus Collections ──────────────────
|
||||||
|
info "Step 2/3:初始化 Milvus Collections..."
|
||||||
|
if docker compose ps milvus | grep -q "healthy"; then
|
||||||
|
docker compose run --rm --no-deps compliance-backend \
|
||||||
|
python3 -c "
|
||||||
|
import asyncio
|
||||||
|
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
|
||||||
|
|
||||||
|
connections.connect(host='milvus', port='19530')
|
||||||
|
print('Milvus 连接成功')
|
||||||
|
|
||||||
|
def create_collection(name, description):
|
||||||
|
if utility.has_collection(name):
|
||||||
|
print(f' Collection {name} 已存在,跳过')
|
||||||
|
return
|
||||||
|
|
||||||
|
fields = [
|
||||||
|
FieldSchema(name='id', dtype=DataType.VARCHAR, is_primary=True, max_length=128),
|
||||||
|
FieldSchema(name='file_id', dtype=DataType.VARCHAR, max_length=128),
|
||||||
|
FieldSchema(name='workspace_id', dtype=DataType.VARCHAR, max_length=128),
|
||||||
|
FieldSchema(name='chunk_idx', dtype=DataType.INT64),
|
||||||
|
FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535),
|
||||||
|
FieldSchema(name='dense_vec', dtype=DataType.FLOAT_VECTOR, dim=1024), # BGE-M3 dense
|
||||||
|
FieldSchema(name='metadata', dtype=DataType.JSON),
|
||||||
|
]
|
||||||
|
schema = CollectionSchema(fields, description=description)
|
||||||
|
col = Collection(name, schema)
|
||||||
|
|
||||||
|
# 创建向量索引(HNSW,适合调研阶段)
|
||||||
|
index_params = {
|
||||||
|
'metric_type': 'COSINE',
|
||||||
|
'index_type': 'HNSW',
|
||||||
|
'params': {'M': 16, 'efConstruction': 200}
|
||||||
|
}
|
||||||
|
col.create_index('dense_vec', index_params)
|
||||||
|
col.load()
|
||||||
|
print(f' Collection {name} 创建完成')
|
||||||
|
|
||||||
|
create_collection('regulation_chunks', '法规条款向量库')
|
||||||
|
create_collection('doc_chunks', '企业文档向量库')
|
||||||
|
create_collection('case_library', '行业案例库')
|
||||||
|
|
||||||
|
print('Milvus 初始化完成')
|
||||||
|
" 2>&1
|
||||||
|
ok "Milvus Collections 初始化完成"
|
||||||
|
else
|
||||||
|
error "Milvus 未运行,请先执行:bash scripts/03_start_infra.sh"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Step 3:Neo4j 约束和索引 ────────────────────
|
||||||
|
info "Step 3/3:初始化 Neo4j 约束和索引..."
|
||||||
|
sleep 5 # Neo4j 可能还在预热
|
||||||
|
|
||||||
|
docker compose exec -T neo4j cypher-shell \
|
||||||
|
-u neo4j -p "$NEO4J_PASSWORD" \
|
||||||
|
--format plain <<'CYPHER'
|
||||||
|
// 节点约束(唯一性)
|
||||||
|
CREATE CONSTRAINT regulation_id IF NOT EXISTS
|
||||||
|
FOR (r:Regulation) REQUIRE r.id IS UNIQUE;
|
||||||
|
CREATE CONSTRAINT clause_id IF NOT EXISTS
|
||||||
|
FOR (c:Clause) REQUIRE c.id IS UNIQUE;
|
||||||
|
CREATE CONSTRAINT obligation_id IF NOT EXISTS
|
||||||
|
FOR (o:Obligation) REQUIRE o.id IS UNIQUE;
|
||||||
|
|
||||||
|
// 全文索引(模糊查询)
|
||||||
|
CREATE FULLTEXT INDEX regulation_fulltext IF NOT EXISTS
|
||||||
|
FOR (r:Regulation) ON EACH [r.title, r.code, r.domain];
|
||||||
|
CREATE FULLTEXT INDEX clause_fulltext IF NOT EXISTS
|
||||||
|
FOR (c:Clause) ON EACH [c.content, c.title];
|
||||||
|
|
||||||
|
// 插入示例节点(验证连通性)
|
||||||
|
MERGE (d:Domain {name: 'vehicle_safety', label: '车辆安全法规'});
|
||||||
|
MERGE (d:Domain {name: 'data_security', label: '数据安全法规'});
|
||||||
|
MERGE (d:Domain {name: 'ehs', label: 'EHS安全法规'});
|
||||||
|
MERGE (d:Domain {name: 'carbon', label: '碳排放法规'});
|
||||||
|
RETURN '初始化完成' AS result;
|
||||||
|
CYPHER
|
||||||
|
ok "Neo4j 约束和索引初始化完成"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${GREEN} 数据库初始化完成!${NC}"
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo ""
|
||||||
|
echo " PostgreSQL: 所有表已创建"
|
||||||
|
echo " Milvus: regulation_chunks / doc_chunks / case_library"
|
||||||
|
echo " Neo4j: 约束 + 全文索引 + 基础域节点"
|
||||||
|
echo ""
|
||||||
|
echo "下一步:bash scripts/06_start_all.sh"
|
||||||
98
scripts/06_start_all.sh
Normal file
98
scripts/06_start_all.sh
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# 06_start_all.sh
|
||||||
|
# 一键启动所有服务(完整流程)
|
||||||
|
# 用法:bash scripts/06_start_all.sh
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
cd "$PROJECT_DIR"
|
||||||
|
|
||||||
|
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||||
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}╔══════════════════════════════════════════╗${NC}"
|
||||||
|
echo -e "${BLUE}║ AI合规智能中枢 — 全服务启动 ║${NC}"
|
||||||
|
echo -e "${BLUE}╚══════════════════════════════════════════╝${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 前置检查 ────────────────────────────────────
|
||||||
|
if [[ ! -f ".env" ]]; then
|
||||||
|
error ".env 文件不存在,请先运行:bash scripts/01_setup_project.sh"
|
||||||
|
fi
|
||||||
|
|
||||||
|
source .env 2>/dev/null || true
|
||||||
|
if [[ -z "${DEEPSEEK_API_KEY:-}" && -z "${DASHSCOPE_API_KEY:-}" ]]; then
|
||||||
|
warn "⚠️ 未设置 LLM API Key,LLM 功能将不可用"
|
||||||
|
warn "请在 .env 中设置 DEEPSEEK_API_KEY 或 DASHSCOPE_API_KEY"
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Phase 1:基础设施 ────────────────────────────
|
||||||
|
info "Phase 1/4:启动基础设施..."
|
||||||
|
bash "$SCRIPT_DIR/03_start_infra.sh"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── Phase 2:构建服务镜像 ────────────────────────
|
||||||
|
info "Phase 2/4:构建服务镜像(首次较慢)..."
|
||||||
|
docker compose build embedding-service mcp-server compliance-backend 2>&1 | \
|
||||||
|
grep -E "(Step|Successfully|=>|ERROR)" || true
|
||||||
|
ok "镜像构建完成"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── Phase 3:初始化数据库 ────────────────────────
|
||||||
|
info "Phase 3/4:初始化数据库..."
|
||||||
|
bash "$SCRIPT_DIR/05_init_db.sh"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── Phase 4:启动所有服务 ────────────────────────
|
||||||
|
info "Phase 4/4:启动 AI 模型服务和业务服务..."
|
||||||
|
docker compose up -d embedding-service mcp-server
|
||||||
|
info "等待 AI 模型加载(BGE-M3/MinerU 约需 2-3 分钟)..."
|
||||||
|
sleep 30
|
||||||
|
|
||||||
|
# 等待嵌入服务就绪
|
||||||
|
for i in {1..20}; do
|
||||||
|
if curl -sf http://localhost:8010/health > /dev/null 2>&1; then
|
||||||
|
ok "embedding-service 就绪"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
echo -n "."
|
||||||
|
sleep 10
|
||||||
|
done
|
||||||
|
|
||||||
|
docker compose up -d compliance-backend celery-worker celery-beat nginx
|
||||||
|
info "等待业务服务启动..."
|
||||||
|
sleep 15
|
||||||
|
|
||||||
|
for i in {1..12}; do
|
||||||
|
if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
|
||||||
|
ok "compliance-backend 就绪"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
echo -n "."
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
|
# ── 最终状态 ────────────────────────────────────
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}╔══════════════════════════════════════════╗${NC}"
|
||||||
|
echo -e "${GREEN}║ 所有服务启动完成! ║${NC}"
|
||||||
|
echo -e "${GREEN}╚══════════════════════════════════════════╝${NC}"
|
||||||
|
echo ""
|
||||||
|
docker compose ps --format "table {{.Service}}\t{{.Status}}\t{{.Ports}}"
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}访问地址:${NC}"
|
||||||
|
echo " API 网关 : http://localhost"
|
||||||
|
echo " API 文档 : http://localhost/docs"
|
||||||
|
echo " Neo4j 浏览器 : http://localhost:7474"
|
||||||
|
echo " MinIO 控制台 : http://localhost:9001"
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW}运行冒烟测试:${NC}"
|
||||||
|
echo " bash scripts/07_smoke_test.sh"
|
||||||
183
scripts/07_smoke_test.sh
Normal file
183
scripts/07_smoke_test.sh
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# 07_smoke_test.sh
|
||||||
|
# 端到端冒烟测试:验证三条业务闭环
|
||||||
|
# 用法:bash scripts/07_smoke_test.sh
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
cd "$PROJECT_DIR"
|
||||||
|
|
||||||
|
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||||
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
ok() { echo -e "${GREEN}[✓]${NC} $*"; }
|
||||||
|
fail() { echo -e "${RED}[✗]${NC} $*"; FAILED=$((FAILED+1)); }
|
||||||
|
warn() { echo -e "${YELLOW}[~]${NC} $*"; }
|
||||||
|
|
||||||
|
FAILED=0
|
||||||
|
API_BASE="http://localhost"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE} AI合规智能中枢 端到端冒烟测试${NC}"
|
||||||
|
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 基础健康检查 ────────────────────────────────
|
||||||
|
info "=== 基础设施健康检查 ==="
|
||||||
|
|
||||||
|
check_service() {
|
||||||
|
local name=$1; local url=$2
|
||||||
|
if curl -sf "$url" > /dev/null 2>&1; then
|
||||||
|
ok "$name"
|
||||||
|
else
|
||||||
|
fail "$name($url 不可达)"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
check_service "API 网关 (Nginx)" "http://localhost/health"
|
||||||
|
check_service "业务后端 (FastAPI)" "http://localhost:8000/health"
|
||||||
|
check_service "嵌入服务 (BGE-M3)" "http://localhost:8010/health"
|
||||||
|
check_service "解析服务 (MinerU)" "http://localhost:8011/health"
|
||||||
|
check_service "Milvus HTTP" "http://localhost:9091/healthz"
|
||||||
|
check_service "Neo4j Browser" "http://localhost:7474"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 嵌入服务测试 ────────────────────────────────
|
||||||
|
info "=== 嵌入服务测试 ==="
|
||||||
|
EMBED_RESP=$(curl -sf -X POST http://localhost:8010/embed \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"texts": ["GB 18384 电动汽车碰撞安全要求"], "batch_size": 1}' 2>/dev/null || echo "{}")
|
||||||
|
|
||||||
|
if echo "$EMBED_RESP" | python3 -c "import sys,json; d=json.load(sys.stdin); assert len(d.get('dense',[])[0])==1024" 2>/dev/null; then
|
||||||
|
ok "BGE-M3 嵌入:返回 1024 维向量"
|
||||||
|
else
|
||||||
|
fail "BGE-M3 嵌入失败,响应:${EMBED_RESP:0:200}"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 创建测试 PDF ────────────────────────────────
|
||||||
|
info "=== 创建测试文档 ==="
|
||||||
|
TEST_PDF="$PROJECT_DIR/data/uploads/test_regulation.txt"
|
||||||
|
cat > "$TEST_PDF" << 'EOF'
|
||||||
|
GB 18384-2020 电动汽车安全要求
|
||||||
|
|
||||||
|
第一章 总则
|
||||||
|
本标准规定了电动汽车的安全要求,适用于M1类纯电动汽车。
|
||||||
|
|
||||||
|
第二章 电气安全
|
||||||
|
2.1 绝缘电阻要求
|
||||||
|
直流电路绝缘电阻不得低于100Ω/V。
|
||||||
|
2.2 碰撞安全
|
||||||
|
车辆碰撞后,高压电系统应自动断电。
|
||||||
|
碰撞后5秒内,高压系统电压应降至60V以下。
|
||||||
|
|
||||||
|
第三章 防水要求
|
||||||
|
高压系统防护等级应达到IP67。
|
||||||
|
EOF
|
||||||
|
ok "测试文档创建:$TEST_PDF"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 闭环①:文件上传 → 向量化 → 问答 ───────────
|
||||||
|
info "=== 闭环①:法规入库 → 检索问答 ==="
|
||||||
|
|
||||||
|
# 创建工作空间
|
||||||
|
WORKSPACE_RESP=$(curl -sf -X POST "$API_BASE/api/kb/workspaces" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"name": "测试法规库", "domain": "vehicle_safety"}' 2>/dev/null || echo "{}")
|
||||||
|
WS_ID=$(echo "$WORKSPACE_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
|
||||||
|
|
||||||
|
if [[ -n "$WS_ID" ]]; then
|
||||||
|
ok "工作空间创建:$WS_ID"
|
||||||
|
else
|
||||||
|
warn "工作空间创建失败(可能接口未完全实现),跳过后续上传测试"
|
||||||
|
WS_ID="test-workspace"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 上传文件
|
||||||
|
UPLOAD_RESP=$(curl -sf -X POST "$API_BASE/api/kb/files/upload" \
|
||||||
|
-F "file=@$TEST_PDF" \
|
||||||
|
-F "workspace_id=$WS_ID" 2>/dev/null || echo "{}")
|
||||||
|
TASK_ID=$(echo "$UPLOAD_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('task_id',''))" 2>/dev/null || echo "")
|
||||||
|
|
||||||
|
if [[ -n "$TASK_ID" ]]; then
|
||||||
|
ok "文件上传任务已创建:$TASK_ID"
|
||||||
|
|
||||||
|
# 轮询任务状态(最多等待120秒)
|
||||||
|
info "等待向量化完成..."
|
||||||
|
for i in {1..24}; do
|
||||||
|
TASK_STATUS=$(curl -sf "$API_BASE/api/kb/tasks/$TASK_ID" 2>/dev/null | \
|
||||||
|
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" 2>/dev/null || echo "unknown")
|
||||||
|
if [[ "$TASK_STATUS" == "completed" ]]; then
|
||||||
|
ok "向量化完成(${i}×5s)"
|
||||||
|
break
|
||||||
|
elif [[ "$TASK_STATUS" == "failed" ]]; then
|
||||||
|
fail "向量化失败"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
echo -n "."
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 检索问答
|
||||||
|
QA_RESP=$(curl -sf -X POST "$API_BASE/api/kb/qa" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"query\": \"碰撞后高压系统电压要求\", \"workspace_id\": \"$WS_ID\", \"top_k\": 3}" 2>/dev/null || echo "{}")
|
||||||
|
ANSWER=$(echo "$QA_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('answer','')[:100])" 2>/dev/null || echo "")
|
||||||
|
|
||||||
|
if [[ -n "$ANSWER" ]]; then
|
||||||
|
ok "问答成功:${ANSWER}..."
|
||||||
|
else
|
||||||
|
warn "问答返回空(LLM API 可能未配置或响应缓慢)"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
warn "文件上传失败(接口可能未实现)"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 闭环②:合规审查 ────────────────────────────
|
||||||
|
info "=== 闭环②:文档上传 → 合规审查 ==="
|
||||||
|
|
||||||
|
CHECK_RESP=$(curl -sf -X POST "$API_BASE/api/compliance/check" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"query": "供应商文件是否符合GB 18384碰撞安全要求", "domains": ["vehicle_safety"]}' 2>/dev/null || echo "{}")
|
||||||
|
RISK=$(echo "$CHECK_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('risk_level','unknown'))" 2>/dev/null || echo "unknown")
|
||||||
|
|
||||||
|
if [[ "$RISK" != "unknown" && -n "$RISK" ]]; then
|
||||||
|
ok "合规审查完成,风险等级:$RISK"
|
||||||
|
else
|
||||||
|
warn "合规审查接口返回空(功能可能未完全实现)"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 闭环③:法规监控 ────────────────────────────
|
||||||
|
info "=== 闭环③:法规监控源配置 ==="
|
||||||
|
|
||||||
|
SOURCE_RESP=$(curl -sf -X POST "$API_BASE/api/regulation/sources" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"name": "测试监控源", "url": "https://std.samr.gov.cn", "domain": "vehicle_safety"}' 2>/dev/null || echo "{}")
|
||||||
|
SOURCE_ID=$(echo "$SOURCE_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
|
||||||
|
|
||||||
|
if [[ -n "$SOURCE_ID" ]]; then
|
||||||
|
ok "监控源配置成功:$SOURCE_ID"
|
||||||
|
else
|
||||||
|
warn "监控源配置返回空(功能可能未完全实现)"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 汇总 ────────────────────────────────────────
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||||
|
if [[ $FAILED -eq 0 ]]; then
|
||||||
|
echo -e "${GREEN} 全部检查通过!${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW} 完成,${FAILED} 项失败${NC}(部分功能可能尚未实现)"
|
||||||
|
fi
|
||||||
|
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||||
|
echo ""
|
||||||
|
echo "查看服务日志:"
|
||||||
|
echo " docker compose logs -f compliance-backend"
|
||||||
|
echo " docker compose logs -f celery-worker"
|
||||||
66
scripts/check_health.sh
Normal file
66
scripts/check_health.sh
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# check_health.sh
|
||||||
|
# 检查所有服务的健康状态和资源使用
|
||||||
|
# 用法:bash scripts/check_health.sh
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
cd "$PROJECT_DIR"
|
||||||
|
|
||||||
|
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE} 服务健康检查报告${NC}"
|
||||||
|
echo -e "${BLUE}══════════════════════════════════════════${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Docker 服务状态
|
||||||
|
echo -e "${BLUE}【Docker Compose 服务状态】${NC}"
|
||||||
|
docker compose ps --format "table {{.Service}}\t{{.Status}}\t{{.Ports}}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# HTTP 端点检查
|
||||||
|
echo -e "${BLUE}【HTTP 健康端点】${NC}"
|
||||||
|
check_http() {
|
||||||
|
local name=$1; local url=$2
|
||||||
|
if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then
|
||||||
|
echo -e " ${GREEN}[OK]${NC} $name ($url)"
|
||||||
|
else
|
||||||
|
echo -e " ${RED}[FAIL]${NC} $name ($url)"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
check_http "API 网关" "http://localhost/health"
|
||||||
|
check_http "业务后端" "http://localhost:8000/health"
|
||||||
|
check_http "嵌入服务" "http://localhost:8010/health"
|
||||||
|
check_http "解析服务" "http://localhost:8011/health"
|
||||||
|
check_http "Milvus" "http://localhost:9091/healthz"
|
||||||
|
check_http "Neo4j" "http://localhost:7474"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 资源使用
|
||||||
|
echo -e "${BLUE}【容器资源使用】${NC}"
|
||||||
|
docker stats --no-stream --format \
|
||||||
|
"table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" \
|
||||||
|
2>/dev/null | head -15
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 磁盘使用
|
||||||
|
echo -e "${BLUE}【磁盘使用】${NC}"
|
||||||
|
df -h . | tail -1 | awk '{print " 项目目录:已用 "$3",可用 "$4"(" $5 " 使用率)"}'
|
||||||
|
docker system df 2>/dev/null | head -6
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# LLM 配置检查
|
||||||
|
echo -e "${BLUE}【LLM API 配置】${NC}"
|
||||||
|
source .env 2>/dev/null || true
|
||||||
|
if [[ -n "${DEEPSEEK_API_KEY:-}" ]]; then
|
||||||
|
echo -e " ${GREEN}[OK]${NC} DeepSeek API Key 已配置"
|
||||||
|
elif [[ -n "${DASHSCOPE_API_KEY:-}" ]]; then
|
||||||
|
echo -e " ${GREEN}[OK]${NC} DashScope (Qwen) API Key 已配置"
|
||||||
|
else
|
||||||
|
echo -e " ${YELLOW}[WARN]${NC} 未配置 LLM API Key(LLM 功能不可用)"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
91
scripts/download_models.sh
Normal file
91
scripts/download_models.sh
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# download_models.sh
|
||||||
|
# 预下载 AI 模型到 ./models 目录(加速容器启动)
|
||||||
|
# 支持 HuggingFace 镜像加速(国内网络)
|
||||||
|
# 用法:bash scripts/download_models.sh
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
cd "$PROJECT_DIR"
|
||||||
|
|
||||||
|
BLUE='\033[0;34m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
||||||
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
|
||||||
|
MODELS_DIR="$PROJECT_DIR/models"
|
||||||
|
mkdir -p "$MODELS_DIR"
|
||||||
|
|
||||||
|
# 设置镜像加速
|
||||||
|
export HF_ENDPOINT="${HF_ENDPOINT:-https://hf-mirror.com}"
|
||||||
|
export HF_HOME="$MODELS_DIR"
|
||||||
|
info "HuggingFace 镜像:$HF_ENDPOINT"
|
||||||
|
info "模型保存路径:$MODELS_DIR"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 方法1:通过 huggingface_hub 下载 ────────────
|
||||||
|
download_hf() {
|
||||||
|
local repo=$1; local local_name=$2
|
||||||
|
info "下载 $repo..."
|
||||||
|
if python3 -c "
|
||||||
|
import os
|
||||||
|
os.environ['HF_ENDPOINT'] = '${HF_ENDPOINT}'
|
||||||
|
os.environ['HF_HOME'] = '${MODELS_DIR}'
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
snapshot_download(repo_id='$repo', cache_dir='${MODELS_DIR}')
|
||||||
|
print('下载完成')
|
||||||
|
" 2>&1; then
|
||||||
|
ok "$repo 下载成功"
|
||||||
|
else
|
||||||
|
warn "$repo HuggingFace 下载失败,尝试 ModelScope..."
|
||||||
|
download_modelscope "$repo" "$local_name"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── 方法2:通过 ModelScope 下载(备用)──────────
|
||||||
|
download_modelscope() {
|
||||||
|
local hf_name=$1
|
||||||
|
local ms_name=${2:-$1}
|
||||||
|
python3 -c "
|
||||||
|
try:
|
||||||
|
from modelscope import snapshot_download
|
||||||
|
snapshot_download(model_id='$ms_name', cache_dir='${MODELS_DIR}/modelscope')
|
||||||
|
print('ModelScope 下载完成')
|
||||||
|
except ImportError:
|
||||||
|
print('ModelScope 未安装,跳过')
|
||||||
|
except Exception as e:
|
||||||
|
print(f'ModelScope 下载失败: {e}')
|
||||||
|
" 2>&1 || warn "ModelScope 下载也失败,模型将在容器启动时自动下载"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── 检查 Python 环境 ────────────────────────────
|
||||||
|
if ! python3 -c "import huggingface_hub" 2>/dev/null; then
|
||||||
|
warn "未安装 huggingface_hub,尝试安装..."
|
||||||
|
pip3 install -q huggingface_hub modelscope 2>/dev/null || \
|
||||||
|
warn "安装失败,模型将在容器首次启动时下载"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 下载模型列表 ────────────────────────────────
|
||||||
|
info "=== 下载 BGE-M3 嵌入模型(约 2.5GB)==="
|
||||||
|
download_hf "BAAI/bge-m3" "BAAI/bge-m3"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
info "=== 下载 BGE-Reranker 精排模型(约 1.1GB)==="
|
||||||
|
download_hf "BAAI/bge-reranker-v2-m3" "BAAI/bge-reranker-v2-m3"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# MinerU 模型通过容器内脚本下载(依赖 magic-pdf 配置)
|
||||||
|
info "=== MinerU 模型说明 ==="
|
||||||
|
warn "MinerU 模型(约 2GB)将在 mcp-server 容器首次启动时自动下载"
|
||||||
|
warn "如需预下载,请在 mcp-server 容器内运行:mineru-models-download"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${GREEN} 模型下载完成!${NC}"
|
||||||
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
||||||
|
echo ""
|
||||||
|
echo "已下载到:$MODELS_DIR"
|
||||||
|
du -sh "$MODELS_DIR" 2>/dev/null || true
|
||||||
37
scripts/reset_all.sh
Normal file
37
scripts/reset_all.sh
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
# reset_all.sh
|
||||||
|
# ⚠️ 危险操作:停止所有服务并删除所有数据(慎用!)
|
||||||
|
# 用法:bash scripts/reset_all.sh
|
||||||
|
# ══════════════════════════════════════════════════
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
cd "$PROJECT_DIR"
|
||||||
|
|
||||||
|
RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${RED}╔══════════════════════════════════════════╗${NC}"
|
||||||
|
echo -e "${RED}║ ⚠️ 警告:此操作将删除所有数据! ║${NC}"
|
||||||
|
echo -e "${RED}║ 包括:PostgreSQL / Milvus / Neo4j 数据 ║${NC}"
|
||||||
|
echo -e "${RED}║ 以及所有上传的文件和日志 ║${NC}"
|
||||||
|
echo -e "${RED}╚══════════════════════════════════════════╝${NC}"
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW}确认要重置所有数据吗?(输入 'yes' 确认,其他取消)${NC}"
|
||||||
|
read -r CONFIRM
|
||||||
|
|
||||||
|
if [[ "$CONFIRM" != "yes" ]]; then
|
||||||
|
echo "已取消"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "停止所有服务..."
|
||||||
|
docker compose down --volumes --remove-orphans
|
||||||
|
|
||||||
|
echo "清理数据目录..."
|
||||||
|
rm -rf data/uploads/* data/parsed/* logs/*
|
||||||
|
echo "✓ 数据目录已清空(保留目录结构)"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW}重置完成。重新启动:bash scripts/06_start_all.sh${NC}"
|
||||||
24
services/compliance-backend/Dockerfile
Normal file
24
services/compliance-backend/Dockerfile
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# 使用 uv 加速依赖安装
|
||||||
|
RUN pip install uv --no-cache-dir
|
||||||
|
|
||||||
|
COPY pyproject.toml .
|
||||||
|
RUN uv pip install --system --no-cache -r pyproject.toml \
|
||||||
|
--index-url https://pypi.tuna.tsinghua.edu.cn/simple \
|
||||||
|
--trusted-host pypi.tuna.tsinghua.edu.cn
|
||||||
|
|
||||||
|
COPY app/ ./app/
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=5 \
|
||||||
|
CMD curl -f http://localhost:8000/health || exit 1
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
|
||||||
0
services/compliance-backend/app/__init__.py
Normal file
0
services/compliance-backend/app/__init__.py
Normal file
0
services/compliance-backend/app/api/__init__.py
Normal file
0
services/compliance-backend/app/api/__init__.py
Normal file
95
services/compliance-backend/app/api/compliance.py
Normal file
95
services/compliance-backend/app/api/compliance.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
import uuid
|
||||||
|
import logging
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
|
||||||
|
from ..core.llm import get_llm, COMPLIANCE_CHECK_PROMPT
|
||||||
|
from ..services.rag import hybrid_search
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(prefix="/api/compliance", tags=["合规审查"])
|
||||||
|
|
||||||
|
|
||||||
|
class ComplianceCheckRequest(BaseModel):
|
||||||
|
query: str
|
||||||
|
regulation_domains: list[str] = ["vehicle_safety"]
|
||||||
|
top_k: int = 5
|
||||||
|
|
||||||
|
|
||||||
|
class ComplianceCheckResponse(BaseModel):
|
||||||
|
risk_level: str
|
||||||
|
risk_score: float
|
||||||
|
findings: list[dict]
|
||||||
|
recommendations: list[str]
|
||||||
|
sources: list[dict]
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/check", response_model=ComplianceCheckResponse)
|
||||||
|
async def check_compliance(req: ComplianceCheckRequest):
|
||||||
|
"""
|
||||||
|
对输入内容进行合规性检查,与法规库比对后给出风险评估。
|
||||||
|
"""
|
||||||
|
# 检索相关法规(从多个域检索)
|
||||||
|
all_chunks = []
|
||||||
|
for domain in req.regulation_domains:
|
||||||
|
chunks = await hybrid_search(
|
||||||
|
req.query,
|
||||||
|
collection_name="regulation_chunks",
|
||||||
|
top_k=req.top_k,
|
||||||
|
)
|
||||||
|
all_chunks.extend(chunks)
|
||||||
|
|
||||||
|
# 去重 + 按分数排序
|
||||||
|
seen = set()
|
||||||
|
unique_chunks = []
|
||||||
|
for c in sorted(all_chunks, key=lambda x: x["score"], reverse=True):
|
||||||
|
if c["id"] not in seen:
|
||||||
|
seen.add(c["id"])
|
||||||
|
unique_chunks.append(c)
|
||||||
|
top_chunks = unique_chunks[:req.top_k]
|
||||||
|
|
||||||
|
if not top_chunks:
|
||||||
|
return ComplianceCheckResponse(
|
||||||
|
risk_level="unknown",
|
||||||
|
risk_score=0,
|
||||||
|
findings=[{"issue": "未找到相关法规,请先上传法规文档"}],
|
||||||
|
recommendations=["上传相关法规文档到知识库后重试"],
|
||||||
|
sources=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
# 构建法规上下文
|
||||||
|
regulations_text = "\n\n".join(
|
||||||
|
f"[{i+1}] {c['content'][:500]}" for i, c in enumerate(top_chunks)
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = COMPLIANCE_CHECK_PROMPT.format(
|
||||||
|
content=req.query,
|
||||||
|
regulations=regulations_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = get_llm(temperature=0.0)
|
||||||
|
try:
|
||||||
|
response = await llm.ainvoke([HumanMessage(content=prompt)])
|
||||||
|
analysis = response.content
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"LLM 合规分析失败:{e}")
|
||||||
|
analysis = f"LLM 分析失败:{e}"
|
||||||
|
|
||||||
|
# 简单解析 LLM 输出(生产可用结构化输出)
|
||||||
|
risk_level = "medium"
|
||||||
|
risk_score = 50.0
|
||||||
|
if "critical" in analysis.lower() or "严重" in analysis:
|
||||||
|
risk_level, risk_score = "critical", 90.0
|
||||||
|
elif "high" in analysis.lower() or "高风险" in analysis:
|
||||||
|
risk_level, risk_score = "high", 70.0
|
||||||
|
elif "low" in analysis.lower() or "低风险" in analysis:
|
||||||
|
risk_level, risk_score = "low", 20.0
|
||||||
|
|
||||||
|
return ComplianceCheckResponse(
|
||||||
|
risk_level=risk_level,
|
||||||
|
risk_score=risk_score,
|
||||||
|
findings=[{"analysis": analysis}],
|
||||||
|
recommendations=["请参考上述分析进行整改"],
|
||||||
|
sources=[{"content": c["content"][:200], "score": c["score"]} for c in top_chunks],
|
||||||
|
)
|
||||||
114
services/compliance-backend/app/api/kb.py
Normal file
114
services/compliance-backend/app/api/kb.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
import uuid
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, UploadFile, File, Form, HTTPException, BackgroundTasks
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
from sqlalchemy import select
|
||||||
|
|
||||||
|
from ..core.deps import get_db
|
||||||
|
from ..models.db import Workspace, File as FileRecord, Task
|
||||||
|
from ..services.rag import hybrid_search, rerank, generate_answer
|
||||||
|
from ..worker import process_file_task
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(prefix="/api/kb", tags=["知识库"])
|
||||||
|
|
||||||
|
UPLOAD_DIR = Path("/app/data/uploads")
|
||||||
|
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
class WorkspaceCreate(BaseModel):
|
||||||
|
name: str
|
||||||
|
description: str = ""
|
||||||
|
domain: str = "general"
|
||||||
|
|
||||||
|
|
||||||
|
class QARequest(BaseModel):
|
||||||
|
query: str
|
||||||
|
workspace_id: str | None = None
|
||||||
|
top_k: int = 5
|
||||||
|
return_sources: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/workspaces")
|
||||||
|
async def create_workspace(req: WorkspaceCreate, db: AsyncSession = Depends(get_db)):
|
||||||
|
ws = Workspace(name=req.name, description=req.description, domain=req.domain)
|
||||||
|
db.add(ws)
|
||||||
|
await db.flush()
|
||||||
|
return {"id": str(ws.id), "name": ws.name, "domain": ws.domain}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/files/upload")
|
||||||
|
async def upload_file(
|
||||||
|
background_tasks: BackgroundTasks,
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
workspace_id: str = Form(default=""),
|
||||||
|
db: AsyncSession = Depends(get_db),
|
||||||
|
):
|
||||||
|
content = await file.read()
|
||||||
|
file_id = str(uuid.uuid4())
|
||||||
|
suffix = Path(file.filename or "doc").suffix
|
||||||
|
save_path = UPLOAD_DIR / f"{file_id}{suffix}"
|
||||||
|
save_path.write_bytes(content)
|
||||||
|
|
||||||
|
file_record = FileRecord(
|
||||||
|
id=uuid.UUID(file_id),
|
||||||
|
filename=f"{file_id}{suffix}",
|
||||||
|
original_name=file.filename or "unknown",
|
||||||
|
file_type=suffix.lstrip("."),
|
||||||
|
file_size=len(content),
|
||||||
|
storage_path=str(save_path),
|
||||||
|
workspace_id=uuid.UUID(workspace_id) if workspace_id else None,
|
||||||
|
status="uploaded",
|
||||||
|
)
|
||||||
|
db.add(file_record)
|
||||||
|
|
||||||
|
task = Task(
|
||||||
|
task_type="parse_and_vectorize",
|
||||||
|
status="pending",
|
||||||
|
file_id=uuid.UUID(file_id),
|
||||||
|
payload={"workspace_id": workspace_id},
|
||||||
|
)
|
||||||
|
db.add(task)
|
||||||
|
await db.flush()
|
||||||
|
|
||||||
|
# 异步触发 Celery 任务
|
||||||
|
celery_task = process_file_task.delay(file_id, str(task.id), workspace_id)
|
||||||
|
task.celery_task_id = celery_task.id
|
||||||
|
await db.flush()
|
||||||
|
|
||||||
|
return {"file_id": file_id, "task_id": str(task.id), "status": "processing"}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/tasks/{task_id}")
|
||||||
|
async def get_task(task_id: str, db: AsyncSession = Depends(get_db)):
|
||||||
|
result = await db.execute(select(Task).where(Task.id == uuid.UUID(task_id)))
|
||||||
|
task = result.scalar_one_or_none()
|
||||||
|
if not task:
|
||||||
|
raise HTTPException(status_code=404, detail="任务不存在")
|
||||||
|
return {
|
||||||
|
"task_id": str(task.id),
|
||||||
|
"status": task.status,
|
||||||
|
"progress": task.progress,
|
||||||
|
"file_id": str(task.file_id) if task.file_id else None,
|
||||||
|
"error_msg": task.error_msg,
|
||||||
|
"completed_at": task.completed_at.isoformat() if task.completed_at else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/qa")
|
||||||
|
async def qa(req: QARequest):
|
||||||
|
chunks = await hybrid_search(req.query, workspace_id=req.workspace_id, top_k=req.top_k * 2)
|
||||||
|
ranked = await rerank(req.query, chunks, top_k=req.top_k)
|
||||||
|
result = await generate_answer(req.query, ranked)
|
||||||
|
if not req.return_sources:
|
||||||
|
result.pop("sources", None)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/knowledge/retrieval")
|
||||||
|
async def retrieval(req: QARequest):
|
||||||
|
chunks = await hybrid_search(req.query, workspace_id=req.workspace_id, top_k=req.top_k)
|
||||||
|
return {"chunks": chunks, "total": len(chunks)}
|
||||||
111
services/compliance-backend/app/api/regulation.py
Normal file
111
services/compliance-backend/app/api/regulation.py
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
import uuid
|
||||||
|
import logging
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
from sqlalchemy import select, desc
|
||||||
|
|
||||||
|
from ..core.deps import get_db
|
||||||
|
from ..models.db import RegulationSource, RegulationUpdate
|
||||||
|
from ..worker import fetch_regulation_source
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(prefix="/api/regulation", tags=["法规监控"])
|
||||||
|
|
||||||
|
|
||||||
|
class SourceCreate(BaseModel):
|
||||||
|
name: str
|
||||||
|
url: str
|
||||||
|
domain: str = "vehicle_safety"
|
||||||
|
fetch_interval: int = 86400
|
||||||
|
fetch_config: dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
class SubscribeRequest(BaseModel):
|
||||||
|
name: str
|
||||||
|
channel: str # email / webhook / feishu / dingtalk
|
||||||
|
target: str
|
||||||
|
domains: list[str] = []
|
||||||
|
importance_min: str = "normal"
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sources")
|
||||||
|
async def create_source(req: SourceCreate, db: AsyncSession = Depends(get_db)):
|
||||||
|
source = RegulationSource(
|
||||||
|
name=req.name,
|
||||||
|
url=req.url,
|
||||||
|
domain=req.domain,
|
||||||
|
fetch_interval=req.fetch_interval,
|
||||||
|
fetch_config=req.fetch_config,
|
||||||
|
)
|
||||||
|
db.add(source)
|
||||||
|
await db.flush()
|
||||||
|
return {
|
||||||
|
"id": str(source.id),
|
||||||
|
"name": source.name,
|
||||||
|
"url": source.url,
|
||||||
|
"domain": source.domain,
|
||||||
|
"status": "active",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sources")
|
||||||
|
async def list_sources(db: AsyncSession = Depends(get_db)):
|
||||||
|
result = await db.execute(
|
||||||
|
select(RegulationSource).where(RegulationSource.is_active == True)
|
||||||
|
)
|
||||||
|
sources = result.scalars().all()
|
||||||
|
return [{"id": str(s.id), "name": s.name, "url": s.url, "domain": s.domain} for s in sources]
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sources/{source_id}/fetch")
|
||||||
|
async def manual_fetch(source_id: str, db: AsyncSession = Depends(get_db)):
|
||||||
|
"""手动触发某个监控源的抓取(测试用)"""
|
||||||
|
result = await db.execute(
|
||||||
|
select(RegulationSource).where(RegulationSource.id == uuid.UUID(source_id))
|
||||||
|
)
|
||||||
|
source = result.scalar_one_or_none()
|
||||||
|
if not source:
|
||||||
|
raise HTTPException(status_code=404, detail="监控源不存在")
|
||||||
|
|
||||||
|
task = fetch_regulation_source.delay(source_id)
|
||||||
|
return {"task_id": task.id, "status": "queued", "source_id": source_id}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/updates")
|
||||||
|
async def get_updates(
|
||||||
|
domain: str | None = None,
|
||||||
|
limit: int = 20,
|
||||||
|
offset: int = 0,
|
||||||
|
db: AsyncSession = Depends(get_db),
|
||||||
|
):
|
||||||
|
query = select(RegulationUpdate).order_by(desc(RegulationUpdate.fetched_at))
|
||||||
|
result = await db.execute(query.limit(limit).offset(offset))
|
||||||
|
updates = result.scalars().all()
|
||||||
|
return {
|
||||||
|
"updates": [
|
||||||
|
{
|
||||||
|
"id": str(u.id),
|
||||||
|
"title": u.title,
|
||||||
|
"url": u.url,
|
||||||
|
"change_type": u.change_type,
|
||||||
|
"summary": u.summary,
|
||||||
|
"importance": u.importance,
|
||||||
|
"fetched_at": u.fetched_at.isoformat() if u.fetched_at else None,
|
||||||
|
}
|
||||||
|
for u in updates
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/subscribe")
|
||||||
|
async def subscribe(req: SubscribeRequest, db: AsyncSession = Depends(get_db)):
|
||||||
|
from ..models.db import Workspace # 借用DB session
|
||||||
|
# 简化版:仅记录订阅(推送逻辑在 push-worker 中实现)
|
||||||
|
return {
|
||||||
|
"id": str(uuid.uuid4()),
|
||||||
|
"name": req.name,
|
||||||
|
"channel": req.channel,
|
||||||
|
"domains": req.domains,
|
||||||
|
"status": "active",
|
||||||
|
}
|
||||||
0
services/compliance-backend/app/core/__init__.py
Normal file
0
services/compliance-backend/app/core/__init__.py
Normal file
37
services/compliance-backend/app/core/config.py
Normal file
37
services/compliance-backend/app/core/config.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
model_config = SettingsConfigDict(env_file=".env", extra="ignore")
|
||||||
|
|
||||||
|
# 应用
|
||||||
|
app_env: str = "development"
|
||||||
|
log_level: str = "INFO"
|
||||||
|
api_secret_key: str = "change_this_key"
|
||||||
|
|
||||||
|
# 数据库
|
||||||
|
database_url: str = "postgresql+asyncpg://compliance:compliance123@postgres:5432/compliance_db"
|
||||||
|
redis_url: str = "redis://:redis123@redis:6379/0"
|
||||||
|
|
||||||
|
# Milvus
|
||||||
|
milvus_host: str = "milvus"
|
||||||
|
milvus_port: int = 19530
|
||||||
|
|
||||||
|
# Neo4j
|
||||||
|
neo4j_uri: str = "bolt://neo4j:7687"
|
||||||
|
neo4j_user: str = "neo4j"
|
||||||
|
neo4j_password: str = "neo4j123"
|
||||||
|
|
||||||
|
# AI 服务
|
||||||
|
embedding_service_url: str = "http://embedding-service:8010"
|
||||||
|
mcp_server_url: str = "http://mcp-server:8011"
|
||||||
|
|
||||||
|
# LLM
|
||||||
|
llm_provider: str = "deepseek" # deepseek / qwen
|
||||||
|
deepseek_api_key: str = ""
|
||||||
|
deepseek_model: str = "deepseek-chat"
|
||||||
|
dashscope_api_key: str = ""
|
||||||
|
qwen_model: str = "qwen-plus"
|
||||||
|
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
54
services/compliance-backend/app/core/deps.py
Normal file
54
services/compliance-backend/app/core/deps.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
from functools import lru_cache
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from neo4j import AsyncGraphDatabase
|
||||||
|
from pymilvus import connections, Collection
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
|
||||||
|
|
||||||
|
from .config import settings
|
||||||
|
|
||||||
|
# ── PostgreSQL ──────────────────────────────────
|
||||||
|
engine = create_async_engine(settings.database_url, pool_size=10, max_overflow=20)
|
||||||
|
AsyncSessionLocal = async_sessionmaker(engine, expire_on_commit=False)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_db() -> AsyncGenerator[AsyncSession, None]:
|
||||||
|
async with AsyncSessionLocal() as session:
|
||||||
|
try:
|
||||||
|
yield session
|
||||||
|
await session.commit()
|
||||||
|
except Exception:
|
||||||
|
await session.rollback()
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
# ── Milvus ──────────────────────────────────────
|
||||||
|
def get_milvus_collection(name: str) -> Collection:
|
||||||
|
connections.connect(host=settings.milvus_host, port=settings.milvus_port)
|
||||||
|
return Collection(name)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Neo4j ───────────────────────────────────────
|
||||||
|
_neo4j_driver = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_neo4j():
|
||||||
|
global _neo4j_driver
|
||||||
|
if _neo4j_driver is None:
|
||||||
|
_neo4j_driver = AsyncGraphDatabase.driver(
|
||||||
|
settings.neo4j_uri,
|
||||||
|
auth=(settings.neo4j_user, settings.neo4j_password),
|
||||||
|
)
|
||||||
|
return _neo4j_driver
|
||||||
|
|
||||||
|
|
||||||
|
# ── HTTP 客户端(复用连接池)────────────────────
|
||||||
|
_http_client = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_http_client() -> httpx.AsyncClient:
|
||||||
|
global _http_client
|
||||||
|
if _http_client is None:
|
||||||
|
_http_client = httpx.AsyncClient(timeout=120.0)
|
||||||
|
return _http_client
|
||||||
56
services/compliance-backend/app/core/llm.py
Normal file
56
services/compliance-backend/app/core/llm.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
from langchain_openai import ChatOpenAI
|
||||||
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||||
|
from .config import settings
|
||||||
|
|
||||||
|
|
||||||
|
def get_llm(temperature: float = 0.1) -> ChatOpenAI:
|
||||||
|
"""获取 LLM 客户端(DeepSeek 或 Qwen,均兼容 OpenAI API)"""
|
||||||
|
if settings.llm_provider == "deepseek":
|
||||||
|
return ChatOpenAI(
|
||||||
|
model=settings.deepseek_model,
|
||||||
|
api_key=settings.deepseek_api_key,
|
||||||
|
base_url="https://api.deepseek.com/v1",
|
||||||
|
temperature=temperature,
|
||||||
|
max_retries=3,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
elif settings.llm_provider == "qwen":
|
||||||
|
return ChatOpenAI(
|
||||||
|
model=settings.qwen_model,
|
||||||
|
api_key=settings.dashscope_api_key,
|
||||||
|
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||||
|
temperature=temperature,
|
||||||
|
max_retries=3,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
raise ValueError(f"不支持的 LLM 提供商:{settings.llm_provider}")
|
||||||
|
|
||||||
|
|
||||||
|
RAG_SYSTEM_PROMPT = """你是一位专业的汽车行业合规专家,具备深厚的法规知识(GB标准、UN-ECE、ISO 45001、IATF 16949等)。
|
||||||
|
|
||||||
|
回答规则:
|
||||||
|
1. 仅基于提供的参考文献回答,不添加不在文献中的信息
|
||||||
|
2. 每个关键陈述必须标注来源(格式:[来源:文件名,第X页])
|
||||||
|
3. 如果参考文献不足以回答问题,明确说明
|
||||||
|
4. 使用专业但清晰的语言,适合工程师和法务人员阅读
|
||||||
|
5. 对于数值要求(如绝缘电阻值、时间限制等),精确引用原文"""
|
||||||
|
|
||||||
|
|
||||||
|
COMPLIANCE_CHECK_PROMPT = """你是一位专业的汽车合规审查专家。
|
||||||
|
|
||||||
|
请对以下内容进行合规性评估:
|
||||||
|
|
||||||
|
【待审查内容】
|
||||||
|
{content}
|
||||||
|
|
||||||
|
【相关法规要求】
|
||||||
|
{regulations}
|
||||||
|
|
||||||
|
请按以下格式输出:
|
||||||
|
1. 整体风险等级:[low/medium/high/critical]
|
||||||
|
2. 风险分数:[0-100]
|
||||||
|
3. 发现的合规问题(逐条列出):
|
||||||
|
- 问题描述
|
||||||
|
- 违反的具体法规条款
|
||||||
|
- 严重程度
|
||||||
|
4. 整改建议(具体可操作)"""
|
||||||
84
services/compliance-backend/app/main.py
Normal file
84
services/compliance-backend/app/main.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
from fastapi import FastAPI, Request
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from prometheus_fastapi_instrumentator import Instrumentator
|
||||||
|
|
||||||
|
from .api import kb, compliance, regulation
|
||||||
|
from .core.config import settings
|
||||||
|
|
||||||
|
# 结构化日志配置
|
||||||
|
structlog.configure(
|
||||||
|
wrapper_class=structlog.make_filtering_bound_logger(
|
||||||
|
getattr(logging, settings.log_level.upper(), logging.INFO)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="AI合规智能中枢 API",
|
||||||
|
description="面向车企与工厂的全链路合规智能平台",
|
||||||
|
version="0.1.0",
|
||||||
|
docs_url="/docs",
|
||||||
|
redoc_url="/redoc",
|
||||||
|
)
|
||||||
|
|
||||||
|
# CORS(开发环境)
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"] if settings.app_env == "development" else [],
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prometheus 指标
|
||||||
|
Instrumentator().instrument(app).expose(app)
|
||||||
|
|
||||||
|
# 注册路由
|
||||||
|
app.include_router(kb.router)
|
||||||
|
app.include_router(compliance.router)
|
||||||
|
app.include_router(regulation.router)
|
||||||
|
|
||||||
|
|
||||||
|
@app.middleware("http")
|
||||||
|
async def log_requests(request: Request, call_next):
|
||||||
|
start = time.time()
|
||||||
|
response = await call_next(request)
|
||||||
|
duration_ms = int((time.time() - start) * 1000)
|
||||||
|
logger.info(
|
||||||
|
"request",
|
||||||
|
method=request.method,
|
||||||
|
path=request.url.path,
|
||||||
|
status=response.status_code,
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
"""健康检查(含依赖服务检测)"""
|
||||||
|
import httpx
|
||||||
|
from .core.config import settings
|
||||||
|
|
||||||
|
checks = {"status": "ok", "services": {}}
|
||||||
|
|
||||||
|
# 检查嵌入服务
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=5) as client:
|
||||||
|
r = await client.get(f"{settings.embedding_service_url}/health")
|
||||||
|
checks["services"]["embedding"] = "ok" if r.status_code == 200 else "degraded"
|
||||||
|
except Exception:
|
||||||
|
checks["services"]["embedding"] = "unavailable"
|
||||||
|
|
||||||
|
# 检查 MCP Server
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=5) as client:
|
||||||
|
r = await client.get(f"{settings.mcp_server_url}/health")
|
||||||
|
checks["services"]["mcp"] = "ok" if r.status_code == 200 else "degraded"
|
||||||
|
except Exception:
|
||||||
|
checks["services"]["mcp"] = "unavailable"
|
||||||
|
|
||||||
|
return checks
|
||||||
0
services/compliance-backend/app/models/__init__.py
Normal file
0
services/compliance-backend/app/models/__init__.py
Normal file
113
services/compliance-backend/app/models/db.py
Normal file
113
services/compliance-backend/app/models/db.py
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from sqlalchemy import Column, String, Integer, BigInteger, Boolean, Text, ARRAY, Numeric
|
||||||
|
from sqlalchemy import DateTime, ForeignKey, func
|
||||||
|
from sqlalchemy.dialects.postgresql import UUID, JSONB, INET
|
||||||
|
from sqlalchemy.orm import DeclarativeBase, relationship
|
||||||
|
|
||||||
|
|
||||||
|
class Base(DeclarativeBase):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Workspace(Base):
|
||||||
|
__tablename__ = "workspaces"
|
||||||
|
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
name = Column(String(255), nullable=False)
|
||||||
|
description = Column(Text)
|
||||||
|
domain = Column(String(100))
|
||||||
|
created_by = Column(String(255))
|
||||||
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||||
|
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
||||||
|
|
||||||
|
files = relationship("File", back_populates="workspace")
|
||||||
|
|
||||||
|
|
||||||
|
class File(Base):
|
||||||
|
__tablename__ = "files"
|
||||||
|
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
workspace_id = Column(UUID(as_uuid=True), ForeignKey("workspaces.id", ondelete="CASCADE"))
|
||||||
|
filename = Column(String(500), nullable=False)
|
||||||
|
original_name = Column(String(500), nullable=False)
|
||||||
|
file_type = Column(String(50))
|
||||||
|
file_size = Column(BigInteger)
|
||||||
|
storage_path = Column(Text)
|
||||||
|
parsed_path = Column(Text)
|
||||||
|
status = Column(String(50), default="uploaded")
|
||||||
|
error_msg = Column(Text)
|
||||||
|
metadata = Column(JSONB, default={})
|
||||||
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||||
|
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
||||||
|
|
||||||
|
workspace = relationship("Workspace", back_populates="files")
|
||||||
|
tasks = relationship("Task", back_populates="file")
|
||||||
|
|
||||||
|
|
||||||
|
class Task(Base):
|
||||||
|
__tablename__ = "tasks"
|
||||||
|
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
task_type = Column(String(100), nullable=False)
|
||||||
|
status = Column(String(50), default="pending")
|
||||||
|
payload = Column(JSONB, default={})
|
||||||
|
result = Column(JSONB)
|
||||||
|
error_msg = Column(Text)
|
||||||
|
progress = Column(Integer, default=0)
|
||||||
|
file_id = Column(UUID(as_uuid=True), ForeignKey("files.id"))
|
||||||
|
celery_task_id = Column(String(255))
|
||||||
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||||
|
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
||||||
|
completed_at = Column(DateTime(timezone=True))
|
||||||
|
|
||||||
|
file = relationship("File", back_populates="tasks")
|
||||||
|
|
||||||
|
|
||||||
|
class ComplianceReport(Base):
|
||||||
|
__tablename__ = "compliance_reports"
|
||||||
|
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
file_id = Column(UUID(as_uuid=True), ForeignKey("files.id"))
|
||||||
|
regulation_domains = Column(ARRAY(Text))
|
||||||
|
overall_risk_level = Column(String(20))
|
||||||
|
risk_score = Column(Numeric(5, 2))
|
||||||
|
findings = Column(JSONB, default=[])
|
||||||
|
recommendations = Column(JSONB, default=[])
|
||||||
|
report_markdown = Column(Text)
|
||||||
|
llm_model = Column(String(100))
|
||||||
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||||
|
|
||||||
|
|
||||||
|
class RegulationSource(Base):
|
||||||
|
__tablename__ = "regulation_sources"
|
||||||
|
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
name = Column(String(255), nullable=False)
|
||||||
|
url = Column(Text, nullable=False)
|
||||||
|
source_type = Column(String(50), default="webpage")
|
||||||
|
domain = Column(String(100))
|
||||||
|
fetch_interval = Column(Integer, default=86400)
|
||||||
|
is_active = Column(Boolean, default=True)
|
||||||
|
last_fetched_at = Column(DateTime(timezone=True))
|
||||||
|
last_hash = Column(String(64))
|
||||||
|
fetch_config = Column(JSONB, default={})
|
||||||
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||||
|
|
||||||
|
|
||||||
|
class RegulationUpdate(Base):
|
||||||
|
__tablename__ = "regulation_updates"
|
||||||
|
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
source_id = Column(UUID(as_uuid=True), ForeignKey("regulation_sources.id"))
|
||||||
|
title = Column(String(500))
|
||||||
|
url = Column(Text)
|
||||||
|
change_type = Column(String(50))
|
||||||
|
summary = Column(Text)
|
||||||
|
raw_content = Column(Text)
|
||||||
|
diff_content = Column(Text)
|
||||||
|
is_notified = Column(Boolean, default=False)
|
||||||
|
importance = Column(String(20), default="normal")
|
||||||
|
fetched_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||||
|
published_at = Column(DateTime(timezone=True))
|
||||||
21
services/compliance-backend/app/services/embed.py
Normal file
21
services/compliance-backend/app/services/embed.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import httpx
|
||||||
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||||
|
from ..core.config import settings
|
||||||
|
|
||||||
|
|
||||||
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
|
||||||
|
async def embed_texts(texts: list[str], batch_size: int = 12) -> dict:
|
||||||
|
"""调用嵌入服务,返回 dense 和 sparse 向量"""
|
||||||
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{settings.embedding_service_url}/embed",
|
||||||
|
json={"texts": texts, "batch_size": batch_size},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
async def embed_single(text: str) -> list[float]:
|
||||||
|
"""嵌入单条文本,返回 dense 向量"""
|
||||||
|
result = await embed_texts([text], batch_size=1)
|
||||||
|
return result["dense"][0]
|
||||||
65
services/compliance-backend/app/services/graph.py
Normal file
65
services/compliance-backend/app/services/graph.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
import logging
|
||||||
|
from ..core.deps import get_neo4j
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def create_regulation_node(regulation: dict) -> str:
|
||||||
|
"""在 Neo4j 中创建法规节点"""
|
||||||
|
driver = get_neo4j()
|
||||||
|
async with driver.session() as session:
|
||||||
|
result = await session.run(
|
||||||
|
"""
|
||||||
|
MERGE (r:Regulation {id: $id})
|
||||||
|
SET r.title = $title,
|
||||||
|
r.domain = $domain,
|
||||||
|
r.version = $version,
|
||||||
|
r.code = $code
|
||||||
|
RETURN r.id as id
|
||||||
|
""",
|
||||||
|
id=regulation.get("id"),
|
||||||
|
title=regulation.get("title", ""),
|
||||||
|
domain=regulation.get("domain", ""),
|
||||||
|
version=regulation.get("version", ""),
|
||||||
|
code=regulation.get("code", ""),
|
||||||
|
)
|
||||||
|
record = await result.single()
|
||||||
|
return record["id"] if record else None
|
||||||
|
|
||||||
|
|
||||||
|
async def create_clause_node(clause: dict, regulation_id: str) -> str:
|
||||||
|
"""创建条款节点并关联到法规"""
|
||||||
|
driver = get_neo4j()
|
||||||
|
async with driver.session() as session:
|
||||||
|
result = await session.run(
|
||||||
|
"""
|
||||||
|
MATCH (r:Regulation {id: $reg_id})
|
||||||
|
MERGE (c:Clause {id: $id})
|
||||||
|
SET c.number = $number,
|
||||||
|
c.content = $content
|
||||||
|
MERGE (r)-[:CONTAINS]->(c)
|
||||||
|
RETURN c.id as id
|
||||||
|
""",
|
||||||
|
reg_id=regulation_id,
|
||||||
|
id=clause.get("id"),
|
||||||
|
number=clause.get("number", ""),
|
||||||
|
content=clause.get("content", "")[:2000],
|
||||||
|
)
|
||||||
|
record = await result.single()
|
||||||
|
return record["id"] if record else None
|
||||||
|
|
||||||
|
|
||||||
|
async def search_related_regulations(domain: str, limit: int = 10) -> list[dict]:
|
||||||
|
"""查询指定域下的所有法规"""
|
||||||
|
driver = get_neo4j()
|
||||||
|
async with driver.session() as session:
|
||||||
|
result = await session.run(
|
||||||
|
"""
|
||||||
|
MATCH (r:Regulation {domain: $domain})
|
||||||
|
RETURN r.id as id, r.title as title, r.code as code, r.version as version
|
||||||
|
LIMIT $limit
|
||||||
|
""",
|
||||||
|
domain=domain,
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
return [dict(record) async for record in result]
|
||||||
59
services/compliance-backend/app/services/monitor.py
Normal file
59
services/compliance-backend/app/services/monitor.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import httpx
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_url(url: str, timeout: int = 30) -> str | None:
|
||||||
|
"""抓取 URL 内容"""
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=timeout,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 (compliance-monitor/1.0)"},
|
||||||
|
follow_redirects=True,
|
||||||
|
) as client:
|
||||||
|
resp = await client.get(url)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.text
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"抓取 {url} 失败:{e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(html: str) -> str:
|
||||||
|
"""提取 HTML 中的主要文本内容"""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
||||||
|
tag.decompose()
|
||||||
|
return soup.get_text(separator="\n", strip=True)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_hash(content: str) -> str:
|
||||||
|
return hashlib.md5(content.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
async def check_source_for_updates(source: dict) -> dict | None:
|
||||||
|
"""
|
||||||
|
检查监控源是否有更新。
|
||||||
|
返回 None 表示无变化,返回 dict 表示有新内容。
|
||||||
|
"""
|
||||||
|
html = await fetch_url(source["url"])
|
||||||
|
if not html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
text = extract_text(html)
|
||||||
|
new_hash = compute_hash(text)
|
||||||
|
|
||||||
|
if source.get("last_hash") == new_hash:
|
||||||
|
logger.info(f"监控源 {source['name']} 无变化")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"source_id": source["id"],
|
||||||
|
"raw_content": text[:50000], # 最多保存 50KB
|
||||||
|
"new_hash": new_hash,
|
||||||
|
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
}
|
||||||
43
services/compliance-backend/app/services/parse.py
Normal file
43
services/compliance-backend/app/services/parse.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import httpx
|
||||||
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||||
|
from ..core.config import settings
|
||||||
|
|
||||||
|
|
||||||
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=30))
|
||||||
|
async def parse_document(file_content: bytes, filename: str) -> dict:
|
||||||
|
"""调用 mcp-server 解析文档,返回 Markdown"""
|
||||||
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{settings.mcp_server_url}/parse-document",
|
||||||
|
files={"file": (filename, file_content, "application/octet-stream")},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_text(text: str, chunk_size: int = 512, overlap: int = 64) -> list[dict]:
|
||||||
|
"""将文本按 token 数分块(简单版,按字符数估算)"""
|
||||||
|
chars_per_chunk = chunk_size * 2 # 中文约2字符/token
|
||||||
|
chars_overlap = overlap * 2
|
||||||
|
chunks = []
|
||||||
|
start = 0
|
||||||
|
idx = 0
|
||||||
|
|
||||||
|
while start < len(text):
|
||||||
|
end = min(start + chars_per_chunk, len(text))
|
||||||
|
# 尝试在段落边界截断
|
||||||
|
if end < len(text):
|
||||||
|
for sep in ["\n\n", "\n", "。", ".", " "]:
|
||||||
|
pos = text.rfind(sep, start, end)
|
||||||
|
if pos > start + chars_per_chunk // 2:
|
||||||
|
end = pos + len(sep)
|
||||||
|
break
|
||||||
|
|
||||||
|
chunk_text = text[start:end].strip()
|
||||||
|
if chunk_text:
|
||||||
|
chunks.append({"idx": idx, "content": chunk_text, "start": start, "end": end})
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
start = max(start + 1, end - chars_overlap)
|
||||||
|
|
||||||
|
return chunks
|
||||||
92
services/compliance-backend/app/services/rag.py
Normal file
92
services/compliance-backend/app/services/rag.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
import logging
|
||||||
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
from pymilvus import connections, Collection
|
||||||
|
|
||||||
|
from .embed import embed_single, embed_texts
|
||||||
|
from ..core.llm import get_llm, RAG_SYSTEM_PROMPT
|
||||||
|
from ..core.config import settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_collection(name: str) -> Collection:
|
||||||
|
connections.connect(host=settings.milvus_host, port=settings.milvus_port)
|
||||||
|
return Collection(name)
|
||||||
|
|
||||||
|
|
||||||
|
async def hybrid_search(
|
||||||
|
query: str,
|
||||||
|
collection_name: str = "regulation_chunks",
|
||||||
|
top_k: int = 10,
|
||||||
|
workspace_id: str | None = None,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""混合检索:BGE-M3 向量检索(调研版简化,省去 BM25 融合)"""
|
||||||
|
query_vec = await embed_single(query)
|
||||||
|
|
||||||
|
col = _get_collection(collection_name)
|
||||||
|
|
||||||
|
expr = f'workspace_id == "{workspace_id}"' if workspace_id else None
|
||||||
|
results = col.search(
|
||||||
|
data=[query_vec],
|
||||||
|
anns_field="dense_vec",
|
||||||
|
param={"metric_type": "COSINE", "params": {"ef": 100}},
|
||||||
|
limit=top_k,
|
||||||
|
expr=expr,
|
||||||
|
output_fields=["content", "metadata", "file_id", "chunk_idx"],
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
for hits in results:
|
||||||
|
for hit in hits:
|
||||||
|
chunks.append({
|
||||||
|
"id": hit.id,
|
||||||
|
"content": hit.entity.get("content", ""),
|
||||||
|
"score": float(hit.score),
|
||||||
|
"file_id": hit.entity.get("file_id", ""),
|
||||||
|
"chunk_idx": hit.entity.get("chunk_idx", 0),
|
||||||
|
"metadata": hit.entity.get("metadata", {}),
|
||||||
|
})
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
async def rerank(query: str, chunks: list[dict], top_k: int = 5) -> list[dict]:
|
||||||
|
"""简化版精排(调研版按 score 直接排序,生产可换 Cross-Encoder)"""
|
||||||
|
return sorted(chunks, key=lambda x: x["score"], reverse=True)[:top_k]
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_answer(query: str, chunks: list[dict]) -> dict:
|
||||||
|
"""基于检索结果,调用 LLM 生成引文锚定的答案"""
|
||||||
|
if not chunks:
|
||||||
|
return {"answer": "未找到相关法规内容,请上传相关法规文档后重试。", "sources": []}
|
||||||
|
|
||||||
|
# 构建 RAG 上下文
|
||||||
|
context_parts = []
|
||||||
|
for i, chunk in enumerate(chunks, 1):
|
||||||
|
meta = chunk.get("metadata", {})
|
||||||
|
source_info = f"[来源 {i}:{meta.get('filename', '未知文件')},第 {meta.get('page', '?')} 页]"
|
||||||
|
context_parts.append(f"{source_info}\n{chunk['content']}")
|
||||||
|
|
||||||
|
context = "\n\n---\n\n".join(context_parts)
|
||||||
|
user_prompt = f"参考文献:\n\n{context}\n\n问题:{query}\n\n请基于以上参考文献回答,并标注来源。"
|
||||||
|
|
||||||
|
llm = get_llm(temperature=0.1)
|
||||||
|
messages = [SystemMessage(content=RAG_SYSTEM_PROMPT), HumanMessage(content=user_prompt)]
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await llm.ainvoke(messages)
|
||||||
|
answer = response.content
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"LLM 生成失败:{e}")
|
||||||
|
answer = f"LLM 生成失败:{e}。检索到的相关内容:{chunks[0]['content'][:200]}..."
|
||||||
|
|
||||||
|
sources = [
|
||||||
|
{
|
||||||
|
"content": c["content"][:300],
|
||||||
|
"file_id": c.get("file_id", ""),
|
||||||
|
"chunk_idx": c.get("chunk_idx", 0),
|
||||||
|
"score": c.get("score", 0),
|
||||||
|
"metadata": c.get("metadata", {}),
|
||||||
|
}
|
||||||
|
for c in chunks
|
||||||
|
]
|
||||||
|
return {"answer": answer, "sources": sources}
|
||||||
212
services/compliance-backend/app/worker.py
Normal file
212
services/compliance-backend/app/worker.py
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
import uuid
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from celery import Celery
|
||||||
|
from celery.schedules import crontab
|
||||||
|
|
||||||
|
from .core.config import settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Celery 配置
|
||||||
|
celery_app = Celery(
|
||||||
|
"compliance",
|
||||||
|
broker=settings.redis_url,
|
||||||
|
backend=settings.redis_url,
|
||||||
|
)
|
||||||
|
celery_app.conf.update(
|
||||||
|
task_serializer="json",
|
||||||
|
accept_content=["json"],
|
||||||
|
result_serializer="json",
|
||||||
|
timezone="Asia/Shanghai",
|
||||||
|
task_routes={
|
||||||
|
"app.worker.process_file_task": {"queue": "parse"},
|
||||||
|
"app.worker.fetch_regulation_source": {"queue": "monitor"},
|
||||||
|
"app.worker.send_notifications": {"queue": "push"},
|
||||||
|
},
|
||||||
|
beat_schedule={
|
||||||
|
"daily-regulation-monitor": {
|
||||||
|
"task": "app.worker.run_all_monitors",
|
||||||
|
"schedule": crontab(hour=2, minute=0),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── 文件处理任务(解析 + 向量化)────────────────
|
||||||
|
|
||||||
|
@celery_app.task(name="app.worker.process_file_task", bind=True, max_retries=3)
|
||||||
|
def process_file_task(self, file_id: str, task_id: str, workspace_id: str):
|
||||||
|
"""解析文档并向量化存入 Milvus"""
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(_process_file(file_id, task_id, workspace_id))
|
||||||
|
|
||||||
|
|
||||||
|
async def _process_file(file_id: str, task_id: str, workspace_id: str):
|
||||||
|
from pathlib import Path
|
||||||
|
from sqlalchemy import select
|
||||||
|
from .core.deps import AsyncSessionLocal, get_milvus_collection
|
||||||
|
from .models.db import File, Task
|
||||||
|
from .services.parse import parse_document, chunk_text
|
||||||
|
from .services.embed import embed_texts
|
||||||
|
|
||||||
|
async with AsyncSessionLocal() as db:
|
||||||
|
# 查找文件记录
|
||||||
|
result = await db.execute(select(File).where(File.id == uuid.UUID(file_id)))
|
||||||
|
file_record = result.scalar_one_or_none()
|
||||||
|
if not file_record:
|
||||||
|
logger.error(f"文件 {file_id} 不存在")
|
||||||
|
return
|
||||||
|
|
||||||
|
task_result = await db.execute(select(Task).where(Task.id == uuid.UUID(task_id)))
|
||||||
|
task = task_result.scalar_one_or_none()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 更新状态
|
||||||
|
file_record.status = "parsing"
|
||||||
|
if task:
|
||||||
|
task.status = "running"
|
||||||
|
task.progress = 10
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
# Step 1:解析文档
|
||||||
|
file_content = Path(file_record.storage_path).read_bytes()
|
||||||
|
parse_result = await parse_document(file_content, file_record.original_name)
|
||||||
|
markdown = parse_result.get("markdown", "")
|
||||||
|
|
||||||
|
if not markdown.strip():
|
||||||
|
raise ValueError("文档解析结果为空")
|
||||||
|
|
||||||
|
file_record.status = "parsed"
|
||||||
|
if task:
|
||||||
|
task.progress = 40
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
# Step 2:分块
|
||||||
|
chunks = chunk_text(markdown, chunk_size=512, overlap=64)
|
||||||
|
logger.info(f"文件 {file_id} 分割为 {len(chunks)} 块")
|
||||||
|
|
||||||
|
# Step 3:向量化(分批处理)
|
||||||
|
batch_size = 16
|
||||||
|
col = get_milvus_collection("regulation_chunks")
|
||||||
|
|
||||||
|
for i in range(0, len(chunks), batch_size):
|
||||||
|
batch = chunks[i:i + batch_size]
|
||||||
|
texts = [c["content"] for c in batch]
|
||||||
|
embed_result = await embed_texts(texts, batch_size=batch_size)
|
||||||
|
dense_vecs = embed_result["dense"]
|
||||||
|
|
||||||
|
entities = [
|
||||||
|
[f"{file_id}_{c['idx']}" for c in batch],
|
||||||
|
[file_id] * len(batch),
|
||||||
|
[workspace_id] * len(batch),
|
||||||
|
[c["idx"] for c in batch],
|
||||||
|
[c["content"] for c in batch],
|
||||||
|
dense_vecs,
|
||||||
|
[{"filename": file_record.original_name, "page": c.get("page", 0)} for c in batch],
|
||||||
|
]
|
||||||
|
col.insert(entities)
|
||||||
|
|
||||||
|
if task:
|
||||||
|
task.progress = 40 + int(60 * (i + batch_size) / len(chunks))
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
col.flush()
|
||||||
|
|
||||||
|
# 完成
|
||||||
|
file_record.status = "vectorized"
|
||||||
|
if task:
|
||||||
|
task.status = "completed"
|
||||||
|
task.progress = 100
|
||||||
|
task.completed_at = datetime.now(timezone.utc)
|
||||||
|
await db.commit()
|
||||||
|
logger.info(f"文件 {file_id} 处理完成")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"文件 {file_id} 处理失败:{e}")
|
||||||
|
file_record.status = "failed"
|
||||||
|
file_record.error_msg = str(e)
|
||||||
|
if task:
|
||||||
|
task.status = "failed"
|
||||||
|
task.error_msg = str(e)
|
||||||
|
await db.commit()
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
# ── 法规监控任务 ────────────────────────────────
|
||||||
|
|
||||||
|
@celery_app.task(name="app.worker.run_all_monitors")
|
||||||
|
def run_all_monitors():
|
||||||
|
"""定时触发所有活跃监控源"""
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(_run_all_monitors())
|
||||||
|
|
||||||
|
|
||||||
|
async def _run_all_monitors():
|
||||||
|
from sqlalchemy import select
|
||||||
|
from .core.deps import AsyncSessionLocal
|
||||||
|
from .models.db import RegulationSource
|
||||||
|
|
||||||
|
async with AsyncSessionLocal() as db:
|
||||||
|
result = await db.execute(
|
||||||
|
select(RegulationSource).where(RegulationSource.is_active == True)
|
||||||
|
)
|
||||||
|
sources = result.scalars().all()
|
||||||
|
for source in sources:
|
||||||
|
fetch_regulation_source.delay(str(source.id))
|
||||||
|
logger.info(f"触发监控源抓取:{source.name}")
|
||||||
|
|
||||||
|
|
||||||
|
@celery_app.task(name="app.worker.fetch_regulation_source", bind=True, max_retries=2)
|
||||||
|
def fetch_regulation_source(self, source_id: str):
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(_fetch_source(source_id))
|
||||||
|
|
||||||
|
|
||||||
|
async def _fetch_source(source_id: str):
|
||||||
|
import hashlib
|
||||||
|
from sqlalchemy import select
|
||||||
|
from .core.deps import AsyncSessionLocal
|
||||||
|
from .models.db import RegulationSource, RegulationUpdate
|
||||||
|
from .services.monitor import check_source_for_updates
|
||||||
|
|
||||||
|
async with AsyncSessionLocal() as db:
|
||||||
|
result = await db.execute(
|
||||||
|
select(RegulationSource).where(RegulationSource.id == uuid.UUID(source_id))
|
||||||
|
)
|
||||||
|
source = result.scalar_one_or_none()
|
||||||
|
if not source:
|
||||||
|
return
|
||||||
|
|
||||||
|
source_dict = {
|
||||||
|
"id": str(source.id),
|
||||||
|
"name": source.name,
|
||||||
|
"url": source.url,
|
||||||
|
"last_hash": source.last_hash,
|
||||||
|
}
|
||||||
|
update_data = await check_source_for_updates(source_dict)
|
||||||
|
|
||||||
|
if update_data:
|
||||||
|
logger.info(f"检测到变更:{source.name}")
|
||||||
|
source.last_hash = update_data["new_hash"]
|
||||||
|
source.last_fetched_at = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
update = RegulationUpdate(
|
||||||
|
source_id=uuid.UUID(source_id),
|
||||||
|
change_type="updated",
|
||||||
|
raw_content=update_data["raw_content"][:50000],
|
||||||
|
importance="normal",
|
||||||
|
)
|
||||||
|
db.add(update)
|
||||||
|
await db.commit()
|
||||||
|
else:
|
||||||
|
source.last_fetched_at = datetime.now(timezone.utc)
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
|
||||||
|
@celery_app.task(name="app.worker.send_notifications")
|
||||||
|
def send_notifications():
|
||||||
|
logger.info("推送通知任务执行(待实现)")
|
||||||
|
|
||||||
|
|
||||||
|
# 导出供 FastAPI 使用
|
||||||
|
worker = celery_app
|
||||||
29
services/compliance-backend/pyproject.toml
Normal file
29
services/compliance-backend/pyproject.toml
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
[project]
|
||||||
|
name = "compliance-backend"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "AI合规智能中枢 — 业务后端"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"fastapi>=0.115",
|
||||||
|
"uvicorn[standard]>=0.30",
|
||||||
|
"pydantic>=2.7",
|
||||||
|
"pydantic-settings>=2.4",
|
||||||
|
"sqlalchemy[asyncio]>=2.0",
|
||||||
|
"asyncpg>=0.29",
|
||||||
|
"redis[asyncio]>=5.0",
|
||||||
|
"celery[redis]>=5.4",
|
||||||
|
"pymilvus>=2.4",
|
||||||
|
"neo4j>=5.20",
|
||||||
|
"langchain>=0.3",
|
||||||
|
"langchain-openai>=0.2",
|
||||||
|
"langchain-community>=0.3",
|
||||||
|
"llama-index-core>=0.11",
|
||||||
|
"httpx>=0.27",
|
||||||
|
"python-multipart>=0.0.9",
|
||||||
|
"python-jose[cryptography]>=3.3",
|
||||||
|
"structlog>=24.0",
|
||||||
|
"prometheus-fastapi-instrumentator>=7.0",
|
||||||
|
"tenacity>=8.5",
|
||||||
|
"beautifulsoup4>=4.12",
|
||||||
|
"requests>=2.32",
|
||||||
|
]
|
||||||
24
services/embedding/Dockerfile
Normal file
24
services/embedding/Dockerfile
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 系统依赖
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Python 依赖(先装,利用构建缓存)
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt \
|
||||||
|
--index-url https://pypi.tuna.tsinghua.edu.cn/simple \
|
||||||
|
--trusted-host pypi.tuna.tsinghua.edu.cn
|
||||||
|
|
||||||
|
COPY main.py .
|
||||||
|
|
||||||
|
# 健康检查
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:8010/health || exit 1
|
||||||
|
|
||||||
|
EXPOSE 8010
|
||||||
|
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8010", "--workers", "1"]
|
||||||
87
services/embedding/main.py
Normal file
87
services/embedding/main.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MODEL_NAME = os.getenv("MODEL_NAME", "BAAI/bge-m3")
|
||||||
|
MODEL_CACHE = os.getenv("HF_HOME", "/app/models")
|
||||||
|
DEVICE = os.getenv("DEVICE", "cpu")
|
||||||
|
MAX_BATCH = int(os.getenv("MAX_BATCH_SIZE", "16"))
|
||||||
|
|
||||||
|
# 设置 HuggingFace 镜像
|
||||||
|
if os.getenv("HF_ENDPOINT"):
|
||||||
|
os.environ["HF_ENDPOINT"] = os.getenv("HF_ENDPOINT")
|
||||||
|
|
||||||
|
model = None
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
global model
|
||||||
|
logger.info(f"加载模型 {MODEL_NAME},设备:{DEVICE}")
|
||||||
|
try:
|
||||||
|
from FlagEmbedding import BGEM3FlagModel
|
||||||
|
model = BGEM3FlagModel(
|
||||||
|
MODEL_NAME,
|
||||||
|
use_fp16=(DEVICE != "cpu"),
|
||||||
|
cache_dir=MODEL_CACHE,
|
||||||
|
)
|
||||||
|
logger.info("BGE-M3 模型加载完成")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"模型加载失败:{e}")
|
||||||
|
raise
|
||||||
|
yield
|
||||||
|
logger.info("服务关闭")
|
||||||
|
|
||||||
|
|
||||||
|
app = FastAPI(title="BGE-M3 嵌入服务", lifespan=lifespan)
|
||||||
|
|
||||||
|
|
||||||
|
class EmbedRequest(BaseModel):
|
||||||
|
texts: list[str] = Field(..., min_length=1, max_length=100)
|
||||||
|
batch_size: int = Field(default=12, ge=1, le=MAX_BATCH)
|
||||||
|
return_dense: bool = True
|
||||||
|
return_sparse: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class EmbedResponse(BaseModel):
|
||||||
|
dense: Optional[list[list[float]]] = None
|
||||||
|
sparse: Optional[list[dict]] = None
|
||||||
|
model: str
|
||||||
|
count: int
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/embed", response_model=EmbedResponse)
|
||||||
|
def embed(req: EmbedRequest) -> EmbedResponse:
|
||||||
|
if model is None:
|
||||||
|
raise HTTPException(status_code=503, detail="模型未就绪")
|
||||||
|
if len(req.texts) > 100:
|
||||||
|
raise HTTPException(status_code=400, detail="单次最多 100 条文本")
|
||||||
|
|
||||||
|
try:
|
||||||
|
output = model.encode(
|
||||||
|
req.texts,
|
||||||
|
batch_size=req.batch_size,
|
||||||
|
return_dense=req.return_dense,
|
||||||
|
return_sparse=req.return_sparse,
|
||||||
|
)
|
||||||
|
return EmbedResponse(
|
||||||
|
dense=output["dense_vecs"].tolist() if req.return_dense else None,
|
||||||
|
sparse=[dict(w) for w in output["lexical_weights"]] if req.return_sparse else None,
|
||||||
|
model=MODEL_NAME,
|
||||||
|
count=len(req.texts),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"嵌入生成失败:{e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health():
|
||||||
|
return {"status": "ok", "model": MODEL_NAME, "device": DEVICE, "ready": model is not None}
|
||||||
10
services/embedding/requirements.txt
Normal file
10
services/embedding/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
fastapi>=0.115
|
||||||
|
uvicorn[standard]>=0.30
|
||||||
|
pydantic>=2.7
|
||||||
|
FlagEmbedding>=1.3
|
||||||
|
# CPU 版本 PyTorch(减小镜像体积)
|
||||||
|
torch>=2.3.0 --index-url https://download.pytorch.org/whl/cpu
|
||||||
|
transformers>=4.44
|
||||||
|
sentence-transformers>=3.0
|
||||||
|
huggingface-hub>=0.24
|
||||||
|
numpy>=1.26
|
||||||
38
services/mcp-server/Dockerfile
Normal file
38
services/mcp-server/Dockerfile
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 系统依赖(MinerU 需要 libGL)
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
curl \
|
||||||
|
libgl1-mesa-glx \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 \
|
||||||
|
libxrender1 \
|
||||||
|
libxext6 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt \
|
||||||
|
--index-url https://pypi.tuna.tsinghua.edu.cn/simple \
|
||||||
|
--trusted-host pypi.tuna.tsinghua.edu.cn
|
||||||
|
|
||||||
|
# 预下载 MinerU 模型(构建时执行,加速启动)
|
||||||
|
RUN python -c "
|
||||||
|
import os
|
||||||
|
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||||
|
try:
|
||||||
|
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
|
||||||
|
print('MinerU 模型下载完成')
|
||||||
|
except Exception as e:
|
||||||
|
print(f'模型下载跳过(将在运行时下载): {e}')
|
||||||
|
" || true
|
||||||
|
|
||||||
|
COPY main.py .
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:8011/health || exit 1
|
||||||
|
|
||||||
|
EXPOSE 8011
|
||||||
|
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8011", "--workers", "1"]
|
||||||
136
services/mcp-server/main.py
Normal file
136
services/mcp-server/main.py
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from fastapi import FastAPI, UploadFile, File, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DEVICE = os.getenv("DEVICE", "cpu")
|
||||||
|
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "/app/uploads"))
|
||||||
|
PARSED_DIR = Path(os.getenv("PARSED_DIR", "/app/parsed"))
|
||||||
|
|
||||||
|
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
PARSED_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
app = FastAPI(title="MinerU 文档解析服务")
|
||||||
|
|
||||||
|
SUPPORTED_TYPES = {
|
||||||
|
"application/pdf": "pdf",
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
||||||
|
"application/msword": "doc",
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_pdf_mineru(pdf_path: str) -> str:
|
||||||
|
"""使用 MinerU 解析 PDF"""
|
||||||
|
try:
|
||||||
|
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
||||||
|
from magic_pdf.pipe.UnicodeFormulaPDFPipe import UnicodeFormulaPDFPipe
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
writer = FileBasedDataWriter(tmpdir)
|
||||||
|
pipe = UnicodeFormulaPDFPipe(pdf_path, writer)
|
||||||
|
pipe.pipe_classify()
|
||||||
|
pipe.pipe_analyze()
|
||||||
|
pipe.pipe_parse()
|
||||||
|
md_content = pipe.pipe_mk_uni_format(tmpdir, drop_mode="none")
|
||||||
|
return md_content or ""
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"MinerU 解析失败,降级到 PyMuPDF:{e}")
|
||||||
|
return parse_pdf_pymupdf(pdf_path)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_pdf_pymupdf(pdf_path: str) -> str:
|
||||||
|
"""降级:使用 PyMuPDF 提取文本"""
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
pages = []
|
||||||
|
for i, page in enumerate(doc):
|
||||||
|
text = page.get_text()
|
||||||
|
if text.strip():
|
||||||
|
pages.append(f"## 第 {i+1} 页\n\n{text}")
|
||||||
|
return "\n\n".join(pages)
|
||||||
|
except Exception as e:
|
||||||
|
return f"[解析失败:{e}]"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_docx(file_path: str) -> str:
|
||||||
|
"""解析 Word 文档"""
|
||||||
|
try:
|
||||||
|
from docx import Document
|
||||||
|
doc = Document(file_path)
|
||||||
|
parts = []
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
if para.text.strip():
|
||||||
|
style = para.style.name if para.style else ""
|
||||||
|
if "Heading" in style:
|
||||||
|
level = style.replace("Heading ", "").strip()
|
||||||
|
try:
|
||||||
|
prefix = "#" * int(level)
|
||||||
|
except ValueError:
|
||||||
|
prefix = "##"
|
||||||
|
parts.append(f"{prefix} {para.text}")
|
||||||
|
else:
|
||||||
|
parts.append(para.text)
|
||||||
|
for table in doc.tables:
|
||||||
|
rows = []
|
||||||
|
for row in table.rows:
|
||||||
|
rows.append(" | ".join(cell.text.strip() for cell in row.cells))
|
||||||
|
if rows:
|
||||||
|
parts.append("\n".join(rows))
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
except Exception as e:
|
||||||
|
return f"[Word 解析失败:{e}]"
|
||||||
|
|
||||||
|
|
||||||
|
class ParseResponse(BaseModel):
|
||||||
|
filename: str
|
||||||
|
markdown: str
|
||||||
|
page_count: int
|
||||||
|
parser: str
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/mineru-parse", response_model=ParseResponse)
|
||||||
|
async def mineru_parse(file: UploadFile = File(...)) -> ParseResponse:
|
||||||
|
content = await file.read()
|
||||||
|
suffix = Path(file.filename or "doc.pdf").suffix.lower()
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||||
|
tmp.write(content)
|
||||||
|
tmp_path = tmp.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
if suffix == ".pdf":
|
||||||
|
markdown = parse_pdf_mineru(tmp_path)
|
||||||
|
parser = "mineru"
|
||||||
|
elif suffix in (".docx", ".doc"):
|
||||||
|
markdown = parse_docx(tmp_path)
|
||||||
|
parser = "python-docx"
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=415, detail=f"不支持的文件类型:{suffix}")
|
||||||
|
|
||||||
|
page_count = markdown.count("## 第") if suffix == ".pdf" else markdown.count("\n\n")
|
||||||
|
return ParseResponse(
|
||||||
|
filename=file.filename or "unknown",
|
||||||
|
markdown=markdown,
|
||||||
|
page_count=max(page_count, 1),
|
||||||
|
parser=parser,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/parse-document", response_model=ParseResponse)
|
||||||
|
async def parse_document(file: UploadFile = File(...)) -> ParseResponse:
|
||||||
|
return await mineru_parse(file)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health():
|
||||||
|
return {"status": "ok", "device": DEVICE}
|
||||||
11
services/mcp-server/requirements.txt
Normal file
11
services/mcp-server/requirements.txt
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
fastapi>=0.115
|
||||||
|
uvicorn[standard]>=0.30
|
||||||
|
pydantic>=2.7
|
||||||
|
python-multipart>=0.0.9
|
||||||
|
httpx>=0.27
|
||||||
|
# MinerU 文档解析
|
||||||
|
mineru[pipeline]>=1.0
|
||||||
|
# Word/Excel 降级解析
|
||||||
|
python-docx>=1.1
|
||||||
|
openpyxl>=3.1
|
||||||
|
PyMuPDF>=1.24 # PDF 降级解析
|
||||||
Reference in New Issue
Block a user