commit 448e078d998b2e1ee71c9a7188c134d76422afb5 Author: wangwei Date: Thu Apr 23 09:58:47 2026 +0800 first commit diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..df982e7 --- /dev/null +++ b/.env.example @@ -0,0 +1,89 @@ +# ══════════════════════════════════════════════════ +# AI合规智能中枢 — 环境变量配置 +# 复制本文件为 .env 并填写实际值 +# cp .env.example .env +# ══════════════════════════════════════════════════ + + +# ────────────────────────────────────────────────── +# LLM 云端 API 配置(至少填写一个) +# ────────────────────────────────────────────────── + +# LLM 提供商:deepseek 或 qwen +LLM_PROVIDER=deepseek + +# DeepSeek API(推荐,约¥1/百万tokens) +# 申请地址:https://platform.deepseek.com +DEEPSEEK_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +# 可选模型:deepseek-chat(通用)、deepseek-reasoner(推理增强) +DEEPSEEK_MODEL=deepseek-chat + +# 阿里云 DashScope / Qwen API(备用) +# 申请地址:https://dashscope.aliyuncs.com +DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +# 可选模型:qwen-plus、qwen-max、qwen-turbo +QWEN_MODEL=qwen-plus + + +# ────────────────────────────────────────────────── +# 数据库密码 +# ────────────────────────────────────────────────── + +# PostgreSQL 密码(生产环境请使用强密码) +POSTGRES_PASSWORD=compliance_secure_2026 + +# Redis 密码 +REDIS_PASSWORD=redis_secure_2026 + +# Neo4j 密码(不能包含特殊字符) +NEO4J_PASSWORD=neo4j_secure_2026 + + +# ────────────────────────────────────────────────── +# AI 模型配置 +# ────────────────────────────────────────────────── + +# HuggingFace 镜像(国内加速,默认使用 hf-mirror.com) +HF_ENDPOINT=https://hf-mirror.com + +# 嵌入服务设备:cpu 或 cuda(有 GPU 时改为 cuda) +EMBEDDING_DEVICE=cpu + +# MinerU 解析设备:cpu 或 cuda +MCP_DEVICE=cpu + + +# ────────────────────────────────────────────────── +# 应用配置 +# ────────────────────────────────────────────────── + +# 运行环境:development / production +APP_ENV=development + +# 日志级别:DEBUG / INFO / WARNING / ERROR +LOG_LEVEL=INFO + +# API 认证密钥(用于内部服务间调用) +API_SECRET_KEY=change_this_to_a_random_secret_key_32chars + + +# ────────────────────────────────────────────────── +# 监控配置(可选) +# ────────────────────────────────────────────────── + +# Grafana 管理员密码 +GRAFANA_PASSWORD=admin + + +# ────────────────────────────────────────────────── +# 外部推送配置(闭环③法规监控推送用) +# ────────────────────────────────────────────────── + +# 邮件推送(可选) +# SMTP_HOST=smtp.example.com +# SMTP_PORT=587 +# SMTP_USER=your@email.com +# SMTP_PASSWORD=your_smtp_password + +# Webhook 推送(可选,支持飞书/钉钉/企业微信) +# WEBHOOK_URL=https://hooks.slack.com/services/xxx diff --git a/00_整体部署规划.md b/00_整体部署规划.md new file mode 100644 index 0000000..a361f92 --- /dev/null +++ b/00_整体部署规划.md @@ -0,0 +1,277 @@ +# AI合规智能中枢 — 整体部署规划 + +> **版本:** 调研版 v1.0 | **日期:** 2026.04 | **团队:** T-Systems AI Regulations Team + +--- + +## 一、项目背景 + +AI+合规智能中枢面向车企与工厂,是一个全链路合规智能平台。主要解决以下痛点: + +| 痛点 | 说明 | +|------|------| +| 法规来源复杂 | GB、MIIT、UN-ECE、IATF 16949、ISO 45001 等多源并行 | +| 更新频率高 | 新能源、数据安全、碳排放法规频繁变动 | +| 跨语言要求 | 中英德法多语言法规并存 | +| 文档管理分散 | 内部文档与外部法规割裂,难以统一检索 | +| 被动识别隐患 | EHS 合规靠人工排查,效率低下 | + +**调研目标:** 以最小资源投入(Docker Compose 单机)验证三条核心业务闭环的技术可行性。 + +--- + +## 二、部署架构概览 + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ 单台服务器 │ +│ ┌──────────────┐ ┌──────────────────────────────────────┐ │ +│ │ API 网关 │ │ Docker Compose │ │ +│ │ Nginx :80 │───▶│ │ │ +│ └──────────────┘ │ ┌──────────────────────────────┐ │ │ +│ │ │ 业务服务层 │ │ │ +│ │ │ compliance-backend :8000 │ │ │ +│ │ │ celery-worker │ │ │ +│ │ │ celery-beat │ │ │ +│ │ └──────────┬───────────────────┘ │ │ +│ │ │ │ │ +│ │ ┌──────────▼───────────────────┐ │ │ +│ │ │ AI 模型层 │ │ │ +│ │ │ embedding-service :8010 │ │ │ +│ │ │ mcp-server(MinerU) :8011 │ │ │ +│ │ │ LLM → DeepSeek API (云端) │ │ │ +│ │ └──────────┬───────────────────┘ │ │ +│ │ │ │ │ +│ │ ┌──────────▼───────────────────┐ │ │ +│ │ │ 数据层 │ │ │ +│ │ │ PostgreSQL :5432 │ │ │ +│ │ │ Redis :6379 │ │ │ +│ │ │ Milvus :19530 │ │ │ +│ │ │ Neo4j :7474/:7687 │ │ │ +│ │ │ MinIO (Milvus内置) │ │ │ +│ │ └──────────────────────────────┘ │ │ +│ └──────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ┌─────────▼──────────┐ + │ DeepSeek API │ + │ (云端 LLM) │ + └────────────────────┘ +``` + +--- + +## 三、原方案 vs 调研方案对比 + +| 维度 | 原方案(生产级)| 调研方案 | 降级理由 | +|------|--------------|---------|---------| +| 编排 | Kubernetes 1.36 + Helm | **Docker Compose** | 无需集群管理,`up -d` 一键启动 | +| LLM | vLLM + DeepSeek-V3(4×A100)| **DeepSeek/Qwen 云端 API** | 无 GPU 依赖,秒级就绪 | +| 嵌入模型 | BGE-M3 GPU 服务 | **BGE-M3 CPU 容器** | 调研数据量小,CPU 够用 | +| Milvus | 分布式集群 + MinIO | **Milvus Standalone**(含内置 MinIO)| 单容器,省去 MinIO 独立部署 | +| 消息队列 | Kafka 3 节点 | **Redis + Celery**(复用已有 Redis)| 调研无需高吞吐,大幅简化 | +| 监控 | Prometheus + Grafana + ELK | **仅 Prometheus + Grafana**(可选)| 轻量,后期按需加 | +| 安全 | JWT + cert-manager + RBAC | **API Key 简单认证** | 调研期无需生产级安全 | +| CI/CD | GitLab CI 完整流水线 | **无**(手动部署)| 调研期直接 compose up | + +--- + +## 四、硬件最低要求 + +| 资源 | 最低配置 | 推荐配置 | 说明 | +|------|---------|---------|------| +| CPU | 8 核 | 16 核+ | BGE-M3 CPU 模式需要较多核心 | +| 内存 | 32 GB | 64 GB | Milvus + BGE-M3 + Neo4j 内存消耗较大 | +| 存储 | 200 GB SSD | 500 GB SSD | 含模型文件(约 5GB)+ 数据 | +| GPU | **无需** | 1× RTX 3090(24GB)| 有 GPU 可加速嵌入/MinerU | +| 网络 | 能访问 DeepSeek API | — | LLM 完全在云端 | +| OS | Ubuntu 22.04 LTS | — | 或 Windows 11 + WSL2 | + +**各组件内存估算:** + +| 服务 | 内存占用 | +|------|---------| +| PostgreSQL | ~1 GB | +| Redis | ~512 MB | +| Milvus(含 etcd/minio)| ~4 GB | +| Neo4j | ~2 GB | +| BGE-M3(CPU 模式)| ~6 GB | +| MinerU(CPU 模式)| ~4 GB | +| compliance-backend | ~1 GB | +| celery-worker × 1 | ~1 GB | +| **合计** | **~20 GB** | + +--- + +## 五、五阶段部署步骤(总览) + +``` +阶段一:宿主机环境准备 + └─ 安装 Docker CE / Docker Desktop + └─ 配置 nvidia-container-toolkit(有 GPU 时) + └─ 创建项目目录,配置 .env + +阶段二:基础中间件启动 + └─ PostgreSQL + Redis(优先启动) + └─ etcd + MinIO(Milvus 依赖) + └─ Milvus Standalone(向量检索核心) + └─ Neo4j Community(知识图谱) + +阶段三:AI 模型服务构建与启动 + └─ 构建 embedding-service(BGE-M3 封装) + └─ 构建 mcp-server(MinerU 封装) + └─ 预下载模型(BGE-M3 ~2.5GB,MinerU ~2GB) + +阶段四:业务微服务启动 + └─ compliance-backend(FastAPI 主服务) + └─ celery-worker(异步任务处理) + └─ celery-beat(定时任务调度) + └─ nginx(API 网关) + +阶段五:验证与闭环测试 + └─ 健康检查(bash scripts/check_health.sh) + └─ 端到端冒烟测试(bash scripts/07_smoke_test.sh) + └─ 三条业务闭环验证 +``` + +--- + +## 六、三条核心业务闭环 + +### 闭环①:法规入库 → 检索问答 + +``` +用户上传 PDF + │ + ▼ +API Gateway(Nginx) + │ + ▼ +kbmp-service(文件接收) + │ 异步投递 + ▼ +Celery Worker + │ + ├─► parse-worker ──► mcp-server(MinerU 解析) + │ │ Markdown + 结构化文本 + │ ▼ + └─► vectorize-worker ──► embedding-service(BGE-M3) + │ 1024维向量 + ▼ + Milvus(向量存储)+ PostgreSQL(元数据) + +用户提问 + │ + ▼ +BM25 关键词检索 + BGE-M3 向量检索(Milvus hybrid search) + │ + ▼ +Cross-Encoder Reranker(精排 Top-K) + │ + ▼ +DeepSeek API(引文锚定生成) + │ + ▼ +返回答案(含原文引用 + 页码) +``` + +### 闭环②:文档上传 → 合规审查 + +``` +上传供应商/内部文档 + │ + ▼ +MinerU 解析 → 条款级分割 + │ + ▼ +法规域匹配(vehicle_safety / data_security / ehs) + │ + ▼ +与法规库语义比对(向量相似度 + 关键字匹配) + │ + ▼ +DeepSeek API 风险评分(条款级分析) + │ + ▼ +生成 Markdown 审查报告(风险等级 + 整改建议) +``` + +### 闭环③:法规监控 → 变更推送 + +``` +Celery Beat 定时触发(每天) + │ + ▼ +抓取监控源(国标委 / 工信部 / 应急管理部 / 生环部) + │ + ▼ +内容 Hash 比对(检测变更) + │ + ▼ [有变更] +NLP Diff 分析(DeepSeek 提取新增/修订/废止条款) + │ + ▼ +增量入库(MinerU 解析 → BGE-M3 → Milvus + PostgreSQL + Neo4j) + │ + ▼ +差距分析(与企业现状比对) + │ + ▼ +推送通知(Email / Webhook / 飞书 / 钉钉) + │ + ▼ +记录变更日志 → 触发整改任务 +``` + +--- + +## 七、技术选型决策依据 + +| 组件 | 选型 | 决策依据 | +|------|------|---------| +| 向量数据库 | Milvus 2.4 | 支持 Dense+Sparse 混合检索,BGE-M3 配套,生产可扩展 | +| 图数据库 | Neo4j 5.x | 法规实体关系建模成熟,APOC 插件丰富,Cypher 查询友好 | +| 嵌入模型 | BGE-M3 | 中英文双语,支持 dense+sparse+multi-vector,8192 token 上下文 | +| LLM | DeepSeek API | 推理能力强,成本低(约¥1/百万 tokens),OpenAI 兼容 | +| 文档解析 | MinerU | GPU 最快 0.21s/页,支持 109 种语言 OCR,布局感知 | +| 任务队列 | Celery + Redis | 调研阶段够用,比 Kafka 轻量,Redis 可复用 | +| API 框架 | FastAPI | 异步性能好,OpenAPI 自动生成,Pydantic 数据验证 | +| 关系数据库 | PostgreSQL + pgvector | 元数据存储 + 备用向量检索,pgvector 镜像开箱即用 | + +--- + +## 八、升级路径(调研 → 生产) + +| 维度 | 升级内容 | 触发条件 | +|------|---------|---------| +| LLM | API → 本地 vLLM + DeepSeek-V3 | 数据安全要求/API成本超阈值 | +| Milvus | Standalone → 分布式集群 | 向量数据 > 1000 万条 | +| 消息队列 | Celery+Redis → Kafka | 并发任务 > 100/分钟 | +| 编排 | Docker Compose → Kubernetes | 多节点部署/弹性伸缩需求 | +| 安全 | API Key → JWT + RBAC | 对外提供服务/多租户 | +| 监控 | Grafana → Grafana + ELK | 日志量大/需要复杂分析 | + +--- + +## 九、文件结构说明 + +``` +Depolyment/ +├── 00_整体部署规划.md ← 本文档 +├── 01_技术架构详解.md ← 六层架构 + 六大微服务详细说明 +├── 02_组件安装指南.md ← 每个组件的详细安装步骤 +├── 03_业务闭环说明.md ← 三条闭环的数据流和接口规范 +├── README.md ← 快速启动指南 +├── docker-compose.yml ← 全服务编排 +├── .env.example ← 环境变量模板 +├── scripts/ ← 安装与运维脚本(13 个) +├── services/ ← 服务源码 +│ ├── embedding/ ← BGE-M3 嵌入服务 +│ ├── mcp-server/ ← MinerU 文档解析服务 +│ └── compliance-backend/ ← 核心业务后端 +├── config/ ← Nginx、Prometheus 配置 +├── init-sql/ ← PostgreSQL 初始化 SQL +├── data/ ← 运行时数据 +├── logs/ ← 服务日志 +└── models/ ← AI 模型缓存 +``` diff --git a/01_技术架构详解.md b/01_技术架构详解.md new file mode 100644 index 0000000..d4cf267 --- /dev/null +++ b/01_技术架构详解.md @@ -0,0 +1,263 @@ +# AI合规智能中枢 — 技术架构详解 + +> 本文档对应架构文档:`01_分层次技术架构图.html` 和 `02_详细技术架构图.html` + +--- + +## 一、六层架构总览 + +``` +┌──────────────────────────────────────────────────────────────────┐ +│ L1 应用接入层:Web / Mobile / Bot / API Gateway / RBAC │ +├──────────────────────────────────────────────────────────────────┤ +│ L2 业务能力层:知识库问答 / 文档审查 / EHS / 法规监控 / 推荐 │ +├──────────────────────────────────────────────────────────────────┤ +│ L3 法规感知层:监控 → 感知 → 解析 → 图谱 → 分析 → 闭环 │ +├──────────────────────────────────────────────────────────────────┤ +│ L4 AI引擎层:RAG / LLM / 文档解析 / 知识图谱推理 / NLP │ +├──────────────────────────────────────────────────────────────────┤ +│ L5 数据知识层:Milvus / PostgreSQL / Neo4j / Redis / 知识库 │ +├──────────────────────────────────────────────────────────────────┤ +│ L6 基础设施层:安全治理 / 容器编排 / 运维观测 / CI/CD │ +└──────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 二、六大微服务详解 + +### 2.1 kbmp-service(知识库公开接口) + +**职责:** 知识库的统一入口,处理文件上传、检索编排、任务投递。 + +**核心接口:** + +| 方法 | 路径 | 功能 | +|------|------|------| +| POST | `/workspace/create` | 创建知识库工作空间 | +| POST | `/files/upload` | 上传文件(触发解析任务) | +| POST | `/files/parse` | 手动触发解析 | +| POST | `/knowledge/retrieval` | 混合检索(BM25 + 向量)| +| POST | `/chunks/recall` | 原始 Chunk 召回 | +| POST | `/qa` | 检索 + LLM 问答生成 | + +**内部流程:** +``` +文件上传 → 存储 data/uploads → 投递 Celery 任务(parse-queue) + → parse-worker 调用 mcp-server 解析 + → vectorize-worker 调用 embedding-service 向量化 + → 写入 Milvus(向量)+ PostgreSQL(元数据) +``` + +--- + +### 2.2 mcp-server(文档解析服务) + +**职责:** 将 PDF/Word/Excel 等文档转换为结构化 Markdown,供后续向量化。 + +**核心接口:** + +| 方法 | 路径 | 功能 | +|------|------|------| +| POST | `/parse-document` | 通用解析(自动选择引擎)| +| POST | `/mineru-parse` | MinerU 精准解析 | +| GET | `/health` | 健康检查 | + +**解析策略(降级链):** +``` +1. 阿里云文档解析 API(云端高精度)→ [调研版暂不启用] +2. MinerU(本地,GPU/CPU 均支持)→ 主用 +3. python-docx / PyMuPDF(纯文本降级)→ 兜底 +``` + +**MinerU 特性:** +- GPU 最快:0.21 秒/页 +- CPU 模式:约 3-5 秒/页(调研阶段可接受) +- 支持 109 种语言 OCR +- 布局感知:区分正文/标题/表格/图片/页眉页脚 +- 输出格式:Markdown + JSON(含结构化元数据) + +--- + +### 2.3 合规业务后端(compliance-backend) + +**职责:** 核心业务逻辑,整合三条闭环的业务处理。 + +**核心接口:** + +| 方法 | 路径 | 功能 | +|------|------|------| +| POST | `/compliance/upload` | 上传待审查文档 | +| POST | `/compliance/check` | 智能合规审查 | +| GET | `/compliance/report/{id}` | 获取审查报告 | +| POST | `/compliance/regulations/download` | 下载法规 | +| POST | `/compliance/regulations/update` | 更新法规版本 | +| POST | `/compliance/access-control` | 权限分级管理 | +| POST | `/compliance/subscribe` | 订阅变更推送 | + +--- + +### 2.4 法规感知引擎(Regulation Awareness Engine) + +**职责:** 定时监控法规源,自动检测变更,触发增量更新。 + +**六步感知闭环:** + +``` +① 法规源监控 + - 定时抓取:国家标准委、工信部、UN-ECE、EUR-Lex、碳交易平台 + - 技术:requests + BeautifulSoup + Playwright(动态页面) + +② 智能变更感知 + - Hash 对比(快速过滤) + - NLP 版本 Diff(精确识别新增/修订/废止条款) + +③ 自动解析入库 + - MinerU 解析 → 条款级分割 + - BGE-M3 向量化 → Milvus + PostgreSQL + +④ 知识图谱同步 + - Neo4j 更新:法规-条款-义务关系 + - 影响分析:哪些企业文档受影响 + +⑤ 差距分析 + - AI 比对企业现状 vs 新法规要求 + - 生成差距报告 + +⑥ 推送与整改触发 + - 按角色推送(研发/EHS/采购/法务) + - 自动生成整改任务 +``` + +--- + +### 2.5 AI 推理引擎(AI Inference Engine) + +**职责:** 混合检索、精排、LLM 生成、知识图谱推理。 + +**混合检索流程:** + +``` +用户查询 + │ + ├─► BGE-M3 向量化(Dense 1024维) + │ │ + │ └─► Milvus ANN 检索(HNSW,Cosine 相似度) + │ + ├─► BM25 关键词检索(稀疏向量/倒排索引) + │ + └─► 结果融合(RRF 排名融合) + │ + ▼ + Cross-Encoder Reranker(精排 Top-K) + │ + ▼ + LLM 生成(DeepSeek API) + - System Prompt:引文锚定要求 + - 输出:答案 + 原文引用 + 来源文档 + 页码 +``` + +**BGE-M3 三种向量输出:** +- **Dense Vector**(1024维):语义相似度,主要用于向量检索 +- **Sparse Vector**(词汇权重):关键字匹配,等效 BM25 +- **Multi-Vector**(ColBERT 风格):精细粒度 token 级匹配 + +--- + +### 2.6 Worker 集群 + +**职责:** 异步任务处理,解耦主服务压力。 + +**Worker 类型:** + +| Worker | 队列 | 职责 | +|--------|------|------| +| parse-worker | `parse` | 调用 mcp-server 解析文档 | +| vectorize-worker | `vectorize` | BGE-M3 向量化 + Milvus 写入 | +| compliance-worker | `compliance` | 合规比对 + 风险评分 | +| monitor-worker | `monitor` | 法规源定时抓取 | +| push-worker | `push` | 推送通知(Email/Webhook)| + +**调度配置(Celery Beat):** +```python +CELERY_BEAT_SCHEDULE = { + "regulation-monitor": { + "task": "app.worker.fetch_regulation_updates", + "schedule": crontab(hour=2, minute=0), # 每天凌晨2点 + }, + "push-notifications": { + "task": "app.worker.send_pending_notifications", + "schedule": crontab(minute="*/30"), # 每30分钟 + }, +} +``` + +--- + +## 三、数据模型 + +### 3.1 PostgreSQL 表结构 + +``` +workspaces → 知识库工作空间 +files → 上传文件记录(含解析状态) +tasks → 异步任务状态追踪 +compliance_reports → 合规审查报告 +regulation_sources → 法规监控源配置 +regulation_updates → 法规变更记录 +subscriptions → 推送订阅配置 +audit_logs → 全链路审计日志 +``` + +### 3.2 Milvus Collection 结构 + +```python +# regulation_chunks / doc_chunks / case_library 共用相同 Schema +fields = [ + FieldSchema("id", VARCHAR, primary_key=True), + FieldSchema("file_id", VARCHAR), # 关联文件 + FieldSchema("workspace_id", VARCHAR), # 所属工作空间 + FieldSchema("chunk_idx", INT64), # 块序号 + FieldSchema("content", VARCHAR(65535)), # 原文内容 + FieldSchema("dense_vec", FLOAT_VECTOR(1024)), # BGE-M3 向量 + FieldSchema("metadata", JSON), # 扩展元数据 +] +# 索引:HNSW,Cosine 相似度,M=16,efConstruction=200 +``` + +### 3.3 Neo4j 图模型 + +```cypher +// 节点类型 +(:Regulation {id, title, code, version, domain, effective_date}) +(:Clause {id, number, content, clause_type}) +(:Obligation {id, description, obligation_type, subject}) +(:Enterprise {id, name, industry}) +(:RiskItem {id, description, severity, domain}) +(:Domain {name, label}) + +// 关系类型 +(Regulation)-[:CONTAINS]->(Clause) +(Clause)-[:REQUIRES]->(Obligation) +(Regulation)-[:SUPERSEDES]->(Regulation) // 版本替代 +(Clause)-[:MAPS_TO]->(RiskItem) +(Enterprise)-[:SUBJECT_TO]->(Regulation) +``` + +--- + +## 四、核心技术栈版本锁定 + +| 组件 | 版本 | Docker 镜像 | +|------|------|------------| +| PostgreSQL | 16 + pgvector | `pgvector/pgvector:pg16` | +| Redis | 7.x | `redis:7-alpine` | +| Milvus | 2.4.13 | `milvusdb/milvus:v2.4.13` | +| Neo4j | 5.20 Community | `neo4j:5.20-community` | +| BGE-M3 | 最新 | BAAI/bge-m3(HuggingFace)| +| MinerU | 1.x | opendatalab/MinerU(pip)| +| LangChain | 0.3+ | pip install langchain>=0.3 | +| FastAPI | 0.115+ | pip install fastapi>=0.115 | +| Celery | 5.4+ | pip install celery[redis]>=5.4 | +| Python | 3.12 | python:3.12-slim(Docker)| +| Nginx | 1.25 | `nginx:1.25-alpine` | diff --git a/02_组件安装指南.md b/02_组件安装指南.md new file mode 100644 index 0000000..05594a6 --- /dev/null +++ b/02_组件安装指南.md @@ -0,0 +1,569 @@ +# AI合规智能中枢 — 组件安装指南 + +> 本文档提供每个组件的详细安装步骤、配置说明和验证方法。 + +--- + +## 前置:Docker 环境安装 + +### Ubuntu 22.04 LTS + +```bash +# 1. 更新包列表 +sudo apt-get update + +# 2. 安装依赖 +sudo apt-get install -y ca-certificates curl gnupg lsb-release + +# 3. 添加 Docker GPG 密钥 +sudo install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \ + sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg +sudo chmod a+r /etc/apt/keyrings/docker.gpg + +# 4. 添加 Docker 仓库 +echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \ + https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + +# 5. 安装 Docker CE +sudo apt-get update +sudo apt-get install -y docker-ce docker-ce-cli containerd.io \ + docker-buildx-plugin docker-compose-plugin + +# 6. 加入 docker 组(免 sudo) +sudo usermod -aG docker $USER +newgrp docker + +# 7. 验证 +docker --version # Docker version 27.x.x +docker compose version # Docker Compose version v2.x.x +``` + +### Windows 11 + WSL2 + +```powershell +# PowerShell(管理员) + +# 1. 启用 WSL2 +wsl --install -d Ubuntu-22.04 +wsl --set-default-version 2 + +# 2. 安装 Docker Desktop(需重启) +winget install -e --id Docker.DockerDesktop + +# 3. 重启后,Docker Desktop 设置: +# Settings → General → "Use WSL 2 based engine" ✓ +# Settings → Resources → WSL Integration → Ubuntu-22.04 ✓ +``` + +### GPU 支持(可选,有 NVIDIA GPU 时) + +```bash +# Ubuntu 安装 nvidia-container-toolkit +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker + +# 验证 +docker run --rm --gpus all nvidia/cuda:12.4-base nvidia-smi +``` + +--- + +## 组件一:PostgreSQL 16 + pgvector + +**用途:** 存储元数据(文件记录、任务状态、合规报告、法规变更) + +### 配置参数 + +```yaml +# docker-compose.yml 中的关键配置 +image: pgvector/pgvector:pg16 # 内置 pgvector 扩展 +POSTGRES_USER: compliance +POSTGRES_PASSWORD: +POSTGRES_DB: compliance_db +volumes: + - postgres_data:/var/lib/postgresql/data # 数据持久化 + - ./init-sql:/docker-entrypoint-initdb.d # 自动执行初始化 SQL +ports: + - "5432:5432" +``` + +### 启动与验证 + +```bash +# 启动 +docker compose up -d postgres + +# 等待健康(约10秒) +docker compose ps postgres + +# 连接测试 +docker compose exec postgres psql -U compliance -d compliance_db -c "\dt" + +# 验证扩展 +docker compose exec postgres psql -U compliance -d compliance_db \ + -c "SELECT extname FROM pg_extension WHERE extname IN ('vector', 'uuid-ossp');" +``` + +### 常用操作 + +```bash +# 查看所有表 +docker compose exec postgres psql -U compliance -d compliance_db \ + -c "\dt" + +# 查询任务状态 +docker compose exec postgres psql -U compliance -d compliance_db \ + -c "SELECT id, task_type, status, created_at FROM tasks ORDER BY created_at DESC LIMIT 10;" + +# 备份数据库 +docker compose exec postgres pg_dump -U compliance compliance_db > backup_$(date +%Y%m%d).sql +``` + +--- + +## 组件二:Redis 7 + +**用途:** Celery 消息中间件、热数据缓存、分布式锁、会话存储 + +### 配置参数 + +```yaml +image: redis:7-alpine +command: > + redis-server + --requirepass + --maxmemory 2gb + --maxmemory-policy allkeys-lru # 内存满时淘汰最近最少使用的 key +ports: + - "6379:6379" +``` + +### 启动与验证 + +```bash +# 启动 +docker compose up -d redis + +# 连接测试 +docker compose exec redis redis-cli -a ping +# 应返回:PONG + +# 查看 Celery 队列长度 +docker compose exec redis redis-cli -a llen celery + +# 查看内存使用 +docker compose exec redis redis-cli -a info memory | grep used_memory_human +``` + +--- + +## 组件三:Milvus 2.4 Standalone + +**用途:** 向量数据库,存储 BGE-M3 嵌入向量,支持混合检索 + +### 架构说明 + +Milvus Standalone 包含三个内部组件: +- **etcd**:元数据存储(Collection 定义、索引配置) +- **MinIO**:向量段文件存储 +- **milvus**:查询/写入引擎 + +### 启动顺序(严格按顺序) + +```bash +# 1. 先启动 etcd +docker compose up -d etcd +sleep 10 + +# 2. 再启动 MinIO +docker compose up -d minio +sleep 10 + +# 3. 最后启动 Milvus(依赖前两者) +docker compose up -d milvus +# Milvus 冷启动约需 60 秒,请耐心等待 +``` + +### 验证 + +```bash +# HTTP 健康检查 +curl http://localhost:9091/healthz +# 应返回:{"status":"ok"} + +# Python 连接测试 +python3 -c " +from pymilvus import connections, utility +connections.connect(host='localhost', port='19530') +print('Collections:', utility.list_collections()) +print('Milvus 连接成功') +" +``` + +### 创建 Collection(向量索引) + +```python +from pymilvus import (connections, Collection, CollectionSchema, + FieldSchema, DataType, utility) + +connections.connect(host='localhost', port='19530') + +fields = [ + FieldSchema('id', DataType.VARCHAR, is_primary=True, max_length=128), + FieldSchema('content', DataType.VARCHAR, max_length=65535), + FieldSchema('dense_vec', DataType.FLOAT_VECTOR, dim=1024), # BGE-M3 + FieldSchema('metadata', DataType.JSON), +] + +schema = CollectionSchema(fields, description='法规条款向量库') +col = Collection('regulation_chunks', schema) + +# 创建 HNSW 索引(速度/精度平衡) +col.create_index('dense_vec', { + 'metric_type': 'COSINE', + 'index_type': 'HNSW', + 'params': {'M': 16, 'efConstruction': 200} +}) +col.load() +print('Collection 创建完成') +``` + +### 常用查询 + +```python +# 向量相似度检索 +results = col.search( + data=[query_vector], # 查询向量(1024维) + anns_field='dense_vec', + param={'metric_type': 'COSINE', 'params': {'ef': 100}}, + limit=10, + output_fields=['content', 'metadata'] +) + +# 查看 Collection 统计 +print(col.num_entities) # 向量总数 +``` + +--- + +## 组件四:Neo4j 5 Community + +**用途:** 知识图谱存储,法规-条款-义务实体关系 + +### 配置参数 + +```yaml +image: neo4j:5.20-community +environment: + NEO4J_AUTH: neo4j/ + NEO4J_PLUGINS: '["apoc"]' # 必须安装 APOC 插件 + NEO4J_dbms_memory_heap_max__size: 2G +ports: + - "7474:7474" # Browser UI + - "7687:7687" # Bolt 协议(应用连接用) +``` + +### 启动与验证 + +```bash +# 启动 +docker compose up -d neo4j +# 首次启动约需 60 秒(下载 APOC 插件) + +# 浏览器访问:http://localhost:7474 +# 用户名:neo4j,密码:见 .env 中 NEO4J_PASSWORD + +# 命令行连接 +docker compose exec neo4j cypher-shell -u neo4j -p +``` + +### 常用 Cypher 查询 + +```cypher +// 查看所有节点类型 +CALL apoc.meta.schema() YIELD value RETURN value; + +// 创建法规节点 +CREATE (r:Regulation { + id: 'GB18384-2020', + title: 'GB 18384-2020 电动汽车安全要求', + domain: 'vehicle_safety', + effective_date: date('2021-01-01'), + version: '2020' +}); + +// 法规-条款关系 +MATCH (r:Regulation {id: 'GB18384-2020'}) +CREATE (c:Clause { + id: 'GB18384-2020-2.1', + number: '2.1', + content: '绝缘电阻要求:直流电路绝缘电阻不得低于100Ω/V' +}) +CREATE (r)-[:CONTAINS]->(c); + +// 多跳查询:查找某法规所有义务 +MATCH (r:Regulation {domain: 'vehicle_safety'})-[:CONTAINS]->(c)-[:REQUIRES]->(o) +RETURN r.title, c.number, o.description LIMIT 20; +``` + +--- + +## 组件五:BGE-M3 嵌入服务 + +**用途:** 将文本转换为 1024 维向量,支持中英双语,支持 Dense+Sparse 混合检索 + +### 服务构建 + +```bash +# 构建镜像 +docker compose build embedding-service + +# 首次启动(会自动下载 BGE-M3 模型约 2.5GB) +docker compose up -d embedding-service + +# 查看下载进度 +docker compose logs -f embedding-service +``` + +### 模型预下载(推荐,避免启动超时) + +```bash +# 方法1:通过 hf-mirror.com 加速 +bash scripts/download_models.sh + +# 方法2:通过 ModelScope(国内最快) +pip install modelscope +python3 -c " +from modelscope import snapshot_download +snapshot_download('AI-ModelScope/bge-m3', cache_dir='./models/modelscope') +" +``` + +### API 使用 + +```bash +# 健康检查 +curl http://localhost:8010/health + +# 生成嵌入向量 +curl -X POST http://localhost:8010/embed \ + -H "Content-Type: application/json" \ + -d '{ + "texts": ["GB 18384 电动汽车碰撞安全", "vehicle crash safety requirements"], + "batch_size": 2 + }' +# 返回:{"dense": [[...1024个浮点数...], [...]], "sparse": [{...词汇权重...}, {...}]} +``` + +### 性能参考 + +| 模式 | 硬件 | 速度 | +|------|------|------| +| CPU | 16核,64GB RAM | 约 2-5 秒/批(batch=16)| +| GPU | RTX 3090 24GB | 约 0.2-0.5 秒/批(batch=32)| + +--- + +## 组件六:MinerU 文档解析服务 + +**用途:** 将 PDF/Word/Excel 解析为 Markdown + 结构化 JSON + +### 服务构建 + +```bash +# 构建镜像(首次约需 10-20 分钟,下载大量依赖) +docker compose build mcp-server + +# 启动服务(首次会下载 MinerU 模型约 2GB) +docker compose up -d mcp-server + +# 查看启动日志 +docker compose logs -f mcp-server +``` + +### API 使用 + +```bash +# 解析 PDF +curl -X POST http://localhost:8011/mineru-parse \ + -F "file=@/path/to/regulation.pdf" +# 返回:{"markdown": "# 法规标题\n\n## 第一章...", "filename": "regulation.pdf"} + +# 解析 Word 文档 +curl -X POST http://localhost:8011/parse-document \ + -F "file=@/path/to/document.docx" +``` + +### 性能参考 + +| 模式 | 速度 | 说明 | +|------|------|------| +| CPU | 3-5 秒/页 | 调研阶段可接受 | +| GPU(RTX 3090)| 0.21 秒/页 | 生产推荐 | + +--- + +## 组件七:业务后端(compliance-backend) + +**用途:** FastAPI 主服务,整合所有业务逻辑 + +### 关键依赖配置 + +```bash +# .env 中必须设置 +DEEPSEEK_API_KEY=sk-xxxx # DeepSeek API Key +LLM_PROVIDER=deepseek # 或 qwen +DATABASE_URL=postgresql+asyncpg://... +REDIS_URL=redis://:password@redis:6379/0 +MILVUS_HOST=milvus +NEO4J_URI=bolt://neo4j:7687 +EMBEDDING_SERVICE_URL=http://embedding-service:8010 +MCP_SERVER_URL=http://mcp-server:8011 +``` + +### 启动与验证 + +```bash +# 启动服务 +docker compose up -d compliance-backend celery-worker celery-beat + +# 验证 API 文档 +open http://localhost:8000/docs + +# 查看健康状态(包含所有依赖) +curl http://localhost:8000/health +``` + +### Celery Worker 监控 + +```bash +# 查看 Worker 状态 +docker compose exec celery-worker celery -A app.worker inspect active + +# 查看队列积压 +docker compose exec redis redis-cli -a llen celery + +# Worker 日志 +docker compose logs -f celery-worker +``` + +--- + +## 组件八:Nginx API 网关 + +**用途:** 反向代理,统一路由,TLS 终止(生产) + +### 配置说明(config/nginx.conf) + +```nginx +upstream compliance_backend { + server compliance-backend:8000; +} + +server { + listen 80; + client_max_body_size 100M; # 支持大 PDF 上传 + proxy_read_timeout 300s; # LLM 推理超时设置 + + location /api/kb/ { proxy_pass http://compliance_backend; } + location /api/compliance/ { proxy_pass http://compliance_backend; } + location /api/regulation/ { proxy_pass http://compliance_backend; } + location /health { proxy_pass http://compliance_backend; } + location /docs { proxy_pass http://compliance_backend; } +} +``` + +### 启动与验证 + +```bash +# 启动 +docker compose up -d nginx + +# 测试路由 +curl http://localhost/health +curl http://localhost/docs # 应返回 Swagger UI HTML +``` + +--- + +## 完整启动顺序 + +```bash +# 方式1:分步启动(推荐,含健康等待) +bash scripts/06_start_all.sh + +# 方式2:手动分步 +docker compose up -d postgres redis # 等30s +docker compose up -d etcd minio # 等30s +docker compose up -d milvus # 等60s +docker compose up -d neo4j # 等60s +docker compose build embedding-service mcp-server compliance-backend +docker compose up -d embedding-service mcp-server # 等120s(模型加载) +bash scripts/05_init_db.sh # 初始化数据库 +docker compose up -d compliance-backend celery-worker celery-beat nginx + +# 验证 +bash scripts/check_health.sh +``` + +--- + +## 常见问题 + +### Q: Milvus 启动失败 + +```bash +# 检查 etcd 和 minio 是否健康 +docker compose ps etcd minio + +# 查看 Milvus 日志 +docker compose logs milvus | tail -50 + +# 常见原因:内存不足(Milvus 需要至少 4GB 可用内存) +free -h +``` + +### Q: BGE-M3 模型下载失败 + +```bash +# 使用镜像加速 +export HF_ENDPOINT=https://hf-mirror.com +docker compose up -d embedding-service + +# 或使用 ModelScope +bash scripts/download_models.sh +``` + +### Q: DeepSeek API 连接超时 + +```bash +# 测试连通性 +curl -X POST https://api.deepseek.com/v1/chat/completions \ + -H "Authorization: Bearer $DEEPSEEK_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "deepseek-chat", "messages": [{"role": "user", "content": "ping"}]}' + +# 常见原因:API Key 未设置或网络问题 +``` + +### Q: 内存不足 + +```bash +# 查看内存使用 +docker stats --no-stream + +# 临时解决:减少 BGE-M3 批大小(降低内存峰值) +# 编辑 .env,添加: +# EMBEDDING_BATCH_SIZE=4 (默认16) +``` diff --git a/03_业务闭环说明.md b/03_业务闭环说明.md new file mode 100644 index 0000000..8ff13a1 --- /dev/null +++ b/03_业务闭环说明.md @@ -0,0 +1,536 @@ +# AI合规智能中枢 — 三条业务闭环说明 + +> 本文档详细描述三条核心业务闭环的数据流、接口规范和验证方法。 + +--- + +## 一、闭环①:法规入库 → 检索问答 + +### 1.1 业务场景 + +**触发场景:** +- 法务/研发人员上传新法规 PDF(如 GB 18384-2020、UN-ECE R155) +- 系统自动解析、分块、向量化,建立可检索知识库 +- 用户用自然语言提问,系统返回精准答案并标注来源 + +**用户角色:** 车企研发、法务、合规管理员 + +### 1.2 数据流 + +``` +[用户] 上传 PDF + │ + ▼ +POST /api/kb/files/upload +{workspace_id, file} + │ + ▼ +[kbmp-service] + - 存储文件 → data/uploads/{file_id}.pdf + - 写入 files 表(status: uploaded) + - 投递 Celery 任务 → parse-queue + - 返回 {task_id, file_id} + │ + ▼ 异步 +[celery: parse-worker] + - 调用 POST http://mcp-server:8011/mineru-parse + - 获取 Markdown 文本 + - 更新 files 表(status: parsed) + - 投递 vectorize-queue + │ + ▼ 异步 +[celery: vectorize-worker] + - 文本分块(chunk_size=512,overlap=64) + - 调用 POST http://embedding-service:8010/embed + - 获取 1024维 Dense + Sparse 向量 + - 写入 Milvus regulation_chunks + - 写入 PostgreSQL(chunk 元数据) + - 更新 files 表(status: vectorized) + - 更新 tasks 表(status: completed) + +[用户] 提问 + │ + ▼ +POST /api/kb/qa +{query, workspace_id, top_k=5} + │ + ▼ +[rag-service] + 1. BGE-M3 向量化查询 + 2. Milvus Dense 向量检索(Cosine,top-20) + 3. Milvus Sparse 向量检索(BM25 等效,top-20) + 4. RRF 融合(Reciprocal Rank Fusion) + 5. Cross-Encoder Reranker 精排(top-5) + 6. 构建 RAG Prompt(含检索片段) + 7. DeepSeek API 生成答案(引文锚定) + │ + ▼ +返回:{answer, sources: [{content, file, page, score}], tokens_used} +``` + +### 1.3 关键接口 + +```http +### 创建工作空间 +POST /api/kb/workspaces +Content-Type: application/json + +{ + "name": "汽车安全法规库", + "description": "GB、UN-ECE 系列法规", + "domain": "vehicle_safety" +} + +### 响应 +{ + "id": "uuid-xxx", + "name": "汽车安全法规库", + "created_at": "2026-04-22T10:00:00Z" +} +``` + +```http +### 上传文件 +POST /api/kb/files/upload +Content-Type: multipart/form-data + +file: +workspace_id: uuid-xxx + +### 响应 +{ + "file_id": "uuid-yyy", + "task_id": "uuid-zzz", + "filename": "GB18384-2020.pdf", + "status": "processing" +} +``` + +```http +### 查询任务状态 +GET /api/kb/tasks/{task_id} + +### 响应 +{ + "task_id": "uuid-zzz", + "status": "completed", // pending / running / completed / failed + "progress": 100, + "file_id": "uuid-yyy", + "completed_at": "2026-04-22T10:05:00Z" +} +``` + +```http +### 智能问答 +POST /api/kb/qa +Content-Type: application/json + +{ + "query": "电动汽车碰撞后高压系统的断电时间要求是多少?", + "workspace_id": "uuid-xxx", + "top_k": 5, + "return_sources": true +} + +### 响应 +{ + "answer": "根据 GB 18384-2020 第 2.2 条,碰撞后 5 秒内,高压系统电压应降至 60V 以下。[来源:GB18384-2020.pdf,第3页]", + "sources": [ + { + "content": "碰撞后5秒内,高压系统电压应降至60V以下。", + "file": "GB18384-2020.pdf", + "page": 3, + "chunk_idx": 12, + "score": 0.94 + } + ], + "tokens_used": 1250 +} +``` + +### 1.4 分块策略 + +```python +# 推荐分块配置(调研阶段) +CHUNK_SIZE = 512 # 每块最大 token 数 +CHUNK_OVERLAP = 64 # 块间重叠(保留上下文) +SEPARATOR = "\n\n" # 优先按段落分割 + +# 法规文档特殊处理 +# - 识别条款编号(1.1, 2.3.1 等),保证条款完整性 +# - 表格单独处理(不与正文混合) +# - 图片提取 alt text +``` + +--- + +## 二、闭环②:文档上传 → 合规审查 + +### 2.1 业务场景 + +**触发场景:** +- 采购/供应链人员上传供应商文件(技术规格书、合规声明等) +- 研发人员上传设计文档,检查是否符合最新法规 +- EHS 工程师上传安全操作规程,验证 ISO 45001 合规性 + +**用户角色:** 采购、供应链、研发、EHS 工程师 + +### 2.2 数据流 + +``` +[用户] 上传供应商文件 + │ + ▼ +POST /api/compliance/upload +{file, regulation_domains} + │ + ▼ +[compliance-backend] + - MinerU 解析文档 + - 条款级分割(识别条款结构) + - 法规域匹配(根据内容自动识别:vehicle_safety / data_security / ehs) + - 投递 compliance-queue + │ + ▼ 异步 +[celery: compliance-worker] + 1. 对每个条款,在 Milvus 中检索相关法规要求 + 2. DeepSeek API 评估合规性 + Prompt: "对比以下供应商条款与法规要求,评估合规性..." + 3. 生成风险评分(0-100) + 4. 汇总生成 Markdown 报告 + 5. 存储 compliance_reports 表 + │ + ▼ +[用户] 获取报告 +GET /api/compliance/report/{id} +``` + +### 2.3 关键接口 + +```http +### 上传并审查文档 +POST /api/compliance/upload +Content-Type: multipart/form-data + +file: +regulation_domains: ["vehicle_safety", "data_security"] # 可多选 + +### 响应 +{ + "report_id": "uuid-aaa", + "file_id": "uuid-bbb", + "status": "analyzing", + "estimated_time_seconds": 60 +} +``` + +```http +### 直接合规检查(文本输入) +POST /api/compliance/check +Content-Type: application/json + +{ + "query": "供应商声明:产品绝缘电阻为50Ω/V,满足行业标准", + "regulation_domains": ["vehicle_safety"], + "top_k": 3 +} + +### 响应 +{ + "risk_level": "high", + "risk_score": 78, + "findings": [ + { + "clause": "GB 18384-2020 第2.1条", + "requirement": "直流电路绝缘电阻不得低于100Ω/V", + "actual": "供应商声明50Ω/V", + "gap": "不满足,差距50Ω/V", + "severity": "critical" + } + ], + "recommendations": [ + "要求供应商提升绝缘电阻至100Ω/V以上", + "提供经第三方认证的测试报告" + ] +} +``` + +```http +### 获取完整审查报告 +GET /api/compliance/report/{report_id} + +### 响应 +{ + "report_id": "uuid-aaa", + "overall_risk_level": "high", + "risk_score": 78, + "findings": [...], + "recommendations": [...], + "report_markdown": "# 合规审查报告\n\n## 总体评估\n...", + "regulation_domains": ["vehicle_safety"], + "llm_model": "deepseek-chat", + "created_at": "2026-04-22T11:00:00Z" +} +``` + +### 2.4 风险等级定义 + +| 风险等级 | 分数 | 说明 | 建议行动 | +|---------|------|------|---------| +| low | 0-30 | 基本合规,小幅优化 | 记录并监控 | +| medium | 31-60 | 部分不符合,需要整改 | 制定整改计划 | +| high | 61-80 | 重大不符合,需立即处理 | 暂停合作/紧急整改 | +| critical | 81-100 | 严重违规,可能造成法律风险 | 立即停止/上报管理层 | + +--- + +## 三、闭环③:法规监控 → 变更推送 + +### 3.1 业务场景 + +**触发场景:** +- 国家发布新的新能源汽车数据安全法规 +- 现有法规(如 GB 7258)进行修订 +- 碳排放法规新增企业义务 + +系统自动检测变更,分析影响,推送给相关角色。 + +**用户角色:** 合规管理员、法务专员、EHS 工程师(订阅对应域) + +### 3.2 数据流 + +``` +[Celery Beat] 每天凌晨 2:00 触发 + │ + ▼ +[celery: monitor-worker] + - 读取 regulation_sources 表(所有 is_active=True 的监控源) + - 对每个监控源: + a. HTTP 抓取页面内容 + b. 计算 MD5 Hash + c. 与 last_hash 对比 + d. 有变化 → 投递变更分析任务 + │ + ▼ [有变更时] +[celery: compliance-worker] + - DeepSeek API 分析变更内容 + - 提取新增/修订/废止条款 + - 生成变更摘要 + - 写入 regulation_updates 表 + - 触发增量入库(重新向量化变更条款) + - 更新 Neo4j 知识图谱 + │ + ▼ +[celery: push-worker] + - 读取 subscriptions 表 + - 按域、重要性过滤 + - 发送推送(Email / Webhook / 飞书) + - 标记 is_notified=True +``` + +### 3.3 关键接口 + +```http +### 配置监控源 +POST /api/regulation/sources +Content-Type: application/json + +{ + "name": "国家标准全文公开系统", + "url": "https://std.samr.gov.cn", + "domain": "vehicle_safety", + "fetch_interval": 86400, + "fetch_config": { + "css_selector": ".standard-list .item", + "title_selector": ".title", + "date_selector": ".date" + } +} + +### 响应 +{ + "id": "uuid-src1", + "name": "国家标准全文公开系统", + "status": "active", + "next_fetch_at": "2026-04-23T02:00:00Z" +} +``` + +```http +### 查看法规变更记录 +GET /api/regulation/updates?domain=vehicle_safety&limit=10&offset=0 + +### 响应 +{ + "total": 25, + "updates": [ + { + "id": "uuid-upd1", + "title": "GB 18384-2022 电动汽车安全要求(修订版)", + "url": "https://std.samr.gov.cn/xxxx", + "change_type": "revised", + "summary": "主要变更:碰撞断电时间由5秒缩短至3秒;新增涉水安全要求", + "importance": "high", + "fetched_at": "2026-04-22T02:00:00Z" + } + ] +} +``` + +```http +### 手动触发法规源采集(测试用) +POST /api/regulation/sources/{source_id}/fetch + +### 响应 +{ + "task_id": "uuid-task1", + "status": "queued", + "source_id": "uuid-src1" +} +``` + +```http +### 订阅变更推送 +POST /api/regulation/subscribe +Content-Type: application/json + +{ + "name": "EHS 工程师推送", + "channel": "webhook", + "target": "https://open.feishu.cn/open-apis/bot/v2/hook/xxxx", + "domains": ["ehs", "carbon"], + "importance_min": "normal" +} +``` + +### 3.4 内置监控源列表 + +| 名称 | URL | 域 | +|------|-----|-----| +| 国家标准全文公开系统 | https://std.samr.gov.cn | vehicle_safety | +| 工信部政策法规 | https://www.miit.gov.cn/jgsj/fgs/zcfg | vehicle_safety | +| 应急管理部法规 | https://www.mem.gov.cn/gk/zcfg | ehs | +| 生态环境部法规 | https://www.mee.gov.cn/ywgz/fgbz/fl | carbon | +| 网信办法规 | https://www.cac.gov.cn/zcfg/index.htm | data_security | + +--- + +## 四、接口认证说明(调研版) + +调研版使用简单 API Key 认证(在 `Authorization` 头传入): + +```http +# 所有请求需要携带 API Key +Authorization: Bearer +``` + +> `API_SECRET_KEY` 在 `.env` 中配置,默认值仅供本地调研使用,生产环境必须更换。 + +--- + +## 五、完整冒烟测试脚本 + +```bash +#!/usr/bin/env bash +# 完整三条闭环验证 +API="http://localhost" +KEY="your_api_secret_key" +HEADER="-H 'Authorization: Bearer $KEY' -H 'Content-Type: application/json'" + +# ── 闭环①测试 ──────────────────────────────── +echo "=== 测试闭环①:法规入库 → 问答 ===" + +# 1. 创建工作空间 +WS=$(curl -sf -X POST $API/api/kb/workspaces \ + -H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \ + -d '{"name":"测试法规库","domain":"vehicle_safety"}') +WS_ID=$(echo $WS | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") +echo "工作空间:$WS_ID" + +# 2. 上传测试 PDF +UPLOAD=$(curl -sf -X POST $API/api/kb/files/upload \ + -H "Authorization: Bearer $KEY" \ + -F "file=@data/uploads/test_regulation.txt" \ + -F "workspace_id=$WS_ID") +TASK_ID=$(echo $UPLOAD | python3 -c "import sys,json; print(json.load(sys.stdin)['task_id'])") +echo "任务ID:$TASK_ID" + +# 3. 等待处理 +for i in {1..30}; do + STATUS=$(curl -sf $API/api/kb/tasks/$TASK_ID -H "Authorization: Bearer $KEY" | \ + python3 -c "import sys,json; print(json.load(sys.stdin)['status'])") + [[ "$STATUS" == "completed" ]] && echo "处理完成" && break + sleep 5 +done + +# 4. 问答测试 +QA=$(curl -sf -X POST $API/api/kb/qa \ + -H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \ + -d "{\"query\":\"碰撞后高压系统要求\",\"workspace_id\":\"$WS_ID\"}") +echo "问答结果:$(echo $QA | python3 -c "import sys,json; print(json.load(sys.stdin).get('answer','')[:100])")" + +# ── 闭环②测试 ──────────────────────────────── +echo "" +echo "=== 测试闭环②:合规审查 ===" +CHECK=$(curl -sf -X POST $API/api/compliance/check \ + -H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \ + -d '{"query":"绝缘电阻50Ω/V","regulation_domains":["vehicle_safety"]}') +echo "风险等级:$(echo $CHECK | python3 -c "import sys,json; print(json.load(sys.stdin).get('risk_level','unknown'))")" + +# ── 闭环③测试 ──────────────────────────────── +echo "" +echo "=== 测试闭环③:法规监控 ===" +SRC=$(curl -sf -X POST $API/api/regulation/sources \ + -H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \ + -d '{"name":"测试源","url":"https://std.samr.gov.cn","domain":"vehicle_safety"}') +echo "监控源:$(echo $SRC | python3 -c "import sys,json; print(json.load(sys.stdin).get('id','failed'))")" +``` + +--- + +## 六、数据流示意图(完整版) + +``` + ┌─────────────────────────────────┐ + │ 用户请求 │ + │ Web / API / Mobile / Bot │ + └──────────────┬──────────────────┘ + │ + ▼ + ┌─────────────────────────────────┐ + │ Nginx API Gateway │ + │ 路由 / 限流 / 认证 │ + └──────────────┬──────────────────┘ + │ + ┌────────────────────┼────────────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌──────────────┐ ┌──────────────────┐ ┌────────────────┐ + │ 知识库 │ │ 合规审查 │ │ 法规监控 │ + │ /api/kb/* │ │ /api/compliance/* │ │/api/regulation/│ + └──────┬───────┘ └────────┬─────────┘ └───────┬────────┘ + │ │ │ + └──────────┬──────────┘ │ + │ │ + ▼ ▼ + ┌──────────────────┐ ┌──────────────────┐ + │ compliance- │ │ Celery Beat │ + │ backend │ │ 定时调度 │ + └──────┬───────────┘ └────────┬─────────┘ + │ │ + ┌──────────┼──────────┐ ┌──────────┼──────────┐ + │ │ │ │ │ │ + ▼ ▼ ▼ ▼ ▼ ▼ + parse-w vectorize-w compliance-w monitor-w push-w + │ │ │ │ │ + ▼ ▼ │ │ ▼ + mcp-server embedding LLM API 网络抓取 通知推送 + (MinerU) (BGE-M3) (DeepSeek) (requests) (Email/Bot) + │ │ + └────┬─────┘ + │ + ┌──────────┼──────────────┐ + ▼ ▼ ▼ + PostgreSQL Milvus Neo4j + (元数据/报告) (向量检索) (知识图谱) +``` diff --git a/README.md b/README.md new file mode 100644 index 0000000..61f829a --- /dev/null +++ b/README.md @@ -0,0 +1,190 @@ +# AI合规智能中枢 — 调研版部署指南 + +面向车企与工厂的全链路合规智能平台,Docker Compose 单机部署版本,用于验证三条业务闭环。 + +## 快速开始 + +### 前置要求 + +| 资源 | 最低 | 推荐 | +|------|------|------| +| CPU | 8核 | 16核+ | +| 内存 | 32 GB | 64 GB | +| 存储 | 200 GB SSD | 500 GB SSD | +| GPU | 无需 | 1× RTX 3090(加速嵌入)| +| OS | Ubuntu 22.04 LTS 或 Windows 11 + WSL2 | — | + +### 1. 安装 Docker + +**Ubuntu/Linux:** +```bash +bash scripts/00_install_docker_ubuntu.sh +``` + +**Windows(PowerShell 管理员):** +```powershell +.\scripts\00_install_docker_windows.ps1 +``` + +### 2. 配置环境变量 + +```bash +cp .env.example .env +# 编辑 .env,至少填写: +# - DEEPSEEK_API_KEY(在 https://platform.deepseek.com 申请) +nano .env +``` + +### 3. 一键启动 + +```bash +# 拉取镜像(可选,加速首次启动) +bash scripts/02_pull_images.sh + +# 分步启动(推荐,含健康等待) +bash scripts/06_start_all.sh +``` + +### 4. 验证部署 + +```bash +# 检查所有服务状态 +bash scripts/check_health.sh + +# 运行端到端冒烟测试 +bash scripts/07_smoke_test.sh +``` + +--- + +## 服务访问地址 + +| 服务 | 地址 | 说明 | +|------|------|------| +| API 网关 | http://localhost | Nginx 入口,所有 API 请求入口 | +| 业务后端 | http://localhost:8000/docs | FastAPI Swagger UI | +| Neo4j 浏览器 | http://localhost:7474 | 知识图谱可视化 | +| Grafana | http://localhost:3000 | 监控面板(`--profile monitoring` 启动)| +| Milvus | localhost:19530 | 向量数据库 gRPC 端口 | + +--- + +## 三条业务闭环 + +### 闭环①:法规入库 → 检索问答 + +```bash +# 上传法规PDF +curl -X POST http://localhost/api/kb/files/upload \ + -F "file=@your_regulation.pdf" \ + -F "workspace_id=auto-regulation" + +# 查询任务状态 +curl http://localhost/api/kb/tasks/{task_id} + +# 检索问答 +curl -X POST http://localhost/api/kb/qa \ + -H "Content-Type: application/json" \ + -d '{"query": "GB 18384 电动汽车碰撞安全要求", "top_k": 5}' +``` + +### 闭环②:文档上传 → 合规审查 + +```bash +# 上传供应商文件 +curl -X POST http://localhost/api/compliance/upload \ + -F "file=@supplier_document.pdf" + +# 触发合规审查 +curl -X POST http://localhost/api/compliance/check \ + -H "Content-Type: application/json" \ + -d '{"doc_id": "xxx", "regulation_domains": ["vehicle_safety", "data_security"]}' + +# 获取审查报告 +curl http://localhost/api/compliance/report/{id} +``` + +### 闭环③:法规监控 → 变更推送 + +```bash +# 配置监控源 +curl -X POST http://localhost/api/regulation/sources \ + -H "Content-Type: application/json" \ + -d '{"url": "https://std.samr.gov.cn", "name": "国家标准全文公开"}' + +# 查看变更记录 +curl http://localhost/api/regulation/updates +``` + +--- + +## 目录结构 + +``` +Depolyment/ +├── README.md # 本文件 +├── docker-compose.yml # 全服务编排 +├── .env.example # 环境变量模板 +├── scripts/ # 安装与运维脚本 +├── services/ +│ ├── embedding/ # BGE-M3 嵌入服务 +│ ├── mcp-server/ # MinerU 文档解析服务 +│ └── compliance-backend/ # 核心业务后端 +├── config/ # Nginx、Prometheus 配置 +├── init-sql/ # PostgreSQL 初始化 SQL +├── data/ # 运行时数据(上传文件、解析结果) +├── logs/ # 服务日志 +└── models/ # AI 模型缓存(BGE-M3、MinerU) +``` + +--- + +## 常用操作 + +```bash +# 查看所有服务状态 +docker compose ps + +# 查看某个服务日志 +docker compose logs -f compliance-backend + +# 重启某个服务 +docker compose restart embedding-service + +# 停止所有服务(保留数据) +docker compose stop + +# 完全重置(删除所有数据,慎用) +bash scripts/reset_all.sh + +# 启动监控面板 +docker compose --profile monitoring up -d grafana +``` + +--- + +## LLM 切换 + +默认使用 DeepSeek API,如需切换到 Qwen(阿里云): + +编辑 `.env`: +```bash +LLM_PROVIDER=qwen +DASHSCOPE_API_KEY=your_key_here +QWEN_MODEL=qwen-plus +``` + +然后重启业务服务: +```bash +docker compose restart compliance-backend celery-worker +``` + +--- + +## 生产升级路径 + +调研验证通过后,升级要点: +1. **LLM**:从 API 切换到本地 vLLM + DeepSeek-V3(需要 4×A100) +2. **Milvus**:从 Standalone 升级到分布式集群(加独立 MinIO) +3. **编排**:从 Docker Compose 迁移到 Kubernetes(服务配置文件可复用) +4. **安全**:启用完整 JWT/RBAC,添加 TLS 证书 diff --git a/config/nginx.conf b/config/nginx.conf new file mode 100644 index 0000000..c7c58c5 --- /dev/null +++ b/config/nginx.conf @@ -0,0 +1,63 @@ +upstream compliance_backend { + server compliance-backend:8000; + keepalive 32; +} + +# 文件上传大小限制(法规PDF可能较大) +client_max_body_size 100M; + +server { + listen 80; + server_name _; + + # 访问日志 + access_log /var/log/nginx/access.log; + error_log /var/log/nginx/error.log; + + # 超时配置(LLM推理可能较慢) + proxy_connect_timeout 60s; + proxy_send_timeout 300s; + proxy_read_timeout 300s; + + # 通用代理头 + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_http_version 1.1; + proxy_set_header Connection ""; + + # ── 知识库接口 ───────────────────────────── + location /api/kb/ { + proxy_pass http://compliance_backend/api/kb/; + } + + # ── 合规审查接口 ─────────────────────────── + location /api/compliance/ { + proxy_pass http://compliance_backend/api/compliance/; + } + + # ── 法规监控接口 ─────────────────────────── + location /api/regulation/ { + proxy_pass http://compliance_backend/api/regulation/; + } + + # ── 健康检查 ─────────────────────────────── + location /health { + proxy_pass http://compliance_backend/health; + } + + # ── API 文档(开发环境)──────────────────── + location /docs { + proxy_pass http://compliance_backend/docs; + } + + location /openapi.json { + proxy_pass http://compliance_backend/openapi.json; + } + + # ── 根路径 ───────────────────────────────── + location / { + proxy_pass http://compliance_backend/; + } +} diff --git a/config/prometheus.yml b/config/prometheus.yml new file mode 100644 index 0000000..ddb11ff --- /dev/null +++ b/config/prometheus.yml @@ -0,0 +1,22 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: "compliance-backend" + static_configs: + - targets: ["compliance-backend:8000"] + metrics_path: /metrics + + - job_name: "milvus" + static_configs: + - targets: ["milvus:9091"] + metrics_path: /metrics + + - job_name: "redis" + static_configs: + - targets: ["redis:6379"] + + - job_name: "postgres" + static_configs: + - targets: ["postgres:5432"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..87a6bfc --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,380 @@ +version: "3.9" + +networks: + compliance-net: + driver: bridge + +volumes: + postgres_data: + redis_data: + milvus_data: + minio_data: + neo4j_data: + neo4j_logs: + +services: + + # ═══════════════════════════════════════════════ + # 基础数据层 + # ═══════════════════════════════════════════════ + + postgres: + image: pgvector/pgvector:pg16 + container_name: compliance-postgres + restart: unless-stopped + environment: + POSTGRES_USER: compliance + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-compliance123} + POSTGRES_DB: compliance_db + volumes: + - postgres_data:/var/lib/postgresql/data + - ./init-sql:/docker-entrypoint-initdb.d + ports: + - "5432:5432" + networks: [compliance-net] + healthcheck: + test: ["CMD-SHELL", "pg_isready -U compliance -d compliance_db"] + interval: 10s + timeout: 5s + retries: 10 + + redis: + image: redis:7-alpine + container_name: compliance-redis + restart: unless-stopped + command: > + redis-server + --requirepass ${REDIS_PASSWORD:-redis123} + --maxmemory 2gb + --maxmemory-policy allkeys-lru + volumes: + - redis_data:/data + ports: + - "6379:6379" + networks: [compliance-net] + healthcheck: + test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD:-redis123}", "ping"] + interval: 10s + timeout: 5s + retries: 5 + + # ═══════════════════════════════════════════════ + # Milvus 向量数据库(Standalone,含 etcd + minio) + # ═══════════════════════════════════════════════ + + etcd: + image: quay.io/coreos/etcd:v3.5.5 + container_name: milvus-etcd + restart: unless-stopped + environment: + ETCD_AUTO_COMPACTION_MODE: revision + ETCD_AUTO_COMPACTION_RETENTION: "1000" + ETCD_QUOTA_BACKEND_BYTES: "4294967296" + ETCD_SNAPSHOT_COUNT: "50000" + volumes: + - milvus_data:/etcd + command: > + etcd + -advertise-client-urls=http://127.0.0.1:2379 + -listen-client-urls=http://0.0.0.0:2379 + --data-dir=/etcd + networks: [compliance-net] + healthcheck: + test: ["CMD", "etcdctl", "endpoint", "health"] + interval: 30s + timeout: 20s + retries: 3 + + minio: + image: minio/minio:RELEASE.2023-03-13T19-46-17Z + container_name: milvus-minio + restart: unless-stopped + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + volumes: + - minio_data:/minio_data + command: minio server /minio_data --console-address ":9001" + ports: + - "9001:9001" # MinIO 控制台(可选访问) + networks: [compliance-net] + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + milvus: + image: milvusdb/milvus:v2.4.13 + container_name: compliance-milvus + restart: unless-stopped + command: ["milvus", "run", "standalone"] + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - milvus_data:/var/lib/milvus + ports: + - "19530:19530" # gRPC API + - "9091:9091" # HTTP API + depends_on: + etcd: + condition: service_healthy + minio: + condition: service_healthy + networks: [compliance-net] + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] + interval: 30s + timeout: 20s + retries: 10 + start_period: 60s + + # ═══════════════════════════════════════════════ + # Neo4j 知识图谱数据库 + # ═══════════════════════════════════════════════ + + neo4j: + image: neo4j:5.20-community + container_name: compliance-neo4j + restart: unless-stopped + environment: + NEO4J_AUTH: neo4j/${NEO4J_PASSWORD:-neo4j123} + NEO4J_PLUGINS: '["apoc"]' + NEO4J_dbms_security_procedures_unrestricted: apoc.* + NEO4J_dbms_memory_heap_initial__size: 512m + NEO4J_dbms_memory_heap_max__size: 2G + NEO4J_dbms_memory_pagecache_size: 1G + volumes: + - neo4j_data:/data + - neo4j_logs:/logs + ports: + - "7474:7474" # Browser UI + - "7687:7687" # Bolt 协议 + networks: [compliance-net] + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://localhost:7474 || exit 1"] + interval: 30s + timeout: 10s + retries: 10 + start_period: 60s + + # ═══════════════════════════════════════════════ + # AI 模型服务 + # ═══════════════════════════════════════════════ + + embedding-service: + build: + context: ./services/embedding + dockerfile: Dockerfile + image: compliance-embedding:latest + container_name: compliance-embedding + restart: unless-stopped + environment: + MODEL_NAME: BAAI/bge-m3 + HF_ENDPOINT: ${HF_ENDPOINT:-https://hf-mirror.com} + DEVICE: ${EMBEDDING_DEVICE:-cpu} + MAX_BATCH_SIZE: "16" + volumes: + - ./models:/app/models + ports: + - "8010:8010" + networks: [compliance-net] + deploy: + resources: + limits: + memory: 8G + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8010/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 120s # 模型加载需要时间 + + mcp-server: + build: + context: ./services/mcp-server + dockerfile: Dockerfile + image: compliance-mcp:latest + container_name: compliance-mcp + restart: unless-stopped + environment: + DEVICE: ${MCP_DEVICE:-cpu} + HF_ENDPOINT: ${HF_ENDPOINT:-https://hf-mirror.com} + volumes: + - ./models:/app/models + - ./data/uploads:/app/uploads + - ./data/parsed:/app/parsed + ports: + - "8011:8011" + networks: [compliance-net] + deploy: + resources: + limits: + memory: 8G + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8011/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 120s + + # ═══════════════════════════════════════════════ + # 业务服务层 + # ═══════════════════════════════════════════════ + + compliance-backend: + build: + context: ./services/compliance-backend + dockerfile: Dockerfile + image: compliance-backend:latest + container_name: compliance-backend + restart: unless-stopped + env_file: .env + environment: + DATABASE_URL: postgresql+asyncpg://compliance:${POSTGRES_PASSWORD:-compliance123}@postgres:5432/compliance_db + REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/0 + MILVUS_HOST: milvus + MILVUS_PORT: "19530" + NEO4J_URI: bolt://neo4j:7687 + NEO4J_USER: neo4j + NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123} + EMBEDDING_SERVICE_URL: http://embedding-service:8010 + MCP_SERVER_URL: http://mcp-server:8011 + LLM_PROVIDER: ${LLM_PROVIDER:-deepseek} + DEEPSEEK_API_KEY: ${DEEPSEEK_API_KEY:-} + DEEPSEEK_MODEL: ${DEEPSEEK_MODEL:-deepseek-chat} + DASHSCOPE_API_KEY: ${DASHSCOPE_API_KEY:-} + QWEN_MODEL: ${QWEN_MODEL:-qwen-plus} + LOG_LEVEL: ${LOG_LEVEL:-INFO} + APP_ENV: ${APP_ENV:-development} + volumes: + - ./data:/app/data + - ./logs:/app/logs + ports: + - "8000:8000" + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + milvus: + condition: service_healthy + embedding-service: + condition: service_healthy + networks: [compliance-net] + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + + celery-worker: + build: + context: ./services/compliance-backend + dockerfile: Dockerfile + image: compliance-backend:latest + container_name: compliance-worker + restart: unless-stopped + command: > + celery -A app.worker worker + --loglevel=info + --concurrency=4 + --queues=default,parse,vectorize,compliance,monitor,push + env_file: .env + environment: + DATABASE_URL: postgresql+asyncpg://compliance:${POSTGRES_PASSWORD:-compliance123}@postgres:5432/compliance_db + REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/0 + MILVUS_HOST: milvus + MILVUS_PORT: "19530" + NEO4J_URI: bolt://neo4j:7687 + NEO4J_USER: neo4j + NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123} + EMBEDDING_SERVICE_URL: http://embedding-service:8010 + MCP_SERVER_URL: http://mcp-server:8011 + LLM_PROVIDER: ${LLM_PROVIDER:-deepseek} + DEEPSEEK_API_KEY: ${DEEPSEEK_API_KEY:-} + DASHSCOPE_API_KEY: ${DASHSCOPE_API_KEY:-} + volumes: + - ./data:/app/data + - ./logs:/app/logs + depends_on: + redis: + condition: service_healthy + compliance-backend: + condition: service_healthy + networks: [compliance-net] + + celery-beat: + build: + context: ./services/compliance-backend + dockerfile: Dockerfile + image: compliance-backend:latest + container_name: compliance-beat + restart: unless-stopped + command: > + celery -A app.worker beat + --loglevel=info + --scheduler celery.beat.PersistentScheduler + env_file: .env + environment: + DATABASE_URL: postgresql+asyncpg://compliance:${POSTGRES_PASSWORD:-compliance123}@postgres:5432/compliance_db + REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/0 + DEEPSEEK_API_KEY: ${DEEPSEEK_API_KEY:-} + volumes: + - ./data:/app/data + - ./logs:/app/logs + depends_on: + redis: + condition: service_healthy + networks: [compliance-net] + + # ═══════════════════════════════════════════════ + # API 网关 + # ═══════════════════════════════════════════════ + + nginx: + image: nginx:1.25-alpine + container_name: compliance-nginx + restart: unless-stopped + volumes: + - ./config/nginx.conf:/etc/nginx/conf.d/default.conf:ro + ports: + - "80:80" + depends_on: + compliance-backend: + condition: service_healthy + networks: [compliance-net] + healthcheck: + test: ["CMD", "nginx", "-t"] + interval: 30s + + # ═══════════════════════════════════════════════ + # 监控(可选,--profile monitoring 启动) + # ═══════════════════════════════════════════════ + + grafana: + image: grafana/grafana:11.0.0 + container_name: compliance-grafana + restart: unless-stopped + environment: + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD:-admin} + GF_USERS_ALLOW_SIGN_UP: "false" + volumes: + - ./config/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml:ro + ports: + - "3000:3000" + networks: [compliance-net] + profiles: [monitoring] + + prometheus: + image: prom/prometheus:v2.51.0 + container_name: compliance-prometheus + restart: unless-stopped + volumes: + - ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + networks: [compliance-net] + profiles: [monitoring] diff --git a/init-sql/01_init_schema.sql b/init-sql/01_init_schema.sql new file mode 100644 index 0000000..cac0aa9 --- /dev/null +++ b/init-sql/01_init_schema.sql @@ -0,0 +1,192 @@ +-- AI合规智能中枢 — PostgreSQL 初始化 Schema +-- 执行时机:容器首次启动时自动执行 + +-- 启用扩展 +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; +CREATE EXTENSION IF NOT EXISTS vector; -- pgvector(pgvector/pgvector:pg16 镜像已内置) +CREATE EXTENSION IF NOT EXISTS pg_trgm; -- 全文检索支持 + +-- ══════════════════════════════════════════════════ +-- 工作空间(知识库) +-- ══════════════════════════════════════════════════ +CREATE TABLE IF NOT EXISTS workspaces ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + name VARCHAR(255) NOT NULL, + description TEXT, + domain VARCHAR(100), -- vehicle_safety / data_security / ehs / carbon + created_by VARCHAR(255), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- ══════════════════════════════════════════════════ +-- 文件记录 +-- ══════════════════════════════════════════════════ +CREATE TABLE IF NOT EXISTS files ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + workspace_id UUID REFERENCES workspaces(id) ON DELETE CASCADE, + filename VARCHAR(500) NOT NULL, + original_name VARCHAR(500) NOT NULL, + file_type VARCHAR(50), -- pdf / docx / xlsx + file_size BIGINT, + storage_path TEXT, -- data/uploads/相对路径 + parsed_path TEXT, -- data/parsed/相对路径 + status VARCHAR(50) DEFAULT 'uploaded', -- uploaded/parsing/parsed/vectorized/failed + error_msg TEXT, + metadata JSONB DEFAULT '{}', + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_files_workspace ON files(workspace_id); +CREATE INDEX IF NOT EXISTS idx_files_status ON files(status); + +-- ══════════════════════════════════════════════════ +-- 异步任务记录 +-- ══════════════════════════════════════════════════ +CREATE TABLE IF NOT EXISTS tasks ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + task_type VARCHAR(100) NOT NULL, -- parse / vectorize / compliance_check / regulation_fetch + status VARCHAR(50) DEFAULT 'pending', -- pending/running/completed/failed + payload JSONB DEFAULT '{}', + result JSONB, + error_msg TEXT, + progress INTEGER DEFAULT 0, -- 0-100 + file_id UUID REFERENCES files(id), + celery_task_id VARCHAR(255), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + completed_at TIMESTAMPTZ +); + +CREATE INDEX IF NOT EXISTS idx_tasks_status ON tasks(status); +CREATE INDEX IF NOT EXISTS idx_tasks_type ON tasks(task_type); +CREATE INDEX IF NOT EXISTS idx_tasks_file ON tasks(file_id); + +-- ══════════════════════════════════════════════════ +-- 合规审查报告 +-- ══════════════════════════════════════════════════ +CREATE TABLE IF NOT EXISTS compliance_reports ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + file_id UUID REFERENCES files(id), + regulation_domains TEXT[], -- 适用法规域 + overall_risk_level VARCHAR(20), -- high / medium / low + risk_score DECIMAL(5,2), -- 0-100 + findings JSONB DEFAULT '[]', -- 问题列表 + recommendations JSONB DEFAULT '[]', -- 整改建议 + report_markdown TEXT, -- 完整报告(Markdown格式) + llm_model VARCHAR(100), -- 生成时使用的模型 + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_reports_file ON compliance_reports(file_id); +CREATE INDEX IF NOT EXISTS idx_reports_risk ON compliance_reports(overall_risk_level); + +-- ══════════════════════════════════════════════════ +-- 法规监控源 +-- ══════════════════════════════════════════════════ +CREATE TABLE IF NOT EXISTS regulation_sources ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + name VARCHAR(255) NOT NULL, + url TEXT NOT NULL, + source_type VARCHAR(50) DEFAULT 'webpage', -- webpage / rss / api + domain VARCHAR(100), -- vehicle_safety / ehs 等 + fetch_interval INTEGER DEFAULT 86400, -- 抓取间隔(秒),默认每天 + is_active BOOLEAN DEFAULT TRUE, + last_fetched_at TIMESTAMPTZ, + last_hash VARCHAR(64), -- 内容hash,用于变更检测 + fetch_config JSONB DEFAULT '{}', -- 抓取配置(CSS选择器等) + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_sources_active ON regulation_sources(is_active); +CREATE INDEX IF NOT EXISTS idx_sources_domain ON regulation_sources(domain); + +-- ══════════════════════════════════════════════════ +-- 法规变更记录 +-- ══════════════════════════════════════════════════ +CREATE TABLE IF NOT EXISTS regulation_updates ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + source_id UUID REFERENCES regulation_sources(id), + title VARCHAR(500), + url TEXT, + change_type VARCHAR(50), -- new / revised / revoked / notice + summary TEXT, -- AI生成的变更摘要 + raw_content TEXT, -- 原始抓取内容 + diff_content TEXT, -- 与上次内容的差异 + is_notified BOOLEAN DEFAULT FALSE, + importance VARCHAR(20) DEFAULT 'normal', -- high / normal / low + fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + published_at TIMESTAMPTZ +); + +CREATE INDEX IF NOT EXISTS idx_updates_source ON regulation_updates(source_id); +CREATE INDEX IF NOT EXISTS idx_updates_notified ON regulation_updates(is_notified); +CREATE INDEX IF NOT EXISTS idx_updates_fetched ON regulation_updates(fetched_at DESC); + +-- ══════════════════════════════════════════════════ +-- 推送订阅 +-- ══════════════════════════════════════════════════ +CREATE TABLE IF NOT EXISTS subscriptions ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + name VARCHAR(255), + channel VARCHAR(50) NOT NULL, -- email / webhook / feishu / dingtalk + target TEXT NOT NULL, -- 邮件地址 或 Webhook URL + domains TEXT[], -- 订阅的法规域,为空则订阅全部 + importance_min VARCHAR(20) DEFAULT 'normal', + is_active BOOLEAN DEFAULT TRUE, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- ══════════════════════════════════════════════════ +-- 全链路审计日志 +-- ══════════════════════════════════════════════════ +CREATE TABLE IF NOT EXISTS audit_logs ( + id BIGSERIAL PRIMARY KEY, + action VARCHAR(100) NOT NULL, -- upload / query / compliance_check / etc + resource VARCHAR(100), + resource_id UUID, + user_id VARCHAR(255), + ip_address INET, + request JSONB, + response JSONB, + duration_ms INTEGER, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_audit_action ON audit_logs(action); +CREATE INDEX IF NOT EXISTS idx_audit_created ON audit_logs(created_at DESC); +CREATE INDEX IF NOT EXISTS idx_audit_user ON audit_logs(user_id); + +-- ══════════════════════════════════════════════════ +-- 更新时间自动维护 +-- ══════════════════════════════════════════════════ +CREATE OR REPLACE FUNCTION update_updated_at_column() +RETURNS TRIGGER AS $$ +BEGIN + NEW.updated_at = NOW(); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE TRIGGER update_workspaces_updated_at + BEFORE UPDATE ON workspaces + FOR EACH ROW EXECUTE FUNCTION update_updated_at_column(); + +CREATE TRIGGER update_files_updated_at + BEFORE UPDATE ON files + FOR EACH ROW EXECUTE FUNCTION update_updated_at_column(); + +CREATE TRIGGER update_tasks_updated_at + BEFORE UPDATE ON tasks + FOR EACH ROW EXECUTE FUNCTION update_updated_at_column(); + +-- ══════════════════════════════════════════════════ +-- 初始数据:预置监控源 +-- ══════════════════════════════════════════════════ +INSERT INTO regulation_sources (name, url, domain, fetch_interval) VALUES + ('国家标准全文公开系统', 'https://std.samr.gov.cn', 'vehicle_safety', 86400), + ('工信部政策法规', 'https://www.miit.gov.cn/jgsj/fgs/zcfg/index.html', 'vehicle_safety', 86400), + ('应急管理部政策法规', 'https://www.mem.gov.cn/gk/zcfg/', 'ehs', 86400), + ('生态环境部政策法规', 'https://www.mee.gov.cn/ywgz/fgbz/fl/', 'carbon', 86400) +ON CONFLICT DO NOTHING; diff --git a/scripts/00_install_docker_ubuntu.sh b/scripts/00_install_docker_ubuntu.sh new file mode 100644 index 0000000..316d04f --- /dev/null +++ b/scripts/00_install_docker_ubuntu.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# 00_install_docker_ubuntu.sh +# Ubuntu 22.04 LTS 安装 Docker CE + nvidia-container-toolkit +# 用法:bash scripts/00_install_docker_ubuntu.sh +# ══════════════════════════════════════════════════ +set -euo pipefail + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } + +# ── 检查 root 权限 ────────────────────────────── +if [[ $EUID -ne 0 ]]; then + error "请以 root 或 sudo 运行:sudo bash scripts/00_install_docker_ubuntu.sh" +fi + +# ── 检测 Ubuntu 版本 ──────────────────────────── +. /etc/os-release +info "检测到 OS:$NAME $VERSION_ID" +if [[ "$ID" != "ubuntu" ]]; then + warn "非 Ubuntu 系统,脚本可能不适用。继续(y/n)?" + read -r ans; [[ "$ans" != "y" ]] && exit 0 +fi + +# ── Step 1:换国内源(可选)────────────────────── +info "Step 1/5:配置 APT 源..." +if [[ "${USE_MIRROR:-false}" == "true" ]]; then + sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list + sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list + ok "已切换到阿里云镜像" +fi +apt-get update -qq + +# ── Step 2:安装依赖 ──────────────────────────── +info "Step 2/5:安装依赖包..." +apt-get install -y -qq \ + ca-certificates \ + curl \ + gnupg \ + lsb-release \ + apt-transport-https + +# ── Step 3:安装 Docker CE ────────────────────── +info "Step 3/5:安装 Docker CE..." +if command -v docker &>/dev/null; then + DOCKER_VER=$(docker --version) + warn "Docker 已安装:$DOCKER_VER" + warn "跳过 Docker 安装。如需重装,请先运行:apt-get remove docker docker-engine docker.io containerd" +else + # 添加 Docker 官方 GPG 密钥 + install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \ + gpg --dearmor -o /etc/apt/keyrings/docker.gpg + chmod a+r /etc/apt/keyrings/docker.gpg + + # 添加 Docker 仓库 + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \ + https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null + + apt-get update -qq + apt-get install -y -qq docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + + # 启动并设置开机自启 + systemctl enable docker + systemctl start docker + ok "Docker CE 安装完成" +fi + +# 验证 +docker --version +docker compose version + +# ── Step 4:将当前用户加入 docker 组 ──────────── +info "Step 4/5:配置 Docker 用户组..." +CURRENT_USER=${SUDO_USER:-$USER} +if [[ -n "$CURRENT_USER" && "$CURRENT_USER" != "root" ]]; then + usermod -aG docker "$CURRENT_USER" + ok "用户 $CURRENT_USER 已加入 docker 组(重新登录后生效)" +fi + +# ── Step 5:安装 nvidia-container-toolkit(可选)─ +info "Step 5/5:检查 NVIDIA GPU..." +if command -v nvidia-smi &>/dev/null; then + info "检测到 NVIDIA GPU,安装 nvidia-container-toolkit..." + nvidia-smi --query-gpu=name --format=csv,noheader + + # 添加 NVIDIA 仓库 + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + + apt-get update -qq + apt-get install -y -qq nvidia-container-toolkit + nvidia-ctk runtime configure --runtime=docker + systemctl restart docker + ok "nvidia-container-toolkit 安装完成" +else + warn "未检测到 NVIDIA GPU,跳过 nvidia-container-toolkit 安装" + warn "如有 GPU 请手动安装驱动后重新运行本脚本" +fi + +echo "" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo -e "${GREEN} Docker 安装完成!${NC}" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo "" +echo " Docker 版本:$(docker --version)" +echo " Compose 版本:$(docker compose version)" +echo "" +echo -e "${YELLOW} 注意:${NC}请重新登录以使 docker 组权限生效" +echo " 验证命令:docker run hello-world" diff --git a/scripts/00_install_docker_windows.ps1 b/scripts/00_install_docker_windows.ps1 new file mode 100644 index 0000000..5ff9bfb --- /dev/null +++ b/scripts/00_install_docker_windows.ps1 @@ -0,0 +1,105 @@ +# ══════════════════════════════════════════════════ +# 00_install_docker_windows.ps1 +# Windows 11 安装 Docker Desktop + WSL2 配置 +# 用法:以管理员身份运行 PowerShell,执行: +# .\scripts\00_install_docker_windows.ps1 +# ══════════════════════════════════════════════════ +#Requires -RunAsAdministrator + +$ErrorActionPreference = "Stop" + +function Write-Info { Write-Host "[INFO] $args" -ForegroundColor Cyan } +function Write-Ok { Write-Host "[OK] $args" -ForegroundColor Green } +function Write-Warn { Write-Host "[WARN] $args" -ForegroundColor Yellow } +function Write-Err { Write-Host "[ERR] $args" -ForegroundColor Red; exit 1 } + +Write-Info "============================================" +Write-Info "AI合规智能中枢 — Windows Docker 环境安装" +Write-Info "============================================" + +# ── Step 1:启用 WSL2 ────────────────────────── +Write-Info "Step 1/4:检查并启用 WSL2..." +$wslFeature = Get-WindowsOptionalFeature -Online -FeatureName Microsoft-Windows-Subsystem-Linux +$vmFeature = Get-WindowsOptionalFeature -Online -FeatureName VirtualMachinePlatform + +if ($wslFeature.State -ne "Enabled") { + Write-Info "启用 WSL 功能..." + Enable-WindowsOptionalFeature -Online -FeatureName Microsoft-Windows-Subsystem-Linux -NoRestart +} +if ($vmFeature.State -ne "Enabled") { + Write-Info "启用虚拟机平台..." + Enable-WindowsOptionalFeature -Online -FeatureName VirtualMachinePlatform -NoRestart +} + +# 更新 WSL 内核 +Write-Info "更新 WSL2 内核..." +wsl --update +wsl --set-default-version 2 +Write-Ok "WSL2 配置完成" + +# ── Step 2:安装 Ubuntu WSL 发行版 ───────────── +Write-Info "Step 2/4:检查 Ubuntu WSL..." +$wslList = wsl --list --quiet 2>$null +if ($wslList -notmatch "Ubuntu") { + Write-Info "安装 Ubuntu 22.04..." + wsl --install -d Ubuntu-22.04 + Write-Ok "Ubuntu 22.04 安装完成(首次运行需要设置用户名和密码)" +} else { + Write-Ok "Ubuntu WSL 已安装" + wsl --list --verbose +} + +# ── Step 3:安装 Docker Desktop ──────────────── +Write-Info "Step 3/4:检查 Docker Desktop..." +$dockerCmd = Get-Command docker -ErrorAction SilentlyContinue +if ($dockerCmd) { + Write-Ok "Docker 已安装:$(docker --version)" +} else { + # 尝试用 winget 安装 + $winget = Get-Command winget -ErrorAction SilentlyContinue + if ($winget) { + Write-Info "通过 winget 安装 Docker Desktop..." + winget install -e --id Docker.DockerDesktop --accept-package-agreements --accept-source-agreements + Write-Ok "Docker Desktop 安装完成" + } else { + Write-Warn "未找到 winget,请手动安装 Docker Desktop:" + Write-Warn "下载地址:https://www.docker.com/products/docker-desktop/" + Write-Warn "安装时勾选:Use WSL 2 instead of Hyper-V" + Start-Process "https://www.docker.com/products/docker-desktop/" + Read-Host "安装完成后按 Enter 继续" + } +} + +# ── Step 4:配置 Docker Desktop WSL 集成 ─────── +Write-Info "Step 4/4:提示 Docker Desktop 配置..." +Write-Warn "" +Write-Warn "请确认 Docker Desktop 已进行以下配置:" +Write-Warn " 1. Settings → General → 勾选 'Use WSL 2 based engine'" +Write-Warn " 2. Settings → Resources → WSL Integration → 开启 Ubuntu-22.04" +Write-Warn " 3. 如有 NVIDIA GPU:" +Write-Warn " Settings → General → 勾选 'Use GPU with WSL 2'" +Write-Warn "" + +# ── 验证 ─────────────────────────────────────── +Write-Info "验证安装..." +try { + $dockerVer = docker --version + $composeVer = docker compose version + Write-Ok "Docker: $dockerVer" + Write-Ok "Compose: $composeVer" +} catch { + Write-Warn "Docker 命令不可用,可能需要重启后再验证" + Write-Warn "重启后运行:docker run hello-world" +} + +Write-Host "" +Write-Host "============================================" -ForegroundColor Green +Write-Host " 安装完成!" -ForegroundColor Green +Write-Host "============================================" -ForegroundColor Green +Write-Host "" +Write-Host "后续步骤(在 WSL2 Ubuntu 中执行):" -ForegroundColor Yellow +Write-Host " 1. 打开 Ubuntu WSL 终端" +Write-Host " 2. cd /mnt/c/Projects/AIProjects/AIRegulations/Depolyment" +Write-Host " 3. bash scripts/01_setup_project.sh" +Write-Host "" +Write-Host "如需重启系统请现在重启,然后继续操作。" -ForegroundColor Yellow diff --git a/scripts/01_setup_project.sh b/scripts/01_setup_project.sh new file mode 100644 index 0000000..5b903e1 --- /dev/null +++ b/scripts/01_setup_project.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# 01_setup_project.sh +# 初始化项目:创建目录、生成 .env 文件 +# 用法:bash scripts/01_setup_project.sh +# ══════════════════════════════════════════════════ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } + +cd "$PROJECT_DIR" +info "项目目录:$PROJECT_DIR" + +# ── 创建运行时目录 ────────────────────────────── +info "创建运行时目录..." +mkdir -p data/uploads data/parsed logs models +mkdir -p services/embedding services/mcp-server +mkdir -p services/compliance-backend/app/{core,api,services,models} +ok "目录结构创建完成" + +# ── 复制 .env 文件 ────────────────────────────── +if [[ ! -f ".env" ]]; then + cp .env.example .env + warn "已创建 .env 文件,请编辑并填写必要配置:" + warn " 必填:DEEPSEEK_API_KEY(或 DASHSCOPE_API_KEY)" + warn " 可选:修改各组件密码" + echo "" + echo -e "${YELLOW}是否现在编辑 .env 文件?(y/n)${NC}" + read -r ans + if [[ "$ans" == "y" ]]; then + ${EDITOR:-nano} .env + fi +else + ok ".env 文件已存在,跳过复制" +fi + +# ── 验证 .env 关键字段 ────────────────────────── +info "验证 .env 配置..." +source .env 2>/dev/null || true + +if [[ -z "${DEEPSEEK_API_KEY:-}" && -z "${DASHSCOPE_API_KEY:-}" ]]; then + warn "⚠️ 未设置 LLM API Key!" + warn " 请在 .env 中设置 DEEPSEEK_API_KEY 或 DASHSCOPE_API_KEY" + warn " DeepSeek 申请:https://platform.deepseek.com" +else + ok "LLM API Key 已配置" +fi + +# ── 验证 Docker ───────────────────────────────── +info "检查 Docker 环境..." +if ! command -v docker &>/dev/null; then + warn "Docker 未安装,请先运行:bash scripts/00_install_docker_ubuntu.sh" + exit 1 +fi +docker compose version > /dev/null +ok "Docker Compose 可用:$(docker compose version)" + +# ── 显示下一步 ────────────────────────────────── +echo "" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo -e "${GREEN} 项目初始化完成!${NC}" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo "" +echo "下一步操作:" +echo " 1. 拉取镜像(可选,较慢):bash scripts/02_pull_images.sh" +echo " 2. 启动全部服务: bash scripts/06_start_all.sh" +echo " 3. 检查健康状态: bash scripts/check_health.sh" diff --git a/scripts/02_pull_images.sh b/scripts/02_pull_images.sh new file mode 100644 index 0000000..bd7660c --- /dev/null +++ b/scripts/02_pull_images.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# 02_pull_images.sh +# 预拉取所有 Docker 镜像(离线/弱网环境准备) +# 用法:bash scripts/02_pull_images.sh +# ══════════════════════════════════════════════════ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +cd "$PROJECT_DIR" + +BLUE='\033[0;34m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m' +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } + +# 所有基础镜像列表 +IMAGES=( + "pgvector/pgvector:pg16" + "redis:7-alpine" + "quay.io/coreos/etcd:v3.5.5" + "minio/minio:RELEASE.2023-03-13T19-46-17Z" + "milvusdb/milvus:v2.4.13" + "neo4j:5.20-community" + "nginx:1.25-alpine" + "grafana/grafana:11.0.0" + "prom/prometheus:v2.51.0" +) + +info "开始拉取 ${#IMAGES[@]} 个基础镜像..." +echo "" + +for img in "${IMAGES[@]}"; do + info "拉取:$img" + docker pull "$img" + ok "完成:$img" + echo "" +done + +info "所有基础镜像拉取完成" +echo "" +info "自定义服务镜像(embedding/mcp/backend)将在 build 时自动拉取基础层" +echo "" +echo -e "${YELLOW}提示:如在国内网络环境下 quay.io 或 milvusdb 拉取慢,${NC}" +echo -e "${YELLOW}可配置 Docker 镜像加速器:/etc/docker/daemon.json${NC}" +echo ' {"registry-mirrors": ["https://docker.mirrors.ustc.edu.cn"]}' diff --git a/scripts/03_start_infra.sh b/scripts/03_start_infra.sh new file mode 100644 index 0000000..3339af8 --- /dev/null +++ b/scripts/03_start_infra.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# 03_start_infra.sh +# 分步启动基础设施(含健康等待),顺序: +# PostgreSQL + Redis → etcd + MinIO → Milvus → Neo4j +# 用法:bash scripts/03_start_infra.sh +# ══════════════════════════════════════════════════ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +cd "$PROJECT_DIR" + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } + +# 等待服务健康的函数 +wait_healthy() { + local service=$1 + local max_wait=${2:-120} + local interval=5 + local elapsed=0 + + info "等待 $service 健康就绪..." + while [[ $elapsed -lt $max_wait ]]; do + local status + status=$(docker compose ps --format json "$service" 2>/dev/null | \ + python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('Health','unknown'))" 2>/dev/null || echo "unknown") + + if [[ "$status" == "healthy" ]]; then + ok "$service 已就绪" + return 0 + fi + + echo -n "." + sleep $interval + elapsed=$((elapsed + interval)) + done + echo "" + error "$service 等待超时(${max_wait}s),请检查:docker compose logs $service" +} + +info "══════════════════════════════════════════" +info " 启动基础设施层" +info "══════════════════════════════════════════" + +# ── Step 1:PostgreSQL + Redis ────────────────── +info "Step 1/4:启动 PostgreSQL 和 Redis..." +docker compose up -d postgres redis + +wait_healthy postgres 90 +wait_healthy redis 30 +ok "数据层就绪" + +# ── Step 2:etcd + MinIO(Milvus 依赖)───────── +info "Step 2/4:启动 etcd 和 MinIO(Milvus 依赖)..." +docker compose up -d etcd minio + +wait_healthy etcd 60 +wait_healthy minio 60 +ok "对象存储层就绪" + +# ── Step 3:Milvus ────────────────────────────── +info "Step 3/4:启动 Milvus(向量数据库)..." +docker compose up -d milvus + +info "Milvus 初始化需要约 60 秒,请耐心等待..." +wait_healthy milvus 180 +ok "Milvus 就绪" + +# ── Step 4:Neo4j ─────────────────────────────── +info "Step 4/4:启动 Neo4j(知识图谱)..." +docker compose up -d neo4j + +wait_healthy neo4j 120 +ok "Neo4j 就绪" + +# ── 汇总 ──────────────────────────────────────── +echo "" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo -e "${GREEN} 基础设施启动完成!${NC}" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo "" +echo " PostgreSQL : localhost:5432" +echo " Redis : localhost:6379" +echo " Milvus : localhost:19530 (gRPC), localhost:9091 (HTTP)" +echo " Neo4j : localhost:7474 (Browser), localhost:7687 (Bolt)" +echo " MinIO 控制台: localhost:9001 (admin/minioadmin)" +echo "" +echo "下一步:bash scripts/04_build_services.sh" diff --git a/scripts/04_build_services.sh b/scripts/04_build_services.sh new file mode 100644 index 0000000..c25593e --- /dev/null +++ b/scripts/04_build_services.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# 04_build_services.sh +# 构建自定义服务 Docker 镜像 +# embedding-service / mcp-server / compliance-backend +# 用法:bash scripts/04_build_services.sh +# ══════════════════════════════════════════════════ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +cd "$PROJECT_DIR" + +BLUE='\033[0;34m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m' +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } + +info "══════════════════════════════════════════" +info " 构建自定义服务镜像" +info "══════════════════════════════════════════" +warn "首次构建较慢(需下载 Python 依赖 + AI 模型)" +warn "BGE-M3 模型约 2.5GB,MinerU 模型约 2GB" +echo "" + +# ── 构建嵌入服务 ──────────────────────────────── +info "构建 embedding-service(BGE-M3)..." +START=$(date +%s) +docker compose build embedding-service +END=$(date +%s) +ok "embedding-service 构建完成($(( END - START ))s)" +echo "" + +# ── 构建 MinerU 解析服务 ──────────────────────── +info "构建 mcp-server(MinerU)..." +START=$(date +%s) +docker compose build mcp-server +END=$(date +%s) +ok "mcp-server 构建完成($(( END - START ))s)" +echo "" + +# ── 构建业务后端 ──────────────────────────────── +info "构建 compliance-backend..." +START=$(date +%s) +docker compose build compliance-backend +END=$(date +%s) +ok "compliance-backend 构建完成($(( END - START ))s)" +echo "" + +# ── 列出构建的镜像 ────────────────────────────── +info "已构建的镜像:" +docker images | grep -E "compliance-(embedding|mcp|backend)" || true + +echo "" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo -e "${GREEN} 所有服务镜像构建完成!${NC}" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo "" +echo "下一步:bash scripts/05_init_db.sh" diff --git a/scripts/05_init_db.sh b/scripts/05_init_db.sh new file mode 100644 index 0000000..73539b3 --- /dev/null +++ b/scripts/05_init_db.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# 05_init_db.sh +# 初始化数据库:PostgreSQL Schema + Milvus Collections + Neo4j Constraints +# 用法:bash scripts/05_init_db.sh +# 前提:postgres / milvus / neo4j 已运行且健康 +# ══════════════════════════════════════════════════ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +cd "$PROJECT_DIR" + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } + +source .env 2>/dev/null || true +POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-compliance123} +NEO4J_PASSWORD=${NEO4J_PASSWORD:-neo4j123} + +# ── Step 1:PostgreSQL Schema ─────────────────── +info "Step 1/3:初始化 PostgreSQL Schema..." +if docker compose ps postgres | grep -q "healthy"; then + docker compose exec -T postgres psql \ + -U compliance -d compliance_db \ + -f /docker-entrypoint-initdb.d/01_init_schema.sql \ + 2>&1 | tail -5 || warn "SQL 可能部分已存在(IF NOT EXISTS),这是正常的" + ok "PostgreSQL Schema 初始化完成" +else + error "PostgreSQL 未运行,请先执行:bash scripts/03_start_infra.sh" +fi + +# ── Step 2:Milvus Collections ────────────────── +info "Step 2/3:初始化 Milvus Collections..." +if docker compose ps milvus | grep -q "healthy"; then + docker compose run --rm --no-deps compliance-backend \ + python3 -c " +import asyncio +from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility + +connections.connect(host='milvus', port='19530') +print('Milvus 连接成功') + +def create_collection(name, description): + if utility.has_collection(name): + print(f' Collection {name} 已存在,跳过') + return + + fields = [ + FieldSchema(name='id', dtype=DataType.VARCHAR, is_primary=True, max_length=128), + FieldSchema(name='file_id', dtype=DataType.VARCHAR, max_length=128), + FieldSchema(name='workspace_id', dtype=DataType.VARCHAR, max_length=128), + FieldSchema(name='chunk_idx', dtype=DataType.INT64), + FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535), + FieldSchema(name='dense_vec', dtype=DataType.FLOAT_VECTOR, dim=1024), # BGE-M3 dense + FieldSchema(name='metadata', dtype=DataType.JSON), + ] + schema = CollectionSchema(fields, description=description) + col = Collection(name, schema) + + # 创建向量索引(HNSW,适合调研阶段) + index_params = { + 'metric_type': 'COSINE', + 'index_type': 'HNSW', + 'params': {'M': 16, 'efConstruction': 200} + } + col.create_index('dense_vec', index_params) + col.load() + print(f' Collection {name} 创建完成') + +create_collection('regulation_chunks', '法规条款向量库') +create_collection('doc_chunks', '企业文档向量库') +create_collection('case_library', '行业案例库') + +print('Milvus 初始化完成') +" 2>&1 + ok "Milvus Collections 初始化完成" +else + error "Milvus 未运行,请先执行:bash scripts/03_start_infra.sh" +fi + +# ── Step 3:Neo4j 约束和索引 ──────────────────── +info "Step 3/3:初始化 Neo4j 约束和索引..." +sleep 5 # Neo4j 可能还在预热 + +docker compose exec -T neo4j cypher-shell \ + -u neo4j -p "$NEO4J_PASSWORD" \ + --format plain <<'CYPHER' +// 节点约束(唯一性) +CREATE CONSTRAINT regulation_id IF NOT EXISTS + FOR (r:Regulation) REQUIRE r.id IS UNIQUE; +CREATE CONSTRAINT clause_id IF NOT EXISTS + FOR (c:Clause) REQUIRE c.id IS UNIQUE; +CREATE CONSTRAINT obligation_id IF NOT EXISTS + FOR (o:Obligation) REQUIRE o.id IS UNIQUE; + +// 全文索引(模糊查询) +CREATE FULLTEXT INDEX regulation_fulltext IF NOT EXISTS + FOR (r:Regulation) ON EACH [r.title, r.code, r.domain]; +CREATE FULLTEXT INDEX clause_fulltext IF NOT EXISTS + FOR (c:Clause) ON EACH [c.content, c.title]; + +// 插入示例节点(验证连通性) +MERGE (d:Domain {name: 'vehicle_safety', label: '车辆安全法规'}); +MERGE (d:Domain {name: 'data_security', label: '数据安全法规'}); +MERGE (d:Domain {name: 'ehs', label: 'EHS安全法规'}); +MERGE (d:Domain {name: 'carbon', label: '碳排放法规'}); +RETURN '初始化完成' AS result; +CYPHER + ok "Neo4j 约束和索引初始化完成" + +echo "" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo -e "${GREEN} 数据库初始化完成!${NC}" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo "" +echo " PostgreSQL: 所有表已创建" +echo " Milvus: regulation_chunks / doc_chunks / case_library" +echo " Neo4j: 约束 + 全文索引 + 基础域节点" +echo "" +echo "下一步:bash scripts/06_start_all.sh" diff --git a/scripts/06_start_all.sh b/scripts/06_start_all.sh new file mode 100644 index 0000000..1066c08 --- /dev/null +++ b/scripts/06_start_all.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# 06_start_all.sh +# 一键启动所有服务(完整流程) +# 用法:bash scripts/06_start_all.sh +# ══════════════════════════════════════════════════ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +cd "$PROJECT_DIR" + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } + +echo "" +echo -e "${BLUE}╔══════════════════════════════════════════╗${NC}" +echo -e "${BLUE}║ AI合规智能中枢 — 全服务启动 ║${NC}" +echo -e "${BLUE}╚══════════════════════════════════════════╝${NC}" +echo "" + +# ── 前置检查 ──────────────────────────────────── +if [[ ! -f ".env" ]]; then + error ".env 文件不存在,请先运行:bash scripts/01_setup_project.sh" +fi + +source .env 2>/dev/null || true +if [[ -z "${DEEPSEEK_API_KEY:-}" && -z "${DASHSCOPE_API_KEY:-}" ]]; then + warn "⚠️ 未设置 LLM API Key,LLM 功能将不可用" + warn "请在 .env 中设置 DEEPSEEK_API_KEY 或 DASHSCOPE_API_KEY" + echo "" +fi + +# ── Phase 1:基础设施 ──────────────────────────── +info "Phase 1/4:启动基础设施..." +bash "$SCRIPT_DIR/03_start_infra.sh" +echo "" + +# ── Phase 2:构建服务镜像 ──────────────────────── +info "Phase 2/4:构建服务镜像(首次较慢)..." +docker compose build embedding-service mcp-server compliance-backend 2>&1 | \ + grep -E "(Step|Successfully|=>|ERROR)" || true +ok "镜像构建完成" +echo "" + +# ── Phase 3:初始化数据库 ──────────────────────── +info "Phase 3/4:初始化数据库..." +bash "$SCRIPT_DIR/05_init_db.sh" +echo "" + +# ── Phase 4:启动所有服务 ──────────────────────── +info "Phase 4/4:启动 AI 模型服务和业务服务..." +docker compose up -d embedding-service mcp-server +info "等待 AI 模型加载(BGE-M3/MinerU 约需 2-3 分钟)..." +sleep 30 + +# 等待嵌入服务就绪 +for i in {1..20}; do + if curl -sf http://localhost:8010/health > /dev/null 2>&1; then + ok "embedding-service 就绪" + break + fi + echo -n "." + sleep 10 +done + +docker compose up -d compliance-backend celery-worker celery-beat nginx +info "等待业务服务启动..." +sleep 15 + +for i in {1..12}; do + if curl -sf http://localhost:8000/health > /dev/null 2>&1; then + ok "compliance-backend 就绪" + break + fi + echo -n "." + sleep 5 +done + +# ── 最终状态 ──────────────────────────────────── +echo "" +echo -e "${GREEN}╔══════════════════════════════════════════╗${NC}" +echo -e "${GREEN}║ 所有服务启动完成! ║${NC}" +echo -e "${GREEN}╚══════════════════════════════════════════╝${NC}" +echo "" +docker compose ps --format "table {{.Service}}\t{{.Status}}\t{{.Ports}}" +echo "" +echo -e "${BLUE}访问地址:${NC}" +echo " API 网关 : http://localhost" +echo " API 文档 : http://localhost/docs" +echo " Neo4j 浏览器 : http://localhost:7474" +echo " MinIO 控制台 : http://localhost:9001" +echo "" +echo -e "${YELLOW}运行冒烟测试:${NC}" +echo " bash scripts/07_smoke_test.sh" diff --git a/scripts/07_smoke_test.sh b/scripts/07_smoke_test.sh new file mode 100644 index 0000000..52a15a4 --- /dev/null +++ b/scripts/07_smoke_test.sh @@ -0,0 +1,183 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# 07_smoke_test.sh +# 端到端冒烟测试:验证三条业务闭环 +# 用法:bash scripts/07_smoke_test.sh +# ══════════════════════════════════════════════════ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +cd "$PROJECT_DIR" + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[✓]${NC} $*"; } +fail() { echo -e "${RED}[✗]${NC} $*"; FAILED=$((FAILED+1)); } +warn() { echo -e "${YELLOW}[~]${NC} $*"; } + +FAILED=0 +API_BASE="http://localhost" + +echo "" +echo -e "${BLUE}══════════════════════════════════════════${NC}" +echo -e "${BLUE} AI合规智能中枢 端到端冒烟测试${NC}" +echo -e "${BLUE}══════════════════════════════════════════${NC}" +echo "" + +# ── 基础健康检查 ──────────────────────────────── +info "=== 基础设施健康检查 ===" + +check_service() { + local name=$1; local url=$2 + if curl -sf "$url" > /dev/null 2>&1; then + ok "$name" + else + fail "$name($url 不可达)" + fi +} + +check_service "API 网关 (Nginx)" "http://localhost/health" +check_service "业务后端 (FastAPI)" "http://localhost:8000/health" +check_service "嵌入服务 (BGE-M3)" "http://localhost:8010/health" +check_service "解析服务 (MinerU)" "http://localhost:8011/health" +check_service "Milvus HTTP" "http://localhost:9091/healthz" +check_service "Neo4j Browser" "http://localhost:7474" +echo "" + +# ── 嵌入服务测试 ──────────────────────────────── +info "=== 嵌入服务测试 ===" +EMBED_RESP=$(curl -sf -X POST http://localhost:8010/embed \ + -H "Content-Type: application/json" \ + -d '{"texts": ["GB 18384 电动汽车碰撞安全要求"], "batch_size": 1}' 2>/dev/null || echo "{}") + +if echo "$EMBED_RESP" | python3 -c "import sys,json; d=json.load(sys.stdin); assert len(d.get('dense',[])[0])==1024" 2>/dev/null; then + ok "BGE-M3 嵌入:返回 1024 维向量" +else + fail "BGE-M3 嵌入失败,响应:${EMBED_RESP:0:200}" +fi +echo "" + +# ── 创建测试 PDF ──────────────────────────────── +info "=== 创建测试文档 ===" +TEST_PDF="$PROJECT_DIR/data/uploads/test_regulation.txt" +cat > "$TEST_PDF" << 'EOF' +GB 18384-2020 电动汽车安全要求 + +第一章 总则 +本标准规定了电动汽车的安全要求,适用于M1类纯电动汽车。 + +第二章 电气安全 +2.1 绝缘电阻要求 + 直流电路绝缘电阻不得低于100Ω/V。 +2.2 碰撞安全 + 车辆碰撞后,高压电系统应自动断电。 + 碰撞后5秒内,高压系统电压应降至60V以下。 + +第三章 防水要求 +高压系统防护等级应达到IP67。 +EOF +ok "测试文档创建:$TEST_PDF" +echo "" + +# ── 闭环①:文件上传 → 向量化 → 问答 ─────────── +info "=== 闭环①:法规入库 → 检索问答 ===" + +# 创建工作空间 +WORKSPACE_RESP=$(curl -sf -X POST "$API_BASE/api/kb/workspaces" \ + -H "Content-Type: application/json" \ + -d '{"name": "测试法规库", "domain": "vehicle_safety"}' 2>/dev/null || echo "{}") +WS_ID=$(echo "$WORKSPACE_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "") + +if [[ -n "$WS_ID" ]]; then + ok "工作空间创建:$WS_ID" +else + warn "工作空间创建失败(可能接口未完全实现),跳过后续上传测试" + WS_ID="test-workspace" +fi + +# 上传文件 +UPLOAD_RESP=$(curl -sf -X POST "$API_BASE/api/kb/files/upload" \ + -F "file=@$TEST_PDF" \ + -F "workspace_id=$WS_ID" 2>/dev/null || echo "{}") +TASK_ID=$(echo "$UPLOAD_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('task_id',''))" 2>/dev/null || echo "") + +if [[ -n "$TASK_ID" ]]; then + ok "文件上传任务已创建:$TASK_ID" + + # 轮询任务状态(最多等待120秒) + info "等待向量化完成..." + for i in {1..24}; do + TASK_STATUS=$(curl -sf "$API_BASE/api/kb/tasks/$TASK_ID" 2>/dev/null | \ + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" 2>/dev/null || echo "unknown") + if [[ "$TASK_STATUS" == "completed" ]]; then + ok "向量化完成(${i}×5s)" + break + elif [[ "$TASK_STATUS" == "failed" ]]; then + fail "向量化失败" + break + fi + echo -n "." + sleep 5 + done + echo "" + + # 检索问答 + QA_RESP=$(curl -sf -X POST "$API_BASE/api/kb/qa" \ + -H "Content-Type: application/json" \ + -d "{\"query\": \"碰撞后高压系统电压要求\", \"workspace_id\": \"$WS_ID\", \"top_k\": 3}" 2>/dev/null || echo "{}") + ANSWER=$(echo "$QA_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('answer','')[:100])" 2>/dev/null || echo "") + + if [[ -n "$ANSWER" ]]; then + ok "问答成功:${ANSWER}..." + else + warn "问答返回空(LLM API 可能未配置或响应缓慢)" + fi +else + warn "文件上传失败(接口可能未实现)" +fi +echo "" + +# ── 闭环②:合规审查 ──────────────────────────── +info "=== 闭环②:文档上传 → 合规审查 ===" + +CHECK_RESP=$(curl -sf -X POST "$API_BASE/api/compliance/check" \ + -H "Content-Type: application/json" \ + -d '{"query": "供应商文件是否符合GB 18384碰撞安全要求", "domains": ["vehicle_safety"]}' 2>/dev/null || echo "{}") +RISK=$(echo "$CHECK_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('risk_level','unknown'))" 2>/dev/null || echo "unknown") + +if [[ "$RISK" != "unknown" && -n "$RISK" ]]; then + ok "合规审查完成,风险等级:$RISK" +else + warn "合规审查接口返回空(功能可能未完全实现)" +fi +echo "" + +# ── 闭环③:法规监控 ──────────────────────────── +info "=== 闭环③:法规监控源配置 ===" + +SOURCE_RESP=$(curl -sf -X POST "$API_BASE/api/regulation/sources" \ + -H "Content-Type: application/json" \ + -d '{"name": "测试监控源", "url": "https://std.samr.gov.cn", "domain": "vehicle_safety"}' 2>/dev/null || echo "{}") +SOURCE_ID=$(echo "$SOURCE_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "") + +if [[ -n "$SOURCE_ID" ]]; then + ok "监控源配置成功:$SOURCE_ID" +else + warn "监控源配置返回空(功能可能未完全实现)" +fi +echo "" + +# ── 汇总 ──────────────────────────────────────── +echo "" +echo -e "${BLUE}══════════════════════════════════════════${NC}" +if [[ $FAILED -eq 0 ]]; then + echo -e "${GREEN} 全部检查通过!${NC}" +else + echo -e "${YELLOW} 完成,${FAILED} 项失败${NC}(部分功能可能尚未实现)" +fi +echo -e "${BLUE}══════════════════════════════════════════${NC}" +echo "" +echo "查看服务日志:" +echo " docker compose logs -f compliance-backend" +echo " docker compose logs -f celery-worker" diff --git a/scripts/check_health.sh b/scripts/check_health.sh new file mode 100644 index 0000000..2842216 --- /dev/null +++ b/scripts/check_health.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# check_health.sh +# 检查所有服务的健康状态和资源使用 +# 用法:bash scripts/check_health.sh +# ══════════════════════════════════════════════════ +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +cd "$PROJECT_DIR" + +GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' + +echo "" +echo -e "${BLUE}══════════════════════════════════════════${NC}" +echo -e "${BLUE} 服务健康检查报告${NC}" +echo -e "${BLUE}══════════════════════════════════════════${NC}" +echo "" + +# Docker 服务状态 +echo -e "${BLUE}【Docker Compose 服务状态】${NC}" +docker compose ps --format "table {{.Service}}\t{{.Status}}\t{{.Ports}}" +echo "" + +# HTTP 端点检查 +echo -e "${BLUE}【HTTP 健康端点】${NC}" +check_http() { + local name=$1; local url=$2 + if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then + echo -e " ${GREEN}[OK]${NC} $name ($url)" + else + echo -e " ${RED}[FAIL]${NC} $name ($url)" + fi +} + +check_http "API 网关" "http://localhost/health" +check_http "业务后端" "http://localhost:8000/health" +check_http "嵌入服务" "http://localhost:8010/health" +check_http "解析服务" "http://localhost:8011/health" +check_http "Milvus" "http://localhost:9091/healthz" +check_http "Neo4j" "http://localhost:7474" +echo "" + +# 资源使用 +echo -e "${BLUE}【容器资源使用】${NC}" +docker stats --no-stream --format \ + "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" \ + 2>/dev/null | head -15 +echo "" + +# 磁盘使用 +echo -e "${BLUE}【磁盘使用】${NC}" +df -h . | tail -1 | awk '{print " 项目目录:已用 "$3",可用 "$4"(" $5 " 使用率)"}' +docker system df 2>/dev/null | head -6 +echo "" + +# LLM 配置检查 +echo -e "${BLUE}【LLM API 配置】${NC}" +source .env 2>/dev/null || true +if [[ -n "${DEEPSEEK_API_KEY:-}" ]]; then + echo -e " ${GREEN}[OK]${NC} DeepSeek API Key 已配置" +elif [[ -n "${DASHSCOPE_API_KEY:-}" ]]; then + echo -e " ${GREEN}[OK]${NC} DashScope (Qwen) API Key 已配置" +else + echo -e " ${YELLOW}[WARN]${NC} 未配置 LLM API Key(LLM 功能不可用)" +fi +echo "" diff --git a/scripts/download_models.sh b/scripts/download_models.sh new file mode 100644 index 0000000..95c04dc --- /dev/null +++ b/scripts/download_models.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# download_models.sh +# 预下载 AI 模型到 ./models 目录(加速容器启动) +# 支持 HuggingFace 镜像加速(国内网络) +# 用法:bash scripts/download_models.sh +# ══════════════════════════════════════════════════ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +cd "$PROJECT_DIR" + +BLUE='\033[0;34m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m' +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } + +MODELS_DIR="$PROJECT_DIR/models" +mkdir -p "$MODELS_DIR" + +# 设置镜像加速 +export HF_ENDPOINT="${HF_ENDPOINT:-https://hf-mirror.com}" +export HF_HOME="$MODELS_DIR" +info "HuggingFace 镜像:$HF_ENDPOINT" +info "模型保存路径:$MODELS_DIR" +echo "" + +# ── 方法1:通过 huggingface_hub 下载 ──────────── +download_hf() { + local repo=$1; local local_name=$2 + info "下载 $repo..." + if python3 -c " +import os +os.environ['HF_ENDPOINT'] = '${HF_ENDPOINT}' +os.environ['HF_HOME'] = '${MODELS_DIR}' +from huggingface_hub import snapshot_download +snapshot_download(repo_id='$repo', cache_dir='${MODELS_DIR}') +print('下载完成') +" 2>&1; then + ok "$repo 下载成功" + else + warn "$repo HuggingFace 下载失败,尝试 ModelScope..." + download_modelscope "$repo" "$local_name" + fi +} + +# ── 方法2:通过 ModelScope 下载(备用)────────── +download_modelscope() { + local hf_name=$1 + local ms_name=${2:-$1} + python3 -c " +try: + from modelscope import snapshot_download + snapshot_download(model_id='$ms_name', cache_dir='${MODELS_DIR}/modelscope') + print('ModelScope 下载完成') +except ImportError: + print('ModelScope 未安装,跳过') +except Exception as e: + print(f'ModelScope 下载失败: {e}') +" 2>&1 || warn "ModelScope 下载也失败,模型将在容器启动时自动下载" +} + +# ── 检查 Python 环境 ──────────────────────────── +if ! python3 -c "import huggingface_hub" 2>/dev/null; then + warn "未安装 huggingface_hub,尝试安装..." + pip3 install -q huggingface_hub modelscope 2>/dev/null || \ + warn "安装失败,模型将在容器首次启动时下载" +fi + +# ── 下载模型列表 ──────────────────────────────── +info "=== 下载 BGE-M3 嵌入模型(约 2.5GB)===" +download_hf "BAAI/bge-m3" "BAAI/bge-m3" +echo "" + +info "=== 下载 BGE-Reranker 精排模型(约 1.1GB)===" +download_hf "BAAI/bge-reranker-v2-m3" "BAAI/bge-reranker-v2-m3" +echo "" + +# MinerU 模型通过容器内脚本下载(依赖 magic-pdf 配置) +info "=== MinerU 模型说明 ===" +warn "MinerU 模型(约 2GB)将在 mcp-server 容器首次启动时自动下载" +warn "如需预下载,请在 mcp-server 容器内运行:mineru-models-download" +echo "" + +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo -e "${GREEN} 模型下载完成!${NC}" +echo -e "${GREEN}══════════════════════════════════════════${NC}" +echo "" +echo "已下载到:$MODELS_DIR" +du -sh "$MODELS_DIR" 2>/dev/null || true diff --git a/scripts/reset_all.sh b/scripts/reset_all.sh new file mode 100644 index 0000000..1f2c337 --- /dev/null +++ b/scripts/reset_all.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# ══════════════════════════════════════════════════ +# reset_all.sh +# ⚠️ 危险操作:停止所有服务并删除所有数据(慎用!) +# 用法:bash scripts/reset_all.sh +# ══════════════════════════════════════════════════ +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +cd "$PROJECT_DIR" + +RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m' + +echo "" +echo -e "${RED}╔══════════════════════════════════════════╗${NC}" +echo -e "${RED}║ ⚠️ 警告:此操作将删除所有数据! ║${NC}" +echo -e "${RED}║ 包括:PostgreSQL / Milvus / Neo4j 数据 ║${NC}" +echo -e "${RED}║ 以及所有上传的文件和日志 ║${NC}" +echo -e "${RED}╚══════════════════════════════════════════╝${NC}" +echo "" +echo -e "${YELLOW}确认要重置所有数据吗?(输入 'yes' 确认,其他取消)${NC}" +read -r CONFIRM + +if [[ "$CONFIRM" != "yes" ]]; then + echo "已取消" + exit 0 +fi + +echo "" +echo "停止所有服务..." +docker compose down --volumes --remove-orphans + +echo "清理数据目录..." +rm -rf data/uploads/* data/parsed/* logs/* +echo "✓ 数据目录已清空(保留目录结构)" + +echo "" +echo -e "${YELLOW}重置完成。重新启动:bash scripts/06_start_all.sh${NC}" diff --git a/services/compliance-backend/Dockerfile b/services/compliance-backend/Dockerfile new file mode 100644 index 0000000..aab2c45 --- /dev/null +++ b/services/compliance-backend/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.12-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# 使用 uv 加速依赖安装 +RUN pip install uv --no-cache-dir + +COPY pyproject.toml . +RUN uv pip install --system --no-cache -r pyproject.toml \ + --index-url https://pypi.tuna.tsinghua.edu.cn/simple \ + --trusted-host pypi.tuna.tsinghua.edu.cn + +COPY app/ ./app/ + +HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=5 \ + CMD curl -f http://localhost:8000/health || exit 1 + +EXPOSE 8000 + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"] diff --git a/services/compliance-backend/app/__init__.py b/services/compliance-backend/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/compliance-backend/app/api/__init__.py b/services/compliance-backend/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/compliance-backend/app/api/compliance.py b/services/compliance-backend/app/api/compliance.py new file mode 100644 index 0000000..4b7356f --- /dev/null +++ b/services/compliance-backend/app/api/compliance.py @@ -0,0 +1,95 @@ +import uuid +import logging +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel +from langchain.schema import HumanMessage, SystemMessage + +from ..core.llm import get_llm, COMPLIANCE_CHECK_PROMPT +from ..services.rag import hybrid_search + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/api/compliance", tags=["合规审查"]) + + +class ComplianceCheckRequest(BaseModel): + query: str + regulation_domains: list[str] = ["vehicle_safety"] + top_k: int = 5 + + +class ComplianceCheckResponse(BaseModel): + risk_level: str + risk_score: float + findings: list[dict] + recommendations: list[str] + sources: list[dict] + + +@router.post("/check", response_model=ComplianceCheckResponse) +async def check_compliance(req: ComplianceCheckRequest): + """ + 对输入内容进行合规性检查,与法规库比对后给出风险评估。 + """ + # 检索相关法规(从多个域检索) + all_chunks = [] + for domain in req.regulation_domains: + chunks = await hybrid_search( + req.query, + collection_name="regulation_chunks", + top_k=req.top_k, + ) + all_chunks.extend(chunks) + + # 去重 + 按分数排序 + seen = set() + unique_chunks = [] + for c in sorted(all_chunks, key=lambda x: x["score"], reverse=True): + if c["id"] not in seen: + seen.add(c["id"]) + unique_chunks.append(c) + top_chunks = unique_chunks[:req.top_k] + + if not top_chunks: + return ComplianceCheckResponse( + risk_level="unknown", + risk_score=0, + findings=[{"issue": "未找到相关法规,请先上传法规文档"}], + recommendations=["上传相关法规文档到知识库后重试"], + sources=[], + ) + + # 构建法规上下文 + regulations_text = "\n\n".join( + f"[{i+1}] {c['content'][:500]}" for i, c in enumerate(top_chunks) + ) + + prompt = COMPLIANCE_CHECK_PROMPT.format( + content=req.query, + regulations=regulations_text, + ) + + llm = get_llm(temperature=0.0) + try: + response = await llm.ainvoke([HumanMessage(content=prompt)]) + analysis = response.content + except Exception as e: + logger.error(f"LLM 合规分析失败:{e}") + analysis = f"LLM 分析失败:{e}" + + # 简单解析 LLM 输出(生产可用结构化输出) + risk_level = "medium" + risk_score = 50.0 + if "critical" in analysis.lower() or "严重" in analysis: + risk_level, risk_score = "critical", 90.0 + elif "high" in analysis.lower() or "高风险" in analysis: + risk_level, risk_score = "high", 70.0 + elif "low" in analysis.lower() or "低风险" in analysis: + risk_level, risk_score = "low", 20.0 + + return ComplianceCheckResponse( + risk_level=risk_level, + risk_score=risk_score, + findings=[{"analysis": analysis}], + recommendations=["请参考上述分析进行整改"], + sources=[{"content": c["content"][:200], "score": c["score"]} for c in top_chunks], + ) diff --git a/services/compliance-backend/app/api/kb.py b/services/compliance-backend/app/api/kb.py new file mode 100644 index 0000000..821c1dc --- /dev/null +++ b/services/compliance-backend/app/api/kb.py @@ -0,0 +1,114 @@ +import uuid +import logging +from pathlib import Path + +from fastapi import APIRouter, Depends, UploadFile, File, Form, HTTPException, BackgroundTasks +from pydantic import BaseModel +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select + +from ..core.deps import get_db +from ..models.db import Workspace, File as FileRecord, Task +from ..services.rag import hybrid_search, rerank, generate_answer +from ..worker import process_file_task + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/api/kb", tags=["知识库"]) + +UPLOAD_DIR = Path("/app/data/uploads") +UPLOAD_DIR.mkdir(parents=True, exist_ok=True) + + +class WorkspaceCreate(BaseModel): + name: str + description: str = "" + domain: str = "general" + + +class QARequest(BaseModel): + query: str + workspace_id: str | None = None + top_k: int = 5 + return_sources: bool = True + + +@router.post("/workspaces") +async def create_workspace(req: WorkspaceCreate, db: AsyncSession = Depends(get_db)): + ws = Workspace(name=req.name, description=req.description, domain=req.domain) + db.add(ws) + await db.flush() + return {"id": str(ws.id), "name": ws.name, "domain": ws.domain} + + +@router.post("/files/upload") +async def upload_file( + background_tasks: BackgroundTasks, + file: UploadFile = File(...), + workspace_id: str = Form(default=""), + db: AsyncSession = Depends(get_db), +): + content = await file.read() + file_id = str(uuid.uuid4()) + suffix = Path(file.filename or "doc").suffix + save_path = UPLOAD_DIR / f"{file_id}{suffix}" + save_path.write_bytes(content) + + file_record = FileRecord( + id=uuid.UUID(file_id), + filename=f"{file_id}{suffix}", + original_name=file.filename or "unknown", + file_type=suffix.lstrip("."), + file_size=len(content), + storage_path=str(save_path), + workspace_id=uuid.UUID(workspace_id) if workspace_id else None, + status="uploaded", + ) + db.add(file_record) + + task = Task( + task_type="parse_and_vectorize", + status="pending", + file_id=uuid.UUID(file_id), + payload={"workspace_id": workspace_id}, + ) + db.add(task) + await db.flush() + + # 异步触发 Celery 任务 + celery_task = process_file_task.delay(file_id, str(task.id), workspace_id) + task.celery_task_id = celery_task.id + await db.flush() + + return {"file_id": file_id, "task_id": str(task.id), "status": "processing"} + + +@router.get("/tasks/{task_id}") +async def get_task(task_id: str, db: AsyncSession = Depends(get_db)): + result = await db.execute(select(Task).where(Task.id == uuid.UUID(task_id))) + task = result.scalar_one_or_none() + if not task: + raise HTTPException(status_code=404, detail="任务不存在") + return { + "task_id": str(task.id), + "status": task.status, + "progress": task.progress, + "file_id": str(task.file_id) if task.file_id else None, + "error_msg": task.error_msg, + "completed_at": task.completed_at.isoformat() if task.completed_at else None, + } + + +@router.post("/qa") +async def qa(req: QARequest): + chunks = await hybrid_search(req.query, workspace_id=req.workspace_id, top_k=req.top_k * 2) + ranked = await rerank(req.query, chunks, top_k=req.top_k) + result = await generate_answer(req.query, ranked) + if not req.return_sources: + result.pop("sources", None) + return result + + +@router.post("/knowledge/retrieval") +async def retrieval(req: QARequest): + chunks = await hybrid_search(req.query, workspace_id=req.workspace_id, top_k=req.top_k) + return {"chunks": chunks, "total": len(chunks)} diff --git a/services/compliance-backend/app/api/regulation.py b/services/compliance-backend/app/api/regulation.py new file mode 100644 index 0000000..87df9e1 --- /dev/null +++ b/services/compliance-backend/app/api/regulation.py @@ -0,0 +1,111 @@ +import uuid +import logging +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select, desc + +from ..core.deps import get_db +from ..models.db import RegulationSource, RegulationUpdate +from ..worker import fetch_regulation_source + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/api/regulation", tags=["法规监控"]) + + +class SourceCreate(BaseModel): + name: str + url: str + domain: str = "vehicle_safety" + fetch_interval: int = 86400 + fetch_config: dict = {} + + +class SubscribeRequest(BaseModel): + name: str + channel: str # email / webhook / feishu / dingtalk + target: str + domains: list[str] = [] + importance_min: str = "normal" + + +@router.post("/sources") +async def create_source(req: SourceCreate, db: AsyncSession = Depends(get_db)): + source = RegulationSource( + name=req.name, + url=req.url, + domain=req.domain, + fetch_interval=req.fetch_interval, + fetch_config=req.fetch_config, + ) + db.add(source) + await db.flush() + return { + "id": str(source.id), + "name": source.name, + "url": source.url, + "domain": source.domain, + "status": "active", + } + + +@router.get("/sources") +async def list_sources(db: AsyncSession = Depends(get_db)): + result = await db.execute( + select(RegulationSource).where(RegulationSource.is_active == True) + ) + sources = result.scalars().all() + return [{"id": str(s.id), "name": s.name, "url": s.url, "domain": s.domain} for s in sources] + + +@router.post("/sources/{source_id}/fetch") +async def manual_fetch(source_id: str, db: AsyncSession = Depends(get_db)): + """手动触发某个监控源的抓取(测试用)""" + result = await db.execute( + select(RegulationSource).where(RegulationSource.id == uuid.UUID(source_id)) + ) + source = result.scalar_one_or_none() + if not source: + raise HTTPException(status_code=404, detail="监控源不存在") + + task = fetch_regulation_source.delay(source_id) + return {"task_id": task.id, "status": "queued", "source_id": source_id} + + +@router.get("/updates") +async def get_updates( + domain: str | None = None, + limit: int = 20, + offset: int = 0, + db: AsyncSession = Depends(get_db), +): + query = select(RegulationUpdate).order_by(desc(RegulationUpdate.fetched_at)) + result = await db.execute(query.limit(limit).offset(offset)) + updates = result.scalars().all() + return { + "updates": [ + { + "id": str(u.id), + "title": u.title, + "url": u.url, + "change_type": u.change_type, + "summary": u.summary, + "importance": u.importance, + "fetched_at": u.fetched_at.isoformat() if u.fetched_at else None, + } + for u in updates + ] + } + + +@router.post("/subscribe") +async def subscribe(req: SubscribeRequest, db: AsyncSession = Depends(get_db)): + from ..models.db import Workspace # 借用DB session + # 简化版:仅记录订阅(推送逻辑在 push-worker 中实现) + return { + "id": str(uuid.uuid4()), + "name": req.name, + "channel": req.channel, + "domains": req.domains, + "status": "active", + } diff --git a/services/compliance-backend/app/core/__init__.py b/services/compliance-backend/app/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/compliance-backend/app/core/config.py b/services/compliance-backend/app/core/config.py new file mode 100644 index 0000000..a300291 --- /dev/null +++ b/services/compliance-backend/app/core/config.py @@ -0,0 +1,37 @@ +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + model_config = SettingsConfigDict(env_file=".env", extra="ignore") + + # 应用 + app_env: str = "development" + log_level: str = "INFO" + api_secret_key: str = "change_this_key" + + # 数据库 + database_url: str = "postgresql+asyncpg://compliance:compliance123@postgres:5432/compliance_db" + redis_url: str = "redis://:redis123@redis:6379/0" + + # Milvus + milvus_host: str = "milvus" + milvus_port: int = 19530 + + # Neo4j + neo4j_uri: str = "bolt://neo4j:7687" + neo4j_user: str = "neo4j" + neo4j_password: str = "neo4j123" + + # AI 服务 + embedding_service_url: str = "http://embedding-service:8010" + mcp_server_url: str = "http://mcp-server:8011" + + # LLM + llm_provider: str = "deepseek" # deepseek / qwen + deepseek_api_key: str = "" + deepseek_model: str = "deepseek-chat" + dashscope_api_key: str = "" + qwen_model: str = "qwen-plus" + + +settings = Settings() diff --git a/services/compliance-backend/app/core/deps.py b/services/compliance-backend/app/core/deps.py new file mode 100644 index 0000000..bb00fc9 --- /dev/null +++ b/services/compliance-backend/app/core/deps.py @@ -0,0 +1,54 @@ +from functools import lru_cache +from typing import AsyncGenerator + +import httpx +from neo4j import AsyncGraphDatabase +from pymilvus import connections, Collection +from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker + +from .config import settings + +# ── PostgreSQL ────────────────────────────────── +engine = create_async_engine(settings.database_url, pool_size=10, max_overflow=20) +AsyncSessionLocal = async_sessionmaker(engine, expire_on_commit=False) + + +async def get_db() -> AsyncGenerator[AsyncSession, None]: + async with AsyncSessionLocal() as session: + try: + yield session + await session.commit() + except Exception: + await session.rollback() + raise + + +# ── Milvus ────────────────────────────────────── +def get_milvus_collection(name: str) -> Collection: + connections.connect(host=settings.milvus_host, port=settings.milvus_port) + return Collection(name) + + +# ── Neo4j ─────────────────────────────────────── +_neo4j_driver = None + + +def get_neo4j(): + global _neo4j_driver + if _neo4j_driver is None: + _neo4j_driver = AsyncGraphDatabase.driver( + settings.neo4j_uri, + auth=(settings.neo4j_user, settings.neo4j_password), + ) + return _neo4j_driver + + +# ── HTTP 客户端(复用连接池)──────────────────── +_http_client = None + + +def get_http_client() -> httpx.AsyncClient: + global _http_client + if _http_client is None: + _http_client = httpx.AsyncClient(timeout=120.0) + return _http_client diff --git a/services/compliance-backend/app/core/llm.py b/services/compliance-backend/app/core/llm.py new file mode 100644 index 0000000..1a99b89 --- /dev/null +++ b/services/compliance-backend/app/core/llm.py @@ -0,0 +1,56 @@ +from langchain_openai import ChatOpenAI +from tenacity import retry, stop_after_attempt, wait_exponential +from .config import settings + + +def get_llm(temperature: float = 0.1) -> ChatOpenAI: + """获取 LLM 客户端(DeepSeek 或 Qwen,均兼容 OpenAI API)""" + if settings.llm_provider == "deepseek": + return ChatOpenAI( + model=settings.deepseek_model, + api_key=settings.deepseek_api_key, + base_url="https://api.deepseek.com/v1", + temperature=temperature, + max_retries=3, + timeout=120, + ) + elif settings.llm_provider == "qwen": + return ChatOpenAI( + model=settings.qwen_model, + api_key=settings.dashscope_api_key, + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + temperature=temperature, + max_retries=3, + timeout=120, + ) + raise ValueError(f"不支持的 LLM 提供商:{settings.llm_provider}") + + +RAG_SYSTEM_PROMPT = """你是一位专业的汽车行业合规专家,具备深厚的法规知识(GB标准、UN-ECE、ISO 45001、IATF 16949等)。 + +回答规则: +1. 仅基于提供的参考文献回答,不添加不在文献中的信息 +2. 每个关键陈述必须标注来源(格式:[来源:文件名,第X页]) +3. 如果参考文献不足以回答问题,明确说明 +4. 使用专业但清晰的语言,适合工程师和法务人员阅读 +5. 对于数值要求(如绝缘电阻值、时间限制等),精确引用原文""" + + +COMPLIANCE_CHECK_PROMPT = """你是一位专业的汽车合规审查专家。 + +请对以下内容进行合规性评估: + +【待审查内容】 +{content} + +【相关法规要求】 +{regulations} + +请按以下格式输出: +1. 整体风险等级:[low/medium/high/critical] +2. 风险分数:[0-100] +3. 发现的合规问题(逐条列出): + - 问题描述 + - 违反的具体法规条款 + - 严重程度 +4. 整改建议(具体可操作)""" diff --git a/services/compliance-backend/app/main.py b/services/compliance-backend/app/main.py new file mode 100644 index 0000000..4137626 --- /dev/null +++ b/services/compliance-backend/app/main.py @@ -0,0 +1,84 @@ +import logging +import time + +import structlog +from fastapi import FastAPI, Request +from fastapi.middleware.cors import CORSMiddleware +from prometheus_fastapi_instrumentator import Instrumentator + +from .api import kb, compliance, regulation +from .core.config import settings + +# 结构化日志配置 +structlog.configure( + wrapper_class=structlog.make_filtering_bound_logger( + getattr(logging, settings.log_level.upper(), logging.INFO) + ) +) +logger = structlog.get_logger() + +app = FastAPI( + title="AI合规智能中枢 API", + description="面向车企与工厂的全链路合规智能平台", + version="0.1.0", + docs_url="/docs", + redoc_url="/redoc", +) + +# CORS(开发环境) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"] if settings.app_env == "development" else [], + allow_methods=["*"], + allow_headers=["*"], +) + +# Prometheus 指标 +Instrumentator().instrument(app).expose(app) + +# 注册路由 +app.include_router(kb.router) +app.include_router(compliance.router) +app.include_router(regulation.router) + + +@app.middleware("http") +async def log_requests(request: Request, call_next): + start = time.time() + response = await call_next(request) + duration_ms = int((time.time() - start) * 1000) + logger.info( + "request", + method=request.method, + path=request.url.path, + status=response.status_code, + duration_ms=duration_ms, + ) + return response + + +@app.get("/health") +async def health(): + """健康检查(含依赖服务检测)""" + import httpx + from .core.config import settings + + checks = {"status": "ok", "services": {}} + + # 检查嵌入服务 + try: + async with httpx.AsyncClient(timeout=5) as client: + r = await client.get(f"{settings.embedding_service_url}/health") + checks["services"]["embedding"] = "ok" if r.status_code == 200 else "degraded" + except Exception: + checks["services"]["embedding"] = "unavailable" + + # 检查 MCP Server + try: + async with httpx.AsyncClient(timeout=5) as client: + r = await client.get(f"{settings.mcp_server_url}/health") + checks["services"]["mcp"] = "ok" if r.status_code == 200 else "degraded" + except Exception: + checks["services"]["mcp"] = "unavailable" + + return checks diff --git a/services/compliance-backend/app/models/__init__.py b/services/compliance-backend/app/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/compliance-backend/app/models/db.py b/services/compliance-backend/app/models/db.py new file mode 100644 index 0000000..8251cfe --- /dev/null +++ b/services/compliance-backend/app/models/db.py @@ -0,0 +1,113 @@ +import uuid +from datetime import datetime + +from sqlalchemy import Column, String, Integer, BigInteger, Boolean, Text, ARRAY, Numeric +from sqlalchemy import DateTime, ForeignKey, func +from sqlalchemy.dialects.postgresql import UUID, JSONB, INET +from sqlalchemy.orm import DeclarativeBase, relationship + + +class Base(DeclarativeBase): + pass + + +class Workspace(Base): + __tablename__ = "workspaces" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + name = Column(String(255), nullable=False) + description = Column(Text) + domain = Column(String(100)) + created_by = Column(String(255)) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now()) + + files = relationship("File", back_populates="workspace") + + +class File(Base): + __tablename__ = "files" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + workspace_id = Column(UUID(as_uuid=True), ForeignKey("workspaces.id", ondelete="CASCADE")) + filename = Column(String(500), nullable=False) + original_name = Column(String(500), nullable=False) + file_type = Column(String(50)) + file_size = Column(BigInteger) + storage_path = Column(Text) + parsed_path = Column(Text) + status = Column(String(50), default="uploaded") + error_msg = Column(Text) + metadata = Column(JSONB, default={}) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now()) + + workspace = relationship("Workspace", back_populates="files") + tasks = relationship("Task", back_populates="file") + + +class Task(Base): + __tablename__ = "tasks" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + task_type = Column(String(100), nullable=False) + status = Column(String(50), default="pending") + payload = Column(JSONB, default={}) + result = Column(JSONB) + error_msg = Column(Text) + progress = Column(Integer, default=0) + file_id = Column(UUID(as_uuid=True), ForeignKey("files.id")) + celery_task_id = Column(String(255)) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now()) + completed_at = Column(DateTime(timezone=True)) + + file = relationship("File", back_populates="tasks") + + +class ComplianceReport(Base): + __tablename__ = "compliance_reports" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + file_id = Column(UUID(as_uuid=True), ForeignKey("files.id")) + regulation_domains = Column(ARRAY(Text)) + overall_risk_level = Column(String(20)) + risk_score = Column(Numeric(5, 2)) + findings = Column(JSONB, default=[]) + recommendations = Column(JSONB, default=[]) + report_markdown = Column(Text) + llm_model = Column(String(100)) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + + +class RegulationSource(Base): + __tablename__ = "regulation_sources" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + name = Column(String(255), nullable=False) + url = Column(Text, nullable=False) + source_type = Column(String(50), default="webpage") + domain = Column(String(100)) + fetch_interval = Column(Integer, default=86400) + is_active = Column(Boolean, default=True) + last_fetched_at = Column(DateTime(timezone=True)) + last_hash = Column(String(64)) + fetch_config = Column(JSONB, default={}) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + + +class RegulationUpdate(Base): + __tablename__ = "regulation_updates" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + source_id = Column(UUID(as_uuid=True), ForeignKey("regulation_sources.id")) + title = Column(String(500)) + url = Column(Text) + change_type = Column(String(50)) + summary = Column(Text) + raw_content = Column(Text) + diff_content = Column(Text) + is_notified = Column(Boolean, default=False) + importance = Column(String(20), default="normal") + fetched_at = Column(DateTime(timezone=True), server_default=func.now()) + published_at = Column(DateTime(timezone=True)) diff --git a/services/compliance-backend/app/services/__init__.py b/services/compliance-backend/app/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/compliance-backend/app/services/embed.py b/services/compliance-backend/app/services/embed.py new file mode 100644 index 0000000..085466a --- /dev/null +++ b/services/compliance-backend/app/services/embed.py @@ -0,0 +1,21 @@ +import httpx +from tenacity import retry, stop_after_attempt, wait_exponential +from ..core.config import settings + + +@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10)) +async def embed_texts(texts: list[str], batch_size: int = 12) -> dict: + """调用嵌入服务,返回 dense 和 sparse 向量""" + async with httpx.AsyncClient(timeout=120.0) as client: + resp = await client.post( + f"{settings.embedding_service_url}/embed", + json={"texts": texts, "batch_size": batch_size}, + ) + resp.raise_for_status() + return resp.json() + + +async def embed_single(text: str) -> list[float]: + """嵌入单条文本,返回 dense 向量""" + result = await embed_texts([text], batch_size=1) + return result["dense"][0] diff --git a/services/compliance-backend/app/services/graph.py b/services/compliance-backend/app/services/graph.py new file mode 100644 index 0000000..9da4742 --- /dev/null +++ b/services/compliance-backend/app/services/graph.py @@ -0,0 +1,65 @@ +import logging +from ..core.deps import get_neo4j + +logger = logging.getLogger(__name__) + + +async def create_regulation_node(regulation: dict) -> str: + """在 Neo4j 中创建法规节点""" + driver = get_neo4j() + async with driver.session() as session: + result = await session.run( + """ + MERGE (r:Regulation {id: $id}) + SET r.title = $title, + r.domain = $domain, + r.version = $version, + r.code = $code + RETURN r.id as id + """, + id=regulation.get("id"), + title=regulation.get("title", ""), + domain=regulation.get("domain", ""), + version=regulation.get("version", ""), + code=regulation.get("code", ""), + ) + record = await result.single() + return record["id"] if record else None + + +async def create_clause_node(clause: dict, regulation_id: str) -> str: + """创建条款节点并关联到法规""" + driver = get_neo4j() + async with driver.session() as session: + result = await session.run( + """ + MATCH (r:Regulation {id: $reg_id}) + MERGE (c:Clause {id: $id}) + SET c.number = $number, + c.content = $content + MERGE (r)-[:CONTAINS]->(c) + RETURN c.id as id + """, + reg_id=regulation_id, + id=clause.get("id"), + number=clause.get("number", ""), + content=clause.get("content", "")[:2000], + ) + record = await result.single() + return record["id"] if record else None + + +async def search_related_regulations(domain: str, limit: int = 10) -> list[dict]: + """查询指定域下的所有法规""" + driver = get_neo4j() + async with driver.session() as session: + result = await session.run( + """ + MATCH (r:Regulation {domain: $domain}) + RETURN r.id as id, r.title as title, r.code as code, r.version as version + LIMIT $limit + """, + domain=domain, + limit=limit, + ) + return [dict(record) async for record in result] diff --git a/services/compliance-backend/app/services/monitor.py b/services/compliance-backend/app/services/monitor.py new file mode 100644 index 0000000..4121c61 --- /dev/null +++ b/services/compliance-backend/app/services/monitor.py @@ -0,0 +1,59 @@ +import hashlib +import logging +import httpx +from bs4 import BeautifulSoup +from datetime import datetime, timezone + +logger = logging.getLogger(__name__) + + +async def fetch_url(url: str, timeout: int = 30) -> str | None: + """抓取 URL 内容""" + try: + async with httpx.AsyncClient( + timeout=timeout, + headers={"User-Agent": "Mozilla/5.0 (compliance-monitor/1.0)"}, + follow_redirects=True, + ) as client: + resp = await client.get(url) + resp.raise_for_status() + return resp.text + except Exception as e: + logger.warning(f"抓取 {url} 失败:{e}") + return None + + +def extract_text(html: str) -> str: + """提取 HTML 中的主要文本内容""" + soup = BeautifulSoup(html, "html.parser") + for tag in soup(["script", "style", "nav", "footer", "header"]): + tag.decompose() + return soup.get_text(separator="\n", strip=True) + + +def compute_hash(content: str) -> str: + return hashlib.md5(content.encode("utf-8")).hexdigest() + + +async def check_source_for_updates(source: dict) -> dict | None: + """ + 检查监控源是否有更新。 + 返回 None 表示无变化,返回 dict 表示有新内容。 + """ + html = await fetch_url(source["url"]) + if not html: + return None + + text = extract_text(html) + new_hash = compute_hash(text) + + if source.get("last_hash") == new_hash: + logger.info(f"监控源 {source['name']} 无变化") + return None + + return { + "source_id": source["id"], + "raw_content": text[:50000], # 最多保存 50KB + "new_hash": new_hash, + "fetched_at": datetime.now(timezone.utc).isoformat(), + } diff --git a/services/compliance-backend/app/services/parse.py b/services/compliance-backend/app/services/parse.py new file mode 100644 index 0000000..9ed18c1 --- /dev/null +++ b/services/compliance-backend/app/services/parse.py @@ -0,0 +1,43 @@ +import httpx +from tenacity import retry, stop_after_attempt, wait_exponential +from ..core.config import settings + + +@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=30)) +async def parse_document(file_content: bytes, filename: str) -> dict: + """调用 mcp-server 解析文档,返回 Markdown""" + async with httpx.AsyncClient(timeout=300.0) as client: + resp = await client.post( + f"{settings.mcp_server_url}/parse-document", + files={"file": (filename, file_content, "application/octet-stream")}, + ) + resp.raise_for_status() + return resp.json() + + +def chunk_text(text: str, chunk_size: int = 512, overlap: int = 64) -> list[dict]: + """将文本按 token 数分块(简单版,按字符数估算)""" + chars_per_chunk = chunk_size * 2 # 中文约2字符/token + chars_overlap = overlap * 2 + chunks = [] + start = 0 + idx = 0 + + while start < len(text): + end = min(start + chars_per_chunk, len(text)) + # 尝试在段落边界截断 + if end < len(text): + for sep in ["\n\n", "\n", "。", ".", " "]: + pos = text.rfind(sep, start, end) + if pos > start + chars_per_chunk // 2: + end = pos + len(sep) + break + + chunk_text = text[start:end].strip() + if chunk_text: + chunks.append({"idx": idx, "content": chunk_text, "start": start, "end": end}) + idx += 1 + + start = max(start + 1, end - chars_overlap) + + return chunks diff --git a/services/compliance-backend/app/services/rag.py b/services/compliance-backend/app/services/rag.py new file mode 100644 index 0000000..c85e0ef --- /dev/null +++ b/services/compliance-backend/app/services/rag.py @@ -0,0 +1,92 @@ +import logging +from langchain.schema import HumanMessage, SystemMessage +from pymilvus import connections, Collection + +from .embed import embed_single, embed_texts +from ..core.llm import get_llm, RAG_SYSTEM_PROMPT +from ..core.config import settings + +logger = logging.getLogger(__name__) + + +def _get_collection(name: str) -> Collection: + connections.connect(host=settings.milvus_host, port=settings.milvus_port) + return Collection(name) + + +async def hybrid_search( + query: str, + collection_name: str = "regulation_chunks", + top_k: int = 10, + workspace_id: str | None = None, +) -> list[dict]: + """混合检索:BGE-M3 向量检索(调研版简化,省去 BM25 融合)""" + query_vec = await embed_single(query) + + col = _get_collection(collection_name) + + expr = f'workspace_id == "{workspace_id}"' if workspace_id else None + results = col.search( + data=[query_vec], + anns_field="dense_vec", + param={"metric_type": "COSINE", "params": {"ef": 100}}, + limit=top_k, + expr=expr, + output_fields=["content", "metadata", "file_id", "chunk_idx"], + ) + + chunks = [] + for hits in results: + for hit in hits: + chunks.append({ + "id": hit.id, + "content": hit.entity.get("content", ""), + "score": float(hit.score), + "file_id": hit.entity.get("file_id", ""), + "chunk_idx": hit.entity.get("chunk_idx", 0), + "metadata": hit.entity.get("metadata", {}), + }) + return chunks + + +async def rerank(query: str, chunks: list[dict], top_k: int = 5) -> list[dict]: + """简化版精排(调研版按 score 直接排序,生产可换 Cross-Encoder)""" + return sorted(chunks, key=lambda x: x["score"], reverse=True)[:top_k] + + +async def generate_answer(query: str, chunks: list[dict]) -> dict: + """基于检索结果,调用 LLM 生成引文锚定的答案""" + if not chunks: + return {"answer": "未找到相关法规内容,请上传相关法规文档后重试。", "sources": []} + + # 构建 RAG 上下文 + context_parts = [] + for i, chunk in enumerate(chunks, 1): + meta = chunk.get("metadata", {}) + source_info = f"[来源 {i}:{meta.get('filename', '未知文件')},第 {meta.get('page', '?')} 页]" + context_parts.append(f"{source_info}\n{chunk['content']}") + + context = "\n\n---\n\n".join(context_parts) + user_prompt = f"参考文献:\n\n{context}\n\n问题:{query}\n\n请基于以上参考文献回答,并标注来源。" + + llm = get_llm(temperature=0.1) + messages = [SystemMessage(content=RAG_SYSTEM_PROMPT), HumanMessage(content=user_prompt)] + + try: + response = await llm.ainvoke(messages) + answer = response.content + except Exception as e: + logger.error(f"LLM 生成失败:{e}") + answer = f"LLM 生成失败:{e}。检索到的相关内容:{chunks[0]['content'][:200]}..." + + sources = [ + { + "content": c["content"][:300], + "file_id": c.get("file_id", ""), + "chunk_idx": c.get("chunk_idx", 0), + "score": c.get("score", 0), + "metadata": c.get("metadata", {}), + } + for c in chunks + ] + return {"answer": answer, "sources": sources} diff --git a/services/compliance-backend/app/worker.py b/services/compliance-backend/app/worker.py new file mode 100644 index 0000000..8c09687 --- /dev/null +++ b/services/compliance-backend/app/worker.py @@ -0,0 +1,212 @@ +import uuid +import logging +from datetime import datetime, timezone +from celery import Celery +from celery.schedules import crontab + +from .core.config import settings + +logger = logging.getLogger(__name__) + +# Celery 配置 +celery_app = Celery( + "compliance", + broker=settings.redis_url, + backend=settings.redis_url, +) +celery_app.conf.update( + task_serializer="json", + accept_content=["json"], + result_serializer="json", + timezone="Asia/Shanghai", + task_routes={ + "app.worker.process_file_task": {"queue": "parse"}, + "app.worker.fetch_regulation_source": {"queue": "monitor"}, + "app.worker.send_notifications": {"queue": "push"}, + }, + beat_schedule={ + "daily-regulation-monitor": { + "task": "app.worker.run_all_monitors", + "schedule": crontab(hour=2, minute=0), + }, + }, +) + +# ── 文件处理任务(解析 + 向量化)──────────────── + +@celery_app.task(name="app.worker.process_file_task", bind=True, max_retries=3) +def process_file_task(self, file_id: str, task_id: str, workspace_id: str): + """解析文档并向量化存入 Milvus""" + import asyncio + asyncio.run(_process_file(file_id, task_id, workspace_id)) + + +async def _process_file(file_id: str, task_id: str, workspace_id: str): + from pathlib import Path + from sqlalchemy import select + from .core.deps import AsyncSessionLocal, get_milvus_collection + from .models.db import File, Task + from .services.parse import parse_document, chunk_text + from .services.embed import embed_texts + + async with AsyncSessionLocal() as db: + # 查找文件记录 + result = await db.execute(select(File).where(File.id == uuid.UUID(file_id))) + file_record = result.scalar_one_or_none() + if not file_record: + logger.error(f"文件 {file_id} 不存在") + return + + task_result = await db.execute(select(Task).where(Task.id == uuid.UUID(task_id))) + task = task_result.scalar_one_or_none() + + try: + # 更新状态 + file_record.status = "parsing" + if task: + task.status = "running" + task.progress = 10 + await db.commit() + + # Step 1:解析文档 + file_content = Path(file_record.storage_path).read_bytes() + parse_result = await parse_document(file_content, file_record.original_name) + markdown = parse_result.get("markdown", "") + + if not markdown.strip(): + raise ValueError("文档解析结果为空") + + file_record.status = "parsed" + if task: + task.progress = 40 + await db.commit() + + # Step 2:分块 + chunks = chunk_text(markdown, chunk_size=512, overlap=64) + logger.info(f"文件 {file_id} 分割为 {len(chunks)} 块") + + # Step 3:向量化(分批处理) + batch_size = 16 + col = get_milvus_collection("regulation_chunks") + + for i in range(0, len(chunks), batch_size): + batch = chunks[i:i + batch_size] + texts = [c["content"] for c in batch] + embed_result = await embed_texts(texts, batch_size=batch_size) + dense_vecs = embed_result["dense"] + + entities = [ + [f"{file_id}_{c['idx']}" for c in batch], + [file_id] * len(batch), + [workspace_id] * len(batch), + [c["idx"] for c in batch], + [c["content"] for c in batch], + dense_vecs, + [{"filename": file_record.original_name, "page": c.get("page", 0)} for c in batch], + ] + col.insert(entities) + + if task: + task.progress = 40 + int(60 * (i + batch_size) / len(chunks)) + await db.commit() + + col.flush() + + # 完成 + file_record.status = "vectorized" + if task: + task.status = "completed" + task.progress = 100 + task.completed_at = datetime.now(timezone.utc) + await db.commit() + logger.info(f"文件 {file_id} 处理完成") + + except Exception as e: + logger.error(f"文件 {file_id} 处理失败:{e}") + file_record.status = "failed" + file_record.error_msg = str(e) + if task: + task.status = "failed" + task.error_msg = str(e) + await db.commit() + raise + + +# ── 法规监控任务 ──────────────────────────────── + +@celery_app.task(name="app.worker.run_all_monitors") +def run_all_monitors(): + """定时触发所有活跃监控源""" + import asyncio + asyncio.run(_run_all_monitors()) + + +async def _run_all_monitors(): + from sqlalchemy import select + from .core.deps import AsyncSessionLocal + from .models.db import RegulationSource + + async with AsyncSessionLocal() as db: + result = await db.execute( + select(RegulationSource).where(RegulationSource.is_active == True) + ) + sources = result.scalars().all() + for source in sources: + fetch_regulation_source.delay(str(source.id)) + logger.info(f"触发监控源抓取:{source.name}") + + +@celery_app.task(name="app.worker.fetch_regulation_source", bind=True, max_retries=2) +def fetch_regulation_source(self, source_id: str): + import asyncio + asyncio.run(_fetch_source(source_id)) + + +async def _fetch_source(source_id: str): + import hashlib + from sqlalchemy import select + from .core.deps import AsyncSessionLocal + from .models.db import RegulationSource, RegulationUpdate + from .services.monitor import check_source_for_updates + + async with AsyncSessionLocal() as db: + result = await db.execute( + select(RegulationSource).where(RegulationSource.id == uuid.UUID(source_id)) + ) + source = result.scalar_one_or_none() + if not source: + return + + source_dict = { + "id": str(source.id), + "name": source.name, + "url": source.url, + "last_hash": source.last_hash, + } + update_data = await check_source_for_updates(source_dict) + + if update_data: + logger.info(f"检测到变更:{source.name}") + source.last_hash = update_data["new_hash"] + source.last_fetched_at = datetime.now(timezone.utc) + + update = RegulationUpdate( + source_id=uuid.UUID(source_id), + change_type="updated", + raw_content=update_data["raw_content"][:50000], + importance="normal", + ) + db.add(update) + await db.commit() + else: + source.last_fetched_at = datetime.now(timezone.utc) + await db.commit() + + +@celery_app.task(name="app.worker.send_notifications") +def send_notifications(): + logger.info("推送通知任务执行(待实现)") + + +# 导出供 FastAPI 使用 +worker = celery_app diff --git a/services/compliance-backend/pyproject.toml b/services/compliance-backend/pyproject.toml new file mode 100644 index 0000000..7279048 --- /dev/null +++ b/services/compliance-backend/pyproject.toml @@ -0,0 +1,29 @@ +[project] +name = "compliance-backend" +version = "0.1.0" +description = "AI合规智能中枢 — 业务后端" +requires-python = ">=3.12" +dependencies = [ + "fastapi>=0.115", + "uvicorn[standard]>=0.30", + "pydantic>=2.7", + "pydantic-settings>=2.4", + "sqlalchemy[asyncio]>=2.0", + "asyncpg>=0.29", + "redis[asyncio]>=5.0", + "celery[redis]>=5.4", + "pymilvus>=2.4", + "neo4j>=5.20", + "langchain>=0.3", + "langchain-openai>=0.2", + "langchain-community>=0.3", + "llama-index-core>=0.11", + "httpx>=0.27", + "python-multipart>=0.0.9", + "python-jose[cryptography]>=3.3", + "structlog>=24.0", + "prometheus-fastapi-instrumentator>=7.0", + "tenacity>=8.5", + "beautifulsoup4>=4.12", + "requests>=2.32", +] diff --git a/services/embedding/Dockerfile b/services/embedding/Dockerfile new file mode 100644 index 0000000..92e2e99 --- /dev/null +++ b/services/embedding/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.12-slim + +WORKDIR /app + +# 系统依赖 +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Python 依赖(先装,利用构建缓存) +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt \ + --index-url https://pypi.tuna.tsinghua.edu.cn/simple \ + --trusted-host pypi.tuna.tsinghua.edu.cn + +COPY main.py . + +# 健康检查 +HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \ + CMD curl -f http://localhost:8010/health || exit 1 + +EXPOSE 8010 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8010", "--workers", "1"] diff --git a/services/embedding/main.py b/services/embedding/main.py new file mode 100644 index 0000000..43de86a --- /dev/null +++ b/services/embedding/main.py @@ -0,0 +1,87 @@ +import os +import logging +from contextlib import asynccontextmanager +from typing import Optional + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +MODEL_NAME = os.getenv("MODEL_NAME", "BAAI/bge-m3") +MODEL_CACHE = os.getenv("HF_HOME", "/app/models") +DEVICE = os.getenv("DEVICE", "cpu") +MAX_BATCH = int(os.getenv("MAX_BATCH_SIZE", "16")) + +# 设置 HuggingFace 镜像 +if os.getenv("HF_ENDPOINT"): + os.environ["HF_ENDPOINT"] = os.getenv("HF_ENDPOINT") + +model = None + + +@asynccontextmanager +async def lifespan(app: FastAPI): + global model + logger.info(f"加载模型 {MODEL_NAME},设备:{DEVICE}") + try: + from FlagEmbedding import BGEM3FlagModel + model = BGEM3FlagModel( + MODEL_NAME, + use_fp16=(DEVICE != "cpu"), + cache_dir=MODEL_CACHE, + ) + logger.info("BGE-M3 模型加载完成") + except Exception as e: + logger.error(f"模型加载失败:{e}") + raise + yield + logger.info("服务关闭") + + +app = FastAPI(title="BGE-M3 嵌入服务", lifespan=lifespan) + + +class EmbedRequest(BaseModel): + texts: list[str] = Field(..., min_length=1, max_length=100) + batch_size: int = Field(default=12, ge=1, le=MAX_BATCH) + return_dense: bool = True + return_sparse: bool = True + + +class EmbedResponse(BaseModel): + dense: Optional[list[list[float]]] = None + sparse: Optional[list[dict]] = None + model: str + count: int + + +@app.post("/embed", response_model=EmbedResponse) +def embed(req: EmbedRequest) -> EmbedResponse: + if model is None: + raise HTTPException(status_code=503, detail="模型未就绪") + if len(req.texts) > 100: + raise HTTPException(status_code=400, detail="单次最多 100 条文本") + + try: + output = model.encode( + req.texts, + batch_size=req.batch_size, + return_dense=req.return_dense, + return_sparse=req.return_sparse, + ) + return EmbedResponse( + dense=output["dense_vecs"].tolist() if req.return_dense else None, + sparse=[dict(w) for w in output["lexical_weights"]] if req.return_sparse else None, + model=MODEL_NAME, + count=len(req.texts), + ) + except Exception as e: + logger.error(f"嵌入生成失败:{e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/health") +def health(): + return {"status": "ok", "model": MODEL_NAME, "device": DEVICE, "ready": model is not None} diff --git a/services/embedding/requirements.txt b/services/embedding/requirements.txt new file mode 100644 index 0000000..be257d6 --- /dev/null +++ b/services/embedding/requirements.txt @@ -0,0 +1,10 @@ +fastapi>=0.115 +uvicorn[standard]>=0.30 +pydantic>=2.7 +FlagEmbedding>=1.3 +# CPU 版本 PyTorch(减小镜像体积) +torch>=2.3.0 --index-url https://download.pytorch.org/whl/cpu +transformers>=4.44 +sentence-transformers>=3.0 +huggingface-hub>=0.24 +numpy>=1.26 diff --git a/services/mcp-server/Dockerfile b/services/mcp-server/Dockerfile new file mode 100644 index 0000000..a443154 --- /dev/null +++ b/services/mcp-server/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.12-slim + +WORKDIR /app + +# 系统依赖(MinerU 需要 libGL) +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxrender1 \ + libxext6 \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt \ + --index-url https://pypi.tuna.tsinghua.edu.cn/simple \ + --trusted-host pypi.tuna.tsinghua.edu.cn + +# 预下载 MinerU 模型(构建时执行,加速启动) +RUN python -c " +import os +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' +try: + from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton + print('MinerU 模型下载完成') +except Exception as e: + print(f'模型下载跳过(将在运行时下载): {e}') +" || true + +COPY main.py . + +HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \ + CMD curl -f http://localhost:8011/health || exit 1 + +EXPOSE 8011 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8011", "--workers", "1"] diff --git a/services/mcp-server/main.py b/services/mcp-server/main.py new file mode 100644 index 0000000..d921e72 --- /dev/null +++ b/services/mcp-server/main.py @@ -0,0 +1,136 @@ +import os +import tempfile +import logging +from pathlib import Path + +from fastapi import FastAPI, UploadFile, File, HTTPException +from pydantic import BaseModel + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +DEVICE = os.getenv("DEVICE", "cpu") +UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "/app/uploads")) +PARSED_DIR = Path(os.getenv("PARSED_DIR", "/app/parsed")) + +UPLOAD_DIR.mkdir(parents=True, exist_ok=True) +PARSED_DIR.mkdir(parents=True, exist_ok=True) + +app = FastAPI(title="MinerU 文档解析服务") + +SUPPORTED_TYPES = { + "application/pdf": "pdf", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", + "application/msword": "doc", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx", +} + + +def parse_pdf_mineru(pdf_path: str) -> str: + """使用 MinerU 解析 PDF""" + try: + from magic_pdf.data.data_reader_writer import FileBasedDataWriter + from magic_pdf.pipe.UnicodeFormulaPDFPipe import UnicodeFormulaPDFPipe + + with tempfile.TemporaryDirectory() as tmpdir: + writer = FileBasedDataWriter(tmpdir) + pipe = UnicodeFormulaPDFPipe(pdf_path, writer) + pipe.pipe_classify() + pipe.pipe_analyze() + pipe.pipe_parse() + md_content = pipe.pipe_mk_uni_format(tmpdir, drop_mode="none") + return md_content or "" + except Exception as e: + logger.warning(f"MinerU 解析失败,降级到 PyMuPDF:{e}") + return parse_pdf_pymupdf(pdf_path) + + +def parse_pdf_pymupdf(pdf_path: str) -> str: + """降级:使用 PyMuPDF 提取文本""" + try: + import fitz # PyMuPDF + doc = fitz.open(pdf_path) + pages = [] + for i, page in enumerate(doc): + text = page.get_text() + if text.strip(): + pages.append(f"## 第 {i+1} 页\n\n{text}") + return "\n\n".join(pages) + except Exception as e: + return f"[解析失败:{e}]" + + +def parse_docx(file_path: str) -> str: + """解析 Word 文档""" + try: + from docx import Document + doc = Document(file_path) + parts = [] + for para in doc.paragraphs: + if para.text.strip(): + style = para.style.name if para.style else "" + if "Heading" in style: + level = style.replace("Heading ", "").strip() + try: + prefix = "#" * int(level) + except ValueError: + prefix = "##" + parts.append(f"{prefix} {para.text}") + else: + parts.append(para.text) + for table in doc.tables: + rows = [] + for row in table.rows: + rows.append(" | ".join(cell.text.strip() for cell in row.cells)) + if rows: + parts.append("\n".join(rows)) + return "\n\n".join(parts) + except Exception as e: + return f"[Word 解析失败:{e}]" + + +class ParseResponse(BaseModel): + filename: str + markdown: str + page_count: int + parser: str + + +@app.post("/mineru-parse", response_model=ParseResponse) +async def mineru_parse(file: UploadFile = File(...)) -> ParseResponse: + content = await file.read() + suffix = Path(file.filename or "doc.pdf").suffix.lower() + + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + tmp.write(content) + tmp_path = tmp.name + + try: + if suffix == ".pdf": + markdown = parse_pdf_mineru(tmp_path) + parser = "mineru" + elif suffix in (".docx", ".doc"): + markdown = parse_docx(tmp_path) + parser = "python-docx" + else: + raise HTTPException(status_code=415, detail=f"不支持的文件类型:{suffix}") + + page_count = markdown.count("## 第") if suffix == ".pdf" else markdown.count("\n\n") + return ParseResponse( + filename=file.filename or "unknown", + markdown=markdown, + page_count=max(page_count, 1), + parser=parser, + ) + finally: + os.unlink(tmp_path) + + +@app.post("/parse-document", response_model=ParseResponse) +async def parse_document(file: UploadFile = File(...)) -> ParseResponse: + return await mineru_parse(file) + + +@app.get("/health") +def health(): + return {"status": "ok", "device": DEVICE} diff --git a/services/mcp-server/requirements.txt b/services/mcp-server/requirements.txt new file mode 100644 index 0000000..683cc76 --- /dev/null +++ b/services/mcp-server/requirements.txt @@ -0,0 +1,11 @@ +fastapi>=0.115 +uvicorn[standard]>=0.30 +pydantic>=2.7 +python-multipart>=0.0.9 +httpx>=0.27 +# MinerU 文档解析 +mineru[pipeline]>=1.0 +# Word/Excel 降级解析 +python-docx>=1.1 +openpyxl>=3.1 +PyMuPDF>=1.24 # PDF 降级解析