first commit
This commit is contained in:
569
02_组件安装指南.md
Normal file
569
02_组件安装指南.md
Normal file
@@ -0,0 +1,569 @@
|
||||
# AI合规智能中枢 — 组件安装指南
|
||||
|
||||
> 本文档提供每个组件的详细安装步骤、配置说明和验证方法。
|
||||
|
||||
---
|
||||
|
||||
## 前置:Docker 环境安装
|
||||
|
||||
### Ubuntu 22.04 LTS
|
||||
|
||||
```bash
|
||||
# 1. 更新包列表
|
||||
sudo apt-get update
|
||||
|
||||
# 2. 安装依赖
|
||||
sudo apt-get install -y ca-certificates curl gnupg lsb-release
|
||||
|
||||
# 3. 添加 Docker GPG 密钥
|
||||
sudo install -m 0755 -d /etc/apt/keyrings
|
||||
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \
|
||||
sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
|
||||
sudo chmod a+r /etc/apt/keyrings/docker.gpg
|
||||
|
||||
# 4. 添加 Docker 仓库
|
||||
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \
|
||||
https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | \
|
||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
|
||||
# 5. 安装 Docker CE
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y docker-ce docker-ce-cli containerd.io \
|
||||
docker-buildx-plugin docker-compose-plugin
|
||||
|
||||
# 6. 加入 docker 组(免 sudo)
|
||||
sudo usermod -aG docker $USER
|
||||
newgrp docker
|
||||
|
||||
# 7. 验证
|
||||
docker --version # Docker version 27.x.x
|
||||
docker compose version # Docker Compose version v2.x.x
|
||||
```
|
||||
|
||||
### Windows 11 + WSL2
|
||||
|
||||
```powershell
|
||||
# PowerShell(管理员)
|
||||
|
||||
# 1. 启用 WSL2
|
||||
wsl --install -d Ubuntu-22.04
|
||||
wsl --set-default-version 2
|
||||
|
||||
# 2. 安装 Docker Desktop(需重启)
|
||||
winget install -e --id Docker.DockerDesktop
|
||||
|
||||
# 3. 重启后,Docker Desktop 设置:
|
||||
# Settings → General → "Use WSL 2 based engine" ✓
|
||||
# Settings → Resources → WSL Integration → Ubuntu-22.04 ✓
|
||||
```
|
||||
|
||||
### GPU 支持(可选,有 NVIDIA GPU 时)
|
||||
|
||||
```bash
|
||||
# Ubuntu 安装 nvidia-container-toolkit
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
|
||||
sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
|
||||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y nvidia-container-toolkit
|
||||
sudo nvidia-ctk runtime configure --runtime=docker
|
||||
sudo systemctl restart docker
|
||||
|
||||
# 验证
|
||||
docker run --rm --gpus all nvidia/cuda:12.4-base nvidia-smi
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件一:PostgreSQL 16 + pgvector
|
||||
|
||||
**用途:** 存储元数据(文件记录、任务状态、合规报告、法规变更)
|
||||
|
||||
### 配置参数
|
||||
|
||||
```yaml
|
||||
# docker-compose.yml 中的关键配置
|
||||
image: pgvector/pgvector:pg16 # 内置 pgvector 扩展
|
||||
POSTGRES_USER: compliance
|
||||
POSTGRES_PASSWORD: <your_password>
|
||||
POSTGRES_DB: compliance_db
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data # 数据持久化
|
||||
- ./init-sql:/docker-entrypoint-initdb.d # 自动执行初始化 SQL
|
||||
ports:
|
||||
- "5432:5432"
|
||||
```
|
||||
|
||||
### 启动与验证
|
||||
|
||||
```bash
|
||||
# 启动
|
||||
docker compose up -d postgres
|
||||
|
||||
# 等待健康(约10秒)
|
||||
docker compose ps postgres
|
||||
|
||||
# 连接测试
|
||||
docker compose exec postgres psql -U compliance -d compliance_db -c "\dt"
|
||||
|
||||
# 验证扩展
|
||||
docker compose exec postgres psql -U compliance -d compliance_db \
|
||||
-c "SELECT extname FROM pg_extension WHERE extname IN ('vector', 'uuid-ossp');"
|
||||
```
|
||||
|
||||
### 常用操作
|
||||
|
||||
```bash
|
||||
# 查看所有表
|
||||
docker compose exec postgres psql -U compliance -d compliance_db \
|
||||
-c "\dt"
|
||||
|
||||
# 查询任务状态
|
||||
docker compose exec postgres psql -U compliance -d compliance_db \
|
||||
-c "SELECT id, task_type, status, created_at FROM tasks ORDER BY created_at DESC LIMIT 10;"
|
||||
|
||||
# 备份数据库
|
||||
docker compose exec postgres pg_dump -U compliance compliance_db > backup_$(date +%Y%m%d).sql
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件二:Redis 7
|
||||
|
||||
**用途:** Celery 消息中间件、热数据缓存、分布式锁、会话存储
|
||||
|
||||
### 配置参数
|
||||
|
||||
```yaml
|
||||
image: redis:7-alpine
|
||||
command: >
|
||||
redis-server
|
||||
--requirepass <your_password>
|
||||
--maxmemory 2gb
|
||||
--maxmemory-policy allkeys-lru # 内存满时淘汰最近最少使用的 key
|
||||
ports:
|
||||
- "6379:6379"
|
||||
```
|
||||
|
||||
### 启动与验证
|
||||
|
||||
```bash
|
||||
# 启动
|
||||
docker compose up -d redis
|
||||
|
||||
# 连接测试
|
||||
docker compose exec redis redis-cli -a <password> ping
|
||||
# 应返回:PONG
|
||||
|
||||
# 查看 Celery 队列长度
|
||||
docker compose exec redis redis-cli -a <password> llen celery
|
||||
|
||||
# 查看内存使用
|
||||
docker compose exec redis redis-cli -a <password> info memory | grep used_memory_human
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件三:Milvus 2.4 Standalone
|
||||
|
||||
**用途:** 向量数据库,存储 BGE-M3 嵌入向量,支持混合检索
|
||||
|
||||
### 架构说明
|
||||
|
||||
Milvus Standalone 包含三个内部组件:
|
||||
- **etcd**:元数据存储(Collection 定义、索引配置)
|
||||
- **MinIO**:向量段文件存储
|
||||
- **milvus**:查询/写入引擎
|
||||
|
||||
### 启动顺序(严格按顺序)
|
||||
|
||||
```bash
|
||||
# 1. 先启动 etcd
|
||||
docker compose up -d etcd
|
||||
sleep 10
|
||||
|
||||
# 2. 再启动 MinIO
|
||||
docker compose up -d minio
|
||||
sleep 10
|
||||
|
||||
# 3. 最后启动 Milvus(依赖前两者)
|
||||
docker compose up -d milvus
|
||||
# Milvus 冷启动约需 60 秒,请耐心等待
|
||||
```
|
||||
|
||||
### 验证
|
||||
|
||||
```bash
|
||||
# HTTP 健康检查
|
||||
curl http://localhost:9091/healthz
|
||||
# 应返回:{"status":"ok"}
|
||||
|
||||
# Python 连接测试
|
||||
python3 -c "
|
||||
from pymilvus import connections, utility
|
||||
connections.connect(host='localhost', port='19530')
|
||||
print('Collections:', utility.list_collections())
|
||||
print('Milvus 连接成功')
|
||||
"
|
||||
```
|
||||
|
||||
### 创建 Collection(向量索引)
|
||||
|
||||
```python
|
||||
from pymilvus import (connections, Collection, CollectionSchema,
|
||||
FieldSchema, DataType, utility)
|
||||
|
||||
connections.connect(host='localhost', port='19530')
|
||||
|
||||
fields = [
|
||||
FieldSchema('id', DataType.VARCHAR, is_primary=True, max_length=128),
|
||||
FieldSchema('content', DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema('dense_vec', DataType.FLOAT_VECTOR, dim=1024), # BGE-M3
|
||||
FieldSchema('metadata', DataType.JSON),
|
||||
]
|
||||
|
||||
schema = CollectionSchema(fields, description='法规条款向量库')
|
||||
col = Collection('regulation_chunks', schema)
|
||||
|
||||
# 创建 HNSW 索引(速度/精度平衡)
|
||||
col.create_index('dense_vec', {
|
||||
'metric_type': 'COSINE',
|
||||
'index_type': 'HNSW',
|
||||
'params': {'M': 16, 'efConstruction': 200}
|
||||
})
|
||||
col.load()
|
||||
print('Collection 创建完成')
|
||||
```
|
||||
|
||||
### 常用查询
|
||||
|
||||
```python
|
||||
# 向量相似度检索
|
||||
results = col.search(
|
||||
data=[query_vector], # 查询向量(1024维)
|
||||
anns_field='dense_vec',
|
||||
param={'metric_type': 'COSINE', 'params': {'ef': 100}},
|
||||
limit=10,
|
||||
output_fields=['content', 'metadata']
|
||||
)
|
||||
|
||||
# 查看 Collection 统计
|
||||
print(col.num_entities) # 向量总数
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件四:Neo4j 5 Community
|
||||
|
||||
**用途:** 知识图谱存储,法规-条款-义务实体关系
|
||||
|
||||
### 配置参数
|
||||
|
||||
```yaml
|
||||
image: neo4j:5.20-community
|
||||
environment:
|
||||
NEO4J_AUTH: neo4j/<your_password>
|
||||
NEO4J_PLUGINS: '["apoc"]' # 必须安装 APOC 插件
|
||||
NEO4J_dbms_memory_heap_max__size: 2G
|
||||
ports:
|
||||
- "7474:7474" # Browser UI
|
||||
- "7687:7687" # Bolt 协议(应用连接用)
|
||||
```
|
||||
|
||||
### 启动与验证
|
||||
|
||||
```bash
|
||||
# 启动
|
||||
docker compose up -d neo4j
|
||||
# 首次启动约需 60 秒(下载 APOC 插件)
|
||||
|
||||
# 浏览器访问:http://localhost:7474
|
||||
# 用户名:neo4j,密码:见 .env 中 NEO4J_PASSWORD
|
||||
|
||||
# 命令行连接
|
||||
docker compose exec neo4j cypher-shell -u neo4j -p <password>
|
||||
```
|
||||
|
||||
### 常用 Cypher 查询
|
||||
|
||||
```cypher
|
||||
// 查看所有节点类型
|
||||
CALL apoc.meta.schema() YIELD value RETURN value;
|
||||
|
||||
// 创建法规节点
|
||||
CREATE (r:Regulation {
|
||||
id: 'GB18384-2020',
|
||||
title: 'GB 18384-2020 电动汽车安全要求',
|
||||
domain: 'vehicle_safety',
|
||||
effective_date: date('2021-01-01'),
|
||||
version: '2020'
|
||||
});
|
||||
|
||||
// 法规-条款关系
|
||||
MATCH (r:Regulation {id: 'GB18384-2020'})
|
||||
CREATE (c:Clause {
|
||||
id: 'GB18384-2020-2.1',
|
||||
number: '2.1',
|
||||
content: '绝缘电阻要求:直流电路绝缘电阻不得低于100Ω/V'
|
||||
})
|
||||
CREATE (r)-[:CONTAINS]->(c);
|
||||
|
||||
// 多跳查询:查找某法规所有义务
|
||||
MATCH (r:Regulation {domain: 'vehicle_safety'})-[:CONTAINS]->(c)-[:REQUIRES]->(o)
|
||||
RETURN r.title, c.number, o.description LIMIT 20;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件五:BGE-M3 嵌入服务
|
||||
|
||||
**用途:** 将文本转换为 1024 维向量,支持中英双语,支持 Dense+Sparse 混合检索
|
||||
|
||||
### 服务构建
|
||||
|
||||
```bash
|
||||
# 构建镜像
|
||||
docker compose build embedding-service
|
||||
|
||||
# 首次启动(会自动下载 BGE-M3 模型约 2.5GB)
|
||||
docker compose up -d embedding-service
|
||||
|
||||
# 查看下载进度
|
||||
docker compose logs -f embedding-service
|
||||
```
|
||||
|
||||
### 模型预下载(推荐,避免启动超时)
|
||||
|
||||
```bash
|
||||
# 方法1:通过 hf-mirror.com 加速
|
||||
bash scripts/download_models.sh
|
||||
|
||||
# 方法2:通过 ModelScope(国内最快)
|
||||
pip install modelscope
|
||||
python3 -c "
|
||||
from modelscope import snapshot_download
|
||||
snapshot_download('AI-ModelScope/bge-m3', cache_dir='./models/modelscope')
|
||||
"
|
||||
```
|
||||
|
||||
### API 使用
|
||||
|
||||
```bash
|
||||
# 健康检查
|
||||
curl http://localhost:8010/health
|
||||
|
||||
# 生成嵌入向量
|
||||
curl -X POST http://localhost:8010/embed \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"texts": ["GB 18384 电动汽车碰撞安全", "vehicle crash safety requirements"],
|
||||
"batch_size": 2
|
||||
}'
|
||||
# 返回:{"dense": [[...1024个浮点数...], [...]], "sparse": [{...词汇权重...}, {...}]}
|
||||
```
|
||||
|
||||
### 性能参考
|
||||
|
||||
| 模式 | 硬件 | 速度 |
|
||||
|------|------|------|
|
||||
| CPU | 16核,64GB RAM | 约 2-5 秒/批(batch=16)|
|
||||
| GPU | RTX 3090 24GB | 约 0.2-0.5 秒/批(batch=32)|
|
||||
|
||||
---
|
||||
|
||||
## 组件六:MinerU 文档解析服务
|
||||
|
||||
**用途:** 将 PDF/Word/Excel 解析为 Markdown + 结构化 JSON
|
||||
|
||||
### 服务构建
|
||||
|
||||
```bash
|
||||
# 构建镜像(首次约需 10-20 分钟,下载大量依赖)
|
||||
docker compose build mcp-server
|
||||
|
||||
# 启动服务(首次会下载 MinerU 模型约 2GB)
|
||||
docker compose up -d mcp-server
|
||||
|
||||
# 查看启动日志
|
||||
docker compose logs -f mcp-server
|
||||
```
|
||||
|
||||
### API 使用
|
||||
|
||||
```bash
|
||||
# 解析 PDF
|
||||
curl -X POST http://localhost:8011/mineru-parse \
|
||||
-F "file=@/path/to/regulation.pdf"
|
||||
# 返回:{"markdown": "# 法规标题\n\n## 第一章...", "filename": "regulation.pdf"}
|
||||
|
||||
# 解析 Word 文档
|
||||
curl -X POST http://localhost:8011/parse-document \
|
||||
-F "file=@/path/to/document.docx"
|
||||
```
|
||||
|
||||
### 性能参考
|
||||
|
||||
| 模式 | 速度 | 说明 |
|
||||
|------|------|------|
|
||||
| CPU | 3-5 秒/页 | 调研阶段可接受 |
|
||||
| GPU(RTX 3090)| 0.21 秒/页 | 生产推荐 |
|
||||
|
||||
---
|
||||
|
||||
## 组件七:业务后端(compliance-backend)
|
||||
|
||||
**用途:** FastAPI 主服务,整合所有业务逻辑
|
||||
|
||||
### 关键依赖配置
|
||||
|
||||
```bash
|
||||
# .env 中必须设置
|
||||
DEEPSEEK_API_KEY=sk-xxxx # DeepSeek API Key
|
||||
LLM_PROVIDER=deepseek # 或 qwen
|
||||
DATABASE_URL=postgresql+asyncpg://...
|
||||
REDIS_URL=redis://:password@redis:6379/0
|
||||
MILVUS_HOST=milvus
|
||||
NEO4J_URI=bolt://neo4j:7687
|
||||
EMBEDDING_SERVICE_URL=http://embedding-service:8010
|
||||
MCP_SERVER_URL=http://mcp-server:8011
|
||||
```
|
||||
|
||||
### 启动与验证
|
||||
|
||||
```bash
|
||||
# 启动服务
|
||||
docker compose up -d compliance-backend celery-worker celery-beat
|
||||
|
||||
# 验证 API 文档
|
||||
open http://localhost:8000/docs
|
||||
|
||||
# 查看健康状态(包含所有依赖)
|
||||
curl http://localhost:8000/health
|
||||
```
|
||||
|
||||
### Celery Worker 监控
|
||||
|
||||
```bash
|
||||
# 查看 Worker 状态
|
||||
docker compose exec celery-worker celery -A app.worker inspect active
|
||||
|
||||
# 查看队列积压
|
||||
docker compose exec redis redis-cli -a <password> llen celery
|
||||
|
||||
# Worker 日志
|
||||
docker compose logs -f celery-worker
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 组件八:Nginx API 网关
|
||||
|
||||
**用途:** 反向代理,统一路由,TLS 终止(生产)
|
||||
|
||||
### 配置说明(config/nginx.conf)
|
||||
|
||||
```nginx
|
||||
upstream compliance_backend {
|
||||
server compliance-backend:8000;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
client_max_body_size 100M; # 支持大 PDF 上传
|
||||
proxy_read_timeout 300s; # LLM 推理超时设置
|
||||
|
||||
location /api/kb/ { proxy_pass http://compliance_backend; }
|
||||
location /api/compliance/ { proxy_pass http://compliance_backend; }
|
||||
location /api/regulation/ { proxy_pass http://compliance_backend; }
|
||||
location /health { proxy_pass http://compliance_backend; }
|
||||
location /docs { proxy_pass http://compliance_backend; }
|
||||
}
|
||||
```
|
||||
|
||||
### 启动与验证
|
||||
|
||||
```bash
|
||||
# 启动
|
||||
docker compose up -d nginx
|
||||
|
||||
# 测试路由
|
||||
curl http://localhost/health
|
||||
curl http://localhost/docs # 应返回 Swagger UI HTML
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 完整启动顺序
|
||||
|
||||
```bash
|
||||
# 方式1:分步启动(推荐,含健康等待)
|
||||
bash scripts/06_start_all.sh
|
||||
|
||||
# 方式2:手动分步
|
||||
docker compose up -d postgres redis # 等30s
|
||||
docker compose up -d etcd minio # 等30s
|
||||
docker compose up -d milvus # 等60s
|
||||
docker compose up -d neo4j # 等60s
|
||||
docker compose build embedding-service mcp-server compliance-backend
|
||||
docker compose up -d embedding-service mcp-server # 等120s(模型加载)
|
||||
bash scripts/05_init_db.sh # 初始化数据库
|
||||
docker compose up -d compliance-backend celery-worker celery-beat nginx
|
||||
|
||||
# 验证
|
||||
bash scripts/check_health.sh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 常见问题
|
||||
|
||||
### Q: Milvus 启动失败
|
||||
|
||||
```bash
|
||||
# 检查 etcd 和 minio 是否健康
|
||||
docker compose ps etcd minio
|
||||
|
||||
# 查看 Milvus 日志
|
||||
docker compose logs milvus | tail -50
|
||||
|
||||
# 常见原因:内存不足(Milvus 需要至少 4GB 可用内存)
|
||||
free -h
|
||||
```
|
||||
|
||||
### Q: BGE-M3 模型下载失败
|
||||
|
||||
```bash
|
||||
# 使用镜像加速
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
docker compose up -d embedding-service
|
||||
|
||||
# 或使用 ModelScope
|
||||
bash scripts/download_models.sh
|
||||
```
|
||||
|
||||
### Q: DeepSeek API 连接超时
|
||||
|
||||
```bash
|
||||
# 测试连通性
|
||||
curl -X POST https://api.deepseek.com/v1/chat/completions \
|
||||
-H "Authorization: Bearer $DEEPSEEK_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model": "deepseek-chat", "messages": [{"role": "user", "content": "ping"}]}'
|
||||
|
||||
# 常见原因:API Key 未设置或网络问题
|
||||
```
|
||||
|
||||
### Q: 内存不足
|
||||
|
||||
```bash
|
||||
# 查看内存使用
|
||||
docker stats --no-stream
|
||||
|
||||
# 临时解决:减少 BGE-M3 批大小(降低内存峰值)
|
||||
# 编辑 .env,添加:
|
||||
# EMBEDDING_BATCH_SIZE=4 (默认16)
|
||||
```
|
||||
Reference in New Issue
Block a user