feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
ash66
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions

View File

@@ -9,12 +9,12 @@ DEBUG=false
# ===== Milvus向量数据库配置 =====
MILVUS_HOST=localhost
MILVUS_PORT=19530
MILVUS_COLLECTION=regulations_dense_1536
MILVUS_COLLECTION=regulations_dense_1024_v1
MILVUS_DB_NAME=default
# ===== 嵌入模型配置 =====
EMBEDDING_MODEL=text-embedding-v3
EMBEDDING_DIM=1536
EMBEDDING_DIM=1024
EMBEDDING_API_KEY=your_embedding_api_key_here
EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1
EMBEDDING_TIMEOUT_SECONDS=120
@@ -44,11 +44,20 @@ CHUNK_SIZE=512
CHUNK_OVERLAP=50
MAX_FILE_SIZE_MB=100
DOCUMENT_METADATA_PATH=backend/data/documents.json
PARSER_BACKEND=aliyun
CHUNK_BACKEND=aliyun
# ===== 阿里云文档解析 =====
ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
ALIYUN_PARSE_POLL_INTERVAL_SECONDS=5
ALIYUN_PARSE_TIMEOUT_SECONDS=900
ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
ALIYUN_LLM_ENHANCEMENT=true
ALIYUN_ENHANCEMENT_MODE=VLM
DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
PARSER_FAILURE_MODE=fail
# ===== API服务配置 =====
API_HOST=0.0.0.0
@@ -73,7 +82,7 @@ DEEPSEEK_BASE_URL=http://6.86.80.4:30080/v1
# Qwen系列: qwen3.5-plus, qwen3-plus, qwen-max, qwen-turbo, qwen-long
# Qwen VL系列: qwen3-vl-plus, qwen-vl-max
# DeepSeek系列: deepseek-v4-flash, deepseek-v3.2, deepseek-v3, deepseek-chat, deepseek-coder
QWEN_MODEL=qwen3.5-plus
QWEN_MODEL=qwen3.6-plus
QWEN_VL_MODEL=qwen3-vl-plus
DEEPSEEK_MODEL=deepseek-v4-flash