feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions
--- a/.env.example
+++ b/.env.example
@@ -9,12 +9,12 @@ DEBUG=false
 # ===== Milvus向量数据库配置 =====
 MILVUS_HOST=localhost
 MILVUS_PORT=19530
-MILVUS_COLLECTION=regulations_dense_1536
+MILVUS_COLLECTION=regulations_dense_1024_v1
 MILVUS_DB_NAME=default

 # ===== 嵌入模型配置 =====
 EMBEDDING_MODEL=text-embedding-v3
-EMBEDDING_DIM=1536
+EMBEDDING_DIM=1024
 EMBEDDING_API_KEY=your_embedding_api_key_here
 EMBEDDING_BASE_URL=http://6.86.80.4:30080/v1
 EMBEDDING_TIMEOUT_SECONDS=120
@@ -44,11 +44,20 @@ CHUNK_SIZE=512
 CHUNK_OVERLAP=50
 MAX_FILE_SIZE_MB=100
 DOCUMENT_METADATA_PATH=backend/data/documents.json
+PARSER_BACKEND=aliyun
+CHUNK_BACKEND=aliyun

 # ===== 阿里云文档解析 =====
 ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
 ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
 ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
+ALIYUN_PARSE_POLL_INTERVAL_SECONDS=5
+ALIYUN_PARSE_TIMEOUT_SECONDS=900
+ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
+ALIYUN_LLM_ENHANCEMENT=true
+ALIYUN_ENHANCEMENT_MODE=VLM
+DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
+PARSER_FAILURE_MODE=fail

 # ===== API服务配置 =====
 API_HOST=0.0.0.0
@@ -73,7 +82,7 @@ DEEPSEEK_BASE_URL=http://6.86.80.4:30080/v1
 # Qwen系列: qwen3.5-plus, qwen3-plus, qwen-max, qwen-turbo, qwen-long
 # Qwen VL系列: qwen3-vl-plus, qwen-vl-max
 # DeepSeek系列: deepseek-v4-flash, deepseek-v3.2, deepseek-v3, deepseek-chat, deepseek-coder
-QWEN_MODEL=qwen3.5-plus
+QWEN_MODEL=qwen3.6-plus
 QWEN_VL_MODEL=qwen3-vl-plus
 DEEPSEEK_MODEL=deepseek-v4-flash