feat: Migrate document parsing to Aliyun and update embedding configurations
- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
@@ -1,9 +1,9 @@
|
||||
"""Configure backend settings for settings."""
|
||||
"""Configure backend settings for the backend application."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
from pydantic import Field
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
from functools import lru_cache
|
||||
# Keep configuration setup explicit so runtime behavior is easy to reason about.
|
||||
|
||||
@@ -33,18 +33,25 @@ class Settings(BaseSettings):
|
||||
# Keep configuration setup explicit so runtime behavior is easy to reason about.
|
||||
milvus_host: str = Field(default="localhost", description="Milvus服务地址")
|
||||
milvus_port: int = Field(default=19530, description="Milvus服务端口")
|
||||
milvus_collection: str = Field(default="regulations_dense_1536", description="法规向量集合名称")
|
||||
milvus_collection: str = Field(default="regulations_dense_1024_v1", description="法规向量集合名称")
|
||||
milvus_db_name: str = Field(default="default", description="Milvus数据库名称")
|
||||
|
||||
# Keep configuration setup explicit so runtime behavior is easy to reason about.
|
||||
embedding_model: str = Field(default="text-embedding-v3", description="嵌入模型名称")
|
||||
embedding_dim: int = Field(default=1536, description="嵌入向量维度")
|
||||
embedding_dim: int = Field(default=1024, description="嵌入向量维度")
|
||||
embedding_api_key: str = Field(default="", description="Embedding API密钥")
|
||||
embedding_base_url: str = Field(default="http://6.86.80.4:30080/v1", description="Embedding API地址")
|
||||
embedding_timeout_seconds: int = Field(default=120, description="Embedding API超时时间(秒)")
|
||||
alibaba_access_key_id: str = Field(default="", description="阿里云文档解析 Access Key ID")
|
||||
alibaba_access_key_secret: str = Field(default="", description="阿里云文档解析 Access Key Secret")
|
||||
alibaba_endpoint: str = Field(default="docmind-api.cn-hangzhou.aliyuncs.com", description="阿里云文档解析 endpoint")
|
||||
aliyun_parse_poll_interval_seconds: int = Field(default=5, description="阿里云文档解析轮询间隔(秒)")
|
||||
aliyun_parse_timeout_seconds: int = Field(default=900, description="阿里云文档解析超时时间(秒)")
|
||||
aliyun_parse_layout_step_size: int = Field(default=50, description="阿里云文档解析分页步长")
|
||||
aliyun_llm_enhancement: bool = Field(default=True, description="是否启用阿里云解析增强")
|
||||
aliyun_enhancement_mode: str = Field(default="VLM", description="阿里云解析增强模式")
|
||||
document_parse_artifact_prefix: str = Field(default="artifacts", description="解析产物对象前缀")
|
||||
parser_failure_mode: str = Field(default="fail", description="解析失败策略")
|
||||
|
||||
# Keep configuration setup explicit so runtime behavior is easy to reason about.
|
||||
minio_endpoint: str = Field(default="localhost:9000", description="MinIO服务地址")
|
||||
@@ -71,8 +78,8 @@ class Settings(BaseSettings):
|
||||
chunk_overlap: int = Field(default=50, description="分块重叠大小")
|
||||
max_file_size_mb: int = Field(default=100, description="最大文件大小(MB)")
|
||||
document_metadata_path: str = Field(default="backend/data/documents.json", description="文档元数据存储路径")
|
||||
parser_backend: str = Field(default="local", description="解析后端(local/aliyun)")
|
||||
chunk_backend: str = Field(default="local", description="分块后端(local/aliyun)")
|
||||
parser_backend: str = Field(default="aliyun", description="解析后端(local/aliyun)")
|
||||
chunk_backend: str = Field(default="aliyun", description="分块后端(local/aliyun)")
|
||||
|
||||
# Keep configuration setup explicit so runtime behavior is easy to reason about.
|
||||
api_host: str = Field(default="0.0.0.0", description="API服务地址")
|
||||
|
||||
Reference in New Issue
Block a user