Files
AIRegulation-DocAnalysis/backend/app/config/settings.py
ash66 30c7bda389 Refactor document handling and update Milvus collection settings
- Removed multiple failed document entries from `documents.json`.
- Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`.
- Updated architecture documentation to reflect changes in the Milvus collection name.
- Adjusted requirements by removing the sqlalchemy dependency.
- Modified test cases to align with new document structure and naming conventions.
- Introduced a new test file for Milvus vector index runtime recovery and error handling.
- Updated assertions in various test files to ensure compatibility with the new schema.
2026-05-26 20:21:31 +08:00

136 lines
8.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Configure backend settings for the backend application."""
from pathlib import Path
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
from functools import lru_cache
# Keep configuration setup explicit so runtime behavior is easy to reason about.
ROOT_DIR = Path(__file__).resolve().parents[3]
ROOT_ENV_FILES = (
ROOT_DIR / ".env",
ROOT_DIR / ".env.development",
)
class Settings(BaseSettings):
"""Define configuration for settings."""
model_config = SettingsConfigDict(
env_file=tuple(str(env_file) for env_file in ROOT_ENV_FILES),
env_file_encoding="utf-8",
extra="ignore",
)
# Keep configuration setup explicit so runtime behavior is easy to reason about.
app_name: str = Field(default="AI Regulations Demo", description="Application name")
app_version: str = Field(default="0.1.0", description="应用版本")
debug: bool = Field(default=False, description="调试模式")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
milvus_host: str = Field(default="6.86.80.8", description="Milvus服务地址")
milvus_port: int = Field(default=19530, description="Milvus服务端口")
milvus_collection: str = Field(default="regulations_dense_1024_v2", description="法规向量集合名称")
milvus_db_name: str = Field(default="default", description="Milvus数据库名称")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
embedding_model: str = Field(default="text-embedding-v3", description="嵌入模型名称")
embedding_dim: int = Field(default=1024, description="嵌入向量维度")
embedding_api_key: str = Field(default="", description="Embedding API密钥")
embedding_base_url: str = Field(default="http://6.86.80.4:30080/v1", description="Embedding API地址")
embedding_timeout_seconds: int = Field(default=120, description="Embedding API超时时间(秒)")
alibaba_access_key_id: str = Field(default="", description="阿里云文档解析 Access Key ID")
alibaba_access_key_secret: str = Field(default="", description="阿里云文档解析 Access Key Secret")
alibaba_endpoint: str = Field(default="docmind-api.cn-hangzhou.aliyuncs.com", description="阿里云文档解析 endpoint")
aliyun_parse_poll_interval_seconds: int = Field(default=5, description="阿里云文档解析轮询间隔(秒)")
aliyun_parse_timeout_seconds: int = Field(default=900, description="阿里云文档解析超时时间(秒)")
aliyun_parse_layout_step_size: int = Field(default=50, description="阿里云文档解析分页步长")
aliyun_llm_enhancement: bool = Field(default=True, description="是否启用阿里云解析增强")
aliyun_enhancement_mode: str = Field(default="VLM", description="阿里云解析增强模式")
document_parse_artifact_prefix: str = Field(default="artifacts", description="解析产物对象前缀")
parser_failure_mode: str = Field(default="fail", description="解析失败策略")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
minio_endpoint: str = Field(default="6.86.80.8:9000", description="MinIO服务地址")
minio_access_key: str = Field(default="minioadmin", description="MinIO访问密钥")
minio_secret_key: str = Field(default="minioadmin123", description="MinIO秘密密钥")
minio_bucket: str = Field(default="upload-files", description="文档存储桶名称")
minio_secure: bool = Field(default=False, description="是否使用HTTPS")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
redis_host: str = Field(default="6.86.80.8", description="Redis服务地址")
redis_port: int = Field(default=6379, description="Redis服务端口")
redis_password: str = Field(default="", description="Redis密码")
redis_db: int = Field(default=0, description="Redis数据库编号")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
postgres_host: str = Field(default="6.86.80.8", description="PostgreSQL服务地址")
postgres_port: int = Field(default=5432, description="PostgreSQL服务端口")
postgres_user: str = Field(default="compliance", description="PostgreSQL用户名")
postgres_password: str = Field(default="compliance123", description="PostgreSQL密码")
postgres_db: str = Field(default="compliance_db", description="PostgreSQL数据库名称")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
chunk_size: int = Field(default=512, description="分块大小(字符数)")
chunk_overlap: int = Field(default=50, description="分块重叠大小")
max_file_size_mb: int = Field(default=100, description="最大文件大小(MB)")
document_metadata_path: str = Field(default="backend/data/documents.json", description="文档元数据存储路径")
document_processing_metadata_path: str = Field(default="backend/data/document_processing.json", description="文档处理历史存储路径")
parser_backend: str = Field(default="aliyun", description="解析后端(local/aliyun)")
chunk_backend: str = Field(default="aliyun", description="分块后端(local/aliyun)")
document_repository_backend: str = Field(default="json", description="文档元数据存储后端 (json/postgres)")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
api_host: str = Field(default="0.0.0.0", description="API服务地址")
api_port: int = Field(default=8000, description="API服务端口")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
llm_provider: str = Field(default="deepseek", description="LLM提供商 (deepseek/qwen/qwen_vl)")
llm_model: str = Field(default="deepseek-v4-flash", description="LLM模型名称")
llm_max_tokens: int = Field(default=4096, description="LLM最大输出token数")
llm_temperature: float = Field(default=0.7, description="LLM温度参数")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
deepseek_api_key: str = Field(default="", description="DeepSeek API密钥")
deepseek_base_url: str = Field(default="http://6.86.80.4:30080/v1", description="DeepSeek API地址")
deepseek_model: str = Field(default="deepseek-v4-flash", description="DeepSeek模型")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
qwen_api_key: str = Field(default="", description="Qwen API密钥")
qwen_base_url: str = Field(default="http://6.86.80.4:30080/v1", description="Qwen API地址")
qwen_model: str = Field(default="qwen3.5-flash", description="Qwen文本模型")
qwen_vl_model: str = Field(default="qwen3-vl-plus", description="Qwen视觉模型")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
rag_top_k: int = Field(default=5, description="检索召回数量")
rag_retrieval_top_k: int = Field(default=20, description="精排前召回候选数量reranker 启用时生效)")
rag_max_context_tokens: int = Field(default=2000, description="RAG最大上下文token数")
rag_summary_max_tokens: int = Field(default=10240, description="文档摘要最大token数")
reranker_enabled: bool = Field(default=False, description="是否启用 Cross-Encoder 精排")
reranker_base_url: str = Field(default="", description="Reranker API 地址")
reranker_model: str = Field(default="BAAI/bge-reranker-v2-m3", description="Reranker 模型名称")
reranker_api_key: str = Field(default="", description="Reranker API 密钥")
reranker_top_k: int = Field(default=5, description="精排后保留的最终结果数量")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
milvus_index_type: str = Field(default="IVF_FLAT", description="Milvus索引类型")
milvus_nlist: int = Field(default=128, description="Milvus nlist参数")
milvus_nprobe: int = Field(default=16, description="Milvus nprobe参数")
# Keep configuration setup explicit so runtime behavior is easy to reason about.
session_max_sessions: int = Field(default=100, description="最大会话数量")
session_timeout_minutes: int = Field(default=30, description="会话超时时间(分钟)")
@lru_cache
def get_settings() -> Settings:
"""Return settings."""
return Settings()
# Keep configuration setup explicit so runtime behavior is easy to reason about.
settings = get_settings()