feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
ash66
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions

View File

@@ -64,11 +64,16 @@ def verify_migration_config() -> bool:
try:
assert settings.embedding_model == "text-embedding-v3"
assert settings.embedding_dim == 1536
assert settings.milvus_collection == "regulations_dense_1536"
assert settings.embedding_dim == 1024
assert settings.milvus_collection == "regulations_dense_1024_v1"
assert settings.parser_backend == "aliyun"
assert settings.chunk_backend == "aliyun"
logger.info(f"embedding_model={settings.embedding_model}")
logger.info(f"embedding_base_url={settings.embedding_base_url}")
logger.info(f"embedding_dim={settings.embedding_dim}")
logger.info(f"milvus_collection={settings.milvus_collection}")
logger.info(f"parser_backend={settings.parser_backend}")
logger.info(f"chunk_backend={settings.chunk_backend}")
logger.success("migration config ok")
return True
except Exception as exc: