feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
ash66
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions

View File

@@ -4,6 +4,7 @@ from __future__ import annotations
from pathlib import Path
from app.config.settings import settings
from app.domain.documents import DocumentParser, ParsedDocument
from app.services.parser.docx_parser import parse_docx_to_markdown
from app.services.parser.pdf_parser import parse_pdf_to_markdown
@@ -34,5 +35,10 @@ class LocalDocumentParser(DocumentParser):
vector_chunks=[],
parser_name=self.parser_name,
raw_text=markdown_text,
metadata={"source": "local_parser", "file_suffix": suffix},
raw_layouts=[],
metadata={
"source": "local_parser",
"file_suffix": suffix,
"artifact_prefix": settings.document_parse_artifact_prefix,
},
)