feat: Migrate document parsing to Aliyun and update embedding configurations
- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
@@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from app.config.settings import settings
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
from app.services.parser.docx_parser import parse_docx_to_markdown
|
||||
from app.services.parser.pdf_parser import parse_pdf_to_markdown
|
||||
@@ -34,5 +35,10 @@ class LocalDocumentParser(DocumentParser):
|
||||
vector_chunks=[],
|
||||
parser_name=self.parser_name,
|
||||
raw_text=markdown_text,
|
||||
metadata={"source": "local_parser", "file_suffix": suffix},
|
||||
raw_layouts=[],
|
||||
metadata={
|
||||
"source": "local_parser",
|
||||
"file_suffix": suffix,
|
||||
"artifact_prefix": settings.document_parse_artifact_prefix,
|
||||
},
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user