feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
ash66
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions

View File

@@ -10,6 +10,8 @@ from app.config.settings import settings
from app.domain.retrieval import EmbeddingProvider
# Keep adapter behavior explicit so integration details remain easy to audit.
EMBEDDING_BATCH_SIZE = 8
class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
@@ -27,6 +29,18 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
self.timeout = settings.embedding_timeout_seconds
self.dimension = settings.embedding_dim
def _raise_for_status(self, response: httpx.Response, *, batch_size: int) -> None:
"""Raise a detailed error so upstream gateway failures are easier to diagnose."""
try:
response.raise_for_status()
except httpx.HTTPStatusError as exc:
response_preview = response.text[:500].strip()
detail = (
f"Embedding request failed for model={self.model}, batch_size={batch_size}, "
f"status={response.status_code}, url={response.request.url}, response={response_preview}"
)
raise httpx.HTTPStatusError(detail, request=exc.request, response=exc.response) from exc
def _request(self, texts: list[str]) -> list[list[float]]:
"""Handle request for this module for the Open A I Compatible Embedding Provider instance."""
if not self.api_key:
@@ -40,7 +54,7 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
json={"model": self.model, "input": texts},
timeout=self.timeout,
)
response.raise_for_status()
self._raise_for_status(response, batch_size=len(texts))
data = response.json()
vectors = [item["embedding"] for item in sorted(data.get("data", []), key=lambda item: item["index"])]
if any(len(vector) != self.dimension for vector in vectors):
@@ -51,7 +65,12 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
"""Embed texts for the Open A I Compatible Embedding Provider instance."""
if not texts:
return []
return self._request(texts)
vectors: list[list[float]] = []
# Batch requests conservatively because some gateways reject larger embedding payloads.
for start in range(0, len(texts), EMBEDDING_BATCH_SIZE):
batch = texts[start:start + EMBEDDING_BATCH_SIZE]
vectors.extend(self._request(batch))
return vectors
def embed_query(self, text: str) -> list[float]:
"""Embed query for the Open A I Compatible Embedding Provider instance."""