feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions
--- a/backend/app/infrastructure/embedding/openai_compatible_embedding_provider.py
+++ b/backend/app/infrastructure/embedding/openai_compatible_embedding_provider.py
@@ -10,6 +10,8 @@ from app.config.settings import settings
 from app.domain.retrieval import EmbeddingProvider
 # Keep adapter behavior explicit so integration details remain easy to audit.

+EMBEDDING_BATCH_SIZE = 8
+


 class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
@@ -27,6 +29,18 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
        self.timeout = settings.embedding_timeout_seconds
        self.dimension = settings.embedding_dim

+    def _raise_for_status(self, response: httpx.Response, *, batch_size: int) -> None:
+        """Raise a detailed error so upstream gateway failures are easier to diagnose."""
+        try:
+            response.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            response_preview = response.text[:500].strip()
+            detail = (
+                f"Embedding request failed for model={self.model}, batch_size={batch_size}, "
+                f"status={response.status_code}, url={response.request.url}, response={response_preview}"
+            )
+            raise httpx.HTTPStatusError(detail, request=exc.request, response=exc.response) from exc
+
    def _request(self, texts: list[str]) -> list[list[float]]:
        """Handle request for this module for the Open A I Compatible Embedding Provider instance."""
        if not self.api_key:
@@ -40,7 +54,7 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
            json={"model": self.model, "input": texts},
            timeout=self.timeout,
        )
-        response.raise_for_status()
+        self._raise_for_status(response, batch_size=len(texts))
        data = response.json()
        vectors = [item["embedding"] for item in sorted(data.get("data", []), key=lambda item: item["index"])]
        if any(len(vector) != self.dimension for vector in vectors):
@@ -51,7 +65,12 @@ class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
        """Embed texts for the Open A I Compatible Embedding Provider instance."""
        if not texts:
            return []
-        return self._request(texts)
+        vectors: list[list[float]] = []
+        # Batch requests conservatively because some gateways reject larger embedding payloads.
+        for start in range(0, len(texts), EMBEDDING_BATCH_SIZE):
+            batch = texts[start:start + EMBEDDING_BATCH_SIZE]
+            vectors.extend(self._request(batch))
+        return vectors

    def embed_query(self, text: str) -> list[float]:
        """Embed query for the Open A I Compatible Embedding Provider instance."""