From 1df4010accf2e4ae1785e4dd777f4290bbd12846 Mon Sep 17 00:00:00 2001
From: wangwei <Wei.Wang@t-systems.com>
Date: Fri, 26 Jun 2026 20:34:01 +0800
Subject: [PATCH] fix(llm): resolve score runtime config from saved profiles

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 rag_eval/metrics/factory.py           | 39 +++++++++++++++++++--
 tests/webapp/test_llm_profiles_api.py | 11 ++++--
 tests/webapp/test_profile_manager.py  | 49 +++++++++++++++++++++++++++
 webapp/api/llm_profiles.py            | 13 +++++++
 webapp/services/inline_scorer.py      | 11 +++++-
 5 files changed, 117 insertions(+), 6 deletions(-)

diff --git a/rag_eval/metrics/factory.py b/rag_eval/metrics/factory.py
index 4c2cbcd..df5683a 100644
--- a/rag_eval/metrics/factory.py
+++ b/rag_eval/metrics/factory.py
@@ -27,13 +27,48 @@ from ragas.metrics.collections import (
 from .pipeline import MetricPipeline
 
 
+def _resolve_openai_client_kwargs(
+    judge_model: str,
+    settings: EvaluationSettings,
+) -> dict[str, Any]:
+    """Return AsyncOpenAI kwargs, preferring a matching LLM Profile over .env settings.
+
+    Lookup order:
+      1. LLM Profile whose model name equals judge_model (exact match)
+      2. Fall back to EvaluationSettings (.env)
+    """
+    try:
+        # Lazy import to avoid circular dependency (webapp -> rag_eval is one-way).
+        from webapp.services.profile_manager import profile_manager
+        profiles = profile_manager.list_all()
+        for profile in profiles:
+            if profile.model == judge_model:
+                kwargs: dict[str, Any] = {
+                    "api_key": profile.api_key or "sk-placeholder",
+                    "timeout": float(profile.timeout_seconds or 30),
+                }
+                if profile.base_url and profile.base_url.strip():
+                    kwargs["base_url"] = profile.base_url.strip()
+                return kwargs
+    except Exception:  # noqa: BLE001
+        # If profile lookup fails for any reason, fall through to .env settings.
+        pass
+
+    return settings.openai_client_kwargs
+
+
 def build_models(
     judge_model: str,
     embedding_model: str,
     settings: EvaluationSettings,
 ) -> tuple[Any, Any]:
-    """Create the LLM and embedding clients required by the selected RAGAS metrics."""
-    client = AsyncOpenAI(**settings.openai_client_kwargs)
+    """Create the LLM and embedding clients required by the selected RAGAS metrics.
+
+    Dynamically resolves connection settings from the stored LLM Profiles first
+    (matched by model name), falling back to .env settings when no profile matches.
+    """
+    client_kwargs = _resolve_openai_client_kwargs(judge_model, settings)
+    client = AsyncOpenAI(**client_kwargs)
     llm = llm_factory(judge_model, client=client)
     embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
     return llm, embeddings
diff --git a/tests/webapp/test_llm_profiles_api.py b/tests/webapp/test_llm_profiles_api.py
index 6903ec3..d131298 100644
--- a/tests/webapp/test_llm_profiles_api.py
+++ b/tests/webapp/test_llm_profiles_api.py
@@ -1,6 +1,7 @@
 """Integration tests for /api/llm-profiles endpoints."""
 import pytest
 from fastapi.testclient import TestClient
+from unittest.mock import patch
 
 
 @pytest.fixture()
@@ -41,19 +42,23 @@ def test_update_profile(client):
     pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
 
     upd = {"name": "New", "model": "m2", "base_url": "http://x/v1", "api_key": "k", "timeout_seconds": 60}
-    resp = client.put(f"/api/llm-profiles/{pid}", json=upd)
+    with patch("webapp.services.inline_scorer.inline_scorer.invalidate_cache") as invalidate:
+        resp = client.put(f"/api/llm-profiles/{pid}", json=upd)
     assert resp.status_code == 200
     assert resp.json()["name"] == "New"
     assert resp.json()["timeout_seconds"] == 60
+    invalidate.assert_called_once()
 
 
 def test_delete_profile(client):
     body = {"name": "Del", "model": "m", "base_url": "http://x/v1", "api_key": "k"}
     pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
-    resp = client.delete(f"/api/llm-profiles/{pid}")
+    with patch("webapp.services.inline_scorer.inline_scorer.invalidate_cache") as invalidate:
+        resp = client.delete(f"/api/llm-profiles/{pid}")
     assert resp.status_code == 200
     assert resp.json()["deleted"] is True
     assert len(client.get("/api/llm-profiles").json()["profiles"]) == 0
+    invalidate.assert_called_once()
 
 
 def test_update_nonexistent(client):
@@ -185,7 +190,7 @@ def test_apply_doc_weights_patches_yaml(tmp_path):
 # ---------------------------------------------------------------------------
 # Connectivity test endpoint tests
 # ---------------------------------------------------------------------------
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 
 def test_probe_connectivity_success(client):
diff --git a/tests/webapp/test_profile_manager.py b/tests/webapp/test_profile_manager.py
index a592823..9dc8655 100644
--- a/tests/webapp/test_profile_manager.py
+++ b/tests/webapp/test_profile_manager.py
@@ -98,3 +98,52 @@ def test_get_nonexistent(tmp_path):
 def test_delete_nonexistent(tmp_path):
     mgr = _make_manager(tmp_path)
     assert mgr.delete("does-not-exist") is False
+
+
+def test_resolve_openai_client_kwargs_prefers_matching_profile(tmp_path, monkeypatch):
+    """Metric runtime should prefer the saved LLM Profile over .env defaults."""
+    from rag_eval.metrics.factory import _resolve_openai_client_kwargs
+    from rag_eval.settings import EvaluationSettings
+    import webapp.services.profile_manager as pm_mod
+
+    mgr = _make_manager(tmp_path)
+    mgr.create(
+        name="Judge",
+        model="gpt-5.5",
+        base_url="http://39.107.88.131:13000",
+        api_key="sk-profile",
+        timeout_seconds=300,
+    )
+    monkeypatch.setattr(pm_mod, "profile_manager", mgr)
+
+    settings = EvaluationSettings(
+        OPENAI_API_KEY="sk-env",
+        OPENAI_BASE_URL="http://env-base/v1",
+        OPENAI_TIMEOUT_SECONDS=30,
+    )
+
+    kwargs = _resolve_openai_client_kwargs("gpt-5.5", settings)
+    assert kwargs["api_key"] == "sk-profile"
+    assert kwargs["base_url"] == "http://39.107.88.131:13000"
+    assert kwargs["timeout"] == 300.0
+
+
+def test_resolve_openai_client_kwargs_falls_back_to_env(tmp_path, monkeypatch):
+    """When no saved profile matches, .env settings remain the fallback."""
+    from rag_eval.metrics.factory import _resolve_openai_client_kwargs
+    from rag_eval.settings import EvaluationSettings
+    import webapp.services.profile_manager as pm_mod
+
+    mgr = _make_manager(tmp_path)
+    monkeypatch.setattr(pm_mod, "profile_manager", mgr)
+
+    settings = EvaluationSettings(
+        OPENAI_API_KEY="sk-env",
+        OPENAI_BASE_URL="http://env-base/v1",
+        OPENAI_TIMEOUT_SECONDS=45,
+    )
+
+    kwargs = _resolve_openai_client_kwargs("gpt-5", settings)
+    assert kwargs["api_key"] == "sk-env"
+    assert kwargs["base_url"] == "http://env-base/v1"
+    assert kwargs["timeout"] == 45.0
diff --git a/webapp/api/llm_profiles.py b/webapp/api/llm_profiles.py
index 02fff3c..4e92846 100644
--- a/webapp/api/llm_profiles.py
+++ b/webapp/api/llm_profiles.py
@@ -148,6 +148,13 @@ def update_profile(profile_id: str, request: CreateProfileRequest) -> LLMProfile
     if updated is None:
         logger.warning("[update_profile] not found  id=%s", profile_id)
         raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
+    # Invalidate scorer cache so next request picks up the new profile settings.
+    try:
+        from webapp.services.inline_scorer import inline_scorer
+        inline_scorer.invalidate_cache()
+        logger.info("[update_profile] scorer cache invalidated  id=%s", profile_id)
+    except Exception:  # noqa: BLE001
+        pass
     logger.info("[update_profile] updated  id=%s", profile_id)
     return updated
 
@@ -160,6 +167,12 @@ def delete_profile(profile_id: str) -> dict:
     if not deleted:
         logger.warning("[delete_profile] not found  id=%s", profile_id)
         raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
+    # Invalidate scorer cache in case the deleted profile was in use.
+    try:
+        from webapp.services.inline_scorer import inline_scorer
+        inline_scorer.invalidate_cache()
+    except Exception:  # noqa: BLE001
+        pass
     logger.info("[delete_profile] deleted  id=%s", profile_id)
     return {"deleted": True}
 
diff --git a/webapp/services/inline_scorer.py b/webapp/services/inline_scorer.py
index 9d0d5ed..0ee843f 100644
--- a/webapp/services/inline_scorer.py
+++ b/webapp/services/inline_scorer.py
@@ -54,13 +54,22 @@ class InlineScorer:
         self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
         self._lock = threading.Lock()
 
+    def invalidate_cache(self) -> None:
+        """Clear the model cache so the next call rebuilds clients from current profiles."""
+        with self._lock:
+            self._model_cache.clear()
+
     def _get_models(
         self,
         judge_model: str,
         embedding_model: str,
         settings: EvaluationSettings,
     ) -> tuple[Any, Any]:
-        """Return cached LLM/embedding clients, building them on first use."""
+        """Return cached LLM/embedding clients, building them on first use.
+
+        Cache is keyed by (judge_model, embedding_model). Call invalidate_cache()
+        after updating an LLM Profile to force a fresh client on the next request.
+        """
         cache_key = (judge_model, embedding_model)
         with self._lock:
             if cache_key not in self._model_cache: