update
This commit is contained in:
@@ -8,10 +8,12 @@ OPENAI_BASE_URL=http://6.86.80.4:30080/v1
|
|||||||
OPENAI_TIMEOUT_SECONDS=180
|
OPENAI_TIMEOUT_SECONDS=180
|
||||||
|
|
||||||
# 默认评测模型(可在场景 YAML 或 Web 控制台 LLM 配置中覆盖)
|
# 默认评测模型(可在场景 YAML 或 Web 控制台 LLM 配置中覆盖)
|
||||||
# RAGAS_JUDGE_MODEL 需支持 max_tokens + json_object(gpt-5、gpt-4.1、gpt-4o 等)
|
# RAGAS_JUDGE_MODEL 需支持 OpenAI 兼容 chat.completions + 结构化 JSON 输出
|
||||||
# 注意:gpt-5.4/5.5/5.2 系列不支持 max_tokens,与 RAGAS 0.4.3 不兼容
|
# RAGAS_LLM_MAX_TOKENS 控制 Judge 评分链路的 completion budget;faithfulness 等
|
||||||
|
# 结构化指标在 GPT-5 系列上通常需要 4096 或更高,避免 IncompleteOutputException
|
||||||
RAGAS_JUDGE_MODEL=gpt-5
|
RAGAS_JUDGE_MODEL=gpt-5
|
||||||
RAGAS_EMBEDDING_MODEL=text-embedding-3-small
|
RAGAS_EMBEDDING_MODEL=text-embedding-3-small
|
||||||
|
RAGAS_LLM_MAX_TOKENS=4096
|
||||||
|
|
||||||
# 评估并发控制(启用 7 个指标时建议 RAGAS_METRIC_TIMEOUT_SECONDS=300)
|
# 评估并发控制(启用 7 个指标时建议 RAGAS_METRIC_TIMEOUT_SECONDS=300)
|
||||||
BATCH_SIZE=8
|
BATCH_SIZE=8
|
||||||
|
|||||||
@@ -69,7 +69,13 @@ def build_models(
|
|||||||
"""
|
"""
|
||||||
client_kwargs = _resolve_openai_client_kwargs(judge_model, settings)
|
client_kwargs = _resolve_openai_client_kwargs(judge_model, settings)
|
||||||
client = AsyncOpenAI(**client_kwargs)
|
client = AsyncOpenAI(**client_kwargs)
|
||||||
llm = llm_factory(judge_model, client=client)
|
# RAGAS structured-output judge calls can be truncated by the upstream default
|
||||||
|
# 1024 completion budget, especially for faithfulness and GPT-5 family models.
|
||||||
|
llm = llm_factory(
|
||||||
|
judge_model,
|
||||||
|
client=client,
|
||||||
|
max_tokens=max(1, int(settings.ragas_llm_max_tokens)),
|
||||||
|
)
|
||||||
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
|
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
|
||||||
return llm, embeddings
|
return llm, embeddings
|
||||||
|
|
||||||
|
|||||||
@@ -26,6 +26,11 @@ class EvaluationSettings(BaseSettings):
|
|||||||
default="text-embedding-3-small",
|
default="text-embedding-3-small",
|
||||||
alias="RAGAS_EMBEDDING_MODEL",
|
alias="RAGAS_EMBEDDING_MODEL",
|
||||||
)
|
)
|
||||||
|
ragas_llm_max_tokens: int = Field(
|
||||||
|
default=4096,
|
||||||
|
alias="RAGAS_LLM_MAX_TOKENS",
|
||||||
|
gt=0,
|
||||||
|
)
|
||||||
openai_timeout_seconds: float = Field(default=30.0, alias="OPENAI_TIMEOUT_SECONDS")
|
openai_timeout_seconds: float = Field(default=30.0, alias="OPENAI_TIMEOUT_SECONDS")
|
||||||
ragas_metric_timeout_seconds: float = Field(default=45.0, alias="RAGAS_METRIC_TIMEOUT_SECONDS")
|
ragas_metric_timeout_seconds: float = Field(default=45.0, alias="RAGAS_METRIC_TIMEOUT_SECONDS")
|
||||||
batch_size: int = Field(default=8, alias="BATCH_SIZE")
|
batch_size: int = Field(default=8, alias="BATCH_SIZE")
|
||||||
|
|||||||
68
tests/test_metric_presenter.py
Normal file
68
tests/test_metric_presenter.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
|
||||||
|
|
||||||
|
def _run_node(script: str) -> str:
|
||||||
|
"""Execute a short Node.js script and return stdout."""
|
||||||
|
completed = subprocess.run(
|
||||||
|
["node", "-e", script],
|
||||||
|
cwd=REPO_ROOT,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
return completed.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def test_metric_presenter_applies_thresholds_and_noise_direction() -> None:
|
||||||
|
"""MetricPresenter should centralize thresholds and inverse noise semantics."""
|
||||||
|
metric_js = (REPO_ROOT / "webapp" / "static" / "js" / "metric_presenter.js").as_posix()
|
||||||
|
script = f"""
|
||||||
|
const fs = require("fs");
|
||||||
|
const vm = require("vm");
|
||||||
|
const code = fs.readFileSync("{metric_js}", "utf8");
|
||||||
|
const sandbox = {{ window: {{}}, console }};
|
||||||
|
vm.runInNewContext(code, sandbox);
|
||||||
|
const p = sandbox.window.MetricPresenter;
|
||||||
|
const result = {{
|
||||||
|
faith085: p.scoreClass("faithfulness", 0.85),
|
||||||
|
faith070: p.scoreClass("faithfulness", 0.70),
|
||||||
|
faith064: p.scoreClass("faithfulness", 0.64),
|
||||||
|
noise010: p.scoreClass("noise_sensitivity", 0.10),
|
||||||
|
noise030: p.scoreClass("noise_sensitivity", 0.30),
|
||||||
|
noise050: p.scoreClass("noise_sensitivity", 0.50),
|
||||||
|
desc: p.describeMetric("faithfulness"),
|
||||||
|
noiseDesc: p.describeMetric("noise_sensitivity"),
|
||||||
|
noiseBin: p.binColor("noise_sensitivity", 0.0),
|
||||||
|
faithBin: p.binColor("faithfulness", 0.8)
|
||||||
|
}};
|
||||||
|
console.log(JSON.stringify(result));
|
||||||
|
"""
|
||||||
|
output = _run_node(script)
|
||||||
|
assert '"faith085":"good"' in output
|
||||||
|
assert '"faith070":"warn"' in output
|
||||||
|
assert '"faith064":"bad"' in output
|
||||||
|
assert '"noise010":"good"' in output
|
||||||
|
assert '"noise030":"warn"' in output
|
||||||
|
assert '"noise050":"bad"' in output
|
||||||
|
assert '"desc":"' in output
|
||||||
|
assert '"noiseDesc":"' in output
|
||||||
|
assert '"noiseBin":"#16a34a"' in output
|
||||||
|
assert '"faithBin":"#16a34a"' in output
|
||||||
|
|
||||||
|
|
||||||
|
def test_report_and_index_load_metric_presenter_helper() -> None:
|
||||||
|
"""The report page should use the shared helper for card descriptions and colors."""
|
||||||
|
index_html = (REPO_ROOT / "webapp" / "static" / "index.html").read_text(encoding="utf-8")
|
||||||
|
report_js = (REPO_ROOT / "webapp" / "static" / "js" / "report.js").read_text(encoding="utf-8")
|
||||||
|
app_js = (REPO_ROOT / "webapp" / "static" / "js" / "app.js").read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
assert "js/metric_presenter.js" in index_html
|
||||||
|
assert "MetricPresenter.describeMetric" in report_js
|
||||||
|
assert "MetricPresenter.scoreClass" in app_js
|
||||||
@@ -88,3 +88,30 @@ def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path)
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
|
assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_report_ranks_noise_sensitivity_with_lower_values_as_better(tmp_path: Path) -> None:
|
||||||
|
"""Lowest-sample review should treat higher noise sensitivity as worse."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(run_dir / "scores.csv").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"sample_id,question,noise_sensitivity",
|
||||||
|
"s-good,q1,0.10",
|
||||||
|
"s-warn,q2,0.30",
|
||||||
|
"s-bad,q3,0.90",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(run_dir / "summary.md").write_text("summary", encoding="utf-8")
|
||||||
|
(run_dir / "optimization_advice.md").write_text("", encoding="utf-8")
|
||||||
|
|
||||||
|
report = build_report(run_dir, ["noise_sensitivity"])
|
||||||
|
|
||||||
|
assert [sample.sample_id for sample in report.lowest_samples[:3]] == [
|
||||||
|
"s-bad",
|
||||||
|
"s-warn",
|
||||||
|
"s-good",
|
||||||
|
]
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
from unittest.mock import sentinel
|
||||||
|
|
||||||
from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse
|
from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse
|
||||||
|
|
||||||
def test_llm_profile_defaults():
|
def test_llm_profile_defaults():
|
||||||
@@ -147,3 +149,57 @@ def test_resolve_openai_client_kwargs_falls_back_to_env(tmp_path, monkeypatch):
|
|||||||
assert kwargs["api_key"] == "sk-env"
|
assert kwargs["api_key"] == "sk-env"
|
||||||
assert kwargs["base_url"] == "http://env-base/v1"
|
assert kwargs["base_url"] == "http://env-base/v1"
|
||||||
assert kwargs["timeout"] == 45.0
|
assert kwargs["timeout"] == 45.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_models_uses_high_default_max_tokens_for_structured_judge(monkeypatch):
|
||||||
|
"""Structured RAGAS judge calls should use a larger completion budget by default."""
|
||||||
|
import rag_eval.metrics.factory as factory
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
def fake_llm_factory(model, client=None, **kwargs):
|
||||||
|
captured["model"] = model
|
||||||
|
captured["client"] = client
|
||||||
|
captured["kwargs"] = kwargs
|
||||||
|
return sentinel.llm
|
||||||
|
|
||||||
|
monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client)
|
||||||
|
monkeypatch.setattr(factory, "llm_factory", fake_llm_factory)
|
||||||
|
monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings)
|
||||||
|
|
||||||
|
llm, embeddings = factory.build_models(
|
||||||
|
"gpt-5",
|
||||||
|
"text-embedding-3-small",
|
||||||
|
EvaluationSettings(),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert llm is sentinel.llm
|
||||||
|
assert embeddings is sentinel.embeddings
|
||||||
|
assert captured["model"] == "gpt-5"
|
||||||
|
assert captured["client"] is sentinel.client
|
||||||
|
assert captured["kwargs"] == {"max_tokens": 4096}
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_models_allows_env_override_for_judge_max_tokens(monkeypatch):
|
||||||
|
"""Operators should be able to raise the judge completion budget via settings."""
|
||||||
|
import rag_eval.metrics.factory as factory
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
def fake_llm_factory(model, client=None, **kwargs):
|
||||||
|
captured["kwargs"] = kwargs
|
||||||
|
return sentinel.llm
|
||||||
|
|
||||||
|
monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client)
|
||||||
|
monkeypatch.setattr(factory, "llm_factory", fake_llm_factory)
|
||||||
|
monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings)
|
||||||
|
|
||||||
|
factory.build_models(
|
||||||
|
"gpt-5",
|
||||||
|
"text-embedding-3-small",
|
||||||
|
EvaluationSettings(RAGAS_LLM_MAX_TOKENS=8192),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert captured["kwargs"] == {"max_tokens": 8192}
|
||||||
|
|||||||
@@ -44,6 +44,41 @@ logger = logging.getLogger("webapp.api.session_score_jobs")
|
|||||||
status_code=202,
|
status_code=202,
|
||||||
response_model=SessionScoreJobResponse,
|
response_model=SessionScoreJobResponse,
|
||||||
summary="提交 Session 异步评分(多样本批量聚合)",
|
summary="提交 Session 异步评分(多样本批量聚合)",
|
||||||
|
description=(
|
||||||
|
"**用途**\n"
|
||||||
|
"- 适合 Dify 循环节点、批量问答评测、同一对话多轮累计评分。\n"
|
||||||
|
"- 相同 `session_id` 的多次调用不会生成多个独立报告,而是持续追加到同一个 session 报告。\n\n"
|
||||||
|
"**请求字段说明**\n"
|
||||||
|
"- `session_id`:会话唯一标识,同一会话必须保持一致。\n"
|
||||||
|
"- `question` / `answer`:本次待评分的问答对。\n"
|
||||||
|
"- `contexts`:检索片段拼接字符串,按 `context_separator` 拆分。\n"
|
||||||
|
"- `ground_truth`:标准答案,可选;缺失时会自动跳过依赖它的指标。\n"
|
||||||
|
"- `metrics`:本次需要计算的指标列表。\n"
|
||||||
|
"- `judge_model` / `embedding_model`:可选;为空时回退到系统默认配置。\n\n"
|
||||||
|
"**处理行为**\n"
|
||||||
|
"1. 服务端立即返回 `202 Accepted`,并生成本次调用的 `job_id`。\n"
|
||||||
|
"2. 系统根据 `session_id` 计算固定 `run_id`,格式为 `session-<sanitized-session_id>`。\n"
|
||||||
|
"3. 本次评分完成后,会向该 session 的 `scores.csv` 追加一行样本数据。\n"
|
||||||
|
"4. 系统会基于当前 session 的全量样本重写 `summary.md`,并重新生成 `optimization_advice.md`。\n"
|
||||||
|
"5. 报告可在「运行列表」中按 `run_id` 查看;同一 session 的后续调用会持续增量更新该报告。\n\n"
|
||||||
|
"**后续查询接口**\n"
|
||||||
|
"- `GET /api/score/session/jobs/{job_id}`:查询本次调用状态与得分。\n"
|
||||||
|
"- `GET /api/score/sessions/{session_id}`:查询整个 session 的累计调用次数、指标均值、所有作业记录。\n"
|
||||||
|
"- `GET /api/runs/{run_id}`:查看完整评估报告内容。\n\n"
|
||||||
|
"**典型请求示例**\n"
|
||||||
|
"```json\n"
|
||||||
|
"{\n"
|
||||||
|
" \"session_id\": \"dify-session-001\",\n"
|
||||||
|
" \"question\": \"单源CT与双源CT在球管配置上有何本质区别?\",\n"
|
||||||
|
" \"answer\": \"单源CT只有一套球管-探测器系统,双源CT有两套独立的球管-探测器系统。\",\n"
|
||||||
|
" \"contexts\": \"双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管\",\n"
|
||||||
|
" \"context_separator\": \" |||| \",\n"
|
||||||
|
" \"metrics\": [\"answer_relevancy\", \"faithfulness\"],\n"
|
||||||
|
" \"judge_model\": \"gpt-5.5\",\n"
|
||||||
|
" \"embedding_model\": \"text-embedding-3-small\"\n"
|
||||||
|
"}\n"
|
||||||
|
"```"
|
||||||
|
),
|
||||||
responses={
|
responses={
|
||||||
202: {
|
202: {
|
||||||
"description": (
|
"description": (
|
||||||
|
|||||||
@@ -542,6 +542,26 @@ class SessionScoreRequest(ScoreRequest):
|
|||||||
Each call adds a new sample row to the session's scores.csv.
|
Each call adds a new sample row to the session's scores.csv.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
json_schema_extra={
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"summary": "Dify 会话批量评分",
|
||||||
|
"value": {
|
||||||
|
"session_id": "dify-session-001",
|
||||||
|
"question": "单源CT与双源CT在球管配置上有何本质区别?",
|
||||||
|
"answer": "单源CT只有一套球管-探测器系统,双源CT有两套独立的球管-探测器系统。",
|
||||||
|
"contexts": "双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
"metrics": ["answer_relevancy", "faithfulness"],
|
||||||
|
"judge_model": "gpt-5.5",
|
||||||
|
"embedding_model": "text-embedding-3-small",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
session_id: str = Field(
|
session_id: str = Field(
|
||||||
description=(
|
description=(
|
||||||
"会话唯一标识符。相同 session_id 的多次调用合并为同一报告,"
|
"会话唯一标识符。相同 session_id 的多次调用合并为同一报告,"
|
||||||
|
|||||||
@@ -75,8 +75,12 @@ OPENAPI_TAGS = [
|
|||||||
"在「运行列表」页查看。\n\n"
|
"在「运行列表」页查看。\n\n"
|
||||||
"**Session 批量评分 API** — `POST /api/score/session_async`\n\n"
|
"**Session 批量评分 API** — `POST /api/score/session_async`\n\n"
|
||||||
"适合 Dify 循环节点批量评估:同一 `session_id` 的多次调用合并为一个报告,"
|
"适合 Dify 循环节点批量评估:同一 `session_id` 的多次调用合并为一个报告,"
|
||||||
"每次调用新增一个样本行,指标均值和优化建议增量更新。\n"
|
"每次调用新增一个样本行,指标均值和优化建议增量更新。\n\n"
|
||||||
"通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态。\n\n"
|
"**Session 模式调用流程**\n"
|
||||||
|
"1. `POST /api/score/session_async` 提交一条问答评分请求。\n"
|
||||||
|
"2. 用 `GET /api/score/session/jobs/{job_id}` 轮询单次调用状态。\n"
|
||||||
|
"3. 用 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态。\n"
|
||||||
|
"4. 用 `GET /api/runs/{run_id}` 或在「运行列表」中查看完整报告。\n\n"
|
||||||
"通过 `GET /api/score/jobs` 列出所有异步评分记录,"
|
"通过 `GET /api/score/jobs` 列出所有异步评分记录,"
|
||||||
"`GET /api/score/jobs/{job_id}` 查询单个任务状态。\n\n"
|
"`GET /api/score/jobs/{job_id}` 查询单个任务状态。\n\n"
|
||||||
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
||||||
|
|||||||
@@ -37,6 +37,9 @@ GROUPING_FIELDS = ("difficulty", "question_type", "language")
|
|||||||
# How many lowest-scoring samples to surface for manual review.
|
# How many lowest-scoring samples to surface for manual review.
|
||||||
LOWEST_SAMPLE_COUNT = 10
|
LOWEST_SAMPLE_COUNT = 10
|
||||||
|
|
||||||
|
# Metrics whose lower raw value means stronger performance.
|
||||||
|
LOWER_IS_BETTER_METRICS = {"noise_sensitivity"}
|
||||||
|
|
||||||
|
|
||||||
def _round_or_none(value: float | None) -> float | None:
|
def _round_or_none(value: float | None) -> float | None:
|
||||||
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
|
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
|
||||||
@@ -105,7 +108,7 @@ def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupS
|
|||||||
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
|
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
|
||||||
"""Average a single sample's available metric scores for ranking."""
|
"""Average a single sample's available metric scores for ranking."""
|
||||||
values = [
|
values = [
|
||||||
float(row[metric])
|
(1.0 - float(row[metric])) if metric in LOWER_IS_BETTER_METRICS else float(row[metric])
|
||||||
for metric in metrics
|
for metric in metrics
|
||||||
if metric in row and pd.notna(row[metric])
|
if metric in row and pd.notna(row[metric])
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -199,6 +199,7 @@ code {
|
|||||||
.metric-value.bad { color: var(--bad); }
|
.metric-value.bad { color: var(--bad); }
|
||||||
.metric-value.na { color: var(--slate-light); }
|
.metric-value.na { color: var(--slate-light); }
|
||||||
.metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; }
|
.metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; }
|
||||||
|
.metric-desc { font-size: 12px; color: #64748b; margin-top: 6px; line-height: 1.45; }
|
||||||
|
|
||||||
.report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
|
.report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
|
||||||
.report-half { margin-bottom: 0; }
|
.report-half { margin-bottom: 0; }
|
||||||
|
|||||||
@@ -267,6 +267,7 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script src="/static/js/api.js"></script>
|
<script src="/static/js/api.js"></script>
|
||||||
|
<script src="/static/js/metric_presenter.js"></script>
|
||||||
<script src="/static/js/report.js"></script>
|
<script src="/static/js/report.js"></script>
|
||||||
<script src="/static/js/profiles.js"></script>
|
<script src="/static/js/profiles.js"></script>
|
||||||
<script src="/static/js/runner.js"></script>
|
<script src="/static/js/runner.js"></script>
|
||||||
|
|||||||
@@ -147,7 +147,7 @@ const App = {
|
|||||||
const chips = (run.metrics || [])
|
const chips = (run.metrics || [])
|
||||||
.map((m) => {
|
.map((m) => {
|
||||||
const val = run.metric_means ? run.metric_means[m] : null;
|
const val = run.metric_means ? run.metric_means[m] : null;
|
||||||
const cls = App.scoreClass(val);
|
const cls = App.scoreClass(m, val);
|
||||||
const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
|
const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
|
||||||
return `<span class="metric-chip" title="${App.escape(m)}">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
|
return `<span class="metric-chip" title="${App.escape(m)}">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
|
||||||
})
|
})
|
||||||
@@ -174,11 +174,8 @@ const App = {
|
|||||||
if (btn) btn.disabled = false;
|
if (btn) btn.disabled = false;
|
||||||
},
|
},
|
||||||
|
|
||||||
scoreClass(value) {
|
scoreClass(metricName, value) {
|
||||||
if (value === null || value === undefined) return "na";
|
return MetricPresenter.scoreClass(metricName, value);
|
||||||
if (value >= 0.8) return "good";
|
|
||||||
if (value >= 0.65) return "warn";
|
|
||||||
return "bad";
|
|
||||||
},
|
},
|
||||||
|
|
||||||
shortMetric(name) {
|
shortMetric(name) {
|
||||||
|
|||||||
77
webapp/static/js/metric_presenter.js
Normal file
77
webapp/static/js/metric_presenter.js
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
// metric_presenter.js — 统一维护指标语义(高分好 / 低分好)、颜色阈值与简要说明。
|
||||||
|
|
||||||
|
(function attachMetricPresenter(globalObj) {
|
||||||
|
const METRIC_META = {
|
||||||
|
faithfulness: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "回答是否被检索内容直接支持,越高越可靠。",
|
||||||
|
},
|
||||||
|
answer_relevancy: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "回答与问题是否紧密相关,越高越切题。",
|
||||||
|
},
|
||||||
|
context_recall: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "检索片段覆盖标准答案关键信息的程度,越高越完整。",
|
||||||
|
},
|
||||||
|
context_precision: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "检索片段中有效信息的占比,越高越精准。",
|
||||||
|
},
|
||||||
|
noise_sensitivity: {
|
||||||
|
direction: "lower_better",
|
||||||
|
description: "对噪声上下文的敏感程度,越低说明抗干扰能力越强。",
|
||||||
|
},
|
||||||
|
factual_correctness: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "回答与标准答案在事实层面的吻合程度,越高越准确。",
|
||||||
|
},
|
||||||
|
semantic_similarity: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "回答与标准答案在语义上的相似程度,越高越接近。",
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
function isLowerBetter(metricName) {
|
||||||
|
return METRIC_META[metricName]?.direction === "lower_better";
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreClass(metricName, value) {
|
||||||
|
if (value === null || value === undefined || Number.isNaN(Number(value))) return "na";
|
||||||
|
const numeric = Number(value);
|
||||||
|
if (isLowerBetter(metricName)) {
|
||||||
|
if (numeric <= 0.15) return "good";
|
||||||
|
if (numeric <= 0.35) return "warn";
|
||||||
|
return "bad";
|
||||||
|
}
|
||||||
|
if (numeric >= 0.85) return "good";
|
||||||
|
if (numeric >= 0.65) return "warn";
|
||||||
|
return "bad";
|
||||||
|
}
|
||||||
|
|
||||||
|
function describeMetric(metricName) {
|
||||||
|
return METRIC_META[metricName]?.description || "该指标用于衡量当前问答样本的评估表现。";
|
||||||
|
}
|
||||||
|
|
||||||
|
function binColor(metricName, lower) {
|
||||||
|
const numeric = Number(lower);
|
||||||
|
if (isLowerBetter(metricName)) {
|
||||||
|
if (numeric < 0.2) return "#16a34a";
|
||||||
|
if (numeric < 0.4) return "#84cc16";
|
||||||
|
if (numeric < 0.6) return "#eab308";
|
||||||
|
if (numeric < 0.8) return "#f97316";
|
||||||
|
return "#dc2626";
|
||||||
|
}
|
||||||
|
if (numeric >= 0.8) return "#16a34a";
|
||||||
|
if (numeric >= 0.6) return "#84cc16";
|
||||||
|
if (numeric >= 0.4) return "#eab308";
|
||||||
|
if (numeric >= 0.2) return "#f97316";
|
||||||
|
return "#dc2626";
|
||||||
|
}
|
||||||
|
|
||||||
|
globalObj.MetricPresenter = {
|
||||||
|
scoreClass,
|
||||||
|
describeMetric,
|
||||||
|
binColor,
|
||||||
|
};
|
||||||
|
})(window);
|
||||||
@@ -117,13 +117,15 @@ const Report = {
|
|||||||
const metrics = report.metrics && report.metrics.length ? report.metrics : summary.metrics;
|
const metrics = report.metrics && report.metrics.length ? report.metrics : summary.metrics;
|
||||||
metrics.forEach((metric) => {
|
metrics.forEach((metric) => {
|
||||||
const value = report.metric_means ? report.metric_means[metric] : null;
|
const value = report.metric_means ? report.metric_means[metric] : null;
|
||||||
const cls = App.scoreClass(value);
|
const cls = App.scoreClass(metric, value);
|
||||||
const text = value === null || value === undefined ? "n/a" : value.toFixed(2);
|
const text = value === null || value === undefined ? "n/a" : value.toFixed(2);
|
||||||
|
const description = MetricPresenter.describeMetric(metric);
|
||||||
const card = document.createElement("div");
|
const card = document.createElement("div");
|
||||||
card.className = "metric-card";
|
card.className = "metric-card";
|
||||||
card.innerHTML = `
|
card.innerHTML = `
|
||||||
<div class="metric-value ${cls}">${text}</div>
|
<div class="metric-value ${cls}">${text}</div>
|
||||||
<div class="metric-name">${App.escape(metric)}</div>
|
<div class="metric-name">${App.escape(metric)}</div>
|
||||||
|
<div class="metric-desc">${App.escape(description)}</div>
|
||||||
`;
|
`;
|
||||||
wrap.appendChild(card);
|
wrap.appendChild(card);
|
||||||
});
|
});
|
||||||
@@ -168,17 +170,13 @@ const Report = {
|
|||||||
const bins = distributions[metric] || [];
|
const bins = distributions[metric] || [];
|
||||||
const labels = bins.map((b) => b.label);
|
const labels = bins.map((b) => b.label);
|
||||||
const counts = bins.map((b) => b.count);
|
const counts = bins.map((b) => b.count);
|
||||||
const colors = bins.map((b) => Report._binColor(b.lower));
|
const colors = bins.map((b) => Report._binColor(metric, b.lower));
|
||||||
Report._drawDistChart(labels, counts, colors);
|
Report._drawDistChart(labels, counts, colors);
|
||||||
},
|
},
|
||||||
|
|
||||||
// 低分箱偏红、高分箱偏绿,直观暴露长尾。
|
// 低分箱偏红、高分箱偏绿,直观暴露长尾。
|
||||||
_binColor(lower) {
|
_binColor(metric, lower) {
|
||||||
if (lower >= 0.8) return "#16a34a";
|
return MetricPresenter.binColor(metric, lower);
|
||||||
if (lower >= 0.6) return "#84cc16";
|
|
||||||
if (lower >= 0.4) return "#eab308";
|
|
||||||
if (lower >= 0.2) return "#f97316";
|
|
||||||
return "#dc2626";
|
|
||||||
},
|
},
|
||||||
|
|
||||||
// 实际绘制 Chart.js 柱状图。
|
// 实际绘制 Chart.js 柱状图。
|
||||||
@@ -247,7 +245,7 @@ const Report = {
|
|||||||
body += `<tr><td>${App.escape(stat.key)}</td><td>${stat.count}</td>`;
|
body += `<tr><td>${App.escape(stat.key)}</td><td>${stat.count}</td>`;
|
||||||
metrics.forEach((m) => {
|
metrics.forEach((m) => {
|
||||||
const v = stat.means ? stat.means[m] : null;
|
const v = stat.means ? stat.means[m] : null;
|
||||||
const cls = App.scoreClass(v);
|
const cls = App.scoreClass(m, v);
|
||||||
const text = v === null || v === undefined ? "—" : v.toFixed(2);
|
const text = v === null || v === undefined ? "—" : v.toFixed(2);
|
||||||
body += `<td class="${cls}">${text}</td>`;
|
body += `<td class="${cls}">${text}</td>`;
|
||||||
});
|
});
|
||||||
@@ -271,7 +269,7 @@ const Report = {
|
|||||||
const scoreBadges = metrics
|
const scoreBadges = metrics
|
||||||
.map((m) => {
|
.map((m) => {
|
||||||
const v = sample.metrics ? sample.metrics[m] : null;
|
const v = sample.metrics ? sample.metrics[m] : null;
|
||||||
const cls = App.scoreClass(v);
|
const cls = App.scoreClass(m, v);
|
||||||
const text = v === null || v === undefined ? "—" : v.toFixed(2);
|
const text = v === null || v === undefined ? "—" : v.toFixed(2);
|
||||||
return `<span class="score-badge ${cls}" title="${App.escape(m)}">${text}</span>`;
|
return `<span class="score-badge ${cls}" title="${App.escape(m)}">${text}</span>`;
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ const ScoreJobs = {
|
|||||||
if (job.status === "completed") {
|
if (job.status === "completed") {
|
||||||
scoreHtml = Object.entries(job.scores || {})
|
scoreHtml = Object.entries(job.scores || {})
|
||||||
.map(([k, v]) => {
|
.map(([k, v]) => {
|
||||||
const cls = App.scoreClass(v);
|
const cls = App.scoreClass(k, v);
|
||||||
const text = v === null || v === undefined ? "n/a" : Number(v).toFixed(3);
|
const text = v === null || v === undefined ? "n/a" : Number(v).toFixed(3);
|
||||||
return `<span class="metric-chip" title="${App.escape(k)}">${App.escape(App.shortMetric(k))} <b class="${cls}">${text}</b></span>`;
|
return `<span class="metric-chip" title="${App.escape(k)}">${App.escape(App.shortMetric(k))} <b class="${cls}">${text}</b></span>`;
|
||||||
})
|
})
|
||||||
|
|||||||
Reference in New Issue
Block a user