siemens_ragas/webapp/services/report_builder.py

"""Aggregate a run's per-sample scores into the report payload for the UI.

All aggregation reads only the standard scores.csv produced by the reporting
layer, plus the metric list resolved by run_reader. The output mirrors the
report detail page: metric means, per-metric distribution histograms, grouped
means by difficulty / question_type, and the lowest-scoring samples for review.
"""

from __future__ import annotations

import math
from pathlib import Path

import pandas as pd

from rag_eval.metrics.weights import (
    compute_overall_weighted_score_mean,
    weighted_metric_means as _weighted_metric_means,
)
from webapp.services.run_reader import _read_weights_from_snapshot
from webapp.services.text_utils import parse_contexts
from webapp.models import (
    DistributionBin,
    GroupStat,
    ReportData,
    SampleScore,
)
from webapp.services import run_reader


# Number of equal-width buckets used for metric score histograms.
DISTRIBUTION_BIN_COUNT = 5

# Metadata columns that we group samples by when present in the data.
GROUPING_FIELDS = ("difficulty", "question_type", "language")

# How many lowest-scoring samples to surface for manual review.
LOWEST_SAMPLE_COUNT = 10


def _round_or_none(value: float | None) -> float | None:
    """Round a float to four places, mapping NaN/None to None for clean JSON."""
    if value is None:
        return None
    if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
        return None
    return round(float(value), 4)


def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
    """Bucket one metric's scores into fixed-width [0,1] histogram bins."""
    bins: list[DistributionBin] = []
    if metric not in frame.columns:
        return bins

    series = pd.to_numeric(frame[metric], errors="coerce").dropna()
    width = 1.0 / DISTRIBUTION_BIN_COUNT
    for index in range(DISTRIBUTION_BIN_COUNT):
        lower = index * width
        upper = (index + 1) * width
        # Include the right edge in the final bin so 1.0 is counted.
        if index == DISTRIBUTION_BIN_COUNT - 1:
            mask = (series >= lower) & (series <= upper)
        else:
            mask = (series >= lower) & (series < upper)
        bins.append(
            DistributionBin(
                label=f"{lower:.1f}–{upper:.1f}",
                lower=round(lower, 2),
                upper=round(upper, 2),
                count=int(mask.sum()),
            )
        )
    return bins


def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupStat]]:
    """Compute per-group metric means for each available grouping field."""
    groupings: dict[str, list[GroupStat]] = {}
    for field in GROUPING_FIELDS:
        if field not in frame.columns:
            continue
        # Skip fields that are entirely empty so the UI does not render noise.
        non_empty = frame[field].astype(str).str.strip().replace("nan", "")
        if non_empty.eq("").all():
            continue

        stats: list[GroupStat] = []
        for key, group in frame.groupby(frame[field].astype(str)):
            key_text = str(key).strip()
            if not key_text or key_text == "nan":
                continue
            means = {
                metric: _round_or_none(group[metric].mean(numeric_only=True))
                for metric in metrics
                if metric in group.columns
            }
            stats.append(GroupStat(key=key_text, count=int(len(group)), means=means))
        if stats:
            stats.sort(key=lambda item: item.key)
            groupings[field] = stats
    return groupings


def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
    """Average a single sample's available metric scores for ranking."""
    values = [
        float(row[metric])
        for metric in metrics
        if metric in row and pd.notna(row[metric])
    ]
    if not values:
        return None
    return sum(values) / len(values)


def _cell_text(row: pd.Series, column: str) -> str:
    """Safely read a string cell, returning '' for missing or NaN values."""
    if column not in row or pd.isna(row[column]):
        return ""
    return str(row[column]).strip()


def _lowest_samples(frame: pd.DataFrame, metrics: list[str]) -> list[SampleScore]:
    """Select and shape the lowest-scoring samples for the review table."""
    if frame.empty:
        return []

    enriched: list[tuple[float, SampleScore]] = []
    for _, row in frame.iterrows():
        mean_score = _sample_mean(row, metrics)
        sample = SampleScore(
            sample_id=_cell_text(row, "sample_id") or "—",
            question=_cell_text(row, "question"),
            contexts=parse_contexts(row["contexts"]) if "contexts" in row else [],
            answer=_cell_text(row, "answer"),
            ground_truth=_cell_text(row, "ground_truth"),
            language=_cell_text(row, "language"),
            difficulty=_cell_text(row, "difficulty"),
            question_type=_cell_text(row, "question_type"),
            metrics={
                metric: _round_or_none(float(row[metric]))
                for metric in metrics
                if metric in row and pd.notna(row[metric])
            },
            mean_score=_round_or_none(mean_score),
            error=_cell_text(row, "error"),
        )
        # Samples without any score sort last (treated as worst for review).
        sort_key = mean_score if mean_score is not None else -1.0
        enriched.append((sort_key, sample))

    enriched.sort(key=lambda item: item[0])
    return [sample for _, sample in enriched[:LOWEST_SAMPLE_COUNT]]


def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
    """Build the full aggregated report payload for one run directory."""
    frame = run_reader.read_scores_frame(run_dir)
    summary_markdown = run_reader.read_summary_markdown(run_dir)
    advice_markdown = run_reader.read_advice_markdown(run_dir)
    metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)

    if frame.empty or not metrics:
        return ReportData(
            metrics=metrics,
            metric_means={metric: None for metric in metrics},
            summary_markdown=summary_markdown,
            advice_markdown=advice_markdown,
            metric_weights=metric_weights,
            doc_weights=doc_weights,
        )

    score_rows_list = frame.to_dict(orient="records")

    # Use weighted metric means (degrades to arithmetic mean when weights are empty).
    w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
    rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}

    overall_ws = compute_overall_weighted_score_mean(
        score_rows_list, metric_weights, doc_weights
    )

    distributions = {
        metric: _distribution(frame, metric)
        for metric in metrics
        if metric in frame.columns
    }

    return ReportData(
        metrics=metrics,
        metric_means=rounded_means,
        distributions=distributions,
        groupings=_groupings(frame, metrics),
        lowest_samples=_lowest_samples(frame, metrics),
        summary_markdown=summary_markdown,
        advice_markdown=advice_markdown,
        weighted_score_mean=_round_or_none(overall_ws),
        metric_weights=metric_weights,
        doc_weights=doc_weights,
    )
-												Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>

											
										
										
											2026-06-15 15:53:57 +08:00
+								"""Aggregate a run's per-sample scores into the report payload for the UI.
 								All aggregation reads only the standard scores.csv produced by the reporting
 								layer, plus the metric list resolved by run_reader. The output mirrors the
 								report detail page: metric means, per-metric distribution histograms, grouped
 								means by difficulty / question_type, and the lowest-scoring samples for review.
 								"""
 								from __future__ import annotations
 								import math
 								from pathlib import Path
 								import pandas as pd
-												feat: report_builder uses weighted means; ReportData gains weighted_score_mean

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-18 17:16:09 +08:00
+								from rag_eval.metrics.weights import (
 								    compute_overall_weighted_score_mean,
 								    weighted_metric_means as _weighted_metric_means,
 								)
 								from webapp.services.run_reader import _read_weights_from_snapshot
-												Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>

											
										
										
											2026-06-15 15:53:57 +08:00
+								from webapp.services.text_utils import parse_contexts
 								from webapp.models import (
 								    DistributionBin,
 								    GroupStat,
 								    ReportData,
 								    SampleScore,
 								)
 								from webapp.services import run_reader
 								# Number of equal-width buckets used for metric score histograms.
 								DISTRIBUTION_BIN_COUNT = 5
 								# Metadata columns that we group samples by when present in the data.
 								GROUPING_FIELDS = ("difficulty", "question_type", "language")
 								# How many lowest-scoring samples to surface for manual review.
 								LOWEST_SAMPLE_COUNT = 10
 								def _round_or_none(value: float | None) -> float | None:
 								    """Round a float to four places, mapping NaN/None to None for clean JSON."""
 								    if value is None:
 								        return None
 								    if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
 								        return None
 								    return round(float(value), 4)
 								def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
 								    """Bucket one metric's scores into fixed-width [0,1] histogram bins."""
 								    bins: list[DistributionBin] = []
 								    if metric not in frame.columns:
 								        return bins
 								    series = pd.to_numeric(frame[metric], errors="coerce").dropna()
 								    width = 1.0 / DISTRIBUTION_BIN_COUNT
 								    for index in range(DISTRIBUTION_BIN_COUNT):
 								        lower = index * width
 								        upper = (index + 1) * width
 								        # Include the right edge in the final bin so 1.0 is counted.
 								        if index == DISTRIBUTION_BIN_COUNT - 1:
 								            mask = (series >= lower) & (series <= upper)
 								        else:
 								            mask = (series >= lower) & (series < upper)
 								        bins.append(
 								            DistributionBin(
 								                label=f"{lower:.1f}–{upper:.1f}",
 								                lower=round(lower, 2),
 								                upper=round(upper, 2),
 								                count=int(mask.sum()),
 								            )
 								        )
 								    return bins
 								def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupStat]]:
 								    """Compute per-group metric means for each available grouping field."""
 								    groupings: dict[str, list[GroupStat]] = {}
 								    for field in GROUPING_FIELDS:
 								        if field not in frame.columns:
 								            continue
 								        # Skip fields that are entirely empty so the UI does not render noise.
 								        non_empty = frame[field].astype(str).str.strip().replace("nan", "")
 								        if non_empty.eq("").all():
 								            continue
 								        stats: list[GroupStat] = []
 								        for key, group in frame.groupby(frame[field].astype(str)):
 								            key_text = str(key).strip()
 								            if not key_text or key_text == "nan":
 								                continue
 								            means = {
 								                metric: _round_or_none(group[metric].mean(numeric_only=True))
 								                for metric in metrics
 								                if metric in group.columns
 								            }
 								            stats.append(GroupStat(key=key_text, count=int(len(group)), means=means))
 								        if stats:
 								            stats.sort(key=lambda item: item.key)
 								            groupings[field] = stats
 								    return groupings
 								def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
 								    """Average a single sample's available metric scores for ranking."""
 								    values = [
 								        float(row[metric])
 								        for metric in metrics
 								        if metric in row and pd.notna(row[metric])
 								    ]
 								    if not values:
 								        return None
 								    return sum(values) / len(values)
 								def _cell_text(row: pd.Series, column: str) -> str:
 								    """Safely read a string cell, returning '' for missing or NaN values."""
 								    if column not in row or pd.isna(row[column]):
 								        return ""
 								    return str(row[column]).strip()
 								def _lowest_samples(frame: pd.DataFrame, metrics: list[str]) -> list[SampleScore]:
 								    """Select and shape the lowest-scoring samples for the review table."""
 								    if frame.empty:
 								        return []
 								    enriched: list[tuple[float, SampleScore]] = []
 								    for _, row in frame.iterrows():
 								        mean_score = _sample_mean(row, metrics)
 								        sample = SampleScore(
 								            sample_id=_cell_text(row, "sample_id") or "—",
 								            question=_cell_text(row, "question"),
 								            contexts=parse_contexts(row["contexts"]) if "contexts" in row else [],
 								            answer=_cell_text(row, "answer"),
 								            ground_truth=_cell_text(row, "ground_truth"),
 								            language=_cell_text(row, "language"),
 								            difficulty=_cell_text(row, "difficulty"),
 								            question_type=_cell_text(row, "question_type"),
 								            metrics={
 								                metric: _round_or_none(float(row[metric]))
 								                for metric in metrics
 								                if metric in row and pd.notna(row[metric])
 								            },
 								            mean_score=_round_or_none(mean_score),
 								            error=_cell_text(row, "error"),
 								        )
 								        # Samples without any score sort last (treated as worst for review).
 								        sort_key = mean_score if mean_score is not None else -1.0
 								        enriched.append((sort_key, sample))
 								    enriched.sort(key=lambda item: item[0])
 								    return [sample for _, sample in enriched[:LOWEST_SAMPLE_COUNT]]
 								def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
 								    """Build the full aggregated report payload for one run directory."""
 								    frame = run_reader.read_scores_frame(run_dir)
 								    summary_markdown = run_reader.read_summary_markdown(run_dir)
-												fix(advisor): fix LLM API call, wire advice_markdown to webapp, update .env.example timeouts

- llm_analyzer.py: use llm.langchain_llm.ainvoke() (correct RAGAS 0.4.3 API)
- webapp/models.py: add advice_markdown field to ReportData
- webapp/services/run_reader.py: add read_advice_markdown() reading optimization_advice.md
- webapp/services/report_builder.py: pass advice_markdown into ReportData
- .env.example: OPENAI_TIMEOUT_SECONDS 30→180, RAGAS_METRIC_TIMEOUT_SECONDS 45→300

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2026-06-16 17:12:32 +08:00
+								    advice_markdown = run_reader.read_advice_markdown(run_dir)
-												feat: report_builder uses weighted means; ReportData gains weighted_score_mean

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-18 17:16:09 +08:00
+								    metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
-												Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>

											
										
										
											2026-06-15 15:53:57 +08:00
 								    if frame.empty or not metrics:
 								        return ReportData(
 								            metrics=metrics,
 								            metric_means={metric: None for metric in metrics},
 								            summary_markdown=summary_markdown,
-												fix(advisor): fix LLM API call, wire advice_markdown to webapp, update .env.example timeouts

- llm_analyzer.py: use llm.langchain_llm.ainvoke() (correct RAGAS 0.4.3 API)
- webapp/models.py: add advice_markdown field to ReportData
- webapp/services/run_reader.py: add read_advice_markdown() reading optimization_advice.md
- webapp/services/report_builder.py: pass advice_markdown into ReportData
- .env.example: OPENAI_TIMEOUT_SECONDS 30→180, RAGAS_METRIC_TIMEOUT_SECONDS 45→300

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2026-06-16 17:12:32 +08:00
+								            advice_markdown=advice_markdown,
-												feat: report_builder uses weighted means; ReportData gains weighted_score_mean

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-18 17:16:09 +08:00
+								            metric_weights=metric_weights,
 								            doc_weights=doc_weights,
-												Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>

											
										
										
											2026-06-15 15:53:57 +08:00
+								        )
-												feat: report_builder uses weighted means; ReportData gains weighted_score_mean

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-18 17:16:09 +08:00
+								    score_rows_list = frame.to_dict(orient="records")
 								    # Use weighted metric means (degrades to arithmetic mean when weights are empty).
 								    w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
 								    rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
 								    overall_ws = compute_overall_weighted_score_mean(
 								        score_rows_list, metric_weights, doc_weights
 								    )
-												Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>

											
										
										
											2026-06-15 15:53:57 +08:00
+								    distributions = {
 								        metric: _distribution(frame, metric)
 								        for metric in metrics
 								        if metric in frame.columns
 								    }
 								    return ReportData(
 								        metrics=metrics,
-												feat: report_builder uses weighted means; ReportData gains weighted_score_mean

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-18 17:16:09 +08:00
+								        metric_means=rounded_means,
-												Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>

											
										
										
											2026-06-15 15:53:57 +08:00
+								        distributions=distributions,
 								        groupings=_groupings(frame, metrics),
 								        lowest_samples=_lowest_samples(frame, metrics),
 								        summary_markdown=summary_markdown,
-												fix(advisor): fix LLM API call, wire advice_markdown to webapp, update .env.example timeouts

- llm_analyzer.py: use llm.langchain_llm.ainvoke() (correct RAGAS 0.4.3 API)
- webapp/models.py: add advice_markdown field to ReportData
- webapp/services/run_reader.py: add read_advice_markdown() reading optimization_advice.md
- webapp/services/report_builder.py: pass advice_markdown into ReportData
- .env.example: OPENAI_TIMEOUT_SECONDS 30→180, RAGAS_METRIC_TIMEOUT_SECONDS 45→300

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2026-06-16 17:12:32 +08:00
+								        advice_markdown=advice_markdown,
-												feat: report_builder uses weighted means; ReportData gains weighted_score_mean

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-18 17:16:09 +08:00
+								        weighted_score_mean=_round_or_none(overall_ws),
 								        metric_weights=metric_weights,
 								        doc_weights=doc_weights,
-												Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>

											
										
										
											2026-06-15 15:53:57 +08:00
+								    )