79 lines
2.8 KiB
Python
79 lines
2.8 KiB
Python
"""Utilities for converting draft online datasets into offline smoke-test datasets."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import pandas as pd
|
|
|
|
from rag_eval.shared.utils import ensure_directory
|
|
|
|
|
|
def _load_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
"""Load a JSONL file into a list of dictionaries."""
|
|
rows: list[dict[str, Any]] = []
|
|
with path.open("r", encoding="utf-8") as handle:
|
|
for line in handle:
|
|
text = line.strip()
|
|
if not text:
|
|
continue
|
|
rows.append(json.loads(text))
|
|
return rows
|
|
|
|
|
|
def build_offline_smoke_dataset(
|
|
*,
|
|
draft_dataset_path: Path,
|
|
source_chunks_path: Path,
|
|
output_path: Path,
|
|
) -> Path:
|
|
"""Derive an offline-evaluable dataset by reusing ground truth as answer and chunk text as contexts."""
|
|
draft_frame = pd.read_csv(draft_dataset_path)
|
|
chunk_rows = _load_jsonl(source_chunks_path)
|
|
chunk_lookup = {str(row["chunk_id"]): row for row in chunk_rows}
|
|
|
|
output_rows: list[dict[str, Any]] = []
|
|
for _, row in draft_frame.iterrows():
|
|
chunk_ids = row.get("source_chunk_ids")
|
|
if isinstance(chunk_ids, str):
|
|
parsed_chunk_ids = json.loads(chunk_ids)
|
|
elif isinstance(chunk_ids, list):
|
|
parsed_chunk_ids = chunk_ids
|
|
else:
|
|
parsed_chunk_ids = []
|
|
|
|
contexts = [
|
|
str(chunk_lookup[chunk_id]["text"]).strip()
|
|
for chunk_id in parsed_chunk_ids
|
|
if chunk_id in chunk_lookup and str(chunk_lookup[chunk_id]["text"]).strip()
|
|
]
|
|
ground_truth = str(row.get("ground_truth", "")).strip()
|
|
output_rows.append(
|
|
{
|
|
"sample_id": row.get("sample_id", ""),
|
|
"question": row.get("question", ""),
|
|
"contexts": json.dumps(contexts, ensure_ascii=False),
|
|
"answer": ground_truth,
|
|
"ground_truth": ground_truth,
|
|
"scenario": row.get("scenario", ""),
|
|
"language": row.get("language", ""),
|
|
"retrieval_config": "offline-smoke-from-pdf-build",
|
|
"doc_id": row.get("doc_id", ""),
|
|
"doc_name": row.get("doc_name", ""),
|
|
"section_path": row.get("section_path", ""),
|
|
"page_start": row.get("page_start", ""),
|
|
"page_end": row.get("page_end", ""),
|
|
"source_chunk_ids": row.get("source_chunk_ids", ""),
|
|
"question_type": row.get("question_type", ""),
|
|
"difficulty": row.get("difficulty", ""),
|
|
"review_status": row.get("review_status", ""),
|
|
"review_notes": row.get("review_notes", ""),
|
|
}
|
|
)
|
|
|
|
ensure_directory(output_path.parent)
|
|
pd.DataFrame(output_rows).to_csv(output_path, index=False)
|
|
return output_path
|