first commit
This commit is contained in:
78
rag_eval/dataset_builder/offline_converter.py
Normal file
78
rag_eval/dataset_builder/offline_converter.py
Normal file
@@ -0,0 +1,78 @@
|
||||
"""Utilities for converting draft online datasets into offline smoke-test datasets."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from rag_eval.shared.utils import ensure_directory
|
||||
|
||||
|
||||
def _load_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||
"""Load a JSONL file into a list of dictionaries."""
|
||||
rows: list[dict[str, Any]] = []
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
text = line.strip()
|
||||
if not text:
|
||||
continue
|
||||
rows.append(json.loads(text))
|
||||
return rows
|
||||
|
||||
|
||||
def build_offline_smoke_dataset(
|
||||
*,
|
||||
draft_dataset_path: Path,
|
||||
source_chunks_path: Path,
|
||||
output_path: Path,
|
||||
) -> Path:
|
||||
"""Derive an offline-evaluable dataset by reusing ground truth as answer and chunk text as contexts."""
|
||||
draft_frame = pd.read_csv(draft_dataset_path)
|
||||
chunk_rows = _load_jsonl(source_chunks_path)
|
||||
chunk_lookup = {str(row["chunk_id"]): row for row in chunk_rows}
|
||||
|
||||
output_rows: list[dict[str, Any]] = []
|
||||
for _, row in draft_frame.iterrows():
|
||||
chunk_ids = row.get("source_chunk_ids")
|
||||
if isinstance(chunk_ids, str):
|
||||
parsed_chunk_ids = json.loads(chunk_ids)
|
||||
elif isinstance(chunk_ids, list):
|
||||
parsed_chunk_ids = chunk_ids
|
||||
else:
|
||||
parsed_chunk_ids = []
|
||||
|
||||
contexts = [
|
||||
str(chunk_lookup[chunk_id]["text"]).strip()
|
||||
for chunk_id in parsed_chunk_ids
|
||||
if chunk_id in chunk_lookup and str(chunk_lookup[chunk_id]["text"]).strip()
|
||||
]
|
||||
ground_truth = str(row.get("ground_truth", "")).strip()
|
||||
output_rows.append(
|
||||
{
|
||||
"sample_id": row.get("sample_id", ""),
|
||||
"question": row.get("question", ""),
|
||||
"contexts": json.dumps(contexts, ensure_ascii=False),
|
||||
"answer": ground_truth,
|
||||
"ground_truth": ground_truth,
|
||||
"scenario": row.get("scenario", ""),
|
||||
"language": row.get("language", ""),
|
||||
"retrieval_config": "offline-smoke-from-pdf-build",
|
||||
"doc_id": row.get("doc_id", ""),
|
||||
"doc_name": row.get("doc_name", ""),
|
||||
"section_path": row.get("section_path", ""),
|
||||
"page_start": row.get("page_start", ""),
|
||||
"page_end": row.get("page_end", ""),
|
||||
"source_chunk_ids": row.get("source_chunk_ids", ""),
|
||||
"question_type": row.get("question_type", ""),
|
||||
"difficulty": row.get("difficulty", ""),
|
||||
"review_status": row.get("review_status", ""),
|
||||
"review_notes": row.get("review_notes", ""),
|
||||
}
|
||||
)
|
||||
|
||||
ensure_directory(output_path.parent)
|
||||
pd.DataFrame(output_rows).to_csv(output_path, index=False)
|
||||
return output_path
|
||||
Reference in New Issue
Block a user