Source code for flexrag.datasets.retrieval_dataset
import json
import os
from dataclasses import field
from typing import Optional
from flexrag.utils import configure, data
from flexrag.utils.dataclasses import Context
from .dataset import MappingDataset
[docs]
@configure
class MTEBDatasetConfig:
"""Configuration for loading `MTEB <https://huggingface.co/mteb>`_ Retrieval Dataset.
The __getitem__ method will return `IREvalData` objects.
For example, to load the NQ dataset, you can download the test set by running the following command:
>>> git lfs install
>>> git clone https://huggingface.co/datasets/mteb/nq nq
Then you can use the following code to load the dataset:
>>> config = MTEBDatasetConfig(
... data_path="nq",
... subset="test",
... load_corpus=False,
... )
>>> dataset = MTEBDataset(config)
:param data_path: Path to the data directory. Required.
:type data_path: str
:param subset: Subset of the dataset to load. Required.
:type subset: str
:param encoding: Encoding of the data files. Default is 'utf-8'.
:type encoding: str
:param load_corpus: Whether to load the corpus data. Default is False.
:type load_corpus: bool
"""
data_path: str
subset: str
encoding: str = "utf-8"
load_corpus: bool = False
[docs]
@data
class IREvalData:
"""The dataclass for Information Retrieval evaluation data.
:param question: The question for evaluation. Required.
:type question: str
:param contexts: The contexts related to the question. Default: None.
:type contexts: Optional[list[Context]]
:param meta_data: The metadata of the evaluation data. Default: {}.
:type meta_data: dict
"""
question: str
contexts: Optional[list[Context]] = None
meta_data: dict = field(default_factory=dict)
[docs]
class MTEBDataset(MappingDataset[IREvalData]):
"""Dataset for loading MTEB Retrieval Dataset."""
def __init__(self, config: MTEBDatasetConfig) -> None:
qrels: list[dict] = [
json.loads(line)
for line in open(
os.path.join(config.data_path, "qrels", f"{config.subset}.jsonl"),
"r",
encoding=config.encoding,
)
]
queries = [
json.loads(line)
for line in open(
os.path.join(config.data_path, "queries.jsonl"),
"r",
encoding=config.encoding,
)
]
queries = {query["_id"]: query for query in queries}
if config.load_corpus:
corpus = [
json.loads(line)
for line in open(
os.path.join(config.data_path, "corpus.jsonl"),
"r",
encoding=config.encoding,
)
]
corpus = {doc["_id"]: doc for doc in corpus}
else:
corpus = None
# merge qrels, queries, and corpus into RetrievalData
dataset_map: dict[str, int] = {}
self.dataset: list[IREvalData] = []
for qrel in qrels:
# construct the context
context = Context(context_id=qrel["corpus-id"])
if corpus is not None:
context.data = corpus[qrel["corpus-id"]]
if "score" in qrel: # relevance level of the context
context.meta_data["score"] = int(qrel["score"])
query = queries[qrel["query-id"]]["text"]
if qrel["query-id"] not in dataset_map:
dataset_map[qrel["query-id"]] = len(self.dataset)
self.dataset.append(
IREvalData(
question=query,
contexts=[context],
meta_data={"query-id": qrel["query-id"]},
)
)
else:
index = dataset_map[qrel["query-id"]]
self.dataset[index].contexts.append(context)
return
def __len__(self) -> int:
return len(self.dataset)
def __getitem__(self, index: int) -> IREvalData:
return self.dataset[index]