Source code for flexrag.datasets.retrieval_dataset

import json
import os
from dataclasses import field
from typing import Optional

from flexrag.utils import configure, data
from flexrag.utils.dataclasses import Context

from .dataset import MappingDataset


[docs] @configure class MTEBDatasetConfig: """Configuration for loading `MTEB <https://huggingface.co/mteb>`_ Retrieval Dataset. The __getitem__ method will return `IREvalData` objects. For example, to load the NQ dataset, you can download the test set by running the following command: >>> git lfs install >>> git clone https://huggingface.co/datasets/mteb/nq nq Then you can use the following code to load the dataset: >>> config = MTEBDatasetConfig( ... data_path="nq", ... subset="test", ... load_corpus=False, ... ) >>> dataset = MTEBDataset(config) :param data_path: Path to the data directory. Required. :type data_path: str :param subset: Subset of the dataset to load. Required. :type subset: str :param encoding: Encoding of the data files. Default is 'utf-8'. :type encoding: str :param load_corpus: Whether to load the corpus data. Default is False. :type load_corpus: bool """ data_path: str subset: str encoding: str = "utf-8" load_corpus: bool = False
[docs] @data class IREvalData: """The dataclass for Information Retrieval evaluation data. :param question: The question for evaluation. Required. :type question: str :param contexts: The contexts related to the question. Default: None. :type contexts: Optional[list[Context]] :param meta_data: The metadata of the evaluation data. Default: {}. :type meta_data: dict """ question: str contexts: Optional[list[Context]] = None meta_data: dict = field(default_factory=dict)
[docs] class MTEBDataset(MappingDataset[IREvalData]): """Dataset for loading MTEB Retrieval Dataset.""" def __init__(self, config: MTEBDatasetConfig) -> None: qrels: list[dict] = [ json.loads(line) for line in open( os.path.join(config.data_path, "qrels", f"{config.subset}.jsonl"), "r", encoding=config.encoding, ) ] queries = [ json.loads(line) for line in open( os.path.join(config.data_path, "queries.jsonl"), "r", encoding=config.encoding, ) ] queries = {query["_id"]: query for query in queries} if config.load_corpus: corpus = [ json.loads(line) for line in open( os.path.join(config.data_path, "corpus.jsonl"), "r", encoding=config.encoding, ) ] corpus = {doc["_id"]: doc for doc in corpus} else: corpus = None # merge qrels, queries, and corpus into RetrievalData dataset_map: dict[str, int] = {} self.dataset: list[IREvalData] = [] for qrel in qrels: # construct the context context = Context(context_id=qrel["corpus-id"]) if corpus is not None: context.data = corpus[qrel["corpus-id"]] if "score" in qrel: # relevance level of the context context.meta_data["score"] = int(qrel["score"]) query = queries[qrel["query-id"]]["text"] if qrel["query-id"] not in dataset_map: dataset_map[qrel["query-id"]] = len(self.dataset) self.dataset.append( IREvalData( question=query, contexts=[context], meta_data={"query-id": qrel["query-id"]}, ) ) else: index = dataset_map[qrel["query-id"]] self.dataset[index].contexts.append(context) return def __len__(self) -> int: return len(self.dataset) def __getitem__(self, index: int) -> IREvalData: return self.dataset[index]