Source code for flexrag.metrics.evaluator

from flexrag.utils import LOGGER_MANAGER, configure
from flexrag.utils.dataclasses import RetrievedContext

from .metrics_base import METRICS, MetricsBase

logger = LOGGER_MANAGER.get_logger("flexrag.metrics")
MetricConfig = METRICS.make_config(allow_multiple=True)



[docs]
@configure
class EvaluatorConfig(MetricConfig):
    round: int = 2




[docs]
class Evaluator:
    def __init__(self, cfg: EvaluatorConfig) -> None:
        self.metrics: dict[str, MetricsBase] = {
            name: metric for name, metric in zip(cfg.metrics_type, METRICS.load(cfg))
        }
        self.round = cfg.round
        return


[docs]
    def evaluate(
        self,
        *,
        questions: list[str] = None,
        responses: list[str] = None,
        golden_responses: list[list[str]] = None,
        retrieved_contexts: list[list[str | RetrievedContext]] = None,
        golden_contexts: list[list[str]] = None,
        log: bool = True,
    ):
        """Evaluate the generated responses against the ground truth responses.

        :param questions: A list of questions. Defaults to None.
        :param responses: A list of responses. Defaults to None.
        :param golden_responses: A list of golden responses. Defaults to None.
        :param retrieved_contexts: A list of retrieved contexts. Defaults to None.
        :param golden_contexts: A list of golden contexts. Defaults to None.
        :param log: Whether to log the evaluation results. Defaults to True.
        :type questions: list[str], optional
        :type responses: list[str], optional
        :type golden_responses: list[list[str]], optional
        :type retrieved_contexts: list[list[str | RetrievedContext]], optional
        :type golden_contexts: list[list[str]], optional
        :type log: bool, optional
        :return: The evaluation results and the evaluation details.
        :rtype: tuple[dict[str, float], dict[str, Any]]
        """
        # check the input arguments
        not_none_args = [
            arg
            for arg in [
                questions,
                responses,
                golden_responses,
                retrieved_contexts,
                golden_contexts,
            ]
            if arg is not None
        ]
        assert len(not_none_args) > 1, "At least one argument must be provided."
        assert all(
            len(i) == len(not_none_args[0]) for i in not_none_args
        ), "All arguments must have the same length."

        # evaluate
        evaluation_results = {}
        evaluation_details = {}
        for metric in self.metrics:
            metric = str(metric)  # make json serializable
            r, r_detail = self.metrics[metric](
                questions=questions,
                responses=responses,
                golden_responses=golden_responses,
                retrieved_contexts=retrieved_contexts,
                golden_contexts=golden_contexts,
            )
            if log:
                for name, score in r.items():
                    logger.info(f"{name}: {score*100:.{self.round}f}%")
            evaluation_results.update(r)
            evaluation_details[metric] = r_detail
        return evaluation_results, evaluation_details