Source code for flexrag.metrics.evaluator
from flexrag.utils import LOGGER_MANAGER, configure
from flexrag.utils.dataclasses import RetrievedContext
from .metrics_base import METRICS, MetricsBase
logger = LOGGER_MANAGER.get_logger("flexrag.metrics")
MetricConfig = METRICS.make_config(allow_multiple=True)
[docs]
@configure
class EvaluatorConfig(MetricConfig):
round: int = 2
[docs]
class Evaluator:
def __init__(self, cfg: EvaluatorConfig) -> None:
self.metrics: dict[str, MetricsBase] = {
name: metric for name, metric in zip(cfg.metrics_type, METRICS.load(cfg))
}
self.round = cfg.round
return
[docs]
def evaluate(
self,
*,
questions: list[str] = None,
responses: list[str] = None,
golden_responses: list[list[str]] = None,
retrieved_contexts: list[list[str | RetrievedContext]] = None,
golden_contexts: list[list[str]] = None,
log: bool = True,
):
"""Evaluate the generated responses against the ground truth responses.
:param questions: A list of questions. Defaults to None.
:param responses: A list of responses. Defaults to None.
:param golden_responses: A list of golden responses. Defaults to None.
:param retrieved_contexts: A list of retrieved contexts. Defaults to None.
:param golden_contexts: A list of golden contexts. Defaults to None.
:param log: Whether to log the evaluation results. Defaults to True.
:type questions: list[str], optional
:type responses: list[str], optional
:type golden_responses: list[list[str]], optional
:type retrieved_contexts: list[list[str | RetrievedContext]], optional
:type golden_contexts: list[list[str]], optional
:type log: bool, optional
:return: The evaluation results and the evaluation details.
:rtype: tuple[dict[str, float], dict[str, Any]]
"""
# check the input arguments
not_none_args = [
arg
for arg in [
questions,
responses,
golden_responses,
retrieved_contexts,
golden_contexts,
]
if arg is not None
]
assert len(not_none_args) > 1, "At least one argument must be provided."
assert all(
len(i) == len(not_none_args[0]) for i in not_none_args
), "All arguments must have the same length."
# evaluate
evaluation_results = {}
evaluation_details = {}
for metric in self.metrics:
metric = str(metric) # make json serializable
r, r_detail = self.metrics[metric](
questions=questions,
responses=responses,
golden_responses=golden_responses,
retrieved_contexts=retrieved_contexts,
golden_contexts=golden_contexts,
)
if log:
for name, score in r.items():
logger.info(f"{name}: {score*100:.{self.round}f}%")
evaluation_results.update(r)
evaluation_details[metric] = r_detail
return evaluation_results, evaluation_details