OpenHands/evaluation/benchmarks/testgeneval/metrics.py

import sys
from typing import Callable, Optional, Sequence, TypeVar, Union

import nltk
import numpy as np
from fuzzywuzzy import fuzz
from rouge import Rouge

# increase recursion depth to ensure ROUGE can be calculated for long sentences
if sys.getrecursionlimit() < 10_000:
    sys.setrecursionlimit(10_000)


def bleu(gold: list[str], pred: list[str]) -> float:
    """Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.

    :param gold: list of gold tokens
    :param pred: list of predicted tokens
    :return: BLEU score
    """
    if len(pred) == 0 or len(gold) == 0:
        return 0.0
    return 100.0 * nltk.translate.bleu_score.sentence_bleu(
        [gold],
        pred,
        smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method2,
        auto_reweigh=True,
    )


def batch_bleu(golds: list[list[str]], preds: list[list[str]]) -> list[float]:
    """Calculate BLEU score for a batch of sentences.

    :param golds: list of gold sentences
    :param preds: list of predicted sentences
    :return: list of BLEU scores
    """
    if len(golds) != len(preds):
        raise ValueError('golds and preds must have the same length')
    return [bleu(gold, pred) for gold, pred in zip(golds, preds)]


def corpus_bleu(golds: list[list[str]], preds: list[list[str]]) -> float:
    """Calculate corpus-level BLEU score for a batch of sentences.

    :param golds: list of gold sentences
    :param preds: list of predicted sentences
    :return: corpus-level BLEU score
    """
    if len(golds) != len(preds):
        raise ValueError('golds and preds must have the same length')
    return 100.0 * nltk.translate.bleu_score.corpus_bleu(
        [[gold] for gold in golds],
        preds,
        smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method2,
        auto_reweigh=True,
    )


def edit_sim(
    gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
) -> float:
    """Calculate char-level edit similarity, in the range of 0~100.

    :param gold: gold sentence or list of gold tokens
    :param pred: predicted sentence or list of predicted tokens
    :param sep: separator between tokens
    :return: char-level edit similarity
    """
    if len(pred) == 0 or len(gold) == 0:
        return 0.0
    if isinstance(gold, list):
        gold = sep.join(gold)
    if isinstance(pred, list):
        pred = sep.join(pred)
    return fuzz.ratio(gold, pred)


def batch_edit_sim(
    golds: list[Union[str, list[str]]],
    preds: list[Union[str, list[str]]],
    sep: str = ' ',
) -> list[float]:
    """Calculate char-level edit similarity for a batch of sentences.

    :param golds: list of gold sentences
    :param preds: list of predicted sentences
    :param sep: separator between tokens
    :return: list of char-level edit similarity
    """
    if len(golds) != len(preds):
        raise ValueError('golds and preds must have the same length')
    return [edit_sim(gold, pred, sep) for gold, pred in zip(golds, preds)]


T = TypeVar('T')


def exact_match(gold: T, pred: T) -> float:
    """Calculate exact match accuracy, in the range of {0, 100}.

    :param gold: gold sentence or list of gold tokens
    :param pred: predicted sentence or list of predicted tokens
    :return: exact match accuracy
    """
    if len(pred) == 0 or len(gold) == 0:
        return 0.0
    return 100.0 if gold == pred else 0.0


def batch_exact_match(golds: list[T], preds: list[T]) -> list[float]:
    """Calculate exact match accuracy for a batch of sentences.

    :param golds: list of gold sentences
    :param preds: list of predicted sentences
    :return: list of exact match accuracy
    """
    if len(golds) != len(preds):
        raise ValueError('golds and preds must have the same length')
    return [exact_match(gold, pred) for gold, pred in zip(golds, preds)]


def rouge_l(
    gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
) -> dict[str, float]:
    """Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.

    :param gold: gold sentence or list of gold tokens
    :param pred: predicted sentence or list of predicted tokens
    :return: {"p": precision, "r": recall, "f": F1}
    """
    if len(pred) == 0 or len(gold) == 0:
        return {'p': 0.0, 'r': 0.0, 'f': 0.0}
    if isinstance(gold, list):
        gold = sep.join(gold)
    if isinstance(pred, list):
        pred = sep.join(pred)
    try:
        rouge = Rouge()
        scores = rouge.get_scores(hyps=pred, refs=gold, avg=True)
        return {x: scores['rouge-l'][x] * 100.0 for x in ['p', 'r', 'f']}
    except ValueError:
        return {'p': 0.0, 'r': 0.0, 'f': 0.0}


def batch_rouge_l(
    golds: list[Union[str, list[str]]],
    preds: list[Union[str, list[str]]],
    sep: str = ' ',
) -> dict[str, list[float]]:
    """Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.

    :param golds: list of gold sentences
    :param preds: list of predicted sentences
    :param sep: separator between tokens
    :return: list of {"p": precision, "r": recall, "f": F1}
    """
    if len(golds) != len(preds):
        raise ValueError('golds and preds must have the same length')
    scores = [rouge_l(gold, pred, sep) for gold, pred in zip(golds, preds)]
    return {x: [score[x] for score in scores] for x in ['p', 'r', 'f']}


def accuracy(
    gold: list[str],
    pred: list[str],
    ignore: Optional[Sequence[str]] = None,
) -> float:
    """Calculate token-level accuracy, in the range of 0~100.
    If gold and pred are not the same length, the longer one would be truncated.

    :param gold: list of gold tokens
    :param pred: list of predicted tokens
    :param ignore: list of (gold) tokens to ignore
    :return: accuracy
    """
    if len(pred) == 0 or len(gold) == 0:
        return 0.0
    if ignore is None:
        ignore = []
    i = 0
    total = 0
    match = 0
    while i < len(gold) and i < len(pred):
        if gold[i] in ignore:
            i += 1
            continue
        total += 1
        if gold[i] == pred[i]:
            match += 1
        i += 1

    if total == 0:
        return 0.0
    return 100.0 * match / total


def batch_accuracy(
    golds: list[list[str]],
    preds: list[list[str]],
    ignore: Optional[Sequence[str]] = None,
) -> list[float]:
    """Calculate token-level accuracy for a batch of sentences.

    :param golds: list of gold sentences
    :param preds: list of predicted sentences
    :param ignore: list of (gold) tokens to ignore
    :return: list of accuracy
    """
    if len(golds) != len(preds):
        raise ValueError('golds and preds must have the same length')
    return [accuracy(gold, pred, ignore) for gold, pred in zip(golds, preds)]


def first_match_to_topk(
    first_match_list: list[int], k_values: list[int]
) -> dict[int, list[float]]:
    """Calculate top-k accuracy with the first match ranks (1-indexed).

    :param first_match: first match ranks (1-indexed)
    :param k_values: k values to consider
    :return: a mapping from k to top-k accuracies (ranging from 0~100)
    """
    return {k: [100.0 if x <= k else 0.0 for x in first_match_list] for k in k_values}


def pass_at_k(n: int, c: int, k: int) -> float:
    """Sample pass@k metric according to the Codex paper, but in the scale of 0~100.
    :param n: total number of samples
    :param c: number of correct samples
    :param k: k in pass@$k$
    """
    if n < k or (n - c) < k:
        # fallback to the (1 - (1-p)^k) formula
        return (1 - (1 - (c / n)) ** k) * 100
    else:
        return (1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)).item()) * 100


def self_bleu(samples: list[list[str]]) -> float:
    """Calculate self-BLEU among the samples.
    :param samples: the chosen m samples
    :return: self-BLEU
    """
    if len(samples) == 0:
        return 100.0

    scores = []
    for i in range(len(samples)):
        scores.append(
            100.0
            * nltk.translate.bleu_score.sentence_bleu(
                [samples[j] for j in range(len(samples)) if j != i],
                samples[i],
                smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method2,
                auto_reweigh=True,
            )
        )
    return np.mean(scores).item()


def self_edit_distance(samples: list[Union[str, list[str]]], sep=' ') -> float:
    """Calculate self-edit-distance among the samples.
    :param samples: the chosen m samples
    :param sep: the separator between tokens
    :return: self-edit-distance
    """
    if len(samples) == 0:
        return 0.0

    scores = []
    for i in range(len(samples)):
        sample_i = samples[i]
        if not isinstance(sample_i, str):
            sample_i = sep.join(sample_i)
        for j in range(len(samples)):
            if i == j:
                continue
            sample_j = samples[j]
            if not isinstance(sample_j, str):
                sample_j = sep.join(sample_j)

            scores.append(100 - fuzz.ratio(sample_i, sample_j))
    return np.mean(scores).item()


QUALITY_METRICS: dict[str, Callable[[list[str], list[str]], float]] = {
    'bleu': bleu,
    'xmatch': exact_match,
    'edit-sim': edit_sim,
    'rouge-f': lambda g, p: rouge_l(g, p)['f'],
    'rouge-p': lambda g, p: rouge_l(g, p)['p'],
    'rouge-r': lambda g, p: rouge_l(g, p)['r'],
}