Skip to content

BLEU

BLEU score metric for evaluating text similarity in RAG systems.

BLEU dataclass

Bases: Metric

BLEU score metric for comparing generated and reference answers.

This implementation uses the sacrebleu.corpus_bleu method to compute a BLEU score for each answer-reference pair.

Attributes:

Name Type Description
name str

The name of the metric.

Source code in ragbot\evaluation\metrics\bleu.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
@dataclass
class BLEU(Metric):
    """BLEU score metric for comparing generated and reference answers.

    This implementation uses the `sacrebleu.corpus_bleu` method to compute
    a BLEU score for each answer-reference pair.

    Attributes:
        name (str): The name of the metric.
    """

    name: str = field(default="bleu", repr=True)
    _required_columns: Set[str] = field(
        default_factory=lambda: {"answer", "reference_answer"}
    )

    def __post_init__(self):
        """Initialize BLEU metric and ensure `sacrebleu` is available."""
        try:
            from sacrebleu import corpus_bleu
        except ImportError as e:
            raise ImportError(
                f"{e.name} is required. Please install it with `pip install {e.name}`"
            )
        self.corpus_bleu = corpus_bleu

    def score(self, sample: Sample, **kwargs: Any) -> float:
        """Compute BLEU score for a given sample.

        Args:
            sample (Sample): A sample containing `answer` and `reference_answer`.
            **kwargs: Optional keyword arguments (unused here).

        Returns:
            float: BLEU score as a float between 0 and 1.
        """
        reference, answer = sample.reference_answer, sample.answer
        ref_sentences = reference.split(". ")
        ans_sentences = answer.split(". ")

        reference = [[reference] for reference in ref_sentences]
        answer = ans_sentences
        score = self.corpus_bleu(answer, reference).score / 100
        return score

__post_init__()

Initialize BLEU metric and ensure sacrebleu is available.

Source code in ragbot\evaluation\metrics\bleu.py
26
27
28
29
30
31
32
33
34
def __post_init__(self):
    """Initialize BLEU metric and ensure `sacrebleu` is available."""
    try:
        from sacrebleu import corpus_bleu
    except ImportError as e:
        raise ImportError(
            f"{e.name} is required. Please install it with `pip install {e.name}`"
        )
    self.corpus_bleu = corpus_bleu

score(sample, **kwargs)

Compute BLEU score for a given sample.

Parameters:

Name Type Description Default
sample Sample

A sample containing answer and reference_answer.

required
**kwargs Any

Optional keyword arguments (unused here).

{}

Returns:

Name Type Description
float float

BLEU score as a float between 0 and 1.

Source code in ragbot\evaluation\metrics\bleu.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def score(self, sample: Sample, **kwargs: Any) -> float:
    """Compute BLEU score for a given sample.

    Args:
        sample (Sample): A sample containing `answer` and `reference_answer`.
        **kwargs: Optional keyword arguments (unused here).

    Returns:
        float: BLEU score as a float between 0 and 1.
    """
    reference, answer = sample.reference_answer, sample.answer
    ref_sentences = reference.split(". ")
    ans_sentences = answer.split(". ")

    reference = [[reference] for reference in ref_sentences]
    answer = ans_sentences
    score = self.corpus_bleu(answer, reference).score / 100
    return score