Skip to content

Documentation

Documentation

valor_lite.text_generation.Context dataclass

Contextual ground truth and prediction.

Attributes:

Name Type Description
groundtruth list[str]

The definitive context.

prediction list[str]

Any retrieved context from a retrieval-augmented-generation (RAG) pipeline.

Examples:

... context = Context( ... groundtruth=[...], ... prediction=[...], ... )

Source code in valor_lite/text_generation/annotation.py
@dataclass
class Context:
    """
    Contextual ground truth and prediction.

    Attributes
    ----------
    groundtruth : list[str]
        The definitive context.
    prediction : list[str]
        Any retrieved context from a retrieval-augmented-generation (RAG) pipeline.

    Examples
    --------
    ... context = Context(
    ...     groundtruth=[...],
    ...     prediction=[...],
    ... )
    """

    groundtruth: list[str] = field(default_factory=list)
    prediction: list[str] = field(default_factory=list)

valor_lite.text_generation.QueryResponse dataclass

Text generation data structure containing ground truths and predictions.

Attributes:

Name Type Description
query str

The user query.

response str

The language model's response.

context Context

Any context provided to the model.

Examples:

>>> query = QueryResponse(
...     query='When was George Washington born?',
...     response="February 22, 1732",
...     context=Context(
...         groundtruth=["02/22/1732"],
...         prediction=["02/22/1732"],
...     ),
... )
Source code in valor_lite/text_generation/annotation.py
@dataclass
class QueryResponse:
    """
    Text generation data structure containing ground truths and predictions.

    Attributes
    ----------
    query : str
        The user query.
    response : str
        The language model's response.
    context : Context
        Any context provided to the model.

    Examples
    --------
    >>> query = QueryResponse(
    ...     query='When was George Washington born?',
    ...     response="February 22, 1732",
    ...     context=Context(
    ...         groundtruth=["02/22/1732"],
    ...         prediction=["02/22/1732"],
    ...     ),
    ... )
    """

    query: str
    response: str
    context: Context | None = field(default=None)

valor_lite.text_generation.Evaluator

Parent class for all LLM clients.

Attributes:

Name Type Description
client (ClientWrapper, optional)

An optional client to compute llm-guided metrics.

retries int

The number of times to retry the API call if it fails. Defaults to 0, indicating that the call will not be retried.

Source code in valor_lite/text_generation/manager.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
class Evaluator:
    """
    Parent class for all LLM clients.

    Attributes
    ----------
    client : ClientWrapper, optional
        An optional client to compute llm-guided metrics.
    retries : int
        The number of times to retry the API call if it fails. Defaults to 0, indicating
        that the call will not be retried.
    """

    def __init__(
        self,
        client: ClientWrapper | None = None,
        retries: int = 0,
        default_system_prompt: str = "You are a helpful assistant.",
    ):
        """
        Creates an instance of a generic LLM client.

        Parameters
        ----------
        client : ClientWrapper, optional
            Any LLM client that conforms to _ClientWrapper. Required for LLM-guided metrics.
        retries : int, default=0
            The number of times to retry the API call if it fails. Defaults to 0, indicating
            that the call will not be retried.
        default_system_prompt : str, default="You are a helpful assistant."
            The default system prompt that is given to the evaluating LLM.
        """

        self.client = client
        self.retries = retries
        self.default_system_prompt = default_system_prompt

    @classmethod
    def openai(
        cls,
        model_name: str = "gpt-3.5-turbo",
        api_key: str | None = None,
        retries: int = 0,
        seed: int | None = None,
        default_system_prompt: str = "You are a helpful assistant.",
    ):
        """
        Create an evaluator using OpenAI's client.

        Parameters
        ----------
        model_name : str, default="gpt-3.5-turbo"
            The model to use. Defaults to "gpt-3.5-turbo".
        api_key : str, optional
            The OpenAI API key to use. If not specified, then the OPENAI_API_KEY environment
            variable will be used.
        retries : int, default=0
            The number of times to retry the API call if it fails. Defaults to 0, indicating
            that the call will not be retried. For example, if self.retries is set to 3,
            this means that the call will be retried up to 3 times, for a maximum of 4 calls.
        seed : int, optional
            An optional seed can be provided to GPT to get deterministic results.
        default_system_prompt : str, default="You are a helpful assistant."
            The default system prompt that is given to the evaluating LLM.
        """
        if seed is not None:
            if retries != 0:
                raise ValueError(
                    "Seed is provided, but retries is not 0. Retries should be 0 when seed is provided."
                )
        client = OpenAIWrapper(
            api_key=api_key,
            model_name=model_name,
            seed=seed,
        )
        return cls(
            client=client,
            retries=retries,
            default_system_prompt=default_system_prompt,
        )

    @classmethod
    def mistral(
        cls,
        model_name: str = "mistral-small-latest",
        api_key: str | None = None,
        retries: int = 0,
        default_system_prompt: str = "You are a helpful assistant.",
    ):
        """
        Create an evaluator using the Mistral API.

        Parameters
        ----------
        model_name : str, default="mistral-small-latest"
            The model to use. Defaults to "mistral-small-latest".
        api_key : str, optional
            The Mistral API key to use. If not specified, then the MISTRAL_API_KEY environment
            variable will be used.
        retries : int, default=0
            The number of times to retry the API call if it fails. Defaults to 0, indicating
            that the call will not be retried. For example, if self.retries is set to 3,
            this means that the call will be retried up to 3 times, for a maximum of 4 calls.
        default_system_prompt : str, default="You are a helpful assistant."
            The default system prompt that is given to the evaluating LLM.
        """
        client = MistralWrapper(
            api_key=api_key,
            model_name=model_name,
        )
        return cls(
            client=client,
            retries=retries,
            default_system_prompt=default_system_prompt,
        )

    @llm_guided_metric
    def compute_answer_correctness(
        self,
        response: QueryResponse,
    ) -> Metric:
        """
        Compute answer correctness. Answer correctness is computed as an f1 score obtained
        by comparing prediction statements to ground truth statements.

        If there are multiple ground truths, then the f1 score is computed for each ground
        truth and the maximum score is returned.

        This metric was adapted from RAGAS. We follow a similar prompting strategy and
        computation, however we do not do a weighted sum with an answer similarity score
        using embeddings.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.

        Returns
        -------
        Metric
            The answer correctness score between 0 and 1. Higher values indicate that the
            answer is more correct. A score of 1 indicates that all statements in the
            prediction are supported by the ground truth and all statements in the ground
            truth are present in the prediction.
        """
        if not response.context:
            raise ValueError("The answer correctness metric requires context.")

        result = calculate_answer_correctness(
            client=self.client,  # type: ignore - wrapper handles None case
            system_prompt=self.default_system_prompt,
            query=response.query,
            response=response.response,
            groundtruths=response.context.groundtruth,
        )
        return Metric.answer_correctness(
            value=result,
            model_name=self.client.model_name,  # type: ignore - wrapper handles None case
            retries=self.retries,
        )

    @llm_guided_metric
    def compute_answer_relevance(self, response: QueryResponse) -> Metric:
        """
        Compute answer relevance, the proportion of the model response that is
        relevant to the query, for a single piece of text.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.

        Returns
        -------
        Metric
            The answer relevance score between 0 and 1. A score of 1 indicates that all
            statements are relevant to the query.
        """
        result = calculate_answer_relevance(
            client=self.client,  # type: ignore - wrapper handles None case
            system_prompt=self.default_system_prompt,
            query=response.query,
            response=response.response,
        )
        return Metric.answer_relevance(
            value=result,
            model_name=self.client.model_name,  # type: ignore - wrapper handles None case
            retries=self.retries,
        )

    @llm_guided_metric
    def compute_bias(
        self,
        response: QueryResponse,
    ) -> Metric:
        """
        Compute bias, the proportion of model opinions that are biased.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.

        Returns
        -------
        float
            The bias score between 0 and 1. A score of 1 indicates that all opinions in
            the text are biased.
        """
        result = calculate_bias(
            client=self.client,  # type: ignore - wrapper handles None case
            system_prompt=self.default_system_prompt,
            response=response.response,
        )
        return Metric.bias(
            value=result,
            model_name=self.client.model_name,  # type: ignore - wrapper handles None case
            retries=self.retries,
        )

    @llm_guided_metric
    def compute_context_precision(
        self,
        response: QueryResponse,
    ) -> Metric:
        """
        Compute context precision, a score for evaluating the retrieval
        mechanism of a RAG model.

        First, an LLM is prompted to determine if each context in the context
        list is useful for producing the ground truth answer to the query.

        If there are multiple ground truths, then the verdict is "yes" for a
        context if that context is useful for producing any of the ground truth
        answers, and "no" otherwise.

        Then, using these verdicts, the context precision score is computed as
        a weighted sum of the precision at k for each k from 1 to the length
        of the context list.

        Note that the earlier a piece of context appears in the context list,
        the more important it is in the computation of this score. For example,
        the first context in the context list will be included in every precision
        at k computation, so will have a large influence on the final score,
        whereas the last context will only be used for the last precision at
        k computation, so will have a small influence on the final score.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.

        Returns
        -------
        Metric
            The context precision score between 0 and 1. A higher score indicates
            better context precision.
        """
        if not response.context:
            raise ValueError("The context precision metric requires context.")

        result = calculate_context_precision(
            client=self.client,  # type: ignore - wrapper handles None case
            system_prompt=self.default_system_prompt,
            query=response.query,
            predicted_context=response.context.prediction,
            groundtruth_context=response.context.groundtruth,
        )
        return Metric.context_precision(
            value=result,
            model_name=self.client.model_name,  # type: ignore - wrapper handles None case
            retries=self.retries,
        )

    @llm_guided_metric
    def compute_context_recall(
        self,
        response: QueryResponse,
    ) -> Metric:
        """
        Compute context recall, a score for evaluating the retrieval mechanism of a RAG model.

        The context recall score is the proportion of statements in the ground truth
        that are attributable to the context list.

        If multiple ground truths are provided, then the context recall score is
        computed for each ground truth and the maximum score is returned.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.

        Returns
        -------
        Metric
            The context recall score between 0 and 1. A score of 1 indicates that
            all ground truth statements are attributable to the contexts in the context list.
        """
        if not response.context:
            raise ValueError("The context recall metric requires context.")

        result = calculate_context_recall(
            client=self.client,  # type: ignore - wrapper handles None case
            system_prompt=self.default_system_prompt,
            predicted_context=response.context.prediction,
            groundtruth_context=response.context.groundtruth,
        )
        return Metric.context_recall(
            value=result,
            model_name=self.client.model_name,  # type: ignore - wrapper handles None case
            retries=self.retries,
        )

    @llm_guided_metric
    def compute_context_relevance(
        self,
        response: QueryResponse,
    ) -> Metric:
        """
        Compute context relevance, the proportion of contexts in the context list
        that are relevant to the query.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.

        Returns
        -------
        Metric
            The context relevance score between 0 and 1. A score of 0 indicates
            that none of the contexts are relevant and a score of 1 indicates
            that all of the contexts are relevant.
        """
        if not response.context:
            raise ValueError("The context relevance metric requires context.")

        result = calculate_context_relevance(
            client=self.client,  # type: ignore - wrapper handles None case
            system_prompt=self.default_system_prompt,
            query=response.query,
            context=response.context.prediction,
        )
        return Metric.context_relevance(
            value=result,
            model_name=self.client.model_name,  # type: ignore - wrapper handles None case
            retries=self.retries,
        )

    @llm_guided_metric
    def compute_faithfulness(
        self,
        response: QueryResponse,
    ) -> Metric:
        """
        Compute the faithfulness score. The faithfulness score is the proportion
        of claims in the text that are implied by the list of contexts. Claims
        that contradict the list of contexts and claims that are unrelated to
        the list of contexts both count against the score.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.

        Returns
        -------
        Metric
            The faithfulness score between 0 and 1. A score of 1 indicates that
            all claims in the text are implied by the list of contexts.
        """

        if not response.context:
            raise ValueError("The faithfulness metric requires context.")

        result = calculate_faithfulness(
            client=self.client,  # type: ignore - wrapper handles None case
            system_prompt=self.default_system_prompt,
            response=response.response,
            context=response.context.prediction,
        )
        return Metric.faithfulness(
            value=result,
            model_name=self.client.model_name,  # type: ignore - wrapper handles None case
            retries=self.retries,
        )

    @llm_guided_metric
    def compute_hallucination(
        self,
        response: QueryResponse,
    ) -> Metric:
        """
        Compute the hallucination score, the proportion of contexts in the context
        list that are contradicted by the text.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.

        Returns
        -------
        Metric
            The hallucination score between 0 and 1. A score of 1 indicates that
            all contexts are contradicted by the text.
        """

        if not response.context:
            raise ValueError("The hallucination metric requires context.")

        result = calculate_hallucination(
            client=self.client,  # type: ignore - wrapper handles None case
            system_prompt=self.default_system_prompt,
            response=response.response,
            context=response.context.prediction,
        )
        return Metric.hallucination(
            value=result,
            model_name=self.client.model_name,  # type: ignore - wrapper handles None case
            retries=self.retries,
        )

    @llm_guided_metric
    def compute_summary_coherence(
        self,
        response: QueryResponse,
    ) -> Metric:
        """
        Compute summary coherence, the collective quality of a summary.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.

        Returns
        -------
        Metric
            The summary coherence score between 1 and 5. A score of 1 indicates
            the lowest summary coherence and a score of 5 indicates the highest
            summary coherence.
        """
        result = calculate_summary_coherence(
            client=self.client,  # type: ignore - wrapper handles None case
            system_prompt=self.default_system_prompt,
            text=response.query,
            summary=response.response,
        )
        return Metric.summary_coherence(
            value=result,
            model_name=self.client.model_name,  # type: ignore - wrapper handles None case
            retries=self.retries,
        )

    @llm_guided_metric
    def compute_toxicity(
        self,
        response: QueryResponse,
    ) -> Metric:
        """
        Compute toxicity, the portion of opinions that are toxic.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.

        Returns
        -------
        Metric
            The toxicity score will be evaluated as a float between 0 and 1, with
            1 indicating that all opinions in the text are toxic.
        """
        result = calculate_toxicity(
            client=self.client,  # type: ignore - wrapper handles None case
            system_prompt=self.default_system_prompt,
            response=response.response,
        )
        return Metric.toxicity(
            value=result,
            model_name=self.client.model_name,  # type: ignore - wrapper handles None case
            retries=self.retries,
        )

    @staticmethod
    def compute_rouge(
        response: QueryResponse,
        rouge_types: list[str] = [
            "rouge1",
            "rouge2",
            "rougeL",
            "rougeLsum",
        ],
        use_stemmer: bool = False,
    ) -> list[Metric]:
        """
        Calculate ROUGE scores for a model response given some set of references.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.
        rouge_types : list[str], optional
            A list of rouge types to calculate.
            Defaults to ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'].
        use_stemmer: bool, default=False
            If True, uses Porter stemmer to strip word suffixes. Defaults to False.

        Returns
        -------
        list[Metric]
        """

        if not response.context:
            raise ValueError("ROUGE metrics require context.")

        results = calculate_rouge_scores(
            prediction=response.response,
            references=response.context.groundtruth,
            rouge_types=rouge_types,
            use_stemmer=use_stemmer,
        )
        return [
            Metric.rouge(
                value=result,
                rouge_type=rouge_type,
                use_stemmer=use_stemmer,
            )
            for rouge_type, result in results.items()
        ]

    @staticmethod
    def compute_sentence_bleu(
        response: QueryResponse,
        weights: list[float] = [0.25, 0.25, 0.25, 0.25],
    ) -> Metric:
        """
        Calculate sentence BLEU scores for a set of model response - ground truth pairs.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.
        weights: list[float], default=[0.25, 0.25, 0.25, 0.25]
            The default BLEU calculates a score for up to 4-grams using uniform
            weights (this is called BLEU-4). To evaluate your translations with
            higher/lower order ngrams, use customized weights. Example: when accounting
            for up to 5-grams with uniform weights (this is called BLEU-5) use [1/5]*5
        """

        if not response.context:
            raise ValueError("The sentence BLEU metric requires context.")

        result = calculate_sentence_bleu(
            prediction=response.response,
            references=response.context.groundtruth,
            weights=weights,
        )
        return Metric.bleu(
            value=result,
            weights=weights,
        )

    def compute_all(
        self,
        response: QueryResponse,
        bleu_weights: list[float] = [0.25, 0.25, 0.25, 0.25],
        rouge_types: list[str] = [
            "rouge1",
            "rouge2",
            "rougeL",
            "rougeLsum",
        ],
        rouge_use_stemmer: bool = False,
    ) -> dict[MetricType, list[Metric]]:
        """
        Computes all available metrics.

        Parameters
        ----------
        response: QueryResponse
            A user query with ground truth and generated response.
        bleu_weights: list[float], default=[0.25, 0.25, 0.25, 0.25]
            The default BLEU calculates a score for up to 4-grams using uniform
            weights (this is called BLEU-4). To evaluate your translations with
            higher/lower order ngrams, use customized weights. Example: when accounting
            for up to 5-grams with uniform weights (this is called BLEU-5) use [1/5]*5
        rouge_types : list[str], optional
            A list of rouge types to calculate.
            Defaults to ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'].
        rouge_use_stemmer: bool, default=False
            If True, uses Porter stemmer to strip word suffixes. Defaults to False.
        """
        results = dict()
        results[MetricType.AnswerCorrectness] = [
            self.compute_answer_correctness(response)
        ]
        results[MetricType.AnswerRelevance] = [
            self.compute_answer_relevance(response)
        ]
        results[MetricType.Bias] = [self.compute_bias(response)]
        results[MetricType.ContextPrecision] = [
            self.compute_context_precision(response)
        ]
        results[MetricType.ContextRecall] = [
            self.compute_context_recall(response)
        ]
        results[MetricType.ContextRelevance] = [
            self.compute_context_relevance(response)
        ]
        results[MetricType.Faithfulness] = [
            self.compute_faithfulness(response)
        ]
        results[MetricType.Hallucination] = [
            self.compute_hallucination(response)
        ]
        results[MetricType.SummaryCoherence] = [
            self.compute_summary_coherence(response)
        ]
        results[MetricType.Toxicity] = [self.compute_toxicity(response)]
        results[MetricType.ROUGE] = self.compute_rouge(
            response=response,
            rouge_types=rouge_types,
            use_stemmer=rouge_use_stemmer,
        )
        results[MetricType.BLEU] = [
            self.compute_sentence_bleu(response=response, weights=bleu_weights)
        ]
        return results

__init__(client=None, retries=0, default_system_prompt='You are a helpful assistant.')

Creates an instance of a generic LLM client.

Parameters:

Name Type Description Default
client ClientWrapper

Any LLM client that conforms to _ClientWrapper. Required for LLM-guided metrics.

None
retries int

The number of times to retry the API call if it fails. Defaults to 0, indicating that the call will not be retried.

0
default_system_prompt str

The default system prompt that is given to the evaluating LLM.

"You are a helpful assistant."
Source code in valor_lite/text_generation/manager.py
def __init__(
    self,
    client: ClientWrapper | None = None,
    retries: int = 0,
    default_system_prompt: str = "You are a helpful assistant.",
):
    """
    Creates an instance of a generic LLM client.

    Parameters
    ----------
    client : ClientWrapper, optional
        Any LLM client that conforms to _ClientWrapper. Required for LLM-guided metrics.
    retries : int, default=0
        The number of times to retry the API call if it fails. Defaults to 0, indicating
        that the call will not be retried.
    default_system_prompt : str, default="You are a helpful assistant."
        The default system prompt that is given to the evaluating LLM.
    """

    self.client = client
    self.retries = retries
    self.default_system_prompt = default_system_prompt

compute_all(response, bleu_weights=[0.25, 0.25, 0.25, 0.25], rouge_types=['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], rouge_use_stemmer=False)

Computes all available metrics.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required
bleu_weights list[float]

The default BLEU calculates a score for up to 4-grams using uniform weights (this is called BLEU-4). To evaluate your translations with higher/lower order ngrams, use customized weights. Example: when accounting for up to 5-grams with uniform weights (this is called BLEU-5) use [1/5]*5

[0.25, 0.25, 0.25, 0.25]
rouge_types list[str]

A list of rouge types to calculate. Defaults to ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'].

['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
rouge_use_stemmer bool

If True, uses Porter stemmer to strip word suffixes. Defaults to False.

False
Source code in valor_lite/text_generation/manager.py
def compute_all(
    self,
    response: QueryResponse,
    bleu_weights: list[float] = [0.25, 0.25, 0.25, 0.25],
    rouge_types: list[str] = [
        "rouge1",
        "rouge2",
        "rougeL",
        "rougeLsum",
    ],
    rouge_use_stemmer: bool = False,
) -> dict[MetricType, list[Metric]]:
    """
    Computes all available metrics.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.
    bleu_weights: list[float], default=[0.25, 0.25, 0.25, 0.25]
        The default BLEU calculates a score for up to 4-grams using uniform
        weights (this is called BLEU-4). To evaluate your translations with
        higher/lower order ngrams, use customized weights. Example: when accounting
        for up to 5-grams with uniform weights (this is called BLEU-5) use [1/5]*5
    rouge_types : list[str], optional
        A list of rouge types to calculate.
        Defaults to ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'].
    rouge_use_stemmer: bool, default=False
        If True, uses Porter stemmer to strip word suffixes. Defaults to False.
    """
    results = dict()
    results[MetricType.AnswerCorrectness] = [
        self.compute_answer_correctness(response)
    ]
    results[MetricType.AnswerRelevance] = [
        self.compute_answer_relevance(response)
    ]
    results[MetricType.Bias] = [self.compute_bias(response)]
    results[MetricType.ContextPrecision] = [
        self.compute_context_precision(response)
    ]
    results[MetricType.ContextRecall] = [
        self.compute_context_recall(response)
    ]
    results[MetricType.ContextRelevance] = [
        self.compute_context_relevance(response)
    ]
    results[MetricType.Faithfulness] = [
        self.compute_faithfulness(response)
    ]
    results[MetricType.Hallucination] = [
        self.compute_hallucination(response)
    ]
    results[MetricType.SummaryCoherence] = [
        self.compute_summary_coherence(response)
    ]
    results[MetricType.Toxicity] = [self.compute_toxicity(response)]
    results[MetricType.ROUGE] = self.compute_rouge(
        response=response,
        rouge_types=rouge_types,
        use_stemmer=rouge_use_stemmer,
    )
    results[MetricType.BLEU] = [
        self.compute_sentence_bleu(response=response, weights=bleu_weights)
    ]
    return results

compute_answer_correctness(response)

Compute answer correctness. Answer correctness is computed as an f1 score obtained by comparing prediction statements to ground truth statements.

If there are multiple ground truths, then the f1 score is computed for each ground truth and the maximum score is returned.

This metric was adapted from RAGAS. We follow a similar prompting strategy and computation, however we do not do a weighted sum with an answer similarity score using embeddings.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required

Returns:

Type Description
Metric

The answer correctness score between 0 and 1. Higher values indicate that the answer is more correct. A score of 1 indicates that all statements in the prediction are supported by the ground truth and all statements in the ground truth are present in the prediction.

Source code in valor_lite/text_generation/manager.py
@llm_guided_metric
def compute_answer_correctness(
    self,
    response: QueryResponse,
) -> Metric:
    """
    Compute answer correctness. Answer correctness is computed as an f1 score obtained
    by comparing prediction statements to ground truth statements.

    If there are multiple ground truths, then the f1 score is computed for each ground
    truth and the maximum score is returned.

    This metric was adapted from RAGAS. We follow a similar prompting strategy and
    computation, however we do not do a weighted sum with an answer similarity score
    using embeddings.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.

    Returns
    -------
    Metric
        The answer correctness score between 0 and 1. Higher values indicate that the
        answer is more correct. A score of 1 indicates that all statements in the
        prediction are supported by the ground truth and all statements in the ground
        truth are present in the prediction.
    """
    if not response.context:
        raise ValueError("The answer correctness metric requires context.")

    result = calculate_answer_correctness(
        client=self.client,  # type: ignore - wrapper handles None case
        system_prompt=self.default_system_prompt,
        query=response.query,
        response=response.response,
        groundtruths=response.context.groundtruth,
    )
    return Metric.answer_correctness(
        value=result,
        model_name=self.client.model_name,  # type: ignore - wrapper handles None case
        retries=self.retries,
    )

compute_answer_relevance(response)

Compute answer relevance, the proportion of the model response that is relevant to the query, for a single piece of text.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required

Returns:

Type Description
Metric

The answer relevance score between 0 and 1. A score of 1 indicates that all statements are relevant to the query.

Source code in valor_lite/text_generation/manager.py
@llm_guided_metric
def compute_answer_relevance(self, response: QueryResponse) -> Metric:
    """
    Compute answer relevance, the proportion of the model response that is
    relevant to the query, for a single piece of text.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.

    Returns
    -------
    Metric
        The answer relevance score between 0 and 1. A score of 1 indicates that all
        statements are relevant to the query.
    """
    result = calculate_answer_relevance(
        client=self.client,  # type: ignore - wrapper handles None case
        system_prompt=self.default_system_prompt,
        query=response.query,
        response=response.response,
    )
    return Metric.answer_relevance(
        value=result,
        model_name=self.client.model_name,  # type: ignore - wrapper handles None case
        retries=self.retries,
    )

compute_bias(response)

Compute bias, the proportion of model opinions that are biased.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required

Returns:

Type Description
float

The bias score between 0 and 1. A score of 1 indicates that all opinions in the text are biased.

Source code in valor_lite/text_generation/manager.py
@llm_guided_metric
def compute_bias(
    self,
    response: QueryResponse,
) -> Metric:
    """
    Compute bias, the proportion of model opinions that are biased.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.

    Returns
    -------
    float
        The bias score between 0 and 1. A score of 1 indicates that all opinions in
        the text are biased.
    """
    result = calculate_bias(
        client=self.client,  # type: ignore - wrapper handles None case
        system_prompt=self.default_system_prompt,
        response=response.response,
    )
    return Metric.bias(
        value=result,
        model_name=self.client.model_name,  # type: ignore - wrapper handles None case
        retries=self.retries,
    )

compute_context_precision(response)

Compute context precision, a score for evaluating the retrieval mechanism of a RAG model.

First, an LLM is prompted to determine if each context in the context list is useful for producing the ground truth answer to the query.

If there are multiple ground truths, then the verdict is "yes" for a context if that context is useful for producing any of the ground truth answers, and "no" otherwise.

Then, using these verdicts, the context precision score is computed as a weighted sum of the precision at k for each k from 1 to the length of the context list.

Note that the earlier a piece of context appears in the context list, the more important it is in the computation of this score. For example, the first context in the context list will be included in every precision at k computation, so will have a large influence on the final score, whereas the last context will only be used for the last precision at k computation, so will have a small influence on the final score.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required

Returns:

Type Description
Metric

The context precision score between 0 and 1. A higher score indicates better context precision.

Source code in valor_lite/text_generation/manager.py
@llm_guided_metric
def compute_context_precision(
    self,
    response: QueryResponse,
) -> Metric:
    """
    Compute context precision, a score for evaluating the retrieval
    mechanism of a RAG model.

    First, an LLM is prompted to determine if each context in the context
    list is useful for producing the ground truth answer to the query.

    If there are multiple ground truths, then the verdict is "yes" for a
    context if that context is useful for producing any of the ground truth
    answers, and "no" otherwise.

    Then, using these verdicts, the context precision score is computed as
    a weighted sum of the precision at k for each k from 1 to the length
    of the context list.

    Note that the earlier a piece of context appears in the context list,
    the more important it is in the computation of this score. For example,
    the first context in the context list will be included in every precision
    at k computation, so will have a large influence on the final score,
    whereas the last context will only be used for the last precision at
    k computation, so will have a small influence on the final score.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.

    Returns
    -------
    Metric
        The context precision score between 0 and 1. A higher score indicates
        better context precision.
    """
    if not response.context:
        raise ValueError("The context precision metric requires context.")

    result = calculate_context_precision(
        client=self.client,  # type: ignore - wrapper handles None case
        system_prompt=self.default_system_prompt,
        query=response.query,
        predicted_context=response.context.prediction,
        groundtruth_context=response.context.groundtruth,
    )
    return Metric.context_precision(
        value=result,
        model_name=self.client.model_name,  # type: ignore - wrapper handles None case
        retries=self.retries,
    )

compute_context_recall(response)

Compute context recall, a score for evaluating the retrieval mechanism of a RAG model.

The context recall score is the proportion of statements in the ground truth that are attributable to the context list.

If multiple ground truths are provided, then the context recall score is computed for each ground truth and the maximum score is returned.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required

Returns:

Type Description
Metric

The context recall score between 0 and 1. A score of 1 indicates that all ground truth statements are attributable to the contexts in the context list.

Source code in valor_lite/text_generation/manager.py
@llm_guided_metric
def compute_context_recall(
    self,
    response: QueryResponse,
) -> Metric:
    """
    Compute context recall, a score for evaluating the retrieval mechanism of a RAG model.

    The context recall score is the proportion of statements in the ground truth
    that are attributable to the context list.

    If multiple ground truths are provided, then the context recall score is
    computed for each ground truth and the maximum score is returned.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.

    Returns
    -------
    Metric
        The context recall score between 0 and 1. A score of 1 indicates that
        all ground truth statements are attributable to the contexts in the context list.
    """
    if not response.context:
        raise ValueError("The context recall metric requires context.")

    result = calculate_context_recall(
        client=self.client,  # type: ignore - wrapper handles None case
        system_prompt=self.default_system_prompt,
        predicted_context=response.context.prediction,
        groundtruth_context=response.context.groundtruth,
    )
    return Metric.context_recall(
        value=result,
        model_name=self.client.model_name,  # type: ignore - wrapper handles None case
        retries=self.retries,
    )

compute_context_relevance(response)

Compute context relevance, the proportion of contexts in the context list that are relevant to the query.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required

Returns:

Type Description
Metric

The context relevance score between 0 and 1. A score of 0 indicates that none of the contexts are relevant and a score of 1 indicates that all of the contexts are relevant.

Source code in valor_lite/text_generation/manager.py
@llm_guided_metric
def compute_context_relevance(
    self,
    response: QueryResponse,
) -> Metric:
    """
    Compute context relevance, the proportion of contexts in the context list
    that are relevant to the query.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.

    Returns
    -------
    Metric
        The context relevance score between 0 and 1. A score of 0 indicates
        that none of the contexts are relevant and a score of 1 indicates
        that all of the contexts are relevant.
    """
    if not response.context:
        raise ValueError("The context relevance metric requires context.")

    result = calculate_context_relevance(
        client=self.client,  # type: ignore - wrapper handles None case
        system_prompt=self.default_system_prompt,
        query=response.query,
        context=response.context.prediction,
    )
    return Metric.context_relevance(
        value=result,
        model_name=self.client.model_name,  # type: ignore - wrapper handles None case
        retries=self.retries,
    )

compute_faithfulness(response)

Compute the faithfulness score. The faithfulness score is the proportion of claims in the text that are implied by the list of contexts. Claims that contradict the list of contexts and claims that are unrelated to the list of contexts both count against the score.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required

Returns:

Type Description
Metric

The faithfulness score between 0 and 1. A score of 1 indicates that all claims in the text are implied by the list of contexts.

Source code in valor_lite/text_generation/manager.py
@llm_guided_metric
def compute_faithfulness(
    self,
    response: QueryResponse,
) -> Metric:
    """
    Compute the faithfulness score. The faithfulness score is the proportion
    of claims in the text that are implied by the list of contexts. Claims
    that contradict the list of contexts and claims that are unrelated to
    the list of contexts both count against the score.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.

    Returns
    -------
    Metric
        The faithfulness score between 0 and 1. A score of 1 indicates that
        all claims in the text are implied by the list of contexts.
    """

    if not response.context:
        raise ValueError("The faithfulness metric requires context.")

    result = calculate_faithfulness(
        client=self.client,  # type: ignore - wrapper handles None case
        system_prompt=self.default_system_prompt,
        response=response.response,
        context=response.context.prediction,
    )
    return Metric.faithfulness(
        value=result,
        model_name=self.client.model_name,  # type: ignore - wrapper handles None case
        retries=self.retries,
    )

compute_hallucination(response)

Compute the hallucination score, the proportion of contexts in the context list that are contradicted by the text.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required

Returns:

Type Description
Metric

The hallucination score between 0 and 1. A score of 1 indicates that all contexts are contradicted by the text.

Source code in valor_lite/text_generation/manager.py
@llm_guided_metric
def compute_hallucination(
    self,
    response: QueryResponse,
) -> Metric:
    """
    Compute the hallucination score, the proportion of contexts in the context
    list that are contradicted by the text.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.

    Returns
    -------
    Metric
        The hallucination score between 0 and 1. A score of 1 indicates that
        all contexts are contradicted by the text.
    """

    if not response.context:
        raise ValueError("The hallucination metric requires context.")

    result = calculate_hallucination(
        client=self.client,  # type: ignore - wrapper handles None case
        system_prompt=self.default_system_prompt,
        response=response.response,
        context=response.context.prediction,
    )
    return Metric.hallucination(
        value=result,
        model_name=self.client.model_name,  # type: ignore - wrapper handles None case
        retries=self.retries,
    )

compute_rouge(response, rouge_types=['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=False) staticmethod

Calculate ROUGE scores for a model response given some set of references.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required
rouge_types list[str]

A list of rouge types to calculate. Defaults to ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'].

['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
use_stemmer bool

If True, uses Porter stemmer to strip word suffixes. Defaults to False.

False

Returns:

Type Description
list[Metric]
Source code in valor_lite/text_generation/manager.py
@staticmethod
def compute_rouge(
    response: QueryResponse,
    rouge_types: list[str] = [
        "rouge1",
        "rouge2",
        "rougeL",
        "rougeLsum",
    ],
    use_stemmer: bool = False,
) -> list[Metric]:
    """
    Calculate ROUGE scores for a model response given some set of references.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.
    rouge_types : list[str], optional
        A list of rouge types to calculate.
        Defaults to ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'].
    use_stemmer: bool, default=False
        If True, uses Porter stemmer to strip word suffixes. Defaults to False.

    Returns
    -------
    list[Metric]
    """

    if not response.context:
        raise ValueError("ROUGE metrics require context.")

    results = calculate_rouge_scores(
        prediction=response.response,
        references=response.context.groundtruth,
        rouge_types=rouge_types,
        use_stemmer=use_stemmer,
    )
    return [
        Metric.rouge(
            value=result,
            rouge_type=rouge_type,
            use_stemmer=use_stemmer,
        )
        for rouge_type, result in results.items()
    ]

compute_sentence_bleu(response, weights=[0.25, 0.25, 0.25, 0.25]) staticmethod

Calculate sentence BLEU scores for a set of model response - ground truth pairs.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required
weights list[float]

The default BLEU calculates a score for up to 4-grams using uniform weights (this is called BLEU-4). To evaluate your translations with higher/lower order ngrams, use customized weights. Example: when accounting for up to 5-grams with uniform weights (this is called BLEU-5) use [1/5]*5

[0.25, 0.25, 0.25, 0.25]
Source code in valor_lite/text_generation/manager.py
@staticmethod
def compute_sentence_bleu(
    response: QueryResponse,
    weights: list[float] = [0.25, 0.25, 0.25, 0.25],
) -> Metric:
    """
    Calculate sentence BLEU scores for a set of model response - ground truth pairs.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.
    weights: list[float], default=[0.25, 0.25, 0.25, 0.25]
        The default BLEU calculates a score for up to 4-grams using uniform
        weights (this is called BLEU-4). To evaluate your translations with
        higher/lower order ngrams, use customized weights. Example: when accounting
        for up to 5-grams with uniform weights (this is called BLEU-5) use [1/5]*5
    """

    if not response.context:
        raise ValueError("The sentence BLEU metric requires context.")

    result = calculate_sentence_bleu(
        prediction=response.response,
        references=response.context.groundtruth,
        weights=weights,
    )
    return Metric.bleu(
        value=result,
        weights=weights,
    )

compute_summary_coherence(response)

Compute summary coherence, the collective quality of a summary.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required

Returns:

Type Description
Metric

The summary coherence score between 1 and 5. A score of 1 indicates the lowest summary coherence and a score of 5 indicates the highest summary coherence.

Source code in valor_lite/text_generation/manager.py
@llm_guided_metric
def compute_summary_coherence(
    self,
    response: QueryResponse,
) -> Metric:
    """
    Compute summary coherence, the collective quality of a summary.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.

    Returns
    -------
    Metric
        The summary coherence score between 1 and 5. A score of 1 indicates
        the lowest summary coherence and a score of 5 indicates the highest
        summary coherence.
    """
    result = calculate_summary_coherence(
        client=self.client,  # type: ignore - wrapper handles None case
        system_prompt=self.default_system_prompt,
        text=response.query,
        summary=response.response,
    )
    return Metric.summary_coherence(
        value=result,
        model_name=self.client.model_name,  # type: ignore - wrapper handles None case
        retries=self.retries,
    )

compute_toxicity(response)

Compute toxicity, the portion of opinions that are toxic.

Parameters:

Name Type Description Default
response QueryResponse

A user query with ground truth and generated response.

required

Returns:

Type Description
Metric

The toxicity score will be evaluated as a float between 0 and 1, with 1 indicating that all opinions in the text are toxic.

Source code in valor_lite/text_generation/manager.py
@llm_guided_metric
def compute_toxicity(
    self,
    response: QueryResponse,
) -> Metric:
    """
    Compute toxicity, the portion of opinions that are toxic.

    Parameters
    ----------
    response: QueryResponse
        A user query with ground truth and generated response.

    Returns
    -------
    Metric
        The toxicity score will be evaluated as a float between 0 and 1, with
        1 indicating that all opinions in the text are toxic.
    """
    result = calculate_toxicity(
        client=self.client,  # type: ignore - wrapper handles None case
        system_prompt=self.default_system_prompt,
        response=response.response,
    )
    return Metric.toxicity(
        value=result,
        model_name=self.client.model_name,  # type: ignore - wrapper handles None case
        retries=self.retries,
    )

mistral(model_name='mistral-small-latest', api_key=None, retries=0, default_system_prompt='You are a helpful assistant.') classmethod

Create an evaluator using the Mistral API.

Parameters:

Name Type Description Default
model_name str

The model to use. Defaults to "mistral-small-latest".

"mistral-small-latest"
api_key str

The Mistral API key to use. If not specified, then the MISTRAL_API_KEY environment variable will be used.

None
retries int

The number of times to retry the API call if it fails. Defaults to 0, indicating that the call will not be retried. For example, if self.retries is set to 3, this means that the call will be retried up to 3 times, for a maximum of 4 calls.

0
default_system_prompt str

The default system prompt that is given to the evaluating LLM.

"You are a helpful assistant."
Source code in valor_lite/text_generation/manager.py
@classmethod
def mistral(
    cls,
    model_name: str = "mistral-small-latest",
    api_key: str | None = None,
    retries: int = 0,
    default_system_prompt: str = "You are a helpful assistant.",
):
    """
    Create an evaluator using the Mistral API.

    Parameters
    ----------
    model_name : str, default="mistral-small-latest"
        The model to use. Defaults to "mistral-small-latest".
    api_key : str, optional
        The Mistral API key to use. If not specified, then the MISTRAL_API_KEY environment
        variable will be used.
    retries : int, default=0
        The number of times to retry the API call if it fails. Defaults to 0, indicating
        that the call will not be retried. For example, if self.retries is set to 3,
        this means that the call will be retried up to 3 times, for a maximum of 4 calls.
    default_system_prompt : str, default="You are a helpful assistant."
        The default system prompt that is given to the evaluating LLM.
    """
    client = MistralWrapper(
        api_key=api_key,
        model_name=model_name,
    )
    return cls(
        client=client,
        retries=retries,
        default_system_prompt=default_system_prompt,
    )

openai(model_name='gpt-3.5-turbo', api_key=None, retries=0, seed=None, default_system_prompt='You are a helpful assistant.') classmethod

Create an evaluator using OpenAI's client.

Parameters:

Name Type Description Default
model_name str

The model to use. Defaults to "gpt-3.5-turbo".

"gpt-3.5-turbo"
api_key str

The OpenAI API key to use. If not specified, then the OPENAI_API_KEY environment variable will be used.

None
retries int

The number of times to retry the API call if it fails. Defaults to 0, indicating that the call will not be retried. For example, if self.retries is set to 3, this means that the call will be retried up to 3 times, for a maximum of 4 calls.

0
seed int

An optional seed can be provided to GPT to get deterministic results.

None
default_system_prompt str

The default system prompt that is given to the evaluating LLM.

"You are a helpful assistant."
Source code in valor_lite/text_generation/manager.py
@classmethod
def openai(
    cls,
    model_name: str = "gpt-3.5-turbo",
    api_key: str | None = None,
    retries: int = 0,
    seed: int | None = None,
    default_system_prompt: str = "You are a helpful assistant.",
):
    """
    Create an evaluator using OpenAI's client.

    Parameters
    ----------
    model_name : str, default="gpt-3.5-turbo"
        The model to use. Defaults to "gpt-3.5-turbo".
    api_key : str, optional
        The OpenAI API key to use. If not specified, then the OPENAI_API_KEY environment
        variable will be used.
    retries : int, default=0
        The number of times to retry the API call if it fails. Defaults to 0, indicating
        that the call will not be retried. For example, if self.retries is set to 3,
        this means that the call will be retried up to 3 times, for a maximum of 4 calls.
    seed : int, optional
        An optional seed can be provided to GPT to get deterministic results.
    default_system_prompt : str, default="You are a helpful assistant."
        The default system prompt that is given to the evaluating LLM.
    """
    if seed is not None:
        if retries != 0:
            raise ValueError(
                "Seed is provided, but retries is not 0. Retries should be 0 when seed is provided."
            )
    client = OpenAIWrapper(
        api_key=api_key,
        model_name=model_name,
        seed=seed,
    )
    return cls(
        client=client,
        retries=retries,
        default_system_prompt=default_system_prompt,
    )