aws
diff --git a/‎src/fmeval/constants.py‎
Lines changed: 3 additions & 0 deletions b/‎src/fmeval/constants.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/fmeval/eval_algorithms/general_semantic_robustness.py‎
Lines changed: 142 additions & 37 deletions b/‎src/fmeval/eval_algorithms/general_semantic_robustness.py‎
Lines changed: 142 additions & 37 deletions
diff --git a/‎src/fmeval/eval_algorithms/helper_models/helper_model.py‎
Lines changed: 25 additions & 0 deletions b/‎src/fmeval/eval_algorithms/helper_models/helper_model.py‎
Lines changed: 25 additions & 0 deletions
@@ -121,3 +121,6 @@ class DatasetColumns(Enum):
 JUMPSTART_BUCKET_BASE_URL_FORMAT = "https://jumpstart-cache-prod-{}.s3.{}.amazonaws.com"
 JUMPSTART_BUCKET_BASE_URL_FORMAT_ENV_VAR = "JUMPSTART_BUCKET_BASE_URL_FORMAT"
 GENERATED_TEXT_JMESPATH_EXPRESSION = "*.output_keys.generated_text"
+
+# BERTScore
+BERTSCORE_DEFAULT_MODEL = "microsoft/deberta-xlarge-mnli"
@@ -1,10 +1,12 @@
+import functools
 import itertools
 import logging
 
 import evaluate as hf_evaluate
 from dataclasses import dataclass
 from typing import Optional, List, Dict, Any
-
+import numpy as np
+from ray.data import Dataset
 
 from fmeval import util
 from fmeval.constants import (
@@ -47,6 +49,9 @@
 from fmeval.model_runners.composers.composers import PromptComposer
 from fmeval.model_runners.model_runner import ModelRunner
 from fmeval.perf_util import timed_block
+from fmeval.eval_algorithms.util import get_bert_score
+from fmeval.constants import BERTSCORE_DEFAULT_MODEL
+from fmeval.eval_algorithms.helper_models.helper_model import BertscoreHelperModel, BertscoreHelperModelTypes
 
 logger = logging.getLogger(__name__)
 
@@ -58,6 +63,7 @@
 }
 
 WER_SCORE = "word_error_rate"
+BERT_SCORE_DISSIMILARITY = "bertscore_dissimilarity"
 
 
 @dataclass(frozen=True)
@@ -67,6 +73,9 @@ class GeneralSemanticRobustnessConfig(EvalAlgorithmConfig):
 
     :param perturbation_type: perturbation type for generating perturbed inputs
     :param num_perturbations: Number of perturbed inputs to be generated for robustness evaluation
+    :param num_baseline_samples: Only used for non-deterministic models. Number of times we generate
+        the model output with the same input to compute the "baseline" change in model output. We
+        compute differences between all pairs of outputs, i.e. between comb(num_baseline_samples, 2) pairs.
     :param butter_finger_perturbation_prob: The probability that a given character will be perturbed. Used for
         butter_finger perturbation_type
     :param random_uppercase_corrupt_proportion: Fraction of characters to be changed to uppercase. Used for
@@ -75,34 +84,63 @@ class GeneralSemanticRobustnessConfig(EvalAlgorithmConfig):
         whitespace_add_remove perturbation_type
     :param whitespace_add_prob: Given a non-whitespace, add a whitespace before it with this probability. Used for
         whitespace_add_remove perturbation_type
+    :param model_type_for_bertscore: model to use for bert score
     """
 
     perturbation_type: str = BUTTER_FINGER
     num_perturbations: int = 5
+    num_baseline_samples: int = 4
     butter_finger_perturbation_prob: float = 0.1
     random_uppercase_corrupt_proportion: float = 0.1
     whitespace_remove_prob: float = 0.1
     whitespace_add_prob: float = 0.05
+    model_type_for_bertscore: str = BERTSCORE_DEFAULT_MODEL
 
     def __post_init__(self):
         if self.perturbation_type not in PERTURBATION_TYPE_TO_HELPER_CLASS.keys():
             raise EvalAlgorithmClientError(
                 f"Invalid perturbation type '{self.perturbation_type} requested, please "
                 f"choose from acceptable values: {PERTURBATION_TYPE_TO_HELPER_CLASS.keys()}"
             )
+        if not BertscoreHelperModelTypes.model_is_allowed(self.model_type_for_bertscore):
+            raise EvalAlgorithmClientError(
+                f"Invalid model_type_for_bertscore: {self.model_type_for_bertscore} requested in "
+                f"GeneralSemanticRobustnessConfig, please choose from acceptable values: {BertscoreHelperModelTypes.model_list()}."
+            )
+        if self.num_baseline_samples < 2:
+            raise EvalAlgorithmClientError(
+                f"Invalid num_baseline_samples: {self.num_baseline_samples} in GeneralSemanticRobusntessConfig. "
+                f"The value should be at least 2."
+            )
 
 
 class GeneralSemanticRobustness(EvalAlgorithmInterface):
     """
-    Semantic Robustness Eval algorithm for General task LLMs
+    Semantic Robustness Eval algorithm for General task LLMs.
 
     This evaluation measures how much the model output changes as a result of semantic preserving
     perturbations. Given the input, e.g., "A quick brown fox jumps over the lazy dog", the
     evaluation creates a perturbation that preserves the semantic meaning of the input e.g.,
     whitespace perturbation that changes the input text to "A q uick bro wn fox ju mps overthe lazy
     dog". The evaluation then measures how much the model output changes when prompted with the
-    original vs. perturbed input. The output difference is measured using Word Error Rate (WER).
-    https://huggingface.co/spaces/evaluate-metric/wer
+    original vs. perturbed input.
+
+    The output difference is measured using two metrics: the Word Error Rate
+    (https://huggingface.co/spaces/evaluate-metric/wer) and the BERTScore Dissimilarity, which  is
+    1 - BERTScore (https://huggingface.co/spaces/evaluate-metric/bertscore), between the original
+    and the perturbed outputs. Word Error Rate measures syntactic differences, that is, changes in
+    the words, whereas BERTScore Dissimilarity measures semantic differences. Semantic differences
+    account of cases when the precise words in the output change but the meaning is the same, e.g.,
+    consider the outputs "it is pouring down today" vs. "it is very rainy today".
+
+    Note: When the model generation strategy is non-deterministic (e.g., with non-zero temperature),
+    the output can change even if the input is the same. In such scenarios, reporting differences
+    (using Word Error Rate or BERTScore Dissimilarity) between the model output on the original input
+    and perturbed inputs might show artificially low robustness since the model output changes even
+    without a change in the input. So this evaluation normalizes the robustness score to account for
+    the baseline non-determinism. Specifically, if d is a score (Word Error Rate or BERTScore
+    Dissimilarity), then the evaluation reports max(0, d - d_base) where d_base measures the
+    differences between the model output on the same input.
     """
 
     def __init__(self, eval_algorithm_config: GeneralSemanticRobustnessConfig = GeneralSemanticRobustnessConfig()):
@@ -126,6 +164,48 @@ def __init__(self, eval_algorithm_config: GeneralSemanticRobustnessConfig = Gene
                 self._eval_algorithm_config.whitespace_remove_prob, self._eval_algorithm_config.whitespace_add_prob
             )
 
+        self._bertscore_helper_model = BertscoreHelperModel.remote(
+            model_type=self._eval_algorithm_config.model_type_for_bertscore
+        )
+
+    def _compute_baseline_scores(
+        self, model: ModelRunner, original_prompt: str, original_model_output
+    ) -> Dict[str, float]:
+        """
+        Private method for computing baseline scores. The baseline scores are required when the model
+        output is non-deterministic and measure the change in the model output with the same input.
+        See the class documentation for how the baseline scores are computed and used.
+
+        :param model: An instance of ModelRunner which is the model under evaluation
+        :param original_prompt: The input prompt to the model. Assumes that the input is already
+            embedded into the prompt template.
+        :param original_model_output: The output of the model on the original input prompt.
+
+        :return: A dict containing the score name to baseline score value mapping.
+        """
+        model_outputs = [
+            model.predict(original_prompt)[0] for _ in range(self._eval_algorithm_config.num_baseline_samples - 1)
+        ]
+        model_outputs.append(original_model_output)
+        all_pairs = itertools.combinations(model_outputs, 2)
+        first_output, second_output = zip(*all_pairs)
+        baselines = dict()
+
+        baselines[BERT_SCORE_DISSIMILARITY] = 1 - np.mean(
+            list(
+                map(
+                    functools.partial(get_bert_score, helper_model=self._bertscore_helper_model),
+                    first_output,
+                    second_output,
+                )
+            )
+        )
+
+        wer = hf_evaluate.load("wer")
+        baselines[WER_SCORE] = wer.compute(predictions=first_output, references=second_output)
+
+        return baselines
+
     def evaluate_sample(
         self,
         model_input: str,
@@ -151,9 +231,9 @@ def evaluate_sample(
         original_prompt = prompt_composer.compose(model_input)
         original_model_output = model_output if model_output else model.predict(original_prompt)[0]
 
-        if self._is_model_deterministic is None:
-            if model.predict(original_prompt)[0] != original_model_output:
-                raise EvalAlgorithmClientError("For evaluating semantic robustness, the model should be deterministic.")
+        is_model_deterministic = self._is_model_deterministic
+        if is_model_deterministic is None:
+            is_model_deterministic = model.predict(original_prompt)[0] == original_model_output
 
         perturbation = PERTURBATION_TYPE_TO_HELPER_CLASS[self._eval_algorithm_config.perturbation_type]()
         perturbed_inputs = perturbation.perturb(
@@ -164,18 +244,31 @@ def evaluate_sample(
         perturbed_input_prompts = [prompt_composer.compose(perturbed_input) for perturbed_input in perturbed_inputs]
         perturbed_input_outputs = [model.predict(prompt)[0] for prompt in perturbed_input_prompts]
 
+        bert_score_dissimilarity_value = 1 - np.mean(
+            list(
+                map(
+                    functools.partial(get_bert_score, helper_model=self._bertscore_helper_model),
+                    itertools.repeat(original_model_output, len(perturbed_input_outputs)),
+                    perturbed_input_outputs,
+                )
+            )
+        )
         wer = hf_evaluate.load("wer")
+        wer_value = wer.compute(
+            predictions=perturbed_input_outputs,
+            references=list(itertools.repeat(original_model_output, self._eval_algorithm_config.num_perturbations)),
+        )
 
-        return [
-            EvalScore(
-                name=WER_SCORE,
-                value=wer.compute(
-                    predictions=perturbed_input_outputs,
-                    references=list(
-                        itertools.repeat(original_model_output, self._eval_algorithm_config.num_perturbations)
-                    ),
-                ),
+        if not is_model_deterministic:  # Compute the baseline differences in the model outputs for the same input
+            baselines = self._compute_baseline_scores(model, original_prompt, original_model_output)
+            bert_score_dissimilarity_value = max(
+                0, bert_score_dissimilarity_value - baselines[BERT_SCORE_DISSIMILARITY]
             )
+            wer_value = max(0, wer_value - baselines[WER_SCORE])
+
+        return [
+            EvalScore(name=BERT_SCORE_DISSIMILARITY, value=bert_score_dissimilarity_value),
+            EvalScore(name=WER_SCORE, value=wer_value),
         ]
 
     def evaluate(
@@ -223,33 +316,18 @@ def evaluate(
             )
 
             self._is_model_deterministic = verify_model_determinism(model, dataset, DatasetColumns.PROMPT.value.name)
-            if not self._is_model_deterministic:
-                raise EvalAlgorithmClientError("For evaluating semantic robustness, the model should be deterministic.")
             dataset = generate_model_predict_response_for_dataset(
                 model=model,
                 data=dataset,
                 model_input_column_name=DatasetColumns.PROMPT.value.name,
                 model_output_column_name=DatasetColumns.MODEL_OUTPUT.value.name,
             )
-            with timed_block(f"Computing score and aggregation on dataset {dataset_config.dataset_name}", logger):
-
-                def _generate_general_semantic_robustness_score(
-                    row: Dict[str, Any]
-                ) -> Dict[str, Any]:  # pragma: no cover
-                    """
-                    Map function generating the scores for every input record in input dataset
-                    """
-                    row[WER_SCORE] = self.evaluate_sample(
-                        model_input=row[DatasetColumns.MODEL_INPUT.value.name],
-                        model=model,
-                        model_output=row[DatasetColumns.MODEL_OUTPUT.value.name],
-                        prompt_template=dataset_prompt_template,
-                    )[0].value
-                    return row
+            with (timed_block(f"Computing score and aggregation on dataset {dataset_config.dataset_name}", logger)):
 
-                dataset = dataset.map(_generate_general_semantic_robustness_score).materialize()
-
-                dataset_scores, category_scores = aggregate_evaluation_scores(dataset, [WER_SCORE], agg_method=MEAN)
+                dataset = self.__add_scores_to_dataset(dataset, model, dataset_prompt_template)
+                dataset_scores, category_scores = aggregate_evaluation_scores(
+                    dataset, [BERT_SCORE_DISSIMILARITY, WER_SCORE], agg_method=MEAN
+                )
                 eval_outputs.append(
                     EvalOutput(
                         eval_name=self.eval_name,
@@ -268,7 +346,7 @@ def _generate_general_semantic_robustness_score(
             if save:
                 save_dataset(
                     dataset=dataset,
-                    score_names=[WER_SCORE],
+                    score_names=[BERT_SCORE_DISSIMILARITY, WER_SCORE],
                     path=generate_output_dataset_path(
                         path_to_parent_dir=self._eval_results_path,
                         eval_name=self.eval_name,
@@ -277,3 +355,30 @@ def _generate_general_semantic_robustness_score(
                 )
 
         return eval_outputs
+
+    def __add_scores_to_dataset(self, dataset: Dataset, model: ModelRunner, prompt_template: str):
+        """
+        Private method to encapsulate logic around getting scores for every row in the dataset.
+
+        :param dataset: ray Dataset to be used for eval scores generation
+        :param model: An instance of ModelRunner which is the model under evaluation
+        :param prompt_template: Eval algo config
+        :returns: ray Dataset with score columns
+        """
+
+        def _generate_general_semantic_robustness_score(row: Dict[str, Any]) -> Dict[str, Any]:  # pragma: no cover
+            """
+            Map function generating the scores for every input record in input dataset
+            """
+            scores = self.evaluate_sample(
+                model_input=row[DatasetColumns.MODEL_INPUT.value.name],
+                model=model,
+                model_output=row[DatasetColumns.MODEL_OUTPUT.value.name],
+                prompt_template=prompt_template,
+            )
+            row[BERT_SCORE_DISSIMILARITY] = scores[0].value
+            row[WER_SCORE] = scores[1].value
+
+            return row
+
+        return dataset.map(_generate_general_semantic_robustness_score).materialize()  # pragma: no cover
@@ -1,3 +1,4 @@
+from enum import Enum
 import ray
 import numpy as np
 import evaluate as hf_evaluate
@@ -219,3 +220,27 @@ def get_helper_scores(self, target_output: str, model_output: str) -> float:  #
             references=[target_output],
             model_type=self._model_type,
         )["f1"][0]
+
+
+class BertscoreHelperModelTypes(Enum):
+    """This class holds the names of all the allowed models for computing the BERTScore."""
+
+    MICROSOFT_DEBERTA_MODEL = "microsoft/deberta-xlarge-mnli"
+    ROBERTA_MODEL = "roberta-large-mnli"
+
+    @classmethod
+    def model_is_allowed(cls, model_name: str) -> bool:
+        """
+        Given a model name like 'roberta-large-mnli', check if this is an allowed model for computing BERTScore.
+        """
+        for elem in iter(cls):
+            if elem.value == model_name:
+                return True
+        return False
+
+    @classmethod
+    def model_list(cls) -> List[str]:
+        """
+        Return a list of all the allowed models for computing BERTScore.
+        """
+        return [elem.value for elem in iter(cls)]