From 344cd5d07e50df6ad2562ece9a3ecb39d529b8db Mon Sep 17 00:00:00 2001 From: alfekka Date: Mon, 21 Jul 2025 17:21:29 +0300 Subject: [PATCH] refactor metrics --- requirements.txt | 4 +- src/atgen/metrics/base_metric.py | 53 ++ src/atgen/metrics/classic_metrics/__init__.py | 1 + .../classic_metrics/abstractiveness.py | 82 +++ src/atgen/metrics/classic_metrics/bleu.py | 56 ++ .../metrics/classic_metrics/exact_match.py | 31 + .../classic_metrics/exact_match_math.py | 26 + src/atgen/metrics/classic_metrics/rouge.py | 27 + .../metrics/classic_metrics/sacrebleu.py | 37 + .../metrics/classic_metrics/word_length.py | 36 + src/atgen/metrics/compute_metrics.py | 369 +++------- src/atgen/metrics/deep_eval/__init__.py | 0 .../metrics/deep_eval/deepeval_metrics.py | 183 +++++ src/atgen/metrics/deep_eval/evaluationllm.py | 99 +++ .../deepeval_supported_models_and_metrics.py | 64 -- src/atgen/metrics/factory.py | 40 ++ src/atgen/metrics/metrics.py | 639 ------------------ src/atgen/metrics/model_based/__init__.py | 0 src/atgen/metrics/model_based/alignscore.py | 60 ++ .../bartscore.py} | 46 +- src/atgen/metrics/registry.py | 22 + 21 files changed, 880 insertions(+), 995 deletions(-) create mode 100644 src/atgen/metrics/base_metric.py create mode 100644 src/atgen/metrics/classic_metrics/__init__.py create mode 100644 src/atgen/metrics/classic_metrics/abstractiveness.py create mode 100644 src/atgen/metrics/classic_metrics/bleu.py create mode 100644 src/atgen/metrics/classic_metrics/exact_match.py create mode 100644 src/atgen/metrics/classic_metrics/exact_match_math.py create mode 100644 src/atgen/metrics/classic_metrics/rouge.py create mode 100644 src/atgen/metrics/classic_metrics/sacrebleu.py create mode 100644 src/atgen/metrics/classic_metrics/word_length.py create mode 100644 src/atgen/metrics/deep_eval/__init__.py create mode 100644 src/atgen/metrics/deep_eval/deepeval_metrics.py create mode 100644 src/atgen/metrics/deep_eval/evaluationllm.py delete mode 100644 src/atgen/metrics/deepeval_supported_models_and_metrics.py create mode 100644 src/atgen/metrics/factory.py delete mode 100644 src/atgen/metrics/metrics.py create mode 100644 src/atgen/metrics/model_based/__init__.py create mode 100644 src/atgen/metrics/model_based/alignscore.py rename src/atgen/metrics/{bart_score.py => model_based/bartscore.py} (70%) create mode 100644 src/atgen/metrics/registry.py diff --git a/requirements.txt b/requirements.txt index 216563c..5d87b05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,9 +28,9 @@ spacy==3.7.5 streamlit==1.37.0 streamlit-authenticator==0.4.2 tabulate==0.9.0 -transformers==4.52.4 +transformers trl==0.15.2 torchmetrics==1.4.1 -unsloth==2025.3.17 +unsloth vllm==0.8.1 xlrd==1.2.0 diff --git a/src/atgen/metrics/base_metric.py b/src/atgen/metrics/base_metric.py new file mode 100644 index 0000000..a31bf7b --- /dev/null +++ b/src/atgen/metrics/base_metric.py @@ -0,0 +1,53 @@ +import string +from pydantic import BaseModel +from typing import List, Optional +import re +from abc import ABC, abstractmethod + + + +class MetricConfig(BaseModel): + aggregate: bool = True + + +class BaseMetric(ABC): + def __init__(self, config: MetricConfig): + self.config = config + + @abstractmethod + def compute(self, predictions: List[str], references: List[str]) -> float: + raise NotImplementedError + + def _preprocess_text(self, text: str, + do_lowercase: bool = True, + do_remove_punctuation: bool = True, + do_remove_extra_spaces: bool = True, + do_remove_stopwords: bool = False, + stopwords: Optional[list[str]] = None) -> str: + # Convert to lowercase + if do_lowercase: + text = text.lower() + + # Remove punctuation + if do_remove_punctuation: + # Keep hyphens within words, remove other punctuation + text = re.sub(r'(? 0: + ngrams_intersection = set(summary_ngrams).intersection(set(text_ngrams)) + if use_modified: + word_is_part_of_ngram_copied = [ + any((x in ngram for ngram in ngrams_intersection)) for x in summary + ] + return 1 - sum(word_is_part_of_ngram_copied) / len( + word_is_part_of_ngram_copied + ) + else: + return sum([x not in ngrams_intersection for x in summary_ngrams]) / len( + summary_ngrams + ) + return np.nan + + + def compute(self, predictions: List[str], references: List[str], sources: List[str], **kwargs) -> float: + stemmer = porter.PorterStemmer() + tokenized_preds = [tokenize.tokenize(x, stemmer) for x in predictions] + tokenized_texts = [tokenize.tokenize(x, stemmer) for x in sources] + if references is not None: + tokenized_refs = [tokenize.tokenize(x, stemmer) for x in references] + else: + tokenized_refs = tokenized_preds + + result = {} + for use_modified in [False, True]: + for n in range(1, 5): + pred_ngram_overlaps = [] + label_ngram_overlaps = [] + for pred, label, text in zip( + tokenized_preds, tokenized_refs, tokenized_texts + ): + pred_pair_ngram_overlap = self._calculate_ngram_overlap( + pred, text, n, use_modified + ) + pred_ngram_overlaps.append(pred_pair_ngram_overlap) + if references is not None: + label_pair_ngram_overlap = self._calculate_ngram_overlap( + label, text, n, use_modified + ) + label_ngram_overlaps.append(label_pair_ngram_overlap) + key = f"ngram_overlap_{n}" if use_modified else f"novel_ngrams_{n}" + + pred_ngram_overlaps = np.array(pred_ngram_overlaps) + cond_abs = ~np.isnan(pred_ngram_overlaps) + result[key + "_abs"] = pred_ngram_overlaps[cond_abs] + + if references is not None: + label_ngram_overlaps = np.array(label_ngram_overlaps) + cond_rel = cond_abs & ~np.isnan(label_ngram_overlaps) + result[key + "_rel"] = ( + pred_ngram_overlaps[cond_rel] / label_ngram_overlaps[cond_rel] + ) + + if self.config.aggregate: + for key, value in result.items(): + result[key] = np.mean(value) + + return {"abstractiveness": result} \ No newline at end of file diff --git a/src/atgen/metrics/classic_metrics/bleu.py b/src/atgen/metrics/classic_metrics/bleu.py new file mode 100644 index 0000000..72afd1c --- /dev/null +++ b/src/atgen/metrics/classic_metrics/bleu.py @@ -0,0 +1,56 @@ +from atgen.metrics.base_metric import BaseMetric, MetricConfig +from typing import List, Optional +from nltk import ngrams +from nltk.stem import porter +from nltk.tokenize import word_tokenize, sent_tokenize +from nltk.translate.bleu_score import corpus_bleu +import numpy as np + + +class BleuConfig(MetricConfig): + pass + + +class Bleu(BaseMetric): + def __init__(self, config: BleuConfig): + super().__init__(config) + + + def _smoothing_function(self, p_n, references, hypothesis, hyp_len): + smoothed_p_n = [] + for i, p_i in enumerate(p_n, start=1): + # Smoothing is not applied for unigrams + if i > 1: + # If hypothesis length is lower than the current order, its value equals (0 + 1) / (0 + 1) = 0 + if hyp_len < i: + assert p_i.denominator == 1 + smoothed_p_n.append(1) + # Otherwise apply smoothing + else: + smoothed_p_i = (p_i.numerator + 1) / (p_i.denominator + 1) + smoothed_p_n.append(smoothed_p_i) + else: + smoothed_p_n.append(p_i) + return smoothed_p_n + + def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float: + scores = [] + for pred, ref in zip(predictions, references): + if isinstance(ref, str): + ref_list = [ref] + else: + ref_list = ref + + tok_ref = [[word_tokenize(r) for r in ref_list]] + tok_pred = [word_tokenize(pred)] + + try: + bleu_score = corpus_bleu(tok_ref, tok_pred, smoothing_function=self._smoothing_function) + scores.append(bleu_score) + except (KeyError, ZeroDivisionError): + scores.append(0.0) + + if self.config.aggregate: + return {"bleu": float(np.mean(scores))} + else: + return {"bleu": scores} \ No newline at end of file diff --git a/src/atgen/metrics/classic_metrics/exact_match.py b/src/atgen/metrics/classic_metrics/exact_match.py new file mode 100644 index 0000000..d73a9e8 --- /dev/null +++ b/src/atgen/metrics/classic_metrics/exact_match.py @@ -0,0 +1,31 @@ +from atgen.metrics.base_metric import BaseMetric, MetricConfig +from typing import List, Optional +import numpy as np + +class ExactMatchConfig(MetricConfig): + aggregate: bool = True + + +class ExactMatch(BaseMetric): + def __init__(self, config: ExactMatchConfig): + super().__init__(config) + + + def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float: + + if isinstance(references[0], list): + scores = np.array( + [ + any(self._preprocess_text(pred) == self._preprocess_text(one_ref) for one_ref in ref) + for pred, ref in zip(predictions, references) + ] + ) + else: + scores = np.array( + [self._preprocess_text(pred) == self._preprocess_text(ref) for pred, ref in zip(predictions, references)] + ) + + if self.config.aggregate: + return {"exact_match": float(np.mean(scores))} + else: + return {"exact_match": scores} \ No newline at end of file diff --git a/src/atgen/metrics/classic_metrics/exact_match_math.py b/src/atgen/metrics/classic_metrics/exact_match_math.py new file mode 100644 index 0000000..9025854 --- /dev/null +++ b/src/atgen/metrics/classic_metrics/exact_match_math.py @@ -0,0 +1,26 @@ +from multiprocessing import reduction +from typing import Literal, Optional +from omegaconf import DictConfig +import numpy as np + +from atgen.metrics.classic_metrics.base_metric import BaseMetric, BaseMetricConfig + + +class ExactMatchMathConfig(BaseMetricConfig): + aggregate: bool = True + +class ExactMatchMath(BaseMetric): + def __init__(self, config: DictConfig): + super().__init__(config) + + def compute(self, generated_texts: list[str], reference_texts: list[str], original_texts: list[str], task: Literal["summarization", "open-qa", "multi-choice-qa", "translation", "math"]) -> float: + scores = np.array( + [ + pred.split("#### ")[-1].lower() == ref.split("#### ")[-1].lower() + for pred, ref in zip(generated_texts, reference_texts) + ] + ) + if self.config.aggregate: + return {"exact_match_math": float(np.mean(scores))} + else: + return {"exact_match_math": scores} diff --git a/src/atgen/metrics/classic_metrics/rouge.py b/src/atgen/metrics/classic_metrics/rouge.py new file mode 100644 index 0000000..b13b23a --- /dev/null +++ b/src/atgen/metrics/classic_metrics/rouge.py @@ -0,0 +1,27 @@ +from atgen.metrics.base_metric import BaseMetric, MetricConfig +from typing import List, Optional +from evaluate import load +import numpy as np + + +class RougeConfig(MetricConfig): + use_stemmer: bool = True + + + +class Rouge(BaseMetric): + def __init__(self, config: RougeConfig): + super().__init__(config) + self.rouge = load("rouge") + + def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float: + rouge_scores = self.rouge.compute( + predictions=predictions, + references=references, + use_stemmer=self.config.use_stemmer, + ) + + if self.config.aggregate: + return {k: float(np.mean(v)) for k, v in rouge_scores.items()} + else: + return rouge_scores \ No newline at end of file diff --git a/src/atgen/metrics/classic_metrics/sacrebleu.py b/src/atgen/metrics/classic_metrics/sacrebleu.py new file mode 100644 index 0000000..f4f30ee --- /dev/null +++ b/src/atgen/metrics/classic_metrics/sacrebleu.py @@ -0,0 +1,37 @@ +from time import time +from typing import List, Optional +from atgen.metrics.base_metric import BaseMetric, MetricConfig +from evaluate import load +import numpy as np + + + +sacrebleu = load("sacrebleu") + + +class SacrebleuConfig(MetricConfig): + pass + + +class Sacrebleu(BaseMetric): + def __init__(self, config: SacrebleuConfig): + super().__init__(config) + + def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float: + if not isinstance(references[0], list): + sacrebleu_references = [[ref] for ref in references] + sacrebleu_result = sacrebleu.compute( + predictions=predictions, references=sacrebleu_references + ) + return float(sacrebleu_result.pop("score")) + else: + sacrebleu_scores = [] + for pred, ref in zip(predictions, references): + sacrebleu_result = sacrebleu.compute( + predictions=[pred], references=[ref] + ) + sacrebleu_scores.append(sacrebleu_result.pop("score")) + if self.config.aggregate: + return {"sacrebleu": float(np.mean(sacrebleu_scores))} + else: + return {"sacrebleu": sacrebleu_scores} \ No newline at end of file diff --git a/src/atgen/metrics/classic_metrics/word_length.py b/src/atgen/metrics/classic_metrics/word_length.py new file mode 100644 index 0000000..c790d02 --- /dev/null +++ b/src/atgen/metrics/classic_metrics/word_length.py @@ -0,0 +1,36 @@ +from atgen.metrics.base_metric import BaseMetric, MetricConfig +from typing import List, Optional +import numpy as np + + +class WordLengthConfig(MetricConfig): + pass + + +class WordLength(BaseMetric): + def __init__(self, config: WordLengthConfig): + super().__init__(config) + + def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float: + # Calculate generated text lengths + gen_word_lengths = np.array([len(text.split()) for text in predictions]) + + # Calculate reference text lengths + if isinstance(references[0], list): + ref_word_lengths = np.array( + [ + np.mean([len(text.split()) for text in ref]) + for ref in references + ] + ) + else: + ref_word_lengths = np.array([len(ref.split()) for ref in references]) + + # Avoid division by zero + ref_word_lengths_safe = np.where(ref_word_lengths > 0, ref_word_lengths, 1) + relative_lengths = gen_word_lengths / ref_word_lengths_safe + + if self.config.aggregate: + return {"word_length": float(np.mean(relative_lengths))} + else: + return {"word_length": relative_lengths} \ No newline at end of file diff --git a/src/atgen/metrics/compute_metrics.py b/src/atgen/metrics/compute_metrics.py index 2864372..e28a41f 100644 --- a/src/atgen/metrics/compute_metrics.py +++ b/src/atgen/metrics/compute_metrics.py @@ -1,305 +1,96 @@ -import string -from time import time import logging -from typing import Literal, Optional +from time import time +from typing import List, Dict, Literal, Optional + from omegaconf import DictConfig -import re -import numpy as np -from evaluate import load -from tqdm import tqdm +# The factory is now the single entry point to get a metric runner. +# We'll assume it's located in atgen/metrics/factory.py +from atgen.metrics.factory import MetricFactory -from .metrics import ( - pair_bleu, - calculate_bart_score, - calculate_alignscore, - calculate_deepeval_metrics, - is_bart_score_available, - is_alignscore_available, -) -from .deepeval_supported_models_and_metrics import API_MODELS, DEEPEVAL_METRICS +log = logging.getLogger(__name__) +# This mapping replaces the large if/elif/else block. It's declarative and easy to modify. +TASK_TO_DEFAULT_METRICS = { + "summarization": ["exact_match", "sacrebleu", "bleu", "rouge", "word_length"], + "open-qa": ["exact_match"], + "multi-choice-qa": ["exact_match"], + "translation": ["exact_match", "sacrebleu", "bleu", "word_length"], + "math": ["exact_match_math"], +} -log = logging.getLogger() +# Define which metrics require the 'original_texts' (sources) input. +# This avoids passing it to metrics that don't need it. +METRICS_REQUIRING_SOURCE = {"bartscore", "alignscore", "deepeval"} def compute_metrics( - generated_texts, - reference_texts, - original_texts, + generated_texts: List[str], + reference_texts: Optional[List[str]], + original_texts: Optional[List[str]], task: Literal["summarization", "open-qa", "multi-choice-qa", "translation", "math"], config: DictConfig, - cache_dir: str = "cache", -) -> dict[str, float]: - """ - Compute various metrics for generated texts. - - Args: - generated_texts: List of generated texts to evaluate - reference_texts: List of reference texts (ground truth) or list of lists of reference texts - original_texts: List of source texts - task: Task type (summarization, open-qa, multi-choice-qa, translation) - config: Configuration for evaluation - - additional_metrics: List of additional metrics to use. Options include: - - "bartscore": BARTScore metrics - - "alignscore": AlignScore metrics - - DeepEval metrics (requires API key): - - "deepeval_answer_relevance": Evaluates how well the output answers the input - - "deepeval_faithfulness": Evaluates factual consistency with the input - - "deepeval_summarization": Evaluates summarization quality - - "deepeval_prompt_alignment": Evaluates alignment with the expected output - - provider: API key for the provider - - api_key: Model identifier to use - - model: Provider name (openai, anthropic, openrouter, or custom) - - base_url: API base URL (if None, uses default for the provider) - - deepeval_threshold: Threshold for DeepEval metrics (default: 0.5) - - deepeval_include_reason: Include reason for evaluation score (default: False) - - deepeval_strict_mode: Enforce binary metric score (default: False) - - deepeval_async_mode: Enable concurrent execution (default: True) - - deepeval_verbose_mode: Print intermediate steps (default: False) - - deepeval_truths_extraction_limit: Maximum number of factual truths to extract (default: None) - Returns: - Dictionary with metric scores - - Note: - The OpenRouterLLM class is also available for direct use with DeepEval metrics: - - ```python - from atgen.metrics import OpenRouterLLM - from deepeval.metrics import AnswerRelevanceMetric - - llm = OpenRouterLLM( - api_key="your_openrouter_api_key", - model="openai/gpt-4o-2024-11-20" - ) - - metric = AnswerRelevanceMetric(model=llm) - ``` - """ - if task == "multi-choice-qa": - metrics_to_calculate = ["exact_match"] + list(config.additional_metrics) - elif task == "open-qa": - metrics_to_calculate = ["exact_match"] + list(config.additional_metrics) - elif task == "summarization": - metrics_to_calculate = ["exact_match", "sacrebleu", "bleu", "rouge", "word_length"] + list(config.additional_metrics) - elif task == "translation": - metrics_to_calculate = ["exact_match", "sacrebleu", "bleu", "word_length"] + list(config.additional_metrics) - elif task == "math": - metrics_to_calculate = ["exact_match_math"] + list(config.additional_metrics) - else: - raise NotImplementedError(f"Task {task} not implemented") - - if "sacrebleu" in metrics_to_calculate: - sacrebleu = load("sacrebleu", cache_dir=cache_dir) - if "rouge" in metrics_to_calculate: - rouge = load("rouge", cache_dir=cache_dir) - - result = {} - if "word_length" in metrics_to_calculate: - result["word_length_gen"] = np.array( - [len(text.split()) for text in generated_texts] - ) - + cache_dir: Optional[str] = None, +) -> Dict[str, float]: + + if task not in TASK_TO_DEFAULT_METRICS: + raise NotImplementedError(f"Task '{task}' is not implemented in TASK_TO_DEFAULT_METRICS.") + + # 1. Determine the full list of metrics to run + base_metrics = TASK_TO_DEFAULT_METRICS.get(task, []) + additional_metrics = list(config.get("additional_metrics", [])) + # Use a set to handle duplicates, then sort for predictable execution order + metrics_to_calculate = sorted(list(set(base_metrics + additional_metrics))) + + if not metrics_to_calculate: + log.warning("No metrics specified for calculation. Returning empty results.") + return {} + + # 2. Instantiate the factory that will build our metric runners + metric_factory = MetricFactory(config, cache_dir=cache_dir) + + final_results = {} time_dict = {} - # Metrics that use both the generated texts and the original texts and - # those that do not require reference texts - if "bartscore" in metrics_to_calculate and is_bart_score_available: - log.info("Calculating BARTScore scores...") - start_time = time() - result.update( - calculate_bart_score( - preds=generated_texts, - texts=original_texts, - refs=reference_texts, - batch_size=4, - cache_dir=cache_dir, - ) - ) - time_dict["time_bartscore"] = time() - start_time - # Metrics that use both the generated texts and the reference texts - if reference_texts is not None: - # Exact match - if "exact_match" in metrics_to_calculate: - if isinstance(reference_texts[0], list): - result["exact_match"] = np.array( - [ - any(_preprocess_text(pred) == _preprocess_text(one_ref) for one_ref in ref) - for pred, ref in zip(generated_texts, reference_texts) - ] - ) - else: - result["exact_match"] = np.array( - [_preprocess_text(pred) == _preprocess_text(ref) for pred, ref in zip(generated_texts, reference_texts)] - ) - if "exact_match_math" in metrics_to_calculate: - # result["exact_match_math"] = np.array( - # [ - # pred.split("Answer: ")[-1].lower() == ref.lower() - # for pred, ref in zip(generated_texts, reference_texts) - # ] - # ) - result["exact_match_math"] = np.array( - [ - pred.split("#### ")[-1].lower() == ref.split("#### ")[-1].lower() - for pred, ref in zip(generated_texts, reference_texts) - ] - ) - if "bleu" in metrics_to_calculate: - # BLEU - start_time = time() - result["bleu"] = np.array( - [ - pair_bleu(references=ref, prediction=pred) - for pred, ref in tqdm(zip(generated_texts, reference_texts)) - ] - ) - time_dict["time_bleu"] = time() - start_time - if "rouge" in metrics_to_calculate: - # ROUGE - start_time = time() - result.update( - rouge.compute( - predictions=generated_texts, - references=reference_texts, - use_stemmer=True, - ) - ) - time_dict["time_rouge"] = time() - start_time - if "sacrebleu" in metrics_to_calculate: - # Sacrebleu - start_time = time() - if not isinstance(reference_texts[0], list): - sacrebleu_references = [[ref] for ref in reference_texts] - sacrebleu_result = sacrebleu.compute( - predictions=generated_texts, references=sacrebleu_references - ) - result["sacrebleu"] = sacrebleu_result.pop("score") - else: - sacrebleu_scores = [] - for pred, ref in zip(generated_texts, reference_texts): - sacrebleu_result = sacrebleu.compute( - predictions=[pred], references=[ref] - ) - sacrebleu_scores.append(sacrebleu_result.pop("score")) - result["sacrebleu"] = sacrebleu_scores - - time_dict["time_sacrebleu"] = time() - start_time - if "word_length" in metrics_to_calculate: - # Lengths - if isinstance(reference_texts[0], list): - ref_word_lengths = np.array( - [ - np.mean([len(text.split()) for text in ref]) - for ref in reference_texts - ] - ) - else: - ref_word_lengths = np.array([len(ref.split()) for ref in reference_texts]) - # Avoid division by zero - ref_word_lengths_safe = np.where(ref_word_lengths > 0, ref_word_lengths, 1) - result["word_length_rel"] = result["word_length_gen"] / ref_word_lengths_safe + log.info(f"Starting evaluation for task '{task}' with metrics: {', '.join(metrics_to_calculate)}") - # AlignScore - if "alignscore" in metrics_to_calculate and is_alignscore_available: - log.info("Calculating AlignScore scores...") + # 3. Loop through metrics, delegate calculation, and collect results + for metric_name in metrics_to_calculate: + try: + log.info(f"--> Calculating metric: {metric_name}") start_time = time() - alignscores = calculate_alignscore( - generated_texts, reference_texts, original_texts - ) - if alignscores is not None: - result.update(alignscores) - time_dict["time_alignscore"] = time() - start_time - - # DeepEval metrics - deepeval_metrics_to_calculate = [ - metric for metric in DEEPEVAL_METRICS if metric in config.additional_metrics - ] - - if deepeval_metrics_to_calculate: - if isinstance(reference_texts[0], list): - log.error("DeepEval does not support multiple references. Skipping...") - else: - # Validate OpenRouter model - only warn if not in predefined list, but still use it - provider = config["provider"] - if config.model not in API_MODELS.get(provider): - log.warning( - f"Using custom model: {config.model}. " - + ( - f"Available models: {API_MODELS[provider]}" - if provider in API_MODELS - else "" - ) - ) - log.info( - f"Calculating DeepEval metrics: {', '.join(deepeval_metrics_to_calculate)}..." - ) - start_time = time() - result.update( - calculate_deepeval_metrics( - predictions=generated_texts, - references=reference_texts, - original_texts=original_texts, - metrics_to_calculate=deepeval_metrics_to_calculate, - base_url=config.base_url, - api_key=config.api_key, - model=config.model, - threshold=config.deepeval_threshold, - include_reason=config.deepeval_include_reason, - strict_mode=config.deepeval_strict_mode, - async_mode=config.deepeval_async_mode, - verbose_mode=config.deepeval_verbose_mode, - truths_extraction_limit=config.deepeval_truths_extraction_limit, - ) - ) - time_dict["time_deepeval"] = time() - start_time - - for key, value in result.items(): - if isinstance(value, np.ndarray): - result[key] = float(np.mean(value)) - elif isinstance(value, (int, float)): - # Ensure numerical values are converted to float - result[key] = float(value) - # Make sure non-numerical values that aren't reasons are preserved - elif not key.endswith("_reasons") and not "_reason" in key.lower(): - continue - - # Filter out reason fields from the final aggregated results - more robust filtering - result = { - key: value - for key, value in sorted(result.items()) - if not key.endswith("_reasons") - and not "_reason" in key.lower() - and isinstance(value, (int, float)) # Ensure we only keep numerical metrics - } - - return result -def _preprocess_text(text: str, do_lowercase: bool = True, do_remove_punctuation: bool = True, do_remove_extra_spaces: bool = True, do_remove_stopwords: bool = False, stopwords: Optional[list[str]] = None) -> str: - # Convert to lowercase - if do_lowercase: - text = text.lower() - - # Remove punctuation - if do_remove_punctuation: - # Keep hyphens within words, remove other punctuation - text = re.sub(r'(? float: + results = {} + + metrics = [] + metric_name_mapping = {} # Maps metric class name to the deepeval metric name + + # Dictionary to store test cases for each metric + metric_test_cases = {} + + if "deepeval_answer_relevance" in self.config.metrics_to_calculate: + metric = AnswerRelevancyMetric( + threshold=self.config.threshold, + model=self.llm, + include_reason=self.config.include_reason, + strict_mode=self.config.strict_mode, + async_mode=self.config.async_mode, + ) + metrics.append(metric) + metric_name_mapping[metric.__class__.__name__] = "deepeval_answer_relevance" + + # Create specific test cases for AnswerRelevancy metric + answer_relevance_test_cases = [] + for i, (pred, src) in enumerate(zip(predictions, original_texts)): + test_case = LLMTestCase( + input=src, + actual_output=pred, + ) + answer_relevance_test_cases.append(test_case) + metric_test_cases[metric.__class__.__name__] = answer_relevance_test_cases + + if "deepeval_faithfulness" in self.config.metrics_to_calculate: + metric = FaithfulnessMetric( + threshold=self.config.threshold, + model=self.llm, + include_reason=self.config.include_reason, + strict_mode=self.config.strict_mode, + async_mode=self.config.async_mode, + truths_extraction_limit=self.config.truths_extraction_limit, + ) + metrics.append(metric) + metric_name_mapping[metric.__class__.__name__] = "deepeval_faithfulness" + + # Create specific test cases for Faithfulness metric + faithfulness_test_cases = [] + for i, (pred, src) in enumerate(zip(predictions, original_texts)): + test_case = LLMTestCase( + input=src, + actual_output=pred, + retrieval_context=[src], + ) + faithfulness_test_cases.append(test_case) + metric_test_cases[metric.__class__.__name__] = faithfulness_test_cases + + if "deepeval_summarization" in self.config.metrics_to_calculate: + metric = SummarizationMetric( + threshold=self.config.threshold, + model=self.llm, + include_reason=self.config.include_reason, + strict_mode=self.config.strict_mode, + async_mode=self.config.async_mode, + ) + metrics.append(metric) + metric_name_mapping[metric.__class__.__name__] = "deepeval_summarization" + + # Create specific test cases for Summarization metric + summarization_test_cases = [] + for i, (pred, src) in enumerate(zip(predictions, original_texts)): + test_case = LLMTestCase( + input=src, + actual_output=pred, + ) + summarization_test_cases.append(test_case) + metric_test_cases[metric.__class__.__name__] = summarization_test_cases + + if "deepeval_prompt_alignment" in self.config.metrics_to_calculate: + metric = PromptAlignmentMetric( + threshold=self.config.threshold, + model=self.llm, + prompt_instructions=["Do what you are told to do in the prompt"], + include_reason=self.config.include_reason, + strict_mode=self.config.strict_mode, + async_mode=self.config.async_mode, + ) + metrics.append(metric) + metric_name_mapping[metric.__class__.__name__] = "deepeval_prompt_alignment" + + # Create specific test cases for PromptAlignment metric + prompt_alignment_test_cases = [] + for i, (pred, ref, src) in enumerate( + zip(predictions, references, original_texts) + ): + test_case = LLMTestCase( + input=src, + actual_output=pred, + expected_output=ref, + ) + prompt_alignment_test_cases.append(test_case) + metric_test_cases[metric.__class__.__name__] = prompt_alignment_test_cases + + for metric in metrics: + metric_class_name = metric.__class__.__name__ + test_cases = metric_test_cases.get(metric_class_name, []) + + if test_cases: + original_stdout = sys.stdout + if not self.config.verbose_mode: + sys.stdout = open(os.devnull, "w") + + try: + # Run evaluation for this specific metric + evaluation_results = evaluate( + test_cases=test_cases, + metrics=[metric], + run_async=self.config.async_mode, + ) + + deepeval_metric_name = metric_name_mapping.get(metric_class_name) + scores = [] + reasons = [] + + # Process results for this metric + for result in evaluation_results.test_results: + scores.append(1 if result.success else 0) + + # Calculate average score + if scores: + results[deepeval_metric_name] = np.mean(scores) + + finally: + # Restore stdout + if not self.config.verbose_mode: + sys.stdout.close() + sys.stdout = original_stdout + print("================================================") + print("Results:") + print(results) + print("================================================") + + return {"deepeval": results} + diff --git a/src/atgen/metrics/deep_eval/evaluationllm.py b/src/atgen/metrics/deep_eval/evaluationllm.py new file mode 100644 index 0000000..88d0a41 --- /dev/null +++ b/src/atgen/metrics/deep_eval/evaluationllm.py @@ -0,0 +1,99 @@ +from deepeval import evaluate +from deepeval.models.base_model import DeepEvalBaseLLM +from deepeval.test_case import LLMTestCase +from deepeval.metrics import ( + AnswerRelevancyMetric, + FaithfulnessMetric, + SummarizationMetric, + PromptAlignmentMetric, +) +from openai import OpenAI, AsyncOpenAI + + +class EvaluationLLM(DeepEvalBaseLLM): + """ + Custom Evaluation LLM implementation for DeepEval. + + This class implements the DeepEvalBaseLLM interface to allow using + custom models with DeepEval metrics. + """ + + def __init__( + self, + api_key=None, + model="openai/gpt-4o-2024-11-20", + base_url="https://openrouter.ai/api/v1", + ): + """ + Initialize the Evaluation LLM. + + Args: + api_key: Evaluation API key + model: Model identifier (e.g., "openai/gpt-4o-2024-11-20") + base_url: Evaluation API base URL + """ + self.api_key = api_key + + self.model_name = model + self.base_url = base_url + self.client = None + self.async_client = None + self.OpenAI = OpenAI + self.AsyncOpenAI = AsyncOpenAI + + def load_model(self): + """Load and return the client.""" + if self.client is None: + self.client = self.OpenAI( + base_url=self.base_url, + api_key=self.api_key, + ) + return self.client + + def load_async_model(self): + """Load and return the async client.""" + if self.async_client is None: + self.async_client = self.AsyncOpenAI( + base_url=self.base_url, + api_key=self.api_key, + ) + return self.async_client + + def generate(self, prompt: str) -> str: + """ + Generate a response from the evaluation model. + + Args: + prompt: The prompt to send to the model + + Returns: + The model's response as a string + """ + client = self.load_model() + response = client.chat.completions.create( + model=self.model_name, + messages=[{"role": "user", "content": prompt}], + ) + return response.choices[0].message.content + + async def a_generate(self, prompt: str) -> str: + """ + Asynchronously generate a response from the evaluation model. + + Args: + prompt: The prompt to send to the model + + Returns: + The model's response as a string + """ + # Use the async client for async operations + client = self.load_async_model() + response = await client.chat.completions.create( + model=self.model_name, + messages=[{"role": "user", "content": prompt}], + ) + return response.choices[0].message.content + + def get_model_name(self): + """Return the name of the model.""" + return f"EvaluationLLM: {self.model_name}" diff --git a/src/atgen/metrics/deepeval_supported_models_and_metrics.py b/src/atgen/metrics/deepeval_supported_models_and_metrics.py deleted file mode 100644 index 40b3a1b..0000000 --- a/src/atgen/metrics/deepeval_supported_models_and_metrics.py +++ /dev/null @@ -1,64 +0,0 @@ -from typing import Literal - -# Available DeepEval metrics -DEEPEVAL_METRICS = [ - "deepeval_answer_relevance", - "deepeval_faithfulness", - "deepeval_summarization", - "deepeval_prompt_alignment", -] - -# Available API models by provider -# TODO: somehow update periodically -API_MODELS = { - "openai": ["gpt-4o", "gpt-4o-mini", "gpt-4.5", "o1-preview"], - "anthropic": [ - "claude-3.7-sonnet:thinking", - "claude-3.7-sonnet", - "claude-3-5-sonnet", - "claude-3-opus", - "claude-3-sonnet", - "claude-3-haiku", - ], - "openrouter": [ - "openai/gpt-4o-2024-11-20", - "google/gemini-2.0-flash-001", - "anthropic/claude-3.5-sonnet", - "openai/gpt-4o-mini", - "mistralai/mistral-nemo", - "meta-llama/llama-3.1-70b-instruct", - ], -} - - -def get_available_models( - provider: Literal["openai", "anthropic", "openrouter"] = "openai", -): - """ - Get a list of all available models for a specific provider. - - Args: - provider: The provider to get models for Literal["openai", "anthropic", "openrouter"] - - Returns: - List of available models for the specified provider - """ - provider = provider.lower() - if provider in API_MODELS: - return API_MODELS[provider] - return [] - - -def get_available_metrics(): - """ - Get a list of all available metrics. - - Returns: - List of available metrics - """ - basic_metrics = [ - "bartscore", - "alignscore", - ] - - return basic_metrics + DEEPEVAL_METRICS diff --git a/src/atgen/metrics/factory.py b/src/atgen/metrics/factory.py new file mode 100644 index 0000000..d68a7a9 --- /dev/null +++ b/src/atgen/metrics/factory.py @@ -0,0 +1,40 @@ +import logging +from omegaconf import DictConfig, OmegaConf +from .registry import METRICS_REGISTRY +from .base_metric import BaseMetric +from typing import Optional +log = logging.getLogger(__name__) + +class MetricFactory: + def __init__(self, config: DictConfig, cache_dir: Optional[str] = None): + """ + Initializes the factory with the global configuration. + + Args: + config: The main OmegaConf DictConfig object. + """ + self.config = config + self.cache_dir = cache_dir + def get_metric(self, metric_name: str) -> BaseMetric: + """ + Instantiates and returns a metric runner based on its name. + + It automatically creates the specific config for the metric + by extracting relevant parameters from the main config object. + + Args: + metric_name: The name of the metric to instantiate. + + Returns: + An instance of a class derived from BaseMetric. + """ + if metric_name not in METRICS_REGISTRY: + if "deepeval" in metric_name: + metric_name = "deepeval" + else: + raise ValueError(f"Metric '{metric_name}' not found in registry.") + + MetricClass, ConfigClass = METRICS_REGISTRY[metric_name] + metric_config = ConfigClass() + + return MetricClass(config=metric_config) diff --git a/src/atgen/metrics/metrics.py b/src/atgen/metrics/metrics.py deleted file mode 100644 index 52f9504..0000000 --- a/src/atgen/metrics/metrics.py +++ /dev/null @@ -1,639 +0,0 @@ -from math import ceil -import os -import sys -from openai import OpenAI, AsyncOpenAI -from typing import Union -from urllib.request import urlretrieve -import logging -from pathlib import Path -import nltk -import numpy as np -import torch -import torch.nn.functional as F -from datasets import Dataset -from nltk import ngrams -from nltk.stem import porter -from nltk.tokenize import word_tokenize, sent_tokenize -from nltk.translate.bleu_score import corpus_bleu -from rouge_score import tokenize -from torch.utils.data import DataLoader -from transformers import ( - AutoModelForSequenceClassification, - AutoTokenizer, - AutoModel, - DataCollatorWithPadding, -) - -log = logging.getLogger(__name__) - -try: - from .bart_score import BARTScorer - - is_bart_score_available = True -except ImportError: - log.warning( - "BARTScorer not found, please install it (see `install.sh`). Skipping the BARTScore metric." - ) - is_bart_score_available = False - -try: - from alignscore import AlignScore - - is_alignscore_available = True -except ImportError: - log.warning( - "AlignScore not found, please install it (see `install.sh`). Skipping the AlignScore metric." - ) - is_alignscore_available = False - -from deepeval import evaluate -from deepeval.models.base_model import DeepEvalBaseLLM -from deepeval.test_case import LLMTestCase -from deepeval.metrics import ( - AnswerRelevancyMetric, - FaithfulnessMetric, - SummarizationMetric, - PromptAlignmentMetric, -) - - -ALIGNSCORE_CHECKPOINT_PATH = os.getenv( - "ALIGNSCORE_CHECKPOINT_PATH", - # Going up 3 levels from metrics.py: src/atgen/metrics -> repository root - os.path.join( - Path(__file__).parents[3], - "cache/AlignScore-base.ckpt", - ), -) - - -def decode(eval_preds, tokenizer): - predictions, labels, *inputs = eval_preds - predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id) - decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) - # Replace -100 in the labels as we can't decode them. - labels = np.where(labels != -100, labels, tokenizer.pad_token_id) - decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) - - decoded_preds = [pred.strip() for pred in decoded_preds] - decoded_labels = [label.strip() for label in decoded_labels] - - if len(inputs) > 0: - input_ids = inputs[0] - input_ids = np.where(input_ids != -100, input_ids, tokenizer.pad_token_id) - decoded_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True) - decoded_texts = [text.strip() for text in decoded_texts] - return decoded_preds, decoded_labels, decoded_texts - - return decoded_preds, decoded_labels - - -def smoothing_function(p_n, references, hypothesis, hyp_len): - """ - Smooth-BLEU (BLEUS) as proposed in the paper: - Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic - evaluation metrics for machine translation. COLING 2004. - """ - smoothed_p_n = [] - for i, p_i in enumerate(p_n, start=1): - # Smoothing is not applied for unigrams - if i > 1: - # If hypothesis length is lower than the current order, its value equals (0 + 1) / (0 + 1) = 0 - if hyp_len < i: - assert p_i.denominator == 1 - smoothed_p_n.append(1) - # Otherwise apply smoothing - else: - smoothed_p_i = (p_i.numerator + 1) / (p_i.denominator + 1) - smoothed_p_n.append(smoothed_p_i) - else: - smoothed_p_n.append(p_i) - return smoothed_p_n - - -def pair_bleu(references: list[str] | str, prediction: str): - """ - Compute the bleu score between two given texts. - A smoothing function is used to avoid zero scores when - there are no common higher order n-grams between the - texts. - """ - if isinstance(references, str): - tok_ref = [[word_tokenize(references)]] - else: - tok_ref = [[word_tokenize(ref) for ref in references]] - tok_pred = [word_tokenize(prediction)] - try: - return corpus_bleu(tok_ref, tok_pred, smoothing_function=smoothing_function) - except (KeyError, ZeroDivisionError): - return 0.0 - - -def calculate_bart_score( - preds, - refs=None, - texts=None, - scorer=None, - batch_size=4, - aggregate=True, - cache_dir: str = "cache", -): - if not is_bart_score_available: - return None - if scorer is None: - scorer = BARTScorer(cache_dir=cache_dir) - scores = {} - if texts is not None: - scores["BARTScore-sh"] = np.array( - scorer.score(texts, preds, batch_size=batch_size) - ) - if refs is not None: - # scores["BARTScore-rh"] = np.array(scorer.score(refs, preds, batch_size=batch_size)) - if isinstance(refs[0], list): - scores_hr = [] - for ref, pred in zip(refs, preds): - inst_pred = [pred for _ in range(len(ref))] - # Take a maximum within the observation similar to ROUGE - inst_score_hr = max(scorer.score(inst_pred, ref, batch_size=batch_size)) - scores_hr.append(inst_score_hr) - scores["BARTScore-hr"] = np.array(scores_hr) - else: - scores["BARTScore-hr"] = np.array( - scorer.score(preds, refs, batch_size=batch_size) - ) - # scores["BARTScore-fa"] = (scores["BARTScore-rh"] + scores["BARTScore-hr"]) / 2 - - if aggregate: - scores = {key: np.mean(value) for key, value in scores.items()} - return scores - - -def calculate_abstractiveness_scores( - predictions, texts, references=None, aggregate: bool = True -): - stemmer = porter.PorterStemmer() - tokenized_preds = [tokenize.tokenize(x, stemmer) for x in predictions] - tokenized_texts = [tokenize.tokenize(x, stemmer) for x in texts] - if references is not None: - tokenized_refs = [tokenize.tokenize(x, stemmer) for x in references] - else: - tokenized_refs = tokenized_preds - - result = {} - for use_modified in [False, True]: - for n in range(1, 5): - pred_ngram_overlaps = [] - label_ngram_overlaps = [] - for pred, label, text in zip( - tokenized_preds, tokenized_refs, tokenized_texts - ): - pred_pair_ngram_overlap = calculate_ngram_overlap( - pred, text, n, use_modified - ) - pred_ngram_overlaps.append(pred_pair_ngram_overlap) - if references is not None: - label_pair_ngram_overlap = calculate_ngram_overlap( - label, text, n, use_modified - ) - label_ngram_overlaps.append(label_pair_ngram_overlap) - key = f"ngram_overlap_{n}" if use_modified else f"novel_ngrams_{n}" - - pred_ngram_overlaps = np.array(pred_ngram_overlaps) - cond_abs = ~np.isnan(pred_ngram_overlaps) - result[key + "_abs"] = pred_ngram_overlaps[cond_abs] - - if references is not None: - label_ngram_overlaps = np.array(label_ngram_overlaps) - cond_rel = cond_abs & ~np.isnan(label_ngram_overlaps) - result[key + "_rel"] = ( - pred_ngram_overlaps[cond_rel] / label_ngram_overlaps[cond_rel] - ) - - if aggregate: - for key, value in result.items(): - result[key] = np.mean(value) - - return result - - -def calculate_ngram_overlap(summary, text, n=1, use_modified=True): - summary_ngrams = list(ngrams(summary, n)) - text_ngrams = list(ngrams(text, n)) - - if len(summary_ngrams) > 0: - ngrams_intersection = set(summary_ngrams).intersection(set(text_ngrams)) - if use_modified: - word_is_part_of_ngram_copied = [ - any((x in ngram for ngram in ngrams_intersection)) for x in summary - ] - return 1 - sum(word_is_part_of_ngram_copied) / len( - word_is_part_of_ngram_copied - ) - else: - return sum([x not in ngrams_intersection for x in summary_ngrams]) / len( - summary_ngrams - ) - return np.nan - - -class SentBert: - def __init__( - self, - checkpoint: str = "sentence-transformers/all-mpnet-base-v2", - device: str = "cuda", - cache_dir: str = "cache", - ): - self.tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=cache_dir) - self.model = AutoModel.from_pretrained(checkpoint, cache_dir=cache_dir).to( - device - ) - self.device = device - - def __call__( - self, source_texts: list[str], ref_texts: list[str], batch_size: int = 32 - ) -> np.ndarray: - assert len(source_texts) == len(ref_texts) - # Make batch_size an even number - if batch_size % 2 == 0: - batch_size -= 1 - half_batch_size = batch_size // 2 - n_texts = len(source_texts) - scores = np.empty(n_texts, dtype=np.float32) - start = 0 - end = 0 - - while end < n_texts: - end += half_batch_size - batch_idx = slice(start, end) - # Tokenize sentences - encoded_input = self.tokenizer( - source_texts[batch_idx] + ref_texts[batch_idx], - padding=True, - truncation=True, - return_tensors="pt", - ) - encoded_input = { - key: value.to(self.device) for key, value in encoded_input.items() - } - # Calculate the probability of belonging to the positive class - model_output = self.model(**encoded_input) - # Perform pooling - sent_embs = self.mean_pooling(model_output, encoded_input["attention_mask"]) - # Normalize embeddings - sent_embs = F.normalize(sent_embs, p=2, dim=1) - n_source_embs = len(sent_embs) // 2 - scores[batch_idx] = ( - (sent_embs[:n_source_embs] * sent_embs[n_source_embs:]) - .sum(-1) - .cpu() - .detach() - .numpy() - ) - start = end - - return scores - - @staticmethod - def mean_pooling(model_output, attention_mask): - """ - Mean Pooling - Take attention mask into account for correct averaging - """ - token_embeddings = model_output[ - 0 - ] # First element of model_output contains all token embeddings - input_mask_expanded = ( - attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - ) - return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( - input_mask_expanded.sum(1), min=1e-9 - ) - - -def calculate_alignscore( - predictions: list[str], - references: Union[list[str], list[list[str]]], - original_texts: list[str], - batch_size: int = 32, - device: str = "cuda", - cache_dir: str = "cache", -): - if not is_alignscore_available: - return None - if isinstance(references[0], list): - log.error("AlignScore does not support multiple references. Skipping...") - return None - if not os.path.exists(ALIGNSCORE_CHECKPOINT_PATH): - urlretrieve( - "https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-base.ckpt", - ALIGNSCORE_CHECKPOINT_PATH, - ) - - scorer = AlignScore( - model="roberta-base", - batch_size=batch_size, - device=device, - ckpt_path=ALIGNSCORE_CHECKPOINT_PATH, - evaluation_mode="nli_sp", - ) - # Fix: alignscore outputs an error if a text is empty, so we need to add some content to such texts - original_texts = [text if text else " " for text in original_texts] - predictions = [text if text else " " for text in predictions] - references = [text if text else " " for text in references] - - scores_ref = scorer.score(contexts=original_texts, claims=predictions) - if isinstance(references[0], list): - scores_baseline = [] - for orig_text, refs in zip(original_texts, references): - inst_baseline_scores = scorer.score( - contexts=[orig_text] * len(refs), claims=refs - ) - scores_baseline.append(max(inst_baseline_scores)) - else: - scores_baseline = scorer.score(contexts=original_texts, claims=references) - scores_rel = np.array(scores_ref) / np.array(scores_baseline) - return {"alignscore": scores_ref, "alignscore_rel": scores_rel} - - -class EvaluationLLM(DeepEvalBaseLLM): - """ - Custom Evaluation LLM implementation for DeepEval. - - This class implements the DeepEvalBaseLLM interface to allow using - custom models with DeepEval metrics. - """ - - def __init__( - self, - api_key=None, - model="openai/gpt-4o-2024-11-20", - base_url="https://openrouter.ai/api/v1", - ): - """ - Initialize the Evaluation LLM. - - Args: - api_key: Evaluation API key - model: Model identifier (e.g., "openai/gpt-4o-2024-11-20") - base_url: Evaluation API base URL - """ - self.api_key = api_key - - self.model_name = model - self.base_url = base_url - self.client = None - self.async_client = None - self.OpenAI = OpenAI - self.AsyncOpenAI = AsyncOpenAI - - def load_model(self): - """Load and return the client.""" - if self.client is None: - self.client = self.OpenAI( - base_url=self.base_url, - api_key=self.api_key, - ) - return self.client - - def load_async_model(self): - """Load and return the async client.""" - if self.async_client is None: - self.async_client = self.AsyncOpenAI( - base_url=self.base_url, - api_key=self.api_key, - ) - return self.async_client - - def generate(self, prompt: str) -> str: - """ - Generate a response from the evaluation model. - - Args: - prompt: The prompt to send to the model - - Returns: - The model's response as a string - """ - client = self.load_model() - response = client.chat.completions.create( - model=self.model_name, - messages=[{"role": "user", "content": prompt}], - ) - return response.choices[0].message.content - - async def a_generate(self, prompt: str) -> str: - """ - Asynchronously generate a response from the evaluation model. - - Args: - prompt: The prompt to send to the model - - Returns: - The model's response as a string - """ - # Use the async client for async operations - client = self.load_async_model() - response = await client.chat.completions.create( - model=self.model_name, - messages=[{"role": "user", "content": prompt}], - ) - return response.choices[0].message.content - - def get_model_name(self): - """Return the name of the model.""" - return f"EvaluationLLM: {self.model_name}" - - -def calculate_deepeval_metrics( - predictions, - references, - original_texts, - metrics_to_calculate=None, - base_url: str = "https://openrouter.ai/api/v1", - api_key: str = None, - model="openai/gpt-4o-2024-11-20", - threshold=0.5, - include_reason=False, - strict_mode=False, - async_mode=True, - verbose_mode=False, - truths_extraction_limit=None, -): - """ - Calculate DeepEval metrics using EvaluationLLM. - - Args: - predictions: list of generated texts - references: list of reference texts - original_texts: list of source texts - metrics_to_calculate: list of metrics to calculate. Options: - ["deepeval_answer_relevance", "deepeval_faithfulness", "deepeval_summarization", "deepeval_prompt_alignment"] - api_key: Evaluation API key - base_url: Evaluation API base URL - model: Evaluation model to use - threshold: Threshold for metrics (default: 0.5) - include_reason: Include reason for evaluation score (default: False) - strict_mode: Enforce binary metric score (1 for perfection, 0 otherwise) (default: False) - async_mode: Enable concurrent execution (default: True) - verbose_mode: Print intermediate steps (default: False) - truths_extraction_limit: Maximum number of factual truths to extract (default: None) - - Returns: - dictionary with metric scores - """ - - if not metrics_to_calculate: - metrics_to_calculate = [ - "deepeval_answer_relevance", - "deepeval_faithfulness", - "deepeval_summarization", - "deepeval_prompt_alignment", - ] - - # Create EvaluationLLM instance - llm = EvaluationLLM( - base_url=base_url, - api_key=api_key, - model=model, - ) - - results = {} - - # Create metrics based on selected options - metrics = [] - metric_name_mapping = {} # Maps metric class name to the deepeval metric name - - # Dictionary to store test cases for each metric - metric_test_cases = {} - - if "deepeval_answer_relevance" in metrics_to_calculate: - metric = AnswerRelevancyMetric( - threshold=threshold, - model=llm, - include_reason=include_reason, - strict_mode=strict_mode, - async_mode=async_mode, - ) - metrics.append(metric) - metric_name_mapping[metric.__class__.__name__] = "deepeval_answer_relevance" - - # Create specific test cases for AnswerRelevancy metric - answer_relevance_test_cases = [] - for i, (pred, src) in enumerate(zip(predictions, original_texts)): - test_case = LLMTestCase( - input=src, - actual_output=pred, - ) - answer_relevance_test_cases.append(test_case) - metric_test_cases[metric.__class__.__name__] = answer_relevance_test_cases - - if "deepeval_faithfulness" in metrics_to_calculate: - metric = FaithfulnessMetric( - threshold=threshold, - model=llm, - include_reason=include_reason, - strict_mode=strict_mode, - async_mode=async_mode, - truths_extraction_limit=truths_extraction_limit, - ) - metrics.append(metric) - metric_name_mapping[metric.__class__.__name__] = "deepeval_faithfulness" - - # Create specific test cases for Faithfulness metric - faithfulness_test_cases = [] - for i, (pred, src) in enumerate(zip(predictions, original_texts)): - test_case = LLMTestCase( - input=src, - actual_output=pred, - retrieval_context=[src], - ) - faithfulness_test_cases.append(test_case) - metric_test_cases[metric.__class__.__name__] = faithfulness_test_cases - - if "deepeval_summarization" in metrics_to_calculate: - metric = SummarizationMetric( - threshold=threshold, - model=llm, - include_reason=include_reason, - strict_mode=strict_mode, - async_mode=async_mode, - ) - metrics.append(metric) - metric_name_mapping[metric.__class__.__name__] = "deepeval_summarization" - - # Create specific test cases for Summarization metric - summarization_test_cases = [] - for i, (pred, src) in enumerate(zip(predictions, original_texts)): - test_case = LLMTestCase( - input=src, - actual_output=pred, - ) - summarization_test_cases.append(test_case) - metric_test_cases[metric.__class__.__name__] = summarization_test_cases - - if "deepeval_prompt_alignment" in metrics_to_calculate: - metric = PromptAlignmentMetric( - threshold=threshold, - model=llm, - prompt_instructions=["Do what you are told to do in the prompt"], - include_reason=include_reason, - strict_mode=strict_mode, - async_mode=async_mode, - ) - metrics.append(metric) - metric_name_mapping[metric.__class__.__name__] = "deepeval_prompt_alignment" - - # Create specific test cases for PromptAlignment metric - prompt_alignment_test_cases = [] - for i, (pred, ref, src) in enumerate( - zip(predictions, references, original_texts) - ): - test_case = LLMTestCase( - input=src, - actual_output=pred, - expected_output=ref, - ) - prompt_alignment_test_cases.append(test_case) - metric_test_cases[metric.__class__.__name__] = prompt_alignment_test_cases - - # Run evaluation for each metric separately - for metric in metrics: - metric_class_name = metric.__class__.__name__ - test_cases = metric_test_cases.get(metric_class_name, []) - - if test_cases: - # Disable printing to console during evaluation if not verbose - original_stdout = sys.stdout - if not verbose_mode: - sys.stdout = open(os.devnull, "w") - - try: - # Run evaluation for this specific metric - evaluation_results = evaluate( - test_cases=test_cases, - metrics=[metric], - run_async=async_mode, - ) - - deepeval_metric_name = metric_name_mapping.get(metric_class_name) - scores = [] - reasons = [] - - # Process results for this metric - for result in evaluation_results.test_results: - scores.append(1 if result.success else 0) - - # Calculate average score - if scores: - results[deepeval_metric_name] = np.mean(scores) - - finally: - # Restore stdout - if not verbose_mode: - sys.stdout.close() - sys.stdout = original_stdout - print("================================================") - print("Results:") - print(results) - print("================================================") - - return results diff --git a/src/atgen/metrics/model_based/__init__.py b/src/atgen/metrics/model_based/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/atgen/metrics/model_based/alignscore.py b/src/atgen/metrics/model_based/alignscore.py new file mode 100644 index 0000000..353e8b2 --- /dev/null +++ b/src/atgen/metrics/model_based/alignscore.py @@ -0,0 +1,60 @@ +from logging import log +from multiprocessing import reduction +import os +from pathlib import Path +from atgen.metrics.base_metric import BaseMetric, MetricConfig +from typing import List, Optional +from urllib.request import urlretrieve +import numpy as np +from alignscore import AlignScore as AlignScoreModel + +class AlignScoreConfig(MetricConfig): + batch_size: int = 32 + device: str = "cuda" + cache_dir: str = "cache" + + +class AlignScore(BaseMetric): + def __init__(self, config: AlignScoreConfig): + super().__init__(config) + self.ALIGNSCORE_CHECKPOINT_PATH = os.getenv( + "ALIGNSCORE_CHECKPOINT_PATH", '../../cache/AlignScore-base.ckpt', + ) + + + if not os.path.exists(self.ALIGNSCORE_CHECKPOINT_PATH): + os.makedirs(os.path.dirname(self.ALIGNSCORE_CHECKPOINT_PATH), exist_ok=True) + urlretrieve( + "https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-base.ckpt", + self.ALIGNSCORE_CHECKPOINT_PATH, + ) + + self.scorer = AlignScoreModel( + model="roberta-base", + batch_size=self.config.batch_size, + device=self.config.device, + ckpt_path=self.ALIGNSCORE_CHECKPOINT_PATH, + evaluation_mode="nli_sp", + ) + + def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float: + original_texts = [text if text else " " for text in sources] + predictions = [text if text else " " for text in predictions] + references = [text if text else " " for text in references] + + scores_ref = self.scorer.score(contexts=original_texts, claims=predictions) + if isinstance(references[0], list): + scores_baseline = [] + for orig_text, refs in zip(original_texts, references): + inst_baseline_scores = self.scorer.score( + contexts=[orig_text] * len(refs), claims=refs + ) + scores_baseline.append(max(inst_baseline_scores)) + else: + scores_baseline = self.scorer.score(contexts=original_texts, claims=references) + scores_rel = np.array(scores_ref) / np.array(scores_baseline) + + if self.config.aggregate: + return {"alignscore": float(np.mean(scores_ref)), "alignscore_rel": float(np.mean(scores_rel))} + else: + return {"alignscore": scores_ref, "alignscore_rel": scores_rel} diff --git a/src/atgen/metrics/bart_score.py b/src/atgen/metrics/model_based/bartscore.py similarity index 70% rename from src/atgen/metrics/bart_score.py rename to src/atgen/metrics/model_based/bartscore.py index 6133693..77a4ce1 100644 --- a/src/atgen/metrics/bart_score.py +++ b/src/atgen/metrics/model_based/bartscore.py @@ -1,4 +1,6 @@ -# %% +from math import prod +from atgen.metrics.base_metric import BaseMetric, MetricConfig +from typing import List, Optional import traceback from typing import List from tqdm import tqdm @@ -116,3 +118,45 @@ def test(self, batch_size=3): tgt_list = ["That's stupid.", "What's the problem?", "He is trustworthy."] print(self.score(src_list, tgt_list, batch_size)) + + +class BartScoreConfig(MetricConfig): + device: str = "cuda" + max_length: int = 1024 + checkpoint: str = "facebook/bart-large-cnn" + cache_dir: str = "cache" + batch_size: int = 4 + +class BartScore(BaseMetric): + def __init__(self, config: BartScoreConfig): + super().__init__(config) + self.scorer = BARTScorer( + device=self.config.device, + max_length=self.config.max_length, + checkpoint=self.config.checkpoint, + cache_dir=self.config.cache_dir, + ) + + def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float: + scores = {} + if references is not None: + scores["BARTScore-sh"] = np.array( + self.scorer.score(references, predictions, batch_size=self.config.batch_size) + ) + if references is not None: + if isinstance(references[0], list): + scores_hr = [] + for ref, pred in zip(references, predictions): + inst_pred = [pred for _ in range(len(ref))] + # Take a maximum within the observation similar to ROUGE + inst_score_hr = max(self.scorer.score(inst_pred, ref, batch_size=self.config.batch_size)) + scores_hr.append(inst_score_hr) + scores["BARTScore-hr"] = np.array(scores_hr) + else: + scores["BARTScore-hr"] = np.array( + self.scorer.score(predictions, references, batch_size=self.config.batch_size) + ) + + if self.config.aggregate: + scores = {key: np.mean(value) for key, value in scores.items()} + return scores diff --git a/src/atgen/metrics/registry.py b/src/atgen/metrics/registry.py new file mode 100644 index 0000000..2735eda --- /dev/null +++ b/src/atgen/metrics/registry.py @@ -0,0 +1,22 @@ +from atgen.metrics.classic_metrics.bleu import Bleu, BleuConfig +from atgen.metrics.classic_metrics.rouge import Rouge, RougeConfig +from atgen.metrics.classic_metrics.sacrebleu import Sacrebleu, SacrebleuConfig +from atgen.metrics.classic_metrics.exact_match import ExactMatch, ExactMatchConfig +from atgen.metrics.classic_metrics.word_length import WordLength, WordLengthConfig +from atgen.metrics.model_based.bartscore import BartScore, BartScoreConfig +from atgen.metrics.model_based.alignscore import AlignScore, AlignScoreConfig +from atgen.metrics.deep_eval.deepeval_metrics import Deepeval, DeepevalConfig + + +METRICS_REGISTRY = { + "bleu": (Bleu, BleuConfig), + "rouge": (Rouge, RougeConfig), + "rouge1": (Rouge, RougeConfig), + "sacrebleu": (Sacrebleu, SacrebleuConfig), + "exact_match": (ExactMatch, ExactMatchConfig), + "word_length": (WordLength, WordLengthConfig), + "bartscore": (BartScore, BartScoreConfig), + "alignscore": (AlignScore, AlignScoreConfig), + "deepeval": (Deepeval, DeepevalConfig), + +} \ No newline at end of file