From 344cd5d07e50df6ad2562ece9a3ecb39d529b8db Mon Sep 17 00:00:00 2001
From: alfekka <milarv99@gmail.com>
Date: Mon, 21 Jul 2025 17:21:29 +0300
Subject: [PATCH] refactor metrics

---
 requirements.txt                              |   4 +-
 src/atgen/metrics/base_metric.py              |  53 ++
 src/atgen/metrics/classic_metrics/__init__.py |   1 +
 .../classic_metrics/abstractiveness.py        |  82 +++
 src/atgen/metrics/classic_metrics/bleu.py     |  56 ++
 .../metrics/classic_metrics/exact_match.py    |  31 +
 .../classic_metrics/exact_match_math.py       |  26 +
 src/atgen/metrics/classic_metrics/rouge.py    |  27 +
 .../metrics/classic_metrics/sacrebleu.py      |  37 +
 .../metrics/classic_metrics/word_length.py    |  36 +
 src/atgen/metrics/compute_metrics.py          | 369 +++-------
 src/atgen/metrics/deep_eval/__init__.py       |   0
 .../metrics/deep_eval/deepeval_metrics.py     | 183 +++++
 src/atgen/metrics/deep_eval/evaluationllm.py  |  99 +++
 .../deepeval_supported_models_and_metrics.py  |  64 --
 src/atgen/metrics/factory.py                  |  40 ++
 src/atgen/metrics/metrics.py                  | 639 ------------------
 src/atgen/metrics/model_based/__init__.py     |   0
 src/atgen/metrics/model_based/alignscore.py   |  60 ++
 .../bartscore.py}                             |  46 +-
 src/atgen/metrics/registry.py                 |  22 +
 21 files changed, 880 insertions(+), 995 deletions(-)
 create mode 100644 src/atgen/metrics/base_metric.py
 create mode 100644 src/atgen/metrics/classic_metrics/__init__.py
 create mode 100644 src/atgen/metrics/classic_metrics/abstractiveness.py
 create mode 100644 src/atgen/metrics/classic_metrics/bleu.py
 create mode 100644 src/atgen/metrics/classic_metrics/exact_match.py
 create mode 100644 src/atgen/metrics/classic_metrics/exact_match_math.py
 create mode 100644 src/atgen/metrics/classic_metrics/rouge.py
 create mode 100644 src/atgen/metrics/classic_metrics/sacrebleu.py
 create mode 100644 src/atgen/metrics/classic_metrics/word_length.py
 create mode 100644 src/atgen/metrics/deep_eval/__init__.py
 create mode 100644 src/atgen/metrics/deep_eval/deepeval_metrics.py
 create mode 100644 src/atgen/metrics/deep_eval/evaluationllm.py
 delete mode 100644 src/atgen/metrics/deepeval_supported_models_and_metrics.py
 create mode 100644 src/atgen/metrics/factory.py
 delete mode 100644 src/atgen/metrics/metrics.py
 create mode 100644 src/atgen/metrics/model_based/__init__.py
 create mode 100644 src/atgen/metrics/model_based/alignscore.py
 rename src/atgen/metrics/{bart_score.py => model_based/bartscore.py} (70%)
 create mode 100644 src/atgen/metrics/registry.py

diff --git a/requirements.txt b/requirements.txt
index 216563c..5d87b05 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,9 +28,9 @@ spacy==3.7.5
 streamlit==1.37.0
 streamlit-authenticator==0.4.2
 tabulate==0.9.0
-transformers==4.52.4
+transformers
 trl==0.15.2
 torchmetrics==1.4.1
-unsloth==2025.3.17
+unsloth
 vllm==0.8.1
 xlrd==1.2.0
diff --git a/src/atgen/metrics/base_metric.py b/src/atgen/metrics/base_metric.py
new file mode 100644
index 0000000..a31bf7b
--- /dev/null
+++ b/src/atgen/metrics/base_metric.py
@@ -0,0 +1,53 @@
+import string
+from pydantic import BaseModel
+from typing import List, Optional
+import re
+from abc import ABC, abstractmethod
+
+
+
+class MetricConfig(BaseModel):
+    aggregate: bool = True
+
+
+class BaseMetric(ABC):
+    def __init__(self, config: MetricConfig):
+        self.config = config
+
+    @abstractmethod
+    def compute(self, predictions: List[str], references: List[str]) -> float:
+        raise NotImplementedError
+    
+    def _preprocess_text(self, text: str, 
+                        do_lowercase: bool = True,
+                        do_remove_punctuation: bool = True,
+                        do_remove_extra_spaces: bool = True, 
+                        do_remove_stopwords: bool = False,
+                        stopwords: Optional[list[str]] = None) -> str:
+        # Convert to lowercase
+        if do_lowercase:
+            text = text.lower()
+        
+        # Remove punctuation
+        if do_remove_punctuation:
+            # Keep hyphens within words, remove other punctuation
+            text = re.sub(r'(?<!\w)-|-(?!\w)', ' ', text)  # Replace standalone hyphens
+            translator = str.maketrans('', '', string.punctuation.replace('-', ''))
+            text = text.translate(translator)
+            text = re.sub(r'(?<!\w)-(?!\w)', '', text)  # Remove remaining standalone hyphens
+        
+        # Normalize whitespace
+        if do_remove_extra_spaces:
+            text = ' '.join(text.split())
+        
+        # Remove stopwords
+        if do_remove_stopwords:
+            if stopwords is None:
+                import nltk
+                nltk.download('stopwords')
+                stopwords = nltk.corpus.stopwords.words('english')
+            words = text.split()
+            words = [w for w in words if w not in stopwords]
+            text = ' '.join(words)
+        
+        return text.strip()
\ No newline at end of file
diff --git a/src/atgen/metrics/classic_metrics/__init__.py b/src/atgen/metrics/classic_metrics/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/atgen/metrics/classic_metrics/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/atgen/metrics/classic_metrics/abstractiveness.py b/src/atgen/metrics/classic_metrics/abstractiveness.py
new file mode 100644
index 0000000..5e57376
--- /dev/null
+++ b/src/atgen/metrics/classic_metrics/abstractiveness.py
@@ -0,0 +1,82 @@
+from typing import List
+from rouge_score import tokenize
+import numpy as np
+from atgen.metrics.base_metric import BaseMetric, MetricConfig
+from nltk import ngrams
+from nltk.stem import porter
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.translate.bleu_score import corpus_bleu
+
+
+
+class AbstractivenessConfig(MetricConfig):
+    aggregate: bool = True
+
+class Abstractiveness(BaseMetric):
+    def __init__(self, config: AbstractivenessConfig):
+        super().__init__(config)
+        
+    def _calculate_ngram_overlap(self, summary, text, n=1, use_modified=True):
+        summary_ngrams = list(ngrams(summary, n))
+        text_ngrams = list(ngrams(text, n))
+
+        if len(summary_ngrams) > 0:
+            ngrams_intersection = set(summary_ngrams).intersection(set(text_ngrams))
+            if use_modified:
+                word_is_part_of_ngram_copied = [
+                    any((x in ngram for ngram in ngrams_intersection)) for x in summary
+                ]
+                return 1 - sum(word_is_part_of_ngram_copied) / len(
+                    word_is_part_of_ngram_copied
+                )
+            else:
+                return sum([x not in ngrams_intersection for x in summary_ngrams]) / len(
+                    summary_ngrams
+                )
+        return np.nan
+
+
+    def compute(self, predictions: List[str], references: List[str], sources: List[str], **kwargs) -> float:
+        stemmer = porter.PorterStemmer()
+        tokenized_preds = [tokenize.tokenize(x, stemmer) for x in predictions]
+        tokenized_texts = [tokenize.tokenize(x, stemmer) for x in sources]
+        if references is not None:
+            tokenized_refs = [tokenize.tokenize(x, stemmer) for x in references]
+        else:
+            tokenized_refs = tokenized_preds
+
+        result = {}
+        for use_modified in [False, True]:
+            for n in range(1, 5):
+                pred_ngram_overlaps = []
+                label_ngram_overlaps = []
+                for pred, label, text in zip(
+                    tokenized_preds, tokenized_refs, tokenized_texts
+                ):
+                    pred_pair_ngram_overlap = self._calculate_ngram_overlap(
+                        pred, text, n, use_modified
+                    )
+                    pred_ngram_overlaps.append(pred_pair_ngram_overlap)
+                    if references is not None:
+                        label_pair_ngram_overlap = self._calculate_ngram_overlap(
+                            label, text, n, use_modified
+                        )
+                        label_ngram_overlaps.append(label_pair_ngram_overlap)
+                key = f"ngram_overlap_{n}" if use_modified else f"novel_ngrams_{n}"
+
+                pred_ngram_overlaps = np.array(pred_ngram_overlaps)
+                cond_abs = ~np.isnan(pred_ngram_overlaps)
+                result[key + "_abs"] = pred_ngram_overlaps[cond_abs]
+
+                if references is not None:
+                    label_ngram_overlaps = np.array(label_ngram_overlaps)
+                    cond_rel = cond_abs & ~np.isnan(label_ngram_overlaps)
+                    result[key + "_rel"] = (
+                        pred_ngram_overlaps[cond_rel] / label_ngram_overlaps[cond_rel]
+                    )
+
+        if self.config.aggregate:
+            for key, value in result.items():
+                result[key] = np.mean(value)
+
+        return {"abstractiveness": result}
\ No newline at end of file
diff --git a/src/atgen/metrics/classic_metrics/bleu.py b/src/atgen/metrics/classic_metrics/bleu.py
new file mode 100644
index 0000000..72afd1c
--- /dev/null
+++ b/src/atgen/metrics/classic_metrics/bleu.py
@@ -0,0 +1,56 @@
+from atgen.metrics.base_metric import BaseMetric, MetricConfig
+from typing import List, Optional
+from nltk import ngrams
+from nltk.stem import porter
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.translate.bleu_score import corpus_bleu
+import numpy as np
+
+
+class BleuConfig(MetricConfig):
+    pass
+
+
+class Bleu(BaseMetric):
+    def __init__(self, config: BleuConfig):
+        super().__init__(config)
+
+
+    def _smoothing_function(self, p_n, references, hypothesis, hyp_len):
+        smoothed_p_n = []
+        for i, p_i in enumerate(p_n, start=1):
+            # Smoothing is not applied for unigrams
+            if i > 1:
+                # If hypothesis length is lower than the current order, its value equals (0 + 1) / (0 + 1) = 0
+                if hyp_len < i:
+                    assert p_i.denominator == 1
+                    smoothed_p_n.append(1)
+                # Otherwise apply smoothing
+                else:
+                    smoothed_p_i = (p_i.numerator + 1) / (p_i.denominator + 1)
+                    smoothed_p_n.append(smoothed_p_i)
+            else:
+                smoothed_p_n.append(p_i)
+        return smoothed_p_n
+
+    def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float:
+        scores = []
+        for pred, ref in zip(predictions, references):
+            if isinstance(ref, str):
+                ref_list = [ref]
+            else:
+                ref_list = ref
+            
+            tok_ref = [[word_tokenize(r) for r in ref_list]]
+            tok_pred = [word_tokenize(pred)]
+            
+            try:
+                bleu_score = corpus_bleu(tok_ref, tok_pred, smoothing_function=self._smoothing_function)
+                scores.append(bleu_score)
+            except (KeyError, ZeroDivisionError):
+                scores.append(0.0)
+        
+        if self.config.aggregate:
+            return {"bleu": float(np.mean(scores))}
+        else:
+            return {"bleu": scores}
\ No newline at end of file
diff --git a/src/atgen/metrics/classic_metrics/exact_match.py b/src/atgen/metrics/classic_metrics/exact_match.py
new file mode 100644
index 0000000..d73a9e8
--- /dev/null
+++ b/src/atgen/metrics/classic_metrics/exact_match.py
@@ -0,0 +1,31 @@
+from atgen.metrics.base_metric import BaseMetric, MetricConfig
+from typing import List, Optional
+import numpy as np
+
+class ExactMatchConfig(MetricConfig):
+    aggregate: bool = True
+
+
+class ExactMatch(BaseMetric):
+    def __init__(self, config: ExactMatchConfig):
+        super().__init__(config)
+        
+                
+    def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float:
+
+        if isinstance(references[0], list):
+            scores = np.array(
+                [
+                    any(self._preprocess_text(pred) == self._preprocess_text(one_ref) for one_ref in ref)
+                    for pred, ref in zip(predictions, references)
+                ]
+            )
+        else:
+            scores = np.array(
+                [self._preprocess_text(pred) == self._preprocess_text(ref) for pred, ref in zip(predictions, references)]
+            )
+        
+        if self.config.aggregate:
+            return {"exact_match": float(np.mean(scores))}
+        else:
+            return {"exact_match": scores}
\ No newline at end of file
diff --git a/src/atgen/metrics/classic_metrics/exact_match_math.py b/src/atgen/metrics/classic_metrics/exact_match_math.py
new file mode 100644
index 0000000..9025854
--- /dev/null
+++ b/src/atgen/metrics/classic_metrics/exact_match_math.py
@@ -0,0 +1,26 @@
+from multiprocessing import reduction
+from typing import Literal, Optional
+from omegaconf import DictConfig
+import numpy as np
+
+from atgen.metrics.classic_metrics.base_metric import BaseMetric, BaseMetricConfig
+
+
+class ExactMatchMathConfig(BaseMetricConfig):
+    aggregate: bool = True
+
+class ExactMatchMath(BaseMetric):
+    def __init__(self, config: DictConfig):
+        super().__init__(config)
+
+    def compute(self, generated_texts: list[str], reference_texts: list[str], original_texts: list[str], task: Literal["summarization", "open-qa", "multi-choice-qa", "translation", "math"]) -> float:
+        scores = np.array(
+            [
+                pred.split("#### ")[-1].lower() == ref.split("#### ")[-1].lower()
+                for pred, ref in zip(generated_texts, reference_texts)
+            ]
+        )
+        if self.config.aggregate:
+            return {"exact_match_math": float(np.mean(scores))}
+        else:
+            return {"exact_match_math": scores}
diff --git a/src/atgen/metrics/classic_metrics/rouge.py b/src/atgen/metrics/classic_metrics/rouge.py
new file mode 100644
index 0000000..b13b23a
--- /dev/null
+++ b/src/atgen/metrics/classic_metrics/rouge.py
@@ -0,0 +1,27 @@
+from atgen.metrics.base_metric import BaseMetric, MetricConfig
+from typing import List, Optional
+from evaluate import load
+import numpy as np
+
+
+class RougeConfig(MetricConfig):
+    use_stemmer: bool = True
+    
+
+
+class Rouge(BaseMetric):
+    def __init__(self, config: RougeConfig):
+        super().__init__(config)
+        self.rouge = load("rouge")
+        
+    def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float:
+        rouge_scores = self.rouge.compute(
+            predictions=predictions,
+            references=references,
+            use_stemmer=self.config.use_stemmer,
+        )
+
+        if self.config.aggregate:
+            return {k: float(np.mean(v)) for k, v in rouge_scores.items()}
+        else:
+            return rouge_scores
\ No newline at end of file
diff --git a/src/atgen/metrics/classic_metrics/sacrebleu.py b/src/atgen/metrics/classic_metrics/sacrebleu.py
new file mode 100644
index 0000000..f4f30ee
--- /dev/null
+++ b/src/atgen/metrics/classic_metrics/sacrebleu.py
@@ -0,0 +1,37 @@
+from time import time
+from typing import List, Optional
+from atgen.metrics.base_metric import BaseMetric, MetricConfig
+from evaluate import load
+import numpy as np
+
+
+
+sacrebleu = load("sacrebleu")
+
+
+class SacrebleuConfig(MetricConfig):
+    pass
+
+
+class Sacrebleu(BaseMetric):
+    def __init__(self, config: SacrebleuConfig):
+        super().__init__(config)
+        
+    def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float:
+        if not isinstance(references[0], list):
+            sacrebleu_references = [[ref] for ref in references]
+            sacrebleu_result = sacrebleu.compute(
+                predictions=predictions, references=sacrebleu_references
+            )
+            return float(sacrebleu_result.pop("score"))
+        else:
+            sacrebleu_scores = []
+            for pred, ref in zip(predictions, references):
+                sacrebleu_result = sacrebleu.compute(
+                    predictions=[pred], references=[ref]
+                )
+                sacrebleu_scores.append(sacrebleu_result.pop("score"))
+            if self.config.aggregate:
+                return {"sacrebleu": float(np.mean(sacrebleu_scores))}
+            else:
+                return {"sacrebleu": sacrebleu_scores}
\ No newline at end of file
diff --git a/src/atgen/metrics/classic_metrics/word_length.py b/src/atgen/metrics/classic_metrics/word_length.py
new file mode 100644
index 0000000..c790d02
--- /dev/null
+++ b/src/atgen/metrics/classic_metrics/word_length.py
@@ -0,0 +1,36 @@
+from atgen.metrics.base_metric import BaseMetric, MetricConfig
+from typing import List, Optional
+import numpy as np
+
+
+class WordLengthConfig(MetricConfig):
+    pass
+
+
+class WordLength(BaseMetric):
+    def __init__(self, config: WordLengthConfig):
+        super().__init__(config)
+        
+    def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float:
+        # Calculate generated text lengths
+        gen_word_lengths = np.array([len(text.split()) for text in predictions])
+        
+        # Calculate reference text lengths
+        if isinstance(references[0], list):
+            ref_word_lengths = np.array(
+                [
+                    np.mean([len(text.split()) for text in ref])
+                    for ref in references
+                ]
+            )
+        else:
+            ref_word_lengths = np.array([len(ref.split()) for ref in references])
+        
+        # Avoid division by zero
+        ref_word_lengths_safe = np.where(ref_word_lengths > 0, ref_word_lengths, 1)
+        relative_lengths = gen_word_lengths / ref_word_lengths_safe
+        
+        if self.config.aggregate:
+            return {"word_length": float(np.mean(relative_lengths))}
+        else:
+            return {"word_length": relative_lengths}
\ No newline at end of file
diff --git a/src/atgen/metrics/compute_metrics.py b/src/atgen/metrics/compute_metrics.py
index 2864372..e28a41f 100644
--- a/src/atgen/metrics/compute_metrics.py
+++ b/src/atgen/metrics/compute_metrics.py
@@ -1,305 +1,96 @@
-import string
-from time import time
 import logging
-from typing import Literal, Optional
+from time import time
+from typing import List, Dict, Literal, Optional
+
 from omegaconf import DictConfig
-import re
 
-import numpy as np
-from evaluate import load
-from tqdm import tqdm
+# The factory is now the single entry point to get a metric runner.
+# We'll assume it's located in atgen/metrics/factory.py
+from atgen.metrics.factory import MetricFactory
 
-from .metrics import (
-    pair_bleu,
-    calculate_bart_score,
-    calculate_alignscore,
-    calculate_deepeval_metrics,
-    is_bart_score_available,
-    is_alignscore_available,
-)
-from .deepeval_supported_models_and_metrics import API_MODELS, DEEPEVAL_METRICS
+log = logging.getLogger(__name__)
 
+# This mapping replaces the large if/elif/else block. It's declarative and easy to modify.
+TASK_TO_DEFAULT_METRICS = {
+    "summarization": ["exact_match", "sacrebleu", "bleu", "rouge", "word_length"],
+    "open-qa": ["exact_match"],
+    "multi-choice-qa": ["exact_match"],
+    "translation": ["exact_match", "sacrebleu", "bleu", "word_length"],
+    "math": ["exact_match_math"],
+}
 
-log = logging.getLogger()
+# Define which metrics require the 'original_texts' (sources) input.
+# This avoids passing it to metrics that don't need it.
+METRICS_REQUIRING_SOURCE = {"bartscore", "alignscore", "deepeval"}
 
 
 def compute_metrics(
-    generated_texts,
-    reference_texts,
-    original_texts,
+    generated_texts: List[str],
+    reference_texts: Optional[List[str]],
+    original_texts: Optional[List[str]],
     task: Literal["summarization", "open-qa", "multi-choice-qa", "translation", "math"],
     config: DictConfig,
-    cache_dir: str = "cache",
-) -> dict[str, float]:
-    """
-    Compute various metrics for generated texts.
-
-    Args:
-        generated_texts: List of generated texts to evaluate
-        reference_texts: List of reference texts (ground truth) or list of lists of reference texts
-        original_texts: List of source texts
-        task: Task type (summarization, open-qa, multi-choice-qa, translation)
-        config: Configuration for evaluation
-            - additional_metrics: List of additional metrics to use. Options include:
-                - "bartscore": BARTScore metrics
-                - "alignscore": AlignScore metrics
-                - DeepEval metrics (requires API key):
-                    - "deepeval_answer_relevance": Evaluates how well the output answers the input
-                    - "deepeval_faithfulness": Evaluates factual consistency with the input
-                    - "deepeval_summarization": Evaluates summarization quality
-                    - "deepeval_prompt_alignment": Evaluates alignment with the expected output
-            - provider: API key for the provider
-            - api_key: Model identifier to use
-            - model: Provider name (openai, anthropic, openrouter, or custom)
-            - base_url: API base URL (if None, uses default for the provider)
-            - deepeval_threshold: Threshold for DeepEval metrics (default: 0.5)
-            - deepeval_include_reason: Include reason for evaluation score (default: False)
-            - deepeval_strict_mode: Enforce binary metric score (default: False)
-            - deepeval_async_mode: Enable concurrent execution (default: True)
-            - deepeval_verbose_mode: Print intermediate steps (default: False)
-            - deepeval_truths_extraction_limit: Maximum number of factual truths to extract (default: None)
-    Returns:
-        Dictionary with metric scores
-
-    Note:
-        The OpenRouterLLM class is also available for direct use with DeepEval metrics:
-
-        ```python
-        from atgen.metrics import OpenRouterLLM
-        from deepeval.metrics import AnswerRelevanceMetric
-
-        llm = OpenRouterLLM(
-            api_key="your_openrouter_api_key",
-            model="openai/gpt-4o-2024-11-20"
-        )
-
-        metric = AnswerRelevanceMetric(model=llm)
-        ```
-    """
-    if task == "multi-choice-qa":
-        metrics_to_calculate = ["exact_match"] + list(config.additional_metrics)
-    elif task == "open-qa":
-        metrics_to_calculate = ["exact_match"] + list(config.additional_metrics)
-    elif task == "summarization":
-        metrics_to_calculate = ["exact_match", "sacrebleu", "bleu", "rouge", "word_length"] + list(config.additional_metrics)
-    elif task == "translation":
-        metrics_to_calculate = ["exact_match", "sacrebleu", "bleu", "word_length"] + list(config.additional_metrics)
-    elif task == "math":
-        metrics_to_calculate = ["exact_match_math"] + list(config.additional_metrics)
-    else:
-        raise NotImplementedError(f"Task {task} not implemented")   
-
-    if "sacrebleu" in metrics_to_calculate:
-        sacrebleu = load("sacrebleu", cache_dir=cache_dir)
-    if "rouge" in metrics_to_calculate:
-        rouge = load("rouge", cache_dir=cache_dir)
-
-    result = {}
-    if "word_length" in metrics_to_calculate:
-        result["word_length_gen"] = np.array(
-            [len(text.split()) for text in generated_texts]
-        )
-
+    cache_dir: Optional[str] = None,
+) -> Dict[str, float]:
+
+    if task not in TASK_TO_DEFAULT_METRICS:
+        raise NotImplementedError(f"Task '{task}' is not implemented in TASK_TO_DEFAULT_METRICS.")
+
+    # 1. Determine the full list of metrics to run
+    base_metrics = TASK_TO_DEFAULT_METRICS.get(task, [])
+    additional_metrics = list(config.get("additional_metrics", []))
+    # Use a set to handle duplicates, then sort for predictable execution order
+    metrics_to_calculate = sorted(list(set(base_metrics + additional_metrics)))
+
+    if not metrics_to_calculate:
+        log.warning("No metrics specified for calculation. Returning empty results.")
+        return {}
+
+    # 2. Instantiate the factory that will build our metric runners
+    metric_factory = MetricFactory(config, cache_dir=cache_dir)
+    
+    final_results = {}
     time_dict = {}
 
-    # Metrics that use both the generated texts and the original texts and
-    # those that do not require reference texts
-    if "bartscore" in metrics_to_calculate and is_bart_score_available:
-        log.info("Calculating BARTScore scores...")
-        start_time = time()
-        result.update(
-            calculate_bart_score(
-                preds=generated_texts,
-                texts=original_texts,
-                refs=reference_texts,
-                batch_size=4,
-                cache_dir=cache_dir,
-            )
-        )
-        time_dict["time_bartscore"] = time() - start_time
-    # Metrics that use both the generated texts and the reference texts
-    if reference_texts is not None:
-        # Exact match
-        if "exact_match" in metrics_to_calculate:
-            if isinstance(reference_texts[0], list):
-                result["exact_match"] = np.array(
-                    [
-                        any(_preprocess_text(pred) == _preprocess_text(one_ref) for one_ref in ref)
-                        for pred, ref in zip(generated_texts, reference_texts)
-                    ]
-                )
-            else:
-                result["exact_match"] = np.array(
-                    [_preprocess_text(pred) == _preprocess_text(ref) for pred, ref in zip(generated_texts, reference_texts)]
-                )
-        if "exact_match_math" in metrics_to_calculate:
-            # result["exact_match_math"] = np.array(
-            #     [
-            #         pred.split("Answer: ")[-1].lower() == ref.lower()
-            #         for pred, ref in zip(generated_texts, reference_texts)
-            #     ]
-            # )
-            result["exact_match_math"] = np.array(
-                [
-                    pred.split("#### ")[-1].lower() == ref.split("#### ")[-1].lower()
-                    for pred, ref in zip(generated_texts, reference_texts)
-                ]
-            )
-        if "bleu" in metrics_to_calculate:
-            # BLEU
-            start_time = time()
-            result["bleu"] = np.array(
-                [
-                    pair_bleu(references=ref, prediction=pred)
-                    for pred, ref in tqdm(zip(generated_texts, reference_texts))
-                ]
-            )
-            time_dict["time_bleu"] = time() - start_time
-        if "rouge" in metrics_to_calculate:
-            # ROUGE
-            start_time = time()
-            result.update(
-                rouge.compute(
-                    predictions=generated_texts,
-                    references=reference_texts,
-                    use_stemmer=True,
-                )
-            )
-            time_dict["time_rouge"] = time() - start_time
-        if "sacrebleu" in metrics_to_calculate:
-            # Sacrebleu
-            start_time = time()
-            if not isinstance(reference_texts[0], list):
-                sacrebleu_references = [[ref] for ref in reference_texts]
-                sacrebleu_result = sacrebleu.compute(
-                    predictions=generated_texts, references=sacrebleu_references
-                )
-                result["sacrebleu"] = sacrebleu_result.pop("score")
-            else:
-                sacrebleu_scores = []
-                for pred, ref in zip(generated_texts, reference_texts):
-                    sacrebleu_result = sacrebleu.compute(
-                        predictions=[pred], references=[ref]
-                    )
-                    sacrebleu_scores.append(sacrebleu_result.pop("score"))
-                result["sacrebleu"] = sacrebleu_scores
-
-            time_dict["time_sacrebleu"] = time() - start_time
-        if "word_length" in metrics_to_calculate:
-            # Lengths
-            if isinstance(reference_texts[0], list):
-                ref_word_lengths = np.array(
-                    [
-                        np.mean([len(text.split()) for text in ref])
-                        for ref in reference_texts
-                    ]
-                )
-            else:
-                ref_word_lengths = np.array([len(ref.split()) for ref in reference_texts])
-            # Avoid division by zero
-            ref_word_lengths_safe = np.where(ref_word_lengths > 0, ref_word_lengths, 1)
-            result["word_length_rel"] = result["word_length_gen"] / ref_word_lengths_safe
+    log.info(f"Starting evaluation for task '{task}' with metrics: {', '.join(metrics_to_calculate)}")
 
-        # AlignScore
-        if "alignscore" in metrics_to_calculate and is_alignscore_available:
-            log.info("Calculating AlignScore scores...")
+    # 3. Loop through metrics, delegate calculation, and collect results
+    for metric_name in metrics_to_calculate:
+        try:
+            log.info(f"--> Calculating metric: {metric_name}")
             start_time = time()
-            alignscores = calculate_alignscore(
-                generated_texts, reference_texts, original_texts
-            )
-            if alignscores is not None:
-                result.update(alignscores)
-            time_dict["time_alignscore"] = time() - start_time
-
-        # DeepEval metrics
-        deepeval_metrics_to_calculate = [
-            metric for metric in DEEPEVAL_METRICS if metric in config.additional_metrics
-        ]
-
-        if deepeval_metrics_to_calculate:
-            if isinstance(reference_texts[0], list):
-                log.error("DeepEval does not support multiple references. Skipping...")
-            else:
-                # Validate OpenRouter model - only warn if not in predefined list, but still use it
-                provider = config["provider"]
-                if config.model not in API_MODELS.get(provider):
-                    log.warning(
-                        f"Using custom model: {config.model}. "
-                        + (
-                            f"Available models: {API_MODELS[provider]}"
-                            if provider in API_MODELS
-                            else ""
-                        )
-                    )
-                log.info(
-                    f"Calculating DeepEval metrics: {', '.join(deepeval_metrics_to_calculate)}..."
-                )
-                start_time = time()
-                result.update(
-                    calculate_deepeval_metrics(
-                        predictions=generated_texts,
-                        references=reference_texts,
-                        original_texts=original_texts,
-                        metrics_to_calculate=deepeval_metrics_to_calculate,
-                        base_url=config.base_url,
-                        api_key=config.api_key,
-                        model=config.model,
-                        threshold=config.deepeval_threshold,
-                        include_reason=config.deepeval_include_reason,
-                        strict_mode=config.deepeval_strict_mode,
-                        async_mode=config.deepeval_async_mode,
-                        verbose_mode=config.deepeval_verbose_mode,
-                        truths_extraction_limit=config.deepeval_truths_extraction_limit,
-                    )
-                )
-                time_dict["time_deepeval"] = time() - start_time
-
-    for key, value in result.items():
-        if isinstance(value, np.ndarray):
-            result[key] = float(np.mean(value))
-        elif isinstance(value, (int, float)):
-            # Ensure numerical values are converted to float
-            result[key] = float(value)
-        # Make sure non-numerical values that aren't reasons are preserved
-        elif not key.endswith("_reasons") and not "_reason" in key.lower():
-            continue
-
-    # Filter out reason fields from the final aggregated results - more robust filtering
-    result = {
-        key: value
-        for key, value in sorted(result.items())
-        if not key.endswith("_reasons")
-        and not "_reason" in key.lower()
-        and isinstance(value, (int, float))  # Ensure we only keep numerical metrics
-    }
-
-    return result
 
-def _preprocess_text(text: str, do_lowercase: bool = True, do_remove_punctuation: bool = True, do_remove_extra_spaces: bool = True, do_remove_stopwords: bool = False, stopwords: Optional[list[str]] = None) -> str:
-        # Convert to lowercase
-        if do_lowercase:
-            text = text.lower()
-        
-        # Remove punctuation
-        if do_remove_punctuation:
-            # Keep hyphens within words, remove other punctuation
-            text = re.sub(r'(?<!\w)-|-(?!\w)', ' ', text)  # Replace standalone hyphens
-            translator = str.maketrans('', '', string.punctuation.replace('-', ''))
-            text = text.translate(translator)
-            text = re.sub(r'(?<!\w)-(?!\w)', '', text)  # Remove remaining standalone hyphens
-        
-        # Normalize whitespace
-        if do_remove_extra_spaces:
-            text = ' '.join(text.split())
-        
-        # Remove stopwords
-        if do_remove_stopwords:
-            if stopwords is None:
-                import nltk
-                nltk.download('stopwords')
-                stopwords = nltk.corpus.stopwords.words('english')
-            words = text.split()
-            words = [w for w in words if w not in stopwords]
-            text = ' '.join(words)
-        
-        return text.strip()
+            # The factory creates the appropriate metric runner with its specific config
+            metric_runner = metric_factory.get_metric(metric_name)
+
+            # Prepare arguments for the metric's standardized 'compute' method
+            compute_kwargs = {
+                "predictions": generated_texts,
+                "references": reference_texts,
+            }
+            
+            # Conditionally add 'sources' if the metric is known to require it
+            # This relies on the METRICS_REQUIRING_SOURCE set defined above.
+            if any(m in metric_name for m in METRICS_REQUIRING_SOURCE):
+                 compute_kwargs["sources"] = original_texts
+
+            # Each metric's compute method is responsible for its own logic
+            # and should return a dictionary of scores.
+            metric_scores = metric_runner.compute(**compute_kwargs)
+            
+            final_results.update(metric_scores)
+            time_dict[f"time_{metric_name}"] = time() - start_time
+            
+            log.info(f"<-- Finished {metric_name} in {time_dict[f'time_{metric_name}']:.2f}s.")
+
+        except ImportError as e:
+            log.warning(f"Could not compute metric '{metric_name}' due to a missing dependency: {e}. Please install the required package and try again.")
+        except Exception as e:
+            # Catch other errors, log them, and continue to the next metric
+            log.error(f"Failed to compute metric '{metric_name}': {e}", exc_info=True)
+
+    # 4. Finalize and return the results
+    final_results.update(time_dict)
+    
+    # Sort the final dictionary by key for consistent, readable output
+    return dict(sorted(final_results.items()))
diff --git a/src/atgen/metrics/deep_eval/__init__.py b/src/atgen/metrics/deep_eval/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/atgen/metrics/deep_eval/deepeval_metrics.py b/src/atgen/metrics/deep_eval/deepeval_metrics.py
new file mode 100644
index 0000000..63ff06f
--- /dev/null
+++ b/src/atgen/metrics/deep_eval/deepeval_metrics.py
@@ -0,0 +1,183 @@
+from math import lcm
+from deepeval.metrics import (
+    AnswerRelevancyMetric,
+    FaithfulnessMetric,
+    SummarizationMetric,
+    PromptAlignmentMetric,
+)
+from atgen.metrics.base_metric import BaseMetric, MetricConfig
+from atgen.metrics.deep_eval.evaluationllm import EvaluationLLM
+from typing import List, Optional
+from deepeval.test_case import LLMTestCase
+from deepeval import evaluate
+import numpy as np
+import sys
+import os
+
+
+class DeepevalConfig(MetricConfig):
+    base_url: str = "https://openrouter.ai/api/v1"
+    api_key: str = None
+    model: str = "openai/gpt-4o-2024-11-20"
+    threshold: float = 0.5
+    include_reason: bool = False
+    strict_mode: bool = False
+    async_mode: bool = True
+    verbose_mode: bool = False
+    truths_extraction_limit: int = None
+    metrics_to_calculate: list[str] = ["deepeval_answer_relevance", "deepeval_faithfulness", "deepeval_summarization", "deepeval_prompt_alignment"]
+    
+    
+class Deepeval(BaseMetric):
+    def __init__(self, config: DeepevalConfig):
+        super().__init__(config)
+        
+        self.llm = EvaluationLLM(
+            base_url=self.config.base_url,
+            api_key=self.config.api_key,
+            model=self.config.model,
+            threshold=self.config.threshold,
+            include_reason=self.config.include_reason,
+            strict_mode=self.config.strict_mode,
+        )
+        
+    def compute(self, predictions: List[str], references: List[str], original_texts: List[str], sources: Optional[List[str]] = None, **kwargs) -> float:
+        results = {}
+
+        metrics = []
+        metric_name_mapping = {}  # Maps metric class name to the deepeval metric name
+
+        # Dictionary to store test cases for each metric
+        metric_test_cases = {}
+
+        if "deepeval_answer_relevance" in self.config.metrics_to_calculate:
+            metric = AnswerRelevancyMetric(
+                threshold=self.config.threshold,
+                model=self.llm,
+                include_reason=self.config.include_reason,
+                strict_mode=self.config.strict_mode,
+                async_mode=self.config.async_mode,
+            )
+            metrics.append(metric)
+            metric_name_mapping[metric.__class__.__name__] = "deepeval_answer_relevance"
+
+            # Create specific test cases for AnswerRelevancy metric
+            answer_relevance_test_cases = []
+            for i, (pred, src) in enumerate(zip(predictions, original_texts)):
+                test_case = LLMTestCase(
+                    input=src,
+                    actual_output=pred,
+                )
+                answer_relevance_test_cases.append(test_case)
+            metric_test_cases[metric.__class__.__name__] = answer_relevance_test_cases
+
+        if "deepeval_faithfulness" in self.config.metrics_to_calculate:
+            metric = FaithfulnessMetric(
+                threshold=self.config.threshold,
+                model=self.llm,
+                include_reason=self.config.include_reason,
+                strict_mode=self.config.strict_mode,
+                async_mode=self.config.async_mode,
+                truths_extraction_limit=self.config.truths_extraction_limit,
+            )
+            metrics.append(metric)
+            metric_name_mapping[metric.__class__.__name__] = "deepeval_faithfulness"
+
+            # Create specific test cases for Faithfulness metric
+            faithfulness_test_cases = []
+            for i, (pred, src) in enumerate(zip(predictions, original_texts)):
+                test_case = LLMTestCase(
+                    input=src,
+                    actual_output=pred,
+                    retrieval_context=[src],
+                )
+                faithfulness_test_cases.append(test_case)
+            metric_test_cases[metric.__class__.__name__] = faithfulness_test_cases
+
+        if "deepeval_summarization" in self.config.metrics_to_calculate:
+            metric = SummarizationMetric(
+                threshold=self.config.threshold,
+                model=self.llm,
+                include_reason=self.config.include_reason,
+                strict_mode=self.config.strict_mode,
+                async_mode=self.config.async_mode,
+            )
+            metrics.append(metric)
+            metric_name_mapping[metric.__class__.__name__] = "deepeval_summarization"
+
+            # Create specific test cases for Summarization metric
+            summarization_test_cases = []
+            for i, (pred, src) in enumerate(zip(predictions, original_texts)):
+                test_case = LLMTestCase(
+                    input=src,
+                    actual_output=pred,
+                )
+                summarization_test_cases.append(test_case)
+            metric_test_cases[metric.__class__.__name__] = summarization_test_cases
+
+        if "deepeval_prompt_alignment" in self.config.metrics_to_calculate:
+            metric = PromptAlignmentMetric(
+                threshold=self.config.threshold,
+                model=self.llm,
+                prompt_instructions=["Do what you are told to do in the prompt"],
+                include_reason=self.config.include_reason,
+                strict_mode=self.config.strict_mode,
+                async_mode=self.config.async_mode,
+            )
+            metrics.append(metric)
+            metric_name_mapping[metric.__class__.__name__] = "deepeval_prompt_alignment"
+
+            # Create specific test cases for PromptAlignment metric
+            prompt_alignment_test_cases = []
+            for i, (pred, ref, src) in enumerate(
+                zip(predictions, references, original_texts)
+            ):
+                test_case = LLMTestCase(
+                    input=src,
+                    actual_output=pred,
+                    expected_output=ref,
+                )
+                prompt_alignment_test_cases.append(test_case)
+            metric_test_cases[metric.__class__.__name__] = prompt_alignment_test_cases
+
+        for metric in metrics:
+            metric_class_name = metric.__class__.__name__
+            test_cases = metric_test_cases.get(metric_class_name, [])
+
+            if test_cases:
+                original_stdout = sys.stdout
+                if not self.config.verbose_mode:
+                    sys.stdout = open(os.devnull, "w")
+
+                try:
+                    # Run evaluation for this specific metric
+                    evaluation_results = evaluate(
+                        test_cases=test_cases,
+                        metrics=[metric],
+                        run_async=self.config.async_mode,
+                    )
+
+                    deepeval_metric_name = metric_name_mapping.get(metric_class_name)
+                    scores = []
+                    reasons = []
+
+                    # Process results for this metric
+                    for result in evaluation_results.test_results:
+                        scores.append(1 if result.success else 0)
+
+                    # Calculate average score
+                    if scores:
+                        results[deepeval_metric_name] = np.mean(scores)
+
+                finally:
+                    # Restore stdout
+                    if not self.config.verbose_mode:
+                        sys.stdout.close()
+                        sys.stdout = original_stdout
+        print("================================================")
+        print("Results:")
+        print(results)
+        print("================================================")
+
+        return {"deepeval": results}
+
diff --git a/src/atgen/metrics/deep_eval/evaluationllm.py b/src/atgen/metrics/deep_eval/evaluationllm.py
new file mode 100644
index 0000000..88d0a41
--- /dev/null
+++ b/src/atgen/metrics/deep_eval/evaluationllm.py
@@ -0,0 +1,99 @@
+from deepeval import evaluate
+from deepeval.models.base_model import DeepEvalBaseLLM
+from deepeval.test_case import LLMTestCase
+from deepeval.metrics import (
+    AnswerRelevancyMetric,
+    FaithfulnessMetric,
+    SummarizationMetric,
+    PromptAlignmentMetric,
+)
+from openai import OpenAI, AsyncOpenAI
+
+
+class EvaluationLLM(DeepEvalBaseLLM):
+    """
+    Custom Evaluation LLM implementation for DeepEval.
+
+    This class implements the DeepEvalBaseLLM interface to allow using
+    custom models with DeepEval metrics.
+    """
+
+    def __init__(
+        self,
+        api_key=None,
+        model="openai/gpt-4o-2024-11-20",
+        base_url="https://openrouter.ai/api/v1",
+    ):
+        """
+        Initialize the Evaluation LLM.
+
+        Args:
+            api_key: Evaluation API key
+            model: Model identifier (e.g., "openai/gpt-4o-2024-11-20")
+            base_url: Evaluation API base URL
+        """
+        self.api_key = api_key
+
+        self.model_name = model
+        self.base_url = base_url
+        self.client = None
+        self.async_client = None
+        self.OpenAI = OpenAI
+        self.AsyncOpenAI = AsyncOpenAI
+
+    def load_model(self):
+        """Load and return the client."""
+        if self.client is None:
+            self.client = self.OpenAI(
+                base_url=self.base_url,
+                api_key=self.api_key,
+            )
+        return self.client
+
+    def load_async_model(self):
+        """Load and return the async client."""
+        if self.async_client is None:
+            self.async_client = self.AsyncOpenAI(
+                base_url=self.base_url,
+                api_key=self.api_key,
+            )
+        return self.async_client
+
+    def generate(self, prompt: str) -> str:
+        """
+        Generate a response from the evaluation model.
+
+        Args:
+            prompt: The prompt to send to the model
+
+        Returns:
+            The model's response as a string
+        """
+        client = self.load_model()
+        response = client.chat.completions.create(
+            model=self.model_name,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return response.choices[0].message.content
+
+    async def a_generate(self, prompt: str) -> str:
+        """
+        Asynchronously generate a response from the evaluation model.
+
+        Args:
+            prompt: The prompt to send to the model
+
+        Returns:
+            The model's response as a string
+        """
+        # Use the async client for async operations
+        client = self.load_async_model()
+        response = await client.chat.completions.create(
+            model=self.model_name,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return response.choices[0].message.content
+
+    def get_model_name(self):
+        """Return the name of the model."""
+        return f"EvaluationLLM: {self.model_name}"
diff --git a/src/atgen/metrics/deepeval_supported_models_and_metrics.py b/src/atgen/metrics/deepeval_supported_models_and_metrics.py
deleted file mode 100644
index 40b3a1b..0000000
--- a/src/atgen/metrics/deepeval_supported_models_and_metrics.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from typing import Literal
-
-# Available DeepEval metrics
-DEEPEVAL_METRICS = [
-    "deepeval_answer_relevance",
-    "deepeval_faithfulness",
-    "deepeval_summarization",
-    "deepeval_prompt_alignment",
-]
-
-# Available API models by provider
-# TODO: somehow update periodically
-API_MODELS = {
-    "openai": ["gpt-4o", "gpt-4o-mini", "gpt-4.5", "o1-preview"],
-    "anthropic": [
-        "claude-3.7-sonnet:thinking",
-        "claude-3.7-sonnet",
-        "claude-3-5-sonnet",
-        "claude-3-opus",
-        "claude-3-sonnet",
-        "claude-3-haiku",
-    ],
-    "openrouter": [
-        "openai/gpt-4o-2024-11-20",
-        "google/gemini-2.0-flash-001",
-        "anthropic/claude-3.5-sonnet",
-        "openai/gpt-4o-mini",
-        "mistralai/mistral-nemo",
-        "meta-llama/llama-3.1-70b-instruct",
-    ],
-}
-
-
-def get_available_models(
-    provider: Literal["openai", "anthropic", "openrouter"] = "openai",
-):
-    """
-    Get a list of all available models for a specific provider.
-
-    Args:
-        provider: The provider to get models for Literal["openai", "anthropic", "openrouter"]
-
-    Returns:
-        List of available models for the specified provider
-    """
-    provider = provider.lower()
-    if provider in API_MODELS:
-        return API_MODELS[provider]
-    return []
-
-
-def get_available_metrics():
-    """
-    Get a list of all available metrics.
-
-    Returns:
-        List of available metrics
-    """
-    basic_metrics = [
-        "bartscore",
-        "alignscore",
-    ]
-
-    return basic_metrics + DEEPEVAL_METRICS
diff --git a/src/atgen/metrics/factory.py b/src/atgen/metrics/factory.py
new file mode 100644
index 0000000..d68a7a9
--- /dev/null
+++ b/src/atgen/metrics/factory.py
@@ -0,0 +1,40 @@
+import logging
+from omegaconf import DictConfig, OmegaConf
+from .registry import METRICS_REGISTRY
+from .base_metric import BaseMetric
+from typing import Optional
+log = logging.getLogger(__name__)
+
+class MetricFactory:
+    def __init__(self, config: DictConfig, cache_dir: Optional[str] = None):
+        """
+        Initializes the factory with the global configuration.
+        
+        Args:
+            config: The main OmegaConf DictConfig object.
+        """
+        self.config = config
+        self.cache_dir = cache_dir
+    def get_metric(self, metric_name: str) -> BaseMetric:
+        """
+        Instantiates and returns a metric runner based on its name.
+
+        It automatically creates the specific config for the metric
+        by extracting relevant parameters from the main config object.
+
+        Args:
+            metric_name: The name of the metric to instantiate.
+
+        Returns:
+            An instance of a class derived from BaseMetric.
+        """
+        if metric_name not in METRICS_REGISTRY:
+            if "deepeval" in metric_name:
+                metric_name = "deepeval"
+            else:
+                raise ValueError(f"Metric '{metric_name}' not found in registry.")
+
+        MetricClass, ConfigClass = METRICS_REGISTRY[metric_name]
+        metric_config = ConfigClass()
+        
+        return MetricClass(config=metric_config)
diff --git a/src/atgen/metrics/metrics.py b/src/atgen/metrics/metrics.py
deleted file mode 100644
index 52f9504..0000000
--- a/src/atgen/metrics/metrics.py
+++ /dev/null
@@ -1,639 +0,0 @@
-from math import ceil
-import os
-import sys
-from openai import OpenAI, AsyncOpenAI
-from typing import Union
-from urllib.request import urlretrieve
-import logging
-from pathlib import Path
-import nltk
-import numpy as np
-import torch
-import torch.nn.functional as F
-from datasets import Dataset
-from nltk import ngrams
-from nltk.stem import porter
-from nltk.tokenize import word_tokenize, sent_tokenize
-from nltk.translate.bleu_score import corpus_bleu
-from rouge_score import tokenize
-from torch.utils.data import DataLoader
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    AutoModel,
-    DataCollatorWithPadding,
-)
-
-log = logging.getLogger(__name__)
-
-try:
-    from .bart_score import BARTScorer
-
-    is_bart_score_available = True
-except ImportError:
-    log.warning(
-        "BARTScorer not found, please install it (see `install.sh`). Skipping the BARTScore metric."
-    )
-    is_bart_score_available = False
-
-try:
-    from alignscore import AlignScore
-
-    is_alignscore_available = True
-except ImportError:
-    log.warning(
-        "AlignScore not found, please install it (see `install.sh`). Skipping the AlignScore metric."
-    )
-    is_alignscore_available = False
-
-from deepeval import evaluate
-from deepeval.models.base_model import DeepEvalBaseLLM
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import (
-    AnswerRelevancyMetric,
-    FaithfulnessMetric,
-    SummarizationMetric,
-    PromptAlignmentMetric,
-)
-
-
-ALIGNSCORE_CHECKPOINT_PATH = os.getenv(
-    "ALIGNSCORE_CHECKPOINT_PATH",
-    # Going up 3 levels from metrics.py: src/atgen/metrics -> repository root
-    os.path.join(
-        Path(__file__).parents[3],
-        "cache/AlignScore-base.ckpt",
-    ),
-)
-
-
-def decode(eval_preds, tokenizer):
-    predictions, labels, *inputs = eval_preds
-    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
-    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
-    # Replace -100 in the labels as we can't decode them.
-    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
-    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-
-    decoded_preds = [pred.strip() for pred in decoded_preds]
-    decoded_labels = [label.strip() for label in decoded_labels]
-
-    if len(inputs) > 0:
-        input_ids = inputs[0]
-        input_ids = np.where(input_ids != -100, input_ids, tokenizer.pad_token_id)
-        decoded_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
-        decoded_texts = [text.strip() for text in decoded_texts]
-        return decoded_preds, decoded_labels, decoded_texts
-
-    return decoded_preds, decoded_labels
-
-
-def smoothing_function(p_n, references, hypothesis, hyp_len):
-    """
-    Smooth-BLEU (BLEUS) as proposed in the paper:
-    Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
-    evaluation metrics for machine translation. COLING 2004.
-    """
-    smoothed_p_n = []
-    for i, p_i in enumerate(p_n, start=1):
-        # Smoothing is not applied for unigrams
-        if i > 1:
-            # If hypothesis length is lower than the current order, its value equals (0 + 1) / (0 + 1) = 0
-            if hyp_len < i:
-                assert p_i.denominator == 1
-                smoothed_p_n.append(1)
-            # Otherwise apply smoothing
-            else:
-                smoothed_p_i = (p_i.numerator + 1) / (p_i.denominator + 1)
-                smoothed_p_n.append(smoothed_p_i)
-        else:
-            smoothed_p_n.append(p_i)
-    return smoothed_p_n
-
-
-def pair_bleu(references: list[str] | str, prediction: str):
-    """
-    Compute the bleu score between two given texts.
-    A smoothing function is used to avoid zero scores when
-    there are no common higher order n-grams between the
-    texts.
-    """
-    if isinstance(references, str):
-        tok_ref = [[word_tokenize(references)]]
-    else:
-        tok_ref = [[word_tokenize(ref) for ref in references]]
-    tok_pred = [word_tokenize(prediction)]
-    try:
-        return corpus_bleu(tok_ref, tok_pred, smoothing_function=smoothing_function)
-    except (KeyError, ZeroDivisionError):
-        return 0.0
-
-
-def calculate_bart_score(
-    preds,
-    refs=None,
-    texts=None,
-    scorer=None,
-    batch_size=4,
-    aggregate=True,
-    cache_dir: str = "cache",
-):
-    if not is_bart_score_available:
-        return None
-    if scorer is None:
-        scorer = BARTScorer(cache_dir=cache_dir)
-    scores = {}
-    if texts is not None:
-        scores["BARTScore-sh"] = np.array(
-            scorer.score(texts, preds, batch_size=batch_size)
-        )
-    if refs is not None:
-        # scores["BARTScore-rh"] = np.array(scorer.score(refs, preds, batch_size=batch_size))
-        if isinstance(refs[0], list):
-            scores_hr = []
-            for ref, pred in zip(refs, preds):
-                inst_pred = [pred for _ in range(len(ref))]
-                # Take a maximum within the observation similar to ROUGE
-                inst_score_hr = max(scorer.score(inst_pred, ref, batch_size=batch_size))
-                scores_hr.append(inst_score_hr)
-            scores["BARTScore-hr"] = np.array(scores_hr)
-        else:
-            scores["BARTScore-hr"] = np.array(
-                scorer.score(preds, refs, batch_size=batch_size)
-            )
-        # scores["BARTScore-fa"] = (scores["BARTScore-rh"] + scores["BARTScore-hr"]) / 2
-
-    if aggregate:
-        scores = {key: np.mean(value) for key, value in scores.items()}
-    return scores
-
-
-def calculate_abstractiveness_scores(
-    predictions, texts, references=None, aggregate: bool = True
-):
-    stemmer = porter.PorterStemmer()
-    tokenized_preds = [tokenize.tokenize(x, stemmer) for x in predictions]
-    tokenized_texts = [tokenize.tokenize(x, stemmer) for x in texts]
-    if references is not None:
-        tokenized_refs = [tokenize.tokenize(x, stemmer) for x in references]
-    else:
-        tokenized_refs = tokenized_preds
-
-    result = {}
-    for use_modified in [False, True]:
-        for n in range(1, 5):
-            pred_ngram_overlaps = []
-            label_ngram_overlaps = []
-            for pred, label, text in zip(
-                tokenized_preds, tokenized_refs, tokenized_texts
-            ):
-                pred_pair_ngram_overlap = calculate_ngram_overlap(
-                    pred, text, n, use_modified
-                )
-                pred_ngram_overlaps.append(pred_pair_ngram_overlap)
-                if references is not None:
-                    label_pair_ngram_overlap = calculate_ngram_overlap(
-                        label, text, n, use_modified
-                    )
-                    label_ngram_overlaps.append(label_pair_ngram_overlap)
-            key = f"ngram_overlap_{n}" if use_modified else f"novel_ngrams_{n}"
-
-            pred_ngram_overlaps = np.array(pred_ngram_overlaps)
-            cond_abs = ~np.isnan(pred_ngram_overlaps)
-            result[key + "_abs"] = pred_ngram_overlaps[cond_abs]
-
-            if references is not None:
-                label_ngram_overlaps = np.array(label_ngram_overlaps)
-                cond_rel = cond_abs & ~np.isnan(label_ngram_overlaps)
-                result[key + "_rel"] = (
-                    pred_ngram_overlaps[cond_rel] / label_ngram_overlaps[cond_rel]
-                )
-
-    if aggregate:
-        for key, value in result.items():
-            result[key] = np.mean(value)
-
-    return result
-
-
-def calculate_ngram_overlap(summary, text, n=1, use_modified=True):
-    summary_ngrams = list(ngrams(summary, n))
-    text_ngrams = list(ngrams(text, n))
-
-    if len(summary_ngrams) > 0:
-        ngrams_intersection = set(summary_ngrams).intersection(set(text_ngrams))
-        if use_modified:
-            word_is_part_of_ngram_copied = [
-                any((x in ngram for ngram in ngrams_intersection)) for x in summary
-            ]
-            return 1 - sum(word_is_part_of_ngram_copied) / len(
-                word_is_part_of_ngram_copied
-            )
-        else:
-            return sum([x not in ngrams_intersection for x in summary_ngrams]) / len(
-                summary_ngrams
-            )
-    return np.nan
-
-
-class SentBert:
-    def __init__(
-        self,
-        checkpoint: str = "sentence-transformers/all-mpnet-base-v2",
-        device: str = "cuda",
-        cache_dir: str = "cache",
-    ):
-        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=cache_dir)
-        self.model = AutoModel.from_pretrained(checkpoint, cache_dir=cache_dir).to(
-            device
-        )
-        self.device = device
-
-    def __call__(
-        self, source_texts: list[str], ref_texts: list[str], batch_size: int = 32
-    ) -> np.ndarray:
-        assert len(source_texts) == len(ref_texts)
-        # Make batch_size an even number
-        if batch_size % 2 == 0:
-            batch_size -= 1
-        half_batch_size = batch_size // 2
-        n_texts = len(source_texts)
-        scores = np.empty(n_texts, dtype=np.float32)
-        start = 0
-        end = 0
-
-        while end < n_texts:
-            end += half_batch_size
-            batch_idx = slice(start, end)
-            # Tokenize sentences
-            encoded_input = self.tokenizer(
-                source_texts[batch_idx] + ref_texts[batch_idx],
-                padding=True,
-                truncation=True,
-                return_tensors="pt",
-            )
-            encoded_input = {
-                key: value.to(self.device) for key, value in encoded_input.items()
-            }
-            # Calculate the probability of belonging to the positive class
-            model_output = self.model(**encoded_input)
-            # Perform pooling
-            sent_embs = self.mean_pooling(model_output, encoded_input["attention_mask"])
-            # Normalize embeddings
-            sent_embs = F.normalize(sent_embs, p=2, dim=1)
-            n_source_embs = len(sent_embs) // 2
-            scores[batch_idx] = (
-                (sent_embs[:n_source_embs] * sent_embs[n_source_embs:])
-                .sum(-1)
-                .cpu()
-                .detach()
-                .numpy()
-            )
-            start = end
-
-        return scores
-
-    @staticmethod
-    def mean_pooling(model_output, attention_mask):
-        """
-        Mean Pooling - Take attention mask into account for correct averaging
-        """
-        token_embeddings = model_output[
-            0
-        ]  # First element of model_output contains all token embeddings
-        input_mask_expanded = (
-            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        )
-        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
-            input_mask_expanded.sum(1), min=1e-9
-        )
-
-
-def calculate_alignscore(
-    predictions: list[str],
-    references: Union[list[str], list[list[str]]],
-    original_texts: list[str],
-    batch_size: int = 32,
-    device: str = "cuda",
-    cache_dir: str = "cache",
-):
-    if not is_alignscore_available:
-        return None
-    if isinstance(references[0], list):
-        log.error("AlignScore does not support multiple references. Skipping...")
-        return None
-    if not os.path.exists(ALIGNSCORE_CHECKPOINT_PATH):
-        urlretrieve(
-            "https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-base.ckpt",
-            ALIGNSCORE_CHECKPOINT_PATH,
-        )
-
-    scorer = AlignScore(
-        model="roberta-base",
-        batch_size=batch_size,
-        device=device,
-        ckpt_path=ALIGNSCORE_CHECKPOINT_PATH,
-        evaluation_mode="nli_sp",
-    )
-    # Fix: alignscore outputs an error if a text is empty, so we need to add some content to such texts
-    original_texts = [text if text else " " for text in original_texts]
-    predictions = [text if text else " " for text in predictions]
-    references = [text if text else " " for text in references]
-
-    scores_ref = scorer.score(contexts=original_texts, claims=predictions)
-    if isinstance(references[0], list):
-        scores_baseline = []
-        for orig_text, refs in zip(original_texts, references):
-            inst_baseline_scores = scorer.score(
-                contexts=[orig_text] * len(refs), claims=refs
-            )
-            scores_baseline.append(max(inst_baseline_scores))
-    else:
-        scores_baseline = scorer.score(contexts=original_texts, claims=references)
-    scores_rel = np.array(scores_ref) / np.array(scores_baseline)
-    return {"alignscore": scores_ref, "alignscore_rel": scores_rel}
-
-
-class EvaluationLLM(DeepEvalBaseLLM):
-    """
-    Custom Evaluation LLM implementation for DeepEval.
-
-    This class implements the DeepEvalBaseLLM interface to allow using
-    custom models with DeepEval metrics.
-    """
-
-    def __init__(
-        self,
-        api_key=None,
-        model="openai/gpt-4o-2024-11-20",
-        base_url="https://openrouter.ai/api/v1",
-    ):
-        """
-        Initialize the Evaluation LLM.
-
-        Args:
-            api_key: Evaluation API key
-            model: Model identifier (e.g., "openai/gpt-4o-2024-11-20")
-            base_url: Evaluation API base URL
-        """
-        self.api_key = api_key
-
-        self.model_name = model
-        self.base_url = base_url
-        self.client = None
-        self.async_client = None
-        self.OpenAI = OpenAI
-        self.AsyncOpenAI = AsyncOpenAI
-
-    def load_model(self):
-        """Load and return the client."""
-        if self.client is None:
-            self.client = self.OpenAI(
-                base_url=self.base_url,
-                api_key=self.api_key,
-            )
-        return self.client
-
-    def load_async_model(self):
-        """Load and return the async client."""
-        if self.async_client is None:
-            self.async_client = self.AsyncOpenAI(
-                base_url=self.base_url,
-                api_key=self.api_key,
-            )
-        return self.async_client
-
-    def generate(self, prompt: str) -> str:
-        """
-        Generate a response from the evaluation model.
-
-        Args:
-            prompt: The prompt to send to the model
-
-        Returns:
-            The model's response as a string
-        """
-        client = self.load_model()
-        response = client.chat.completions.create(
-            model=self.model_name,
-            messages=[{"role": "user", "content": prompt}],
-        )
-        return response.choices[0].message.content
-
-    async def a_generate(self, prompt: str) -> str:
-        """
-        Asynchronously generate a response from the evaluation model.
-
-        Args:
-            prompt: The prompt to send to the model
-
-        Returns:
-            The model's response as a string
-        """
-        # Use the async client for async operations
-        client = self.load_async_model()
-        response = await client.chat.completions.create(
-            model=self.model_name,
-            messages=[{"role": "user", "content": prompt}],
-        )
-        return response.choices[0].message.content
-
-    def get_model_name(self):
-        """Return the name of the model."""
-        return f"EvaluationLLM: {self.model_name}"
-
-
-def calculate_deepeval_metrics(
-    predictions,
-    references,
-    original_texts,
-    metrics_to_calculate=None,
-    base_url: str = "https://openrouter.ai/api/v1",
-    api_key: str = None,
-    model="openai/gpt-4o-2024-11-20",
-    threshold=0.5,
-    include_reason=False,
-    strict_mode=False,
-    async_mode=True,
-    verbose_mode=False,
-    truths_extraction_limit=None,
-):
-    """
-    Calculate DeepEval metrics using EvaluationLLM.
-
-    Args:
-        predictions: list of generated texts
-        references: list of reference texts
-        original_texts: list of source texts
-        metrics_to_calculate: list of metrics to calculate. Options:
-            ["deepeval_answer_relevance", "deepeval_faithfulness", "deepeval_summarization", "deepeval_prompt_alignment"]
-        api_key: Evaluation API key
-        base_url: Evaluation API base URL
-        model: Evaluation model to use
-        threshold: Threshold for metrics (default: 0.5)
-        include_reason: Include reason for evaluation score (default: False)
-        strict_mode: Enforce binary metric score (1 for perfection, 0 otherwise) (default: False)
-        async_mode: Enable concurrent execution (default: True)
-        verbose_mode: Print intermediate steps (default: False)
-        truths_extraction_limit: Maximum number of factual truths to extract (default: None)
-
-    Returns:
-        dictionary with metric scores
-    """
-
-    if not metrics_to_calculate:
-        metrics_to_calculate = [
-            "deepeval_answer_relevance",
-            "deepeval_faithfulness",
-            "deepeval_summarization",
-            "deepeval_prompt_alignment",
-        ]
-
-    # Create EvaluationLLM instance
-    llm = EvaluationLLM(
-        base_url=base_url,
-        api_key=api_key,
-        model=model,
-    )
-
-    results = {}
-
-    # Create metrics based on selected options
-    metrics = []
-    metric_name_mapping = {}  # Maps metric class name to the deepeval metric name
-
-    # Dictionary to store test cases for each metric
-    metric_test_cases = {}
-
-    if "deepeval_answer_relevance" in metrics_to_calculate:
-        metric = AnswerRelevancyMetric(
-            threshold=threshold,
-            model=llm,
-            include_reason=include_reason,
-            strict_mode=strict_mode,
-            async_mode=async_mode,
-        )
-        metrics.append(metric)
-        metric_name_mapping[metric.__class__.__name__] = "deepeval_answer_relevance"
-
-        # Create specific test cases for AnswerRelevancy metric
-        answer_relevance_test_cases = []
-        for i, (pred, src) in enumerate(zip(predictions, original_texts)):
-            test_case = LLMTestCase(
-                input=src,
-                actual_output=pred,
-            )
-            answer_relevance_test_cases.append(test_case)
-        metric_test_cases[metric.__class__.__name__] = answer_relevance_test_cases
-
-    if "deepeval_faithfulness" in metrics_to_calculate:
-        metric = FaithfulnessMetric(
-            threshold=threshold,
-            model=llm,
-            include_reason=include_reason,
-            strict_mode=strict_mode,
-            async_mode=async_mode,
-            truths_extraction_limit=truths_extraction_limit,
-        )
-        metrics.append(metric)
-        metric_name_mapping[metric.__class__.__name__] = "deepeval_faithfulness"
-
-        # Create specific test cases for Faithfulness metric
-        faithfulness_test_cases = []
-        for i, (pred, src) in enumerate(zip(predictions, original_texts)):
-            test_case = LLMTestCase(
-                input=src,
-                actual_output=pred,
-                retrieval_context=[src],
-            )
-            faithfulness_test_cases.append(test_case)
-        metric_test_cases[metric.__class__.__name__] = faithfulness_test_cases
-
-    if "deepeval_summarization" in metrics_to_calculate:
-        metric = SummarizationMetric(
-            threshold=threshold,
-            model=llm,
-            include_reason=include_reason,
-            strict_mode=strict_mode,
-            async_mode=async_mode,
-        )
-        metrics.append(metric)
-        metric_name_mapping[metric.__class__.__name__] = "deepeval_summarization"
-
-        # Create specific test cases for Summarization metric
-        summarization_test_cases = []
-        for i, (pred, src) in enumerate(zip(predictions, original_texts)):
-            test_case = LLMTestCase(
-                input=src,
-                actual_output=pred,
-            )
-            summarization_test_cases.append(test_case)
-        metric_test_cases[metric.__class__.__name__] = summarization_test_cases
-
-    if "deepeval_prompt_alignment" in metrics_to_calculate:
-        metric = PromptAlignmentMetric(
-            threshold=threshold,
-            model=llm,
-            prompt_instructions=["Do what you are told to do in the prompt"],
-            include_reason=include_reason,
-            strict_mode=strict_mode,
-            async_mode=async_mode,
-        )
-        metrics.append(metric)
-        metric_name_mapping[metric.__class__.__name__] = "deepeval_prompt_alignment"
-
-        # Create specific test cases for PromptAlignment metric
-        prompt_alignment_test_cases = []
-        for i, (pred, ref, src) in enumerate(
-            zip(predictions, references, original_texts)
-        ):
-            test_case = LLMTestCase(
-                input=src,
-                actual_output=pred,
-                expected_output=ref,
-            )
-            prompt_alignment_test_cases.append(test_case)
-        metric_test_cases[metric.__class__.__name__] = prompt_alignment_test_cases
-
-    # Run evaluation for each metric separately
-    for metric in metrics:
-        metric_class_name = metric.__class__.__name__
-        test_cases = metric_test_cases.get(metric_class_name, [])
-
-        if test_cases:
-            # Disable printing to console during evaluation if not verbose
-            original_stdout = sys.stdout
-            if not verbose_mode:
-                sys.stdout = open(os.devnull, "w")
-
-            try:
-                # Run evaluation for this specific metric
-                evaluation_results = evaluate(
-                    test_cases=test_cases,
-                    metrics=[metric],
-                    run_async=async_mode,
-                )
-
-                deepeval_metric_name = metric_name_mapping.get(metric_class_name)
-                scores = []
-                reasons = []
-
-                # Process results for this metric
-                for result in evaluation_results.test_results:
-                    scores.append(1 if result.success else 0)
-
-                # Calculate average score
-                if scores:
-                    results[deepeval_metric_name] = np.mean(scores)
-
-            finally:
-                # Restore stdout
-                if not verbose_mode:
-                    sys.stdout.close()
-                    sys.stdout = original_stdout
-    print("================================================")
-    print("Results:")
-    print(results)
-    print("================================================")
-
-    return results
diff --git a/src/atgen/metrics/model_based/__init__.py b/src/atgen/metrics/model_based/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/atgen/metrics/model_based/alignscore.py b/src/atgen/metrics/model_based/alignscore.py
new file mode 100644
index 0000000..353e8b2
--- /dev/null
+++ b/src/atgen/metrics/model_based/alignscore.py
@@ -0,0 +1,60 @@
+from logging import log
+from multiprocessing import reduction
+import os
+from pathlib import Path
+from atgen.metrics.base_metric import BaseMetric, MetricConfig
+from typing import List, Optional
+from urllib.request import urlretrieve
+import numpy as np
+from alignscore import AlignScore as AlignScoreModel
+
+class AlignScoreConfig(MetricConfig):
+    batch_size: int = 32
+    device: str = "cuda"
+    cache_dir: str = "cache"
+
+
+class AlignScore(BaseMetric):
+    def __init__(self, config: AlignScoreConfig):
+        super().__init__(config)
+        self.ALIGNSCORE_CHECKPOINT_PATH = os.getenv(
+        "ALIGNSCORE_CHECKPOINT_PATH", '../../cache/AlignScore-base.ckpt',
+        )
+        
+
+        if not os.path.exists(self.ALIGNSCORE_CHECKPOINT_PATH):
+            os.makedirs(os.path.dirname(self.ALIGNSCORE_CHECKPOINT_PATH), exist_ok=True)
+            urlretrieve(
+                "https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-base.ckpt",
+                self.ALIGNSCORE_CHECKPOINT_PATH,
+            )
+            
+        self.scorer = AlignScoreModel(
+            model="roberta-base",
+            batch_size=self.config.batch_size,
+            device=self.config.device,
+            ckpt_path=self.ALIGNSCORE_CHECKPOINT_PATH,
+            evaluation_mode="nli_sp",
+        )
+        
+    def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float:
+        original_texts = [text if text else " " for text in sources]
+        predictions = [text if text else " " for text in predictions]
+        references = [text if text else " " for text in references]
+
+        scores_ref = self.scorer.score(contexts=original_texts, claims=predictions)
+        if isinstance(references[0], list):
+            scores_baseline = []
+            for orig_text, refs in zip(original_texts, references):
+                inst_baseline_scores = self.scorer.score(
+                    contexts=[orig_text] * len(refs), claims=refs
+                )
+                scores_baseline.append(max(inst_baseline_scores))
+        else:
+            scores_baseline = self.scorer.score(contexts=original_texts, claims=references)
+        scores_rel = np.array(scores_ref) / np.array(scores_baseline)
+        
+        if self.config.aggregate:
+            return {"alignscore": float(np.mean(scores_ref)), "alignscore_rel": float(np.mean(scores_rel))}
+        else:
+            return {"alignscore": scores_ref, "alignscore_rel": scores_rel}
diff --git a/src/atgen/metrics/bart_score.py b/src/atgen/metrics/model_based/bartscore.py
similarity index 70%
rename from src/atgen/metrics/bart_score.py
rename to src/atgen/metrics/model_based/bartscore.py
index 6133693..77a4ce1 100644
--- a/src/atgen/metrics/bart_score.py
+++ b/src/atgen/metrics/model_based/bartscore.py
@@ -1,4 +1,6 @@
-# %%
+from math import prod
+from atgen.metrics.base_metric import BaseMetric, MetricConfig
+from typing import List, Optional
 import traceback
 from typing import List
 from tqdm import tqdm
@@ -116,3 +118,45 @@ def test(self, batch_size=3):
         tgt_list = ["That's stupid.", "What's the problem?", "He is trustworthy."]
 
         print(self.score(src_list, tgt_list, batch_size))
+
+
+class BartScoreConfig(MetricConfig):
+    device: str = "cuda"
+    max_length: int = 1024
+    checkpoint: str = "facebook/bart-large-cnn"
+    cache_dir: str = "cache"
+    batch_size: int = 4
+
+class BartScore(BaseMetric):
+    def __init__(self, config: BartScoreConfig):
+        super().__init__(config)
+        self.scorer = BARTScorer(
+            device=self.config.device,
+            max_length=self.config.max_length,
+            checkpoint=self.config.checkpoint,
+            cache_dir=self.config.cache_dir,
+        )
+        
+    def compute(self, predictions: List[str], references: List[str], sources: Optional[List[str]] = None, **kwargs) -> float:
+        scores = {}
+        if references is not None:
+            scores["BARTScore-sh"] = np.array(
+                self.scorer.score(references, predictions, batch_size=self.config.batch_size)
+        )
+        if references is not None:
+            if isinstance(references[0], list):
+                scores_hr = []
+                for ref, pred in zip(references, predictions):
+                    inst_pred = [pred for _ in range(len(ref))]
+                # Take a maximum within the observation similar to ROUGE
+                    inst_score_hr = max(self.scorer.score(inst_pred, ref, batch_size=self.config.batch_size))
+                    scores_hr.append(inst_score_hr)
+                scores["BARTScore-hr"] = np.array(scores_hr)
+        else:
+            scores["BARTScore-hr"] = np.array(
+                    self.scorer.score(predictions, references, batch_size=self.config.batch_size)
+                )
+
+        if self.config.aggregate:
+            scores = {key: np.mean(value) for key, value in scores.items()}
+        return scores
diff --git a/src/atgen/metrics/registry.py b/src/atgen/metrics/registry.py
new file mode 100644
index 0000000..2735eda
--- /dev/null
+++ b/src/atgen/metrics/registry.py
@@ -0,0 +1,22 @@
+from atgen.metrics.classic_metrics.bleu import Bleu, BleuConfig
+from atgen.metrics.classic_metrics.rouge import Rouge, RougeConfig
+from atgen.metrics.classic_metrics.sacrebleu import Sacrebleu, SacrebleuConfig
+from atgen.metrics.classic_metrics.exact_match import ExactMatch, ExactMatchConfig
+from atgen.metrics.classic_metrics.word_length import WordLength, WordLengthConfig
+from atgen.metrics.model_based.bartscore import BartScore, BartScoreConfig
+from atgen.metrics.model_based.alignscore import AlignScore, AlignScoreConfig
+from atgen.metrics.deep_eval.deepeval_metrics import Deepeval, DeepevalConfig   
+
+
+METRICS_REGISTRY = {
+    "bleu": (Bleu, BleuConfig),
+    "rouge": (Rouge, RougeConfig),
+    "rouge1": (Rouge, RougeConfig),
+    "sacrebleu": (Sacrebleu, SacrebleuConfig),
+    "exact_match": (ExactMatch, ExactMatchConfig),
+    "word_length": (WordLength, WordLengthConfig),
+    "bartscore": (BartScore, BartScoreConfig),
+    "alignscore": (AlignScore, AlignScoreConfig),
+    "deepeval": (Deepeval, DeepevalConfig),
+    
+}
\ No newline at end of file