cleanlab · mturk24 · Sep 19, 2025 · Oct 17, 2025 · jwmueller · Sep 26, 2025
diff --git a/src/cleanlab_tlm/utils/rag.py b/src/cleanlab_tlm/utils/rag.py
@@ -54,6 +54,20 @@
     from cleanlab_tlm.tlm import TLMOptions
 
 
+# Criteria constants for response_helpfulness
+_RESPONSE_HELPFULNESS_BINARY_CRITERIA = """Does the AI Assistant Response avoid answering or deflect from the User Query?
+
+A Response is considered as avoiding/deflecting (answer "Yes") if it refuses to answer, such as by saying or implying things like "I don't know", "Sorry", "No information available", or any other form of refusal or deflection.
+
+A Response is considered as attempting to answer (answer "No") if the Assistant makes a genuine attempt to answer the question, even if the answer is incorrect or incomplete. Factual inaccuracies should not affect the assessment."""
+
+_RESPONSE_HELPFULNESS_NUMERIC_CRITERIA = """Assess whether the AI Assistant Response is a helpful answer to the User Query.
+
+A Response is considered helpful if it makes a genuine attempt to answer the question, even if the answer is incorrect or incomplete. Factual inaccuracies should not affect the assessment. The only thing that matters is whether the Assistant tries to answer the question.
+
+A Response is considered not helpful if it avoids answering the question. For example, by saying or implying things like "I don't know", "Sorry", "No information available", or any other form of refusal or deflection."""
+
+
 class TrustworthyRAG(BaseTLM):
     """
     Real-time Evals for Retrieval-Augmented Generation (RAG) systems, powered by Cleanlab's Trustworthy Language Model (TLM).
@@ -123,6 +137,7 @@ def __init__(
                     query_identifier=eval_config.get(_TLM_EVAL_QUERY_IDENTIFIER_KEY),
                     context_identifier=eval_config.get(_TLM_EVAL_CONTEXT_IDENTIFIER_KEY),
                     response_identifier=eval_config.get(_TLM_EVAL_RESPONSE_IDENTIFIER_KEY),
+                    mode=eval_config.get("mode") or "numeric",  # Default to numeric if not specified
                 )
                 for eval_config in _DEFAULT_EVALS
             ]
@@ -861,6 +876,11 @@ class Eval:
         response_identifier (str, optional): The exact string used in your evaluation `criteria` to reference the RAG/LLM response.
             For example, specifying `response_identifier` as "AI Answer" means your `criteria` should refer to the response as "AI Answer".
             Leave this value as None (the default) if this Eval doesn't consider the response.
+        mode (str, optional): The evaluation mode, either "numeric" (default) or "binary".
+            - "numeric": For evaluations that naturally have a continuous score range (e.g., helpfulness, coherence).
+            - "binary": For yes/no evaluations (e.g., does response mention a company, is query appropriate).
-            - "binary": For yes/no evaluations (e.g., does response mention a company, is query appropriate).
+            - "binary": For yes/no evaluations (e.g., does response mention a particular company or not).
-            - "binary": For yes/no evaluations (e.g., does response mention a company, is query appropriate).
+            - "binary": For yes/no evaluations (e.g., does response mention a particular company or not).
+            Both modes return numeric scores in the 0-1 range. For binary evaluations detecting issues,
+            low scores typically correspond to "Yes" (issue detected) and high scores to "No" (issue not detected).
-            Both modes return numeric scores in the 0-1 range. For binary evaluations detecting issues,
-            low scores typically correspond to "Yes" (issue detected) and high scores to "No" (issue not detected).
+            Both modes return numeric scores in the 0-1 range.
+            For numeric evaluations, your `criteria` should define what good vs. bad looks like (low evaluation scores will correspond to cases deemed bad). 
+            For binary evaluations, your `criteria` should be a Yes/No question (low evaluation scores will correspond to "Yes" cases, so phrase your question such that the likelihood of "Yes" matches the likelihood of the particular problem you wish to detect).
-            Both modes return numeric scores in the 0-1 range. For binary evaluations detecting issues,
-            low scores typically correspond to "Yes" (issue detected) and high scores to "No" (issue not detected).
+            Both modes return numeric scores in the 0-1 range.
+            For numeric evaluations, your `criteria` should define what good vs. bad looks like (low evaluation scores will correspond to cases deemed bad). 
+            For binary evaluations, your `criteria` should be a Yes/No question (low evaluation scores will correspond to "Yes" cases, so phrase your question such that the likelihood of "Yes" matches the likelihood of the particular problem you wish to detect).
 
     Note on handling Tool Calls: By default, when a tool call response is detected, evaluations that analyze the response content
         (those with a `response_identifier`) are assigned `score=None`. You can override this behavior for specific evals via
@@ -874,6 +894,7 @@ def __init__(
         query_identifier: Optional[str] = None,
         context_identifier: Optional[str] = None,
         response_identifier: Optional[str] = None,
+        mode: str = "numeric",
     ):
         """
         lazydocs: ignore
@@ -884,11 +905,16 @@ def __init__(
                 "At least one of query_identifier, context_identifier, or response_identifier must be specified."
             )
 
+        # Validate mode parameter
+        if mode not in ["numeric", "binary"]:
+            raise ValueError("mode must be 'numeric' or 'binary'")
+
         self.name = name
         self.criteria = criteria
         self.query_identifier = query_identifier
         self.context_identifier = context_identifier
         self.response_identifier = response_identifier
+        self.mode = mode
 
     def __repr__(self) -> str:
         """
@@ -903,7 +929,8 @@ def __repr__(self) -> str:
             f"    'criteria': '{self.criteria}',\n"
             f"    'query_identifier': {self.query_identifier!r},\n"
             f"    'context_identifier': {self.context_identifier!r},\n"
-            f"    'response_identifier': {self.response_identifier!r}\n"
+            f"    'response_identifier': {self.response_identifier!r},\n"
+            f"    'mode': '{self.mode}'\n"
             f"}}"
         )
 
@@ -915,22 +942,23 @@ def __repr__(self) -> str:
         "query_identifier": "Question",
         "context_identifier": "Document",
         "response_identifier": None,
+        "mode": "numeric",
     },
     {
         "name": "response_groundedness",
         "criteria": "Review the Response to the Query and assess whether every factual claim in the Response is explicitly supported by the provided Context. A Response meets the criteria if all information is directly backed by evidence in the Context, without relying on assumptions, external knowledge, or unstated inferences. The focus is on whether the Response is fully grounded in the Context, rather than whether it fully addresses the Query. If any claim in the Response lacks direct support or introduces information not present in the Context, the Response is bad and does not meet the criteria.",
         "query_identifier": "Query",
         "context_identifier": "Context",
         "response_identifier": "Response",
+        "mode": "numeric",
     },
     {
         "name": "response_helpfulness",
-        "criteria": """Assess whether the AI Assistant Response is a helpful answer to the User Query.
-A Response is considered helpful if it makes a genuine attempt to answer the question, even if the answer is incorrect or incomplete. Factual inaccuracies should not affect the assessment. The only thing that matters is whether the Assistant tries to answer the question.
-A Response is considered not helpful if it avoids answering the question. For example, by saying or implying things like "I don't know", "Sorry", "No information available", or any other form of refusal or deflection.""",
+        "criteria": _RESPONSE_HELPFULNESS_BINARY_CRITERIA,  # Use binary criteria by default
         "query_identifier": "User Query",
         "context_identifier": None,
         "response_identifier": "AI Assistant Response",
+        "mode": "binary",
     },
     {
         "name": "query_ease",
@@ -942,30 +970,31 @@ def __repr__(self) -> str:
         "query_identifier": "User Request",
         "context_identifier": None,
         "response_identifier": None,
+        "mode": "numeric",
     },
 ]
 
 
 def get_default_evals() -> list[Eval]:
     """
-    Get the evaluation criteria that are run in TrustworthyRAG by default.
+        Get the evaluation criteria that are run in TrustworthyRAG by default.
 
-    Returns:
-        list[Eval]: A list of [Eval](#class-eval) objects based on pre-configured criteria
-        that can be used with TrustworthyRAG.
+        Returns:
+            list[Eval]: A list of [Eval](#class-eval) objects based on pre-configured criteria
+            that can be used with TrustworthyRAG.
 
-    Example:
-        ```python
-        default_evaluations = get_default_evals()
+        Example:
+    ```python
+            default_evaluations = get_default_evals()
 
-        # You can modify the default Evals by:
-        # 1. Adding new evaluation criteria
-        # 2. Updating existing criteria with custom text
-        # 3. Removing specific evaluations you don't need
+            # You can modify the default Evals by:
+            # 1. Adding new evaluation criteria
+            # 2. Updating existing criteria with custom text
+            # 3. Removing specific evaluations you don't need
 
-        # Run TrustworthyRAG with your modified Evals
-        trustworthy_rag = TrustworthyRAG(evals=modified_evaluations)
-        ```
+            # Run TrustworthyRAG with your modified Evals
+            trustworthy_rag = TrustworthyRAG(evals=modified_evaluations)
+    ```
     """
     return [
         Eval(
@@ -974,6 +1003,7 @@ def get_default_evals() -> list[Eval]:
             query_identifier=eval_config.get("query_identifier"),
             context_identifier=eval_config.get("context_identifier"),
             response_identifier=eval_config.get("response_identifier"),
+            mode=eval_config.get("mode") or "numeric",
         )
         for eval_config in _DEFAULT_EVALS
     ]
@@ -997,26 +1027,26 @@ class EvalMetric(TypedDict):
 class TrustworthyRAGResponse(dict[str, Union[Optional[str], EvalMetric]]):
     """Object returned by `TrustworthyRAG.generate()` containing generated text and evaluation scores. This class is a dictionary with specific keys.
 
-    Attributes:
-        response (str): The generated response text.
-        trustworthiness ([EvalMetric](#class-evalmetric)): Overall trustworthiness of the response.
-        Additional keys: Various evaluation metrics (context_sufficiency, response_helpfulness, etc.),
-            each following the [EvalMetric](#class-evalmetric) structure.
-
-    Example:
-        ```python
-        {
-            "response": "<response text>",
-            "trustworthiness": {
-                "score": 0.92,
-                "log": {"explanation": "Did not find a reason to doubt trustworthiness."}
-            },
-            "context_informativeness": {
-                "score": 0.65
-            },
-            ...
-        }
-        ```
+        Attributes:
+            response (str): The generated response text.
+            trustworthiness ([EvalMetric](#class-evalmetric)): Overall trustworthiness of the response.
+            Additional keys: Various evaluation metrics (context_sufficiency, response_helpfulness, etc.),
+                each following the [EvalMetric](#class-evalmetric) structure.
+
+        Example:
+    ```python
+            {
+                "response": "<response text>",
+                "trustworthiness": {
+                    "score": 0.92,
+                    "log": {"explanation": "Did not find a reason to doubt trustworthiness."}
+                },
+                "context_informativeness": {
+                    "score": 0.65
+                },
+                ...
+            }
+    ```
     """
 
 
@@ -1031,15 +1061,15 @@ class TrustworthyRAGScore(dict[str, EvalMetric]):
 
     Example:
         ```python
-        {
-            "trustworthiness": {
-                "score": 0.92,
-                "log": {"explanation": "Did not find a reason to doubt trustworthiness."}
-            },
-            "context_informativeness": {
-                "score": 0.65
-            },
-            ...
-        }
+                {
+                    "trustworthiness": {
+                        "score": 0.92,
+                        "log": {"explanation": "Did not find a reason to doubt trustworthiness."}
+                    },
+                    "context_informativeness": {
+                        "score": 0.65
+                    },
+                    ...
+                }
         ```
     """