From 22df91d8f132014d14999946e6cbbbf7f3cc87b4 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Sat, 3 May 2025 15:28:51 -0400
Subject: [PATCH 1/4] adding in error messages & timeout for user permission
 message

---
 dspy/teleprompt/mipro_optimizer_v2.py | 67 +++++++++++++++++++--------
 1 file changed, 48 insertions(+), 19 deletions(-)

diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index d650d7da71..3b33a25306 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -3,11 +3,14 @@
 import textwrap
 from collections import defaultdict
 from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+import select
+import sys
+import time
 
 import numpy as np
 import optuna
 from optuna.distributions import CategoricalDistribution
-
+import math
 import dspy
 from dspy.evaluate.evaluate import Evaluate
 from dspy.propose import GroundedProposer
@@ -53,10 +56,8 @@ def __init__(
         teacher_settings: Dict = {},
         max_bootstrapped_demos: int = 4,
         max_labeled_demos: int = 4,
-        auto: Optional[Literal["light", "medium", "heavy"]] = "medium",
-        num_candidates: int = 10,
-        num_fewshot_candidates: Optional[int] = None,
-        num_instruct_candidates: Optional[int] = None,
+        auto: Optional[Literal["light", "medium", "heavy"]] = "light",
+        num_candidates: Optional[int] = None,
         num_threads: Optional[int] = None,
         max_errors: int = 10,
         seed: int = 9,
@@ -71,9 +72,9 @@ def __init__(
         if auto not in allowed_modes:
             raise ValueError(f"Invalid value for auto: {auto}. Must be one of {allowed_modes}.")
         self.auto = auto
-
-        self.num_fewshot_candidates = num_fewshot_candidates or num_candidates
-        self.num_instruct_candidates = num_instruct_candidates or num_candidates
+        self.num_fewshot_candidates = num_candidates
+        self.num_instruct_candidates = num_candidates
+        self.num_candidates = num_candidates
         self.metric = metric
         self.init_temperature = init_temperature
         self.task_model = task_model if task_model else dspy.settings.lm
@@ -99,7 +100,7 @@ def compile(
         trainset: List,
         teacher: Any = None,
         valset: Optional[List] = None,
-        num_trials: int = 30,
+        num_trials: Optional[int] = None,
         max_bootstrapped_demos: Optional[int] = None,
         max_labeled_demos: Optional[int] = None,
         seed: Optional[int] = None,
@@ -114,6 +115,21 @@ def compile(
         requires_permission_to_run: bool = True,
         provide_traceback: Optional[bool] = None,
     ) -> Any:
+        
+        zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0)
+        
+        # If auto is None, and num_trials is not provided (but num_candidates is), raise an error that suggests a good num_trials value
+        if self.auto is None and (self.num_candidates is not None and num_trials is None):
+            raise ValueError(f"If auto is None, num_trials must also be provided. Given num_candidates={self.num_candidates}, we'd recommend setting num_trials to ~{self._set_num_trials_from_num_candidates(student, zeroshot_opt, self.num_candidates)}.")
+        
+        # If auto is None, and num_candidates or num_trials is None, raise an error
+        if self.auto is None and (self.num_candidates is None or num_trials is None):
+            raise ValueError("If auto is None, num_candidates must also be provided.")
+        
+        # If auto is provided, and either num_candidates or num_trials is not None, raise an error
+        if self.auto is not None and (self.num_candidates is not None or num_trials is not None):
+            raise ValueError("If auto is not None, num_candidates and num_trials cannot be set, since they would be overrided by the auto settings. Please either set auto to None, or do not specify num_candidates and num_trials.")
+        
         # Set random seeds
         seed = seed or self.seed
         self._set_random_seeds(seed)
@@ -128,7 +144,6 @@ def compile(
         trainset, valset = self._set_and_validate_datasets(trainset, valset)
 
         # Set hyperparameters based on run mode (if set)
-        zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0)
         num_trials, valset, minibatch = self._set_hyperparams_from_run_mode(
             student, num_trials, minibatch, zeroshot_opt, valset
         )
@@ -204,6 +219,15 @@ def _set_random_seeds(self, seed):
         self.rng = random.Random(seed)
         np.random.seed(seed)
 
+    def _set_num_trials_from_num_candidates(self, program, zeroshot_opt, num_candidates):
+        num_vars = len(program.predictors())
+        if not zeroshot_opt:
+            num_vars *= 2  # Account for few-shot examples + instruction variables
+        # Trials = MAX(c*M*log(N), c=2, 3/2*N)
+        num_trials = int(max(2 * num_vars * np.log2(num_candidates), 1.5 * num_candidates))
+
+        return num_trials
+        
     def _set_hyperparams_from_run_mode(
         self,
         program: Any,
@@ -226,11 +250,7 @@ def _set_hyperparams_from_run_mode(
         self.num_instruct_candidates = auto_settings["n"] if zeroshot_opt else int(auto_settings["n"] * 0.5)
         self.num_fewshot_candidates = auto_settings["n"] 
 
-        num_vars = len(program.predictors())
-        if not zeroshot_opt:
-            num_vars *= 2  # Account for few-shot examples + instruction variables
-        # Trials = MAX(c*M*log(N), c=2, 3/2*N)
-        num_trials = max(2 * num_vars * np.log(auto_settings["n"]), 1.5 * auto_settings["n"])
+        num_trials = self._set_num_trials_from_num_candidates(program, zeroshot_opt, auto_settings["n"])
 
         return num_trials, valset, minibatch
 
@@ -353,6 +373,7 @@ def _get_user_confirmation(
         user_confirmation_message = textwrap.dedent(
             f"""\
             To proceed with the execution of this program, please confirm by typing {BLUE}'y'{ENDC} for yes or {BLUE}'n'{ENDC} for no.
+            If no input is received within 20 seconds, the program will proceed automatically.
 
             If you would like to bypass this confirmation step in future executions, set the {YELLOW}`requires_permission_to_run`{ENDC} flag to {YELLOW}`False`{ENDC} when calling compile.
 
@@ -360,10 +381,18 @@ def _get_user_confirmation(
         """
         )
 
-        user_input = (
-            input(f"{user_message}\n{user_confirmation_message}\nDo you wish to continue? (y/n): ").strip().lower()
-        )
-        return user_input == "y"
+        print(f"{user_message}\n{user_confirmation_message}\nDo you wish to continue? (y/n): ", end='', flush=True)
+        
+        # Wait for input with timeout
+        start_time = time.time()
+        while time.time() - start_time < 20:
+            if select.select([sys.stdin], [], [], 0.1)[0]:
+                user_input = sys.stdin.readline().strip().lower()
+                return user_input == "y"
+            time.sleep(0.1)
+        
+        print("\nNo input received within 20 seconds. Proceeding with execution...")
+        return True
 
     def _bootstrap_fewshot_examples(self, program: Any, trainset: List, seed: int, teacher: Any) -> Optional[List]:
         logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==")

From f2408fab373fc6aa6980605c03994fffa2a582b1 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Mon, 2 Jun 2025 14:11:56 -0400
Subject: [PATCH 2/4] wip

---
 dspy/teleprompt/simba_utils.py | 89 ++++++++++++++++++++++++++--------
 pyproject.toml                 |  2 +-
 2 files changed, 69 insertions(+), 22 deletions(-)

diff --git a/dspy/teleprompt/simba_utils.py b/dspy/teleprompt/simba_utils.py
index 3765a33f1f..3a23eb6d11 100644
--- a/dspy/teleprompt/simba_utils.py
+++ b/dspy/teleprompt/simba_utils.py
@@ -3,20 +3,40 @@
 import inspect
 import logging
 import textwrap
+import re
 
 from dspy.adapters.utils import get_field_description_string
 from dspy.signatures import InputField, OutputField
-from typing import Callable
+from typing import Callable, Optional, Dict, Any
 
 logger = logging.getLogger(__name__)
 
+def prepare_models_for_resampling(program: dspy.Module, n: int, teacher_settings: Optional[Dict] = None):
+
+    models = []
+    if teacher_settings:
+        with dspy.settings.context(trace=[], **teacher_settings):
+            lm = dspy.settings.lm
+            models.append(lm)
 
-def prepare_models_for_resampling(program: dspy.Module, n: int):
     lm = program.get_lm() or dspy.settings.lm
-    temps = [lm.kwargs["temperature"]] + [0.5 + i * (0.5 / n) for i in range(n)]
-    temps = list(dict.fromkeys(temps))[:n]
-    return [lm.copy(temperature=t) for t in temps]
 
+    # Check to see if our model is a reasoning model, which means temp must stay as 1.0
+    model_family = lm.model.split("/")[-1].lower() if "/" in lm.model else lm.model.lower()
+    model_pattern = re.match(r"^o([13])(?:-mini)?", model_family)
+
+    if model_pattern: # Vary the seed
+        start_seed = 0 if "seed" not in lm.kwargs else lm.kwargs["seed"]
+        seeds = [start_seed + 1 + i for i in range(n-len(models))]
+        seeds = list(dict.fromkeys(seeds))[:(n-len(models))]
+        models.extend([lm.copy(seed=seed) for seed in seeds])
+    else: # Vary the temperature
+        start_temp = 0 if "temperature" not in lm.kwargs else lm.kwargs["temperature"]
+        temps = [start_temp + 0.5 + i * (0.5 / n) for i in range(n-len(models))]
+        temps = list(dict.fromkeys(temps))[:(n-len(models))]
+        models.extend([lm.copy(temperature=t) for t in temps])
+    
+    return models
 
 def wrap_program(program: dspy.Module, metric: Callable):
     def wrapped_program(example):
@@ -25,33 +45,53 @@ def wrapped_program(example):
             try:
                 prediction = program(**example.inputs())
             except Exception as e:
-                print(e)
+                logger.info(e)
             trace = dspy.settings.trace.copy()
 
+        output = None
+        score = 0.0
+        output_metadata = {}
+
         try:
-            score = metric(example, prediction)
+            output = metric(example, prediction)
+            if isinstance(output, (int, float)):
+                score = output
+            elif isinstance(output, dspy.Prediction):
+                if not hasattr(output, 'score'):
+                    raise ValueError("dspy.Prediction must contain a 'score' attribute")
+                score = output.score
+                # Just extract fields from _store, excluding 'score'
+                output_metadata = {
+                    k: v for k, v in output._store.items() if k != "score"
+                }
         except Exception as e:
-            print(e)
+            logger.info(e)
 
-        # Include the `example` in the output for subsequent usage in buckets/strategies.
         return {
             "prediction": prediction,
             "trace": trace,
             "score": score,
-            "example": example
+            "example": example,
+            "output_metadata": output_metadata
         }
 
     return wrapped_program
 
-
-
 def append_a_demo(demo_input_field_maxlen):
     def append_a_demo_(bucket, system, **kwargs):
         predictor2name, name2predictor = kwargs["predictor2name"], kwargs["name2predictor"]
+        batch_10p_score = kwargs["batch_10p_score"]
 
-        trace = bucket[0]["trace"]
+        logger.info(f"Appending a demo with max length {demo_input_field_maxlen}")
+        
+        good = bucket[0]
+        trace = good["trace"]
         name2demo = {}
 
+        if good["score"] <= batch_10p_score:
+            logger.info(f"Skipping appending a demo as good score {good['score']} is at or below the 10th percentile (<={batch_10p_score}).")
+            return False
+
         for step in trace:
             predictor, _inputs, _outputs = step
 
@@ -62,28 +102,29 @@ def append_a_demo_(bucket, system, **kwargs):
             demo = dspy.Example(augmented=True, **_inputs, **_outputs)
             name = predictor2name[id(predictor)]
             name2demo[name] = demo  # keep the last demo for each predictor
-
         for name, demo in name2demo.items():
             predictor = name2predictor[name]
             predictor.demos.append(demo)
 
-        logger.info(f"Added {len(name2demo)} demos (one each) across all predictors.")
+        logger.info(f"Added {len(name2demo)} demos (one each) across all predictors. Each predictor now has {len(predictor.demos)} demos total.")
         return True
     
     return append_a_demo_
 
 
 def append_a_rule(bucket, system, **kwargs):
+    # Read in kwargs
     predictor2name = kwargs["predictor2name"]
     batch_10p_score, batch_90p_score = kwargs["batch_10p_score"], kwargs["batch_90p_score"]
+    prompt_model = kwargs["prompt_model"] or dspy.settings.lm
 
     module_names = [name for name, _ in system.named_predictors()]
     good, bad = bucket[0], bucket[-1]
     example = good["example"]
 
-    if good["score"] < batch_10p_score or bad["score"] > batch_90p_score:
-        logger.info(f"Skipping rule generation as good score {good['score']} is below the 10th percentile "
-                    f"*or* bad score {bad['score']} is above the 90th percentile.")
+    if good["score"] <= batch_10p_score or bad["score"] >= batch_90p_score:
+        logger.info(f"Skipping rule generation as good score {good['score']} is at or below the 10th percentile (<={batch_10p_score}) "
+                    f"*or* bad score {bad['score']} is at or above the 90th percentile, (>={batch_90p_score}).")
         return False
 
     if good["score"] <= bad["score"]:
@@ -116,12 +157,17 @@ def append_a_rule(bucket, system, **kwargs):
         worse_program_outputs=dict(bad["prediction"] or {}),
         worse_reward_value=bad["score"],
         better_reward_value=good["score"],
+        worse_reward_info=bad["output_metadata"],
+        better_reward_info=good["output_metadata"],
         module_names=module_names,
     )
 
     kwargs = {k: v if isinstance(v, str) else ujson.dumps(recursive_mask(v), indent=2)
               for k, v in kwargs.items()}
-    advice = dspy.Predict(OfferFeedback)(**kwargs).module_advice
+
+    with dspy.settings.context(trace=[], lm=prompt_model):
+        advice_program = dspy.Predict(OfferFeedback)
+        advice = advice_program(**kwargs).module_advice
 
     for name, predictor in system.named_predictors():
         if name in advice:
@@ -155,11 +201,13 @@ class OfferFeedback(dspy.Signature):
     )
     worse_program_outputs: str = InputField(desc="The outputs of the program that we are analyzing")
     worse_reward_value: float = InputField(desc="The reward value assigned to the program's outputs")
+    worse_reward_info: str = InputField(desc="Additional information that might be helpful to understanding the assigned reward value.")
     better_program_trajectory: str = InputField(
         desc="The trajectory of the program's execution, showing each module's I/O"
     )
     better_program_outputs: str = InputField(desc="The outputs of the program that we are analyzing")
     better_reward_value: float = InputField(desc="The reward value assigned to the program's outputs")
+    better_reward_info: str = InputField(desc="Additional information that might be helpful to understanding the assigned reward value.")
     module_names: list[str] = InputField(desc="The names of the modules in the program, for which we seek advice")
     discussion: str = OutputField(desc="Discussing blame of where each module went wrong, if it did")
     module_advice: dict[str, str] = OutputField(
@@ -169,7 +217,6 @@ class OfferFeedback(dspy.Signature):
         "like the successful trajectory rather than the lower-scoring trajectory."
     )
 
-
 def inspect_modules(program):
     separator = "-" * 80
     output = [separator]
@@ -209,4 +256,4 @@ def recursive_mask(o):
         return tuple(recursive_mask(v) for v in o)
     # Otherwise, replace it with a placeholder string (or use repr(o)).
     else:
-        return f"<non-serializable: {type(o).__name__}>"
+        return f"<non-serializable: {type(o).__name__}>"
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 78c1e637d8..6eb315a221 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,7 @@ classifiers = [
 dependencies = [
     "backoff>=2.2",
     "joblib~=1.3",
-    "openai>=0.28.1",
+    "openai>=0.28.1,<=1.67.0",
     "pandas>=2.1.1",
     "regex>=2023.10.3",
     "ujson>=5.8.0",

From 8b3b6e60afdabd91505002a1463354c25db72d9a Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Mon, 2 Jun 2025 14:29:57 -0400
Subject: [PATCH 3/4] wip

---
 dspy/teleprompt/mipro_optimizer_v2.py | 37 +--------------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index 879d25711a..05dd4eec1a 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -7,14 +7,9 @@
 from typing import TYPE_CHECKING
 from collections import defaultdict
 from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
-import select
-import sys
-import time
 
 import numpy as np
-import optuna
-from optuna.distributions import CategoricalDistribution
-import math
+
 import dspy
 from dspy.evaluate.evaluate import Evaluate
 from dspy.propose import GroundedProposer
@@ -122,23 +117,6 @@ def compile(
         requires_permission_to_run: bool = True,
         provide_traceback: Optional[bool] = None,
     ) -> Any:
-<<<<<<< HEAD
-        
-        zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0)
-        
-        # If auto is None, and num_trials is not provided (but num_candidates is), raise an error that suggests a good num_trials value
-        if self.auto is None and (self.num_candidates is not None and num_trials is None):
-            raise ValueError(f"If auto is None, num_trials must also be provided. Given num_candidates={self.num_candidates}, we'd recommend setting num_trials to ~{self._set_num_trials_from_num_candidates(student, zeroshot_opt, self.num_candidates)}.")
-        
-        # If auto is None, and num_candidates or num_trials is None, raise an error
-        if self.auto is None and (self.num_candidates is None or num_trials is None):
-            raise ValueError("If auto is None, num_candidates must also be provided.")
-        
-        # If auto is provided, and either num_candidates or num_trials is not None, raise an error
-        if self.auto is not None and (self.num_candidates is not None or num_trials is not None):
-            raise ValueError("If auto is not None, num_candidates and num_trials cannot be set, since they would be overrided by the auto settings. Please either set auto to None, or do not specify num_candidates and num_trials.")
-        
-=======
 
         zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0)
 
@@ -154,7 +132,6 @@ def compile(
         if self.auto is not None and (self.num_candidates is not None or num_trials is not None):
             raise ValueError("If auto is not None, num_candidates and num_trials cannot be set, since they would be overrided by the auto settings. Please either set auto to None, or do not specify num_candidates and num_trials.")
 
->>>>>>> 82d3878b12b4632b3c549d9c4e85eaef360ad1f7
         # Set random seeds
         seed = seed or self.seed
         self._set_random_seeds(seed)
@@ -252,11 +229,7 @@ def _set_num_trials_from_num_candidates(self, program, zeroshot_opt, num_candida
         num_trials = int(max(2 * num_vars * np.log2(num_candidates), 1.5 * num_candidates))
 
         return num_trials
-<<<<<<< HEAD
-        
-=======
 
->>>>>>> 82d3878b12b4632b3c549d9c4e85eaef360ad1f7
     def _set_hyperparams_from_run_mode(
         self,
         program: Any,
@@ -411,11 +384,7 @@ def _get_user_confirmation(
         )
 
         print(f"{user_message}\n{user_confirmation_message}\nDo you wish to continue? (y/n): ", end='', flush=True)
-<<<<<<< HEAD
-        
-=======
 
->>>>>>> 82d3878b12b4632b3c549d9c4e85eaef360ad1f7
         # Wait for input with timeout
         start_time = time.time()
         while time.time() - start_time < 20:
@@ -423,11 +392,7 @@ def _get_user_confirmation(
                 user_input = sys.stdin.readline().strip().lower()
                 return user_input == "y"
             time.sleep(0.1)
-<<<<<<< HEAD
-        
-=======
 
->>>>>>> 82d3878b12b4632b3c549d9c4e85eaef360ad1f7
         print("\nNo input received within 20 seconds. Proceeding with execution...")
         return True
 

From 44313d9264181dfe0b6786f41c4b74d56ca33fcc Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Fri, 8 Aug 2025 12:32:04 -0400
Subject: [PATCH 4/4] fixing litellm logging

---
 dspy/clients/__init__.py | 44 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/dspy/clients/__init__.py b/dspy/clients/__init__.py
index 735691e9ce..496aa48eb7 100644
--- a/dspy/clients/__init__.py
+++ b/dspy/clients/__init__.py
@@ -3,9 +3,43 @@
 from pathlib import Path
 from typing import Optional
 
+# Set environment variables before importing litellm
+os.environ["LITELLM_LOG"] = "ERROR"
+os.environ["OPENAI_LOG"] = "ERROR"
+
 import litellm
 from litellm.caching.caching import Cache as LitellmCache
 
+def _configure_litellm_logging(level: str = "ERROR"):
+    """Configure LiteLLM logging to the specified level."""
+    # Update environment variables
+    os.environ["LITELLM_LOG"] = level
+    os.environ["OPENAI_LOG"] = level
+    
+    # Cover both capitalization variants used by LiteLLM
+    logger_names = [
+        "LiteLLM",
+        "LiteLLM.utils",
+        "LiteLLM.proxy.utils",
+        "litellm",
+        "litellm.utils",
+        "litellm.proxy.utils",
+    ]
+    _level = getattr(logging, level)
+    for logger_name in logger_names:
+        lg = logging.getLogger(logger_name)
+        lg.setLevel(_level)
+        lg.propagate = False
+        # Remove all existing handlers or force them to the desired level
+        for h in lg.handlers[:]:
+            h.setLevel(_level)
+        # Ensure there is at least a NullHandler to swallow logs
+        if not lg.handlers:
+            lg.addHandler(logging.NullHandler())
+
+# Immediately disable LiteLLM logging after import
+_configure_litellm_logging("ERROR")
+
 from dspy.clients.base_lm import BaseLM, inspect_history
 from dspy.clients.cache import Cache
 from dspy.clients.embedding import Embedder
@@ -86,9 +120,6 @@ def configure_cache(
     memory_max_entries=1000000,
 )
 
-# Turn off by default to avoid LiteLLM logging during every LM call.
-litellm.suppress_debug_info = True
-
 if "LITELLM_LOCAL_MODEL_COST_MAP" not in os.environ:
     # Accessed at run time by litellm; i.e., fine to keep after import
     os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
@@ -96,10 +127,17 @@ def configure_cache(
 
 def enable_litellm_logging():
     litellm.suppress_debug_info = False
+    _configure_litellm_logging("INFO")
+    # Remove environment variables to allow logging
+    if "LITELLM_LOG" in os.environ:
+        del os.environ["LITELLM_LOG"]
+    if "OPENAI_LOG" in os.environ:
+        del os.environ["OPENAI_LOG"]
 
 
 def disable_litellm_logging():
     litellm.suppress_debug_info = True
+    _configure_litellm_logging("ERROR")
 
 
 __all__ = [