use rate limit headers for smarter retry in http backoff

hanouticelina · hanouticelina · commit 31b3bb5ab4c5 · 2025-11-27T12:30:37.000+01:00
diff --git a/src/huggingface_hub/utils/_http.py b/src/huggingface_hub/utils/_http.py
@@ -362,6 +362,7 @@ def _http_backoff_base(
 
     nb_tries = 0
     sleep_time = base_wait_time
+    ratelimit_reset: Optional[int] = None  # seconds to wait for rate limit reset if 429 response
 
     # If `data` is used and is a file object (or any IO), it will be consumed on the
     # first HTTP request. We need to save the initial position so that the full content
@@ -373,6 +374,7 @@ def _http_backoff_base(
     client = get_session()
     while True:
         nb_tries += 1
+        ratelimit_reset = None
         try:
             # If `data` is used and is a file object (or any IO), set back cursor to
             # initial position.
@@ -382,6 +384,8 @@ def _http_backoff_base(
             # Perform request and handle response
             def _should_retry(response: httpx.Response) -> bool:
                 """Handle response and return True if should retry, False if should return/yield."""
+                nonlocal ratelimit_reset
+
                 if response.status_code not in retry_on_status_codes:
                     return False  # Success, don't retry
 
@@ -393,6 +397,12 @@ def _should_retry(response: httpx.Response) -> bool:
                     # user ask for retry on a status code that doesn't raise_for_status.
                     return False  # Don't retry, return/yield response
 
+                # get rate limit reset time from headers if 429 response
+                if response.status_code == 429:
+                    ratelimit_info = parse_ratelimit_headers(response.headers)
+                    if ratelimit_info is not None:
+                        ratelimit_reset = ratelimit_info.reset_in_seconds
+
                 return True  # Should retry
 
             if stream:
@@ -415,9 +425,15 @@ def _should_retry(response: httpx.Response) -> bool:
             if nb_tries > max_retries:
                 raise err
 
-        # Sleep for X seconds
-        logger.warning(f"Retrying in {sleep_time}s [Retry {nb_tries}/{max_retries}].")
-        time.sleep(sleep_time)
+        # use rate limit reset if available, otherwise exponential backoff
+        if ratelimit_reset is not None:
+            actual_sleep = min(max_wait_time, float(ratelimit_reset))
+            logger.warning(f"Rate limited. Waiting {actual_sleep}s before retry [Retry {nb_tries}/{max_retries}].")
+        else:
+            actual_sleep = min(max_wait_time, sleep_time)
+            logger.warning(f"Retrying in {actual_sleep}s [Retry {nb_tries}/{max_retries}].")
+
+        time.sleep(actual_sleep)
 
         # Update sleep time for next retry
         sleep_time = min(max_wait_time, sleep_time * 2)  # Exponential backoff
diff --git a/tests/test_utils_http.py b/tests/test_utils_http.py
@@ -151,6 +151,62 @@ def _side_effect_timer() -> Generator[ConnectTimeout, None, None]:
         expected_sleep_times = [0.1, 0.2, 0.4, 0.5, 0.5]
         self.assertListEqual(sleep_times, expected_sleep_times)
 
+    def test_backoff_on_429_uses_ratelimit_header_capped(self) -> None:
+        """Test that 429 wait time is capped by max_wait_time."""
+        sleep_times = []
+
+        def _side_effect_timer() -> Generator:
+            t0 = time.time()
+            mock_429 = Mock()
+            mock_429.status_code = 429
+            mock_429.headers = {"ratelimit": '"api";r=0;t=1'}  # Server says wait 1s
+            yield mock_429
+            t1 = time.time()
+            sleep_times.append(round(t1 - t0, 1))
+            t0 = t1
+            mock_200 = Mock()
+            mock_200.status_code = 200
+            yield mock_200
+
+        self.mock_request.side_effect = _side_effect_timer()
+
+        # max_wait_time=0.5 is less than t=1, so wait should be capped at 0.5
+        response = http_backoff(
+            "GET", URL, base_wait_time=0.1, max_wait_time=0.5, max_retries=3, retry_on_status_codes=429
+        )
+
+        self.assertEqual(self.mock_request.call_count, 2)
+        self.assertEqual(sleep_times, [0.5])  # Capped at max_wait_time
+        self.assertEqual(response.status_code, 200)
+
+    def test_backoff_on_429_uses_ratelimit_header_not_capped(self) -> None:
+        """Test that 429 wait time uses full reset time when under max_wait_time."""
+        sleep_times = []
+
+        def _side_effect_timer() -> Generator:
+            t0 = time.time()
+            mock_429 = Mock()
+            mock_429.status_code = 429
+            mock_429.headers = {"ratelimit": '"api";r=0;t=1'}  # Server says wait 1s
+            yield mock_429
+            t1 = time.time()
+            sleep_times.append(round(t1 - t0, 1))
+            t0 = t1
+            mock_200 = Mock()
+            mock_200.status_code = 200
+            yield mock_200
+
+        self.mock_request.side_effect = _side_effect_timer()
+
+        # max_wait_time=5 is greater than t=1, so wait should be full 1s
+        response = http_backoff(
+            "GET", URL, base_wait_time=0.1, max_wait_time=5.0, max_retries=3, retry_on_status_codes=429
+        )
+
+        self.assertEqual(self.mock_request.call_count, 2)
+        self.assertEqual(sleep_times, [1.0])  # Full reset time (not capped)
+        self.assertEqual(response.status_code, 200)
+
 
 class TestConfigureSession(unittest.TestCase):
     def setUp(self) -> None: