feat(endpoints): scaling metric and threshold (#3525)

oOraph · hanouticelina · web-flow · commit dd8d31028ce9 · 2025-11-13T16:49:31.000+01:00
* feat(endpoints): scaling metric and threshold

Add the possibility to customize both the scaling metric and threshold when creating or updating an endpoint.

Signed-off-by: Raphael Glon &lt;oOraph@users.noreply.github.com&gt;

* Review

Co-authored-by: célina &lt;hanouticelina@gmail.com&gt;

---------

Signed-off-by: Raphael Glon &lt;oOraph@users.noreply.github.com&gt;
Co-authored-by: Raphael Glon &lt;oOraph@users.noreply.github.com&gt;
Co-authored-by: célina &lt;hanouticelina@gmail.com&gt;
diff --git a/docs/source/en/package_reference/cli.md b/docs/source/en/package_reference/cli.md
@@ -401,6 +401,11 @@ $ hf endpoints deploy [OPTIONS] NAME
 * `--namespace TEXT`: The namespace associated with the Inference Endpoint. Defaults to the current user's namespace.
 * `--task TEXT`: The task on which to deploy the model (e.g. 'text-classification').
 * `--token TEXT`: A User Access Token generated from https://huggingface.co/settings/tokens.
+* `--min-replica INTEGER`: The minimum number of replicas (instances) to keep running for the Inference Endpoint.  [default: 1]
+* `--max-replica INTEGER`: The maximum number of replicas (instances) to scale to for the Inference Endpoint.  [default: 1]
+* `--scale-to-zero-timeout INTEGER`: The duration in minutes before an inactive endpoint is scaled to zero.
+* `--scaling-metric [pendingRequests|hardwareUsage]`: The metric reference for scaling.
+* `--scaling-threshold FLOAT`: The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.
 * `--help`: Show this message and exit.
 
 ### `hf endpoints describe`
@@ -542,6 +547,8 @@ $ hf endpoints update [OPTIONS] NAME
 * `--min-replica INTEGER`: The minimum number of replicas (instances) to keep running for the Inference Endpoint.
 * `--max-replica INTEGER`: The maximum number of replicas (instances) to scale to for the Inference Endpoint.
 * `--scale-to-zero-timeout INTEGER`: The duration in minutes before an inactive endpoint is scaled to zero.
+* `--scaling-metric [pendingRequests|hardwareUsage]`: The metric reference for scaling.
+* `--scaling-threshold FLOAT`: The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.
 * `--token TEXT`: A User Access Token generated from https://huggingface.co/settings/tokens.
 * `--help`: Show this message and exit.
 
diff --git a/src/huggingface_hub/_inference_endpoints.py b/src/huggingface_hub/_inference_endpoints.py
@@ -34,6 +34,11 @@ class InferenceEndpointType(str, Enum):
     PRIVATE = "private"
 
 
+class InferenceEndpointScalingMetric(str, Enum):
+    PENDING_REQUESTS = "pendingRequests"
+    HARDWARE_USAGE = "hardwareUsage"
+
+
 @dataclass
 class InferenceEndpoint:
     """
diff --git a/src/huggingface_hub/cli/inference_endpoints.py b/src/huggingface_hub/cli/inference_endpoints.py
@@ -5,7 +5,7 @@
 
 import typer
 
-from huggingface_hub._inference_endpoints import InferenceEndpoint
+from huggingface_hub._inference_endpoints import InferenceEndpoint, InferenceEndpointScalingMetric
 from huggingface_hub.errors import HfHubHTTPError
 
 from ._cli_utils import TokenOpt, get_hf_api, typer_factory
@@ -112,6 +112,36 @@ def deploy(
         ),
     ] = None,
     token: TokenOpt = None,
+    min_replica: Annotated[
+        int,
+        typer.Option(
+            help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.",
+        ),
+    ] = 1,
+    max_replica: Annotated[
+        int,
+        typer.Option(
+            help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.",
+        ),
+    ] = 1,
+    scale_to_zero_timeout: Annotated[
+        Optional[int],
+        typer.Option(
+            help="The duration in minutes before an inactive endpoint is scaled to zero.",
+        ),
+    ] = None,
+    scaling_metric: Annotated[
+        Optional[InferenceEndpointScalingMetric],
+        typer.Option(
+            help="The metric reference for scaling.",
+        ),
+    ] = None,
+    scaling_threshold: Annotated[
+        Optional[float],
+        typer.Option(
+            help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.",
+        ),
+    ] = None,
 ) -> None:
     """Deploy an Inference Endpoint from a Hub repository."""
     api = get_hf_api(token=token)
@@ -127,6 +157,11 @@ def deploy(
         namespace=namespace,
         task=task,
         token=token,
+        min_replica=min_replica,
+        max_replica=max_replica,
+        scaling_metric=scaling_metric,
+        scaling_threshold=scaling_threshold,
+        scale_to_zero_timeout=scale_to_zero_timeout,
     )
 
     _print_endpoint(endpoint)
@@ -262,6 +297,18 @@ def update(
             help="The duration in minutes before an inactive endpoint is scaled to zero.",
         ),
     ] = None,
+    scaling_metric: Annotated[
+        Optional[InferenceEndpointScalingMetric],
+        typer.Option(
+            help="The metric reference for scaling.",
+        ),
+    ] = None,
+    scaling_threshold: Annotated[
+        Optional[float],
+        typer.Option(
+            help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.",
+        ),
+    ] = None,
     token: TokenOpt = None,
 ) -> None:
     """Update an existing endpoint."""
@@ -280,6 +327,8 @@ def update(
             min_replica=min_replica,
             max_replica=max_replica,
             scale_to_zero_timeout=scale_to_zero_timeout,
+            scaling_metric=scaling_metric,
+            scaling_threshold=scaling_threshold,
             token=token,
         )
     except HfHubHTTPError as error:
diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
@@ -59,7 +59,7 @@
     _upload_files,
     _warn_on_overwriting_operations,
 )
-from ._inference_endpoints import InferenceEndpoint, InferenceEndpointType
+from ._inference_endpoints import InferenceEndpoint, InferenceEndpointScalingMetric, InferenceEndpointType
 from ._jobs_api import JobInfo, JobSpec, ScheduledJobInfo, _create_job_spec
 from ._space_api import SpaceHardware, SpaceRuntime, SpaceStorage, SpaceVariable
 from ._upload_large_folder import upload_large_folder_internal
@@ -7409,6 +7409,8 @@ def create_inference_endpoint(
         account_id: Optional[str] = None,
         min_replica: int = 1,
         max_replica: int = 1,
+        scaling_metric: Optional[InferenceEndpointScalingMetric] = None,
+        scaling_threshold: Optional[float] = None,
         scale_to_zero_timeout: Optional[int] = None,
         revision: Optional[str] = None,
         task: Optional[str] = None,
@@ -7449,6 +7451,12 @@ def create_inference_endpoint(
                 scaling to zero, set this value to 0 and adjust `scale_to_zero_timeout` accordingly. Defaults to 1.
             max_replica (`int`, *optional*):
                 The maximum number of replicas (instances) to scale to for the Inference Endpoint. Defaults to 1.
+            scaling_metric (`str` or [`InferenceEndpointScalingMetric `], *optional*):
+                The metric reference for scaling. Either "pendingRequests" or "hardwareUsage" when provided. Defaults to
+                None (meaning: let the HF Endpoints service specify the metric).
+            scaling_threshold (`float`, *optional*):
+                The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.
+                Defaults to None (meaning: let the HF Endpoints service specify the threshold).
             scale_to_zero_timeout (`int`, *optional*):
                 The duration in minutes before an inactive endpoint is scaled to zero, or no scaling to zero if
                 set to None and `min_replica` is not 0. Defaults to None.
@@ -7600,6 +7608,8 @@ def create_inference_endpoint(
             },
             "type": type,
         }
+        if scaling_metric:
+            payload["compute"]["scaling"]["measure"] = {scaling_metric: scaling_threshold}
         if env:
             payload["model"]["env"] = env
         if secrets:
@@ -7764,6 +7774,8 @@ def update_inference_endpoint(
         min_replica: Optional[int] = None,
         max_replica: Optional[int] = None,
         scale_to_zero_timeout: Optional[int] = None,
+        scaling_metric: Optional[InferenceEndpointScalingMetric] = None,
+        scaling_threshold: Optional[float] = None,
         # Model update
         repository: Optional[str] = None,
         framework: Optional[str] = None,
@@ -7804,7 +7816,12 @@ def update_inference_endpoint(
                 The maximum number of replicas (instances) to scale to for the Inference Endpoint.
             scale_to_zero_timeout (`int`, *optional*):
                 The duration in minutes before an inactive endpoint is scaled to zero.
-
+            scaling_metric (`str` or [`InferenceEndpointScalingMetric `], *optional*):
+                The metric reference for scaling. Either "pendingRequests" or "hardwareUsage" when provided.
+                Defaults to None.
+            scaling_threshold (`float`, *optional*):
+                The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.
+                Defaults to None.
             repository (`str`, *optional*):
                 The name of the model repository associated with the Inference Endpoint (e.g. `"gpt2"`).
             framework (`str`, *optional*):
@@ -7858,6 +7875,8 @@ def update_inference_endpoint(
             payload["compute"]["scaling"]["minReplica"] = min_replica
         if scale_to_zero_timeout is not None:
             payload["compute"]["scaling"]["scaleToZeroTimeout"] = scale_to_zero_timeout
+        if scaling_metric:
+            payload["compute"]["scaling"]["measure"] = {scaling_metric: scaling_threshold}
         if repository is not None:
             payload["model"]["repository"] = repository
         if framework is not None:
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1330,6 +1330,11 @@ def test_deploy_from_hub(self, runner: CliRunner) -> None:
             namespace=None,
             token=None,
             task=None,
+            min_replica=1,
+            max_replica=1,
+            scaling_metric=None,
+            scaling_threshold=None,
+            scale_to_zero_timeout=None,
         )
         assert '"name": "hub"' in result.stdout
 
@@ -1404,6 +1409,8 @@ def test_update(self, runner: CliRunner) -> None:
             max_replica=None,
             scale_to_zero_timeout=None,
             token=None,
+            scaling_metric=None,
+            scaling_threshold=None,
         )
         assert '"name": "updated"' in result.stdout