Skip to content

Commit dd8d310

Browse files
feat(endpoints): scaling metric and threshold (#3525)
* feat(endpoints): scaling metric and threshold Add the possibility to customize both the scaling metric and threshold when creating or updating an endpoint. Signed-off-by: Raphael Glon <[email protected]> * Review Co-authored-by: célina <[email protected]> --------- Signed-off-by: Raphael Glon <[email protected]> Co-authored-by: Raphael Glon <[email protected]> Co-authored-by: célina <[email protected]>
1 parent 743fd8c commit dd8d310

File tree

5 files changed

+90
-3
lines changed

5 files changed

+90
-3
lines changed

docs/source/en/package_reference/cli.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,11 @@ $ hf endpoints deploy [OPTIONS] NAME
401401
* `--namespace TEXT`: The namespace associated with the Inference Endpoint. Defaults to the current user's namespace.
402402
* `--task TEXT`: The task on which to deploy the model (e.g. 'text-classification').
403403
* `--token TEXT`: A User Access Token generated from https://huggingface.co/settings/tokens.
404+
* `--min-replica INTEGER`: The minimum number of replicas (instances) to keep running for the Inference Endpoint. [default: 1]
405+
* `--max-replica INTEGER`: The maximum number of replicas (instances) to scale to for the Inference Endpoint. [default: 1]
406+
* `--scale-to-zero-timeout INTEGER`: The duration in minutes before an inactive endpoint is scaled to zero.
407+
* `--scaling-metric [pendingRequests|hardwareUsage]`: The metric reference for scaling.
408+
* `--scaling-threshold FLOAT`: The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.
404409
* `--help`: Show this message and exit.
405410

406411
### `hf endpoints describe`
@@ -542,6 +547,8 @@ $ hf endpoints update [OPTIONS] NAME
542547
* `--min-replica INTEGER`: The minimum number of replicas (instances) to keep running for the Inference Endpoint.
543548
* `--max-replica INTEGER`: The maximum number of replicas (instances) to scale to for the Inference Endpoint.
544549
* `--scale-to-zero-timeout INTEGER`: The duration in minutes before an inactive endpoint is scaled to zero.
550+
* `--scaling-metric [pendingRequests|hardwareUsage]`: The metric reference for scaling.
551+
* `--scaling-threshold FLOAT`: The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.
545552
* `--token TEXT`: A User Access Token generated from https://huggingface.co/settings/tokens.
546553
* `--help`: Show this message and exit.
547554

src/huggingface_hub/_inference_endpoints.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ class InferenceEndpointType(str, Enum):
3434
PRIVATE = "private"
3535

3636

37+
class InferenceEndpointScalingMetric(str, Enum):
38+
PENDING_REQUESTS = "pendingRequests"
39+
HARDWARE_USAGE = "hardwareUsage"
40+
41+
3742
@dataclass
3843
class InferenceEndpoint:
3944
"""

src/huggingface_hub/cli/inference_endpoints.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import typer
77

8-
from huggingface_hub._inference_endpoints import InferenceEndpoint
8+
from huggingface_hub._inference_endpoints import InferenceEndpoint, InferenceEndpointScalingMetric
99
from huggingface_hub.errors import HfHubHTTPError
1010

1111
from ._cli_utils import TokenOpt, get_hf_api, typer_factory
@@ -112,6 +112,36 @@ def deploy(
112112
),
113113
] = None,
114114
token: TokenOpt = None,
115+
min_replica: Annotated[
116+
int,
117+
typer.Option(
118+
help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.",
119+
),
120+
] = 1,
121+
max_replica: Annotated[
122+
int,
123+
typer.Option(
124+
help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.",
125+
),
126+
] = 1,
127+
scale_to_zero_timeout: Annotated[
128+
Optional[int],
129+
typer.Option(
130+
help="The duration in minutes before an inactive endpoint is scaled to zero.",
131+
),
132+
] = None,
133+
scaling_metric: Annotated[
134+
Optional[InferenceEndpointScalingMetric],
135+
typer.Option(
136+
help="The metric reference for scaling.",
137+
),
138+
] = None,
139+
scaling_threshold: Annotated[
140+
Optional[float],
141+
typer.Option(
142+
help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.",
143+
),
144+
] = None,
115145
) -> None:
116146
"""Deploy an Inference Endpoint from a Hub repository."""
117147
api = get_hf_api(token=token)
@@ -127,6 +157,11 @@ def deploy(
127157
namespace=namespace,
128158
task=task,
129159
token=token,
160+
min_replica=min_replica,
161+
max_replica=max_replica,
162+
scaling_metric=scaling_metric,
163+
scaling_threshold=scaling_threshold,
164+
scale_to_zero_timeout=scale_to_zero_timeout,
130165
)
131166

132167
_print_endpoint(endpoint)
@@ -262,6 +297,18 @@ def update(
262297
help="The duration in minutes before an inactive endpoint is scaled to zero.",
263298
),
264299
] = None,
300+
scaling_metric: Annotated[
301+
Optional[InferenceEndpointScalingMetric],
302+
typer.Option(
303+
help="The metric reference for scaling.",
304+
),
305+
] = None,
306+
scaling_threshold: Annotated[
307+
Optional[float],
308+
typer.Option(
309+
help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.",
310+
),
311+
] = None,
265312
token: TokenOpt = None,
266313
) -> None:
267314
"""Update an existing endpoint."""
@@ -280,6 +327,8 @@ def update(
280327
min_replica=min_replica,
281328
max_replica=max_replica,
282329
scale_to_zero_timeout=scale_to_zero_timeout,
330+
scaling_metric=scaling_metric,
331+
scaling_threshold=scaling_threshold,
283332
token=token,
284333
)
285334
except HfHubHTTPError as error:

src/huggingface_hub/hf_api.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
_upload_files,
6060
_warn_on_overwriting_operations,
6161
)
62-
from ._inference_endpoints import InferenceEndpoint, InferenceEndpointType
62+
from ._inference_endpoints import InferenceEndpoint, InferenceEndpointScalingMetric, InferenceEndpointType
6363
from ._jobs_api import JobInfo, JobSpec, ScheduledJobInfo, _create_job_spec
6464
from ._space_api import SpaceHardware, SpaceRuntime, SpaceStorage, SpaceVariable
6565
from ._upload_large_folder import upload_large_folder_internal
@@ -7409,6 +7409,8 @@ def create_inference_endpoint(
74097409
account_id: Optional[str] = None,
74107410
min_replica: int = 1,
74117411
max_replica: int = 1,
7412+
scaling_metric: Optional[InferenceEndpointScalingMetric] = None,
7413+
scaling_threshold: Optional[float] = None,
74127414
scale_to_zero_timeout: Optional[int] = None,
74137415
revision: Optional[str] = None,
74147416
task: Optional[str] = None,
@@ -7449,6 +7451,12 @@ def create_inference_endpoint(
74497451
scaling to zero, set this value to 0 and adjust `scale_to_zero_timeout` accordingly. Defaults to 1.
74507452
max_replica (`int`, *optional*):
74517453
The maximum number of replicas (instances) to scale to for the Inference Endpoint. Defaults to 1.
7454+
scaling_metric (`str` or [`InferenceEndpointScalingMetric `], *optional*):
7455+
The metric reference for scaling. Either "pendingRequests" or "hardwareUsage" when provided. Defaults to
7456+
None (meaning: let the HF Endpoints service specify the metric).
7457+
scaling_threshold (`float`, *optional*):
7458+
The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.
7459+
Defaults to None (meaning: let the HF Endpoints service specify the threshold).
74527460
scale_to_zero_timeout (`int`, *optional*):
74537461
The duration in minutes before an inactive endpoint is scaled to zero, or no scaling to zero if
74547462
set to None and `min_replica` is not 0. Defaults to None.
@@ -7600,6 +7608,8 @@ def create_inference_endpoint(
76007608
},
76017609
"type": type,
76027610
}
7611+
if scaling_metric:
7612+
payload["compute"]["scaling"]["measure"] = {scaling_metric: scaling_threshold}
76037613
if env:
76047614
payload["model"]["env"] = env
76057615
if secrets:
@@ -7764,6 +7774,8 @@ def update_inference_endpoint(
77647774
min_replica: Optional[int] = None,
77657775
max_replica: Optional[int] = None,
77667776
scale_to_zero_timeout: Optional[int] = None,
7777+
scaling_metric: Optional[InferenceEndpointScalingMetric] = None,
7778+
scaling_threshold: Optional[float] = None,
77677779
# Model update
77687780
repository: Optional[str] = None,
77697781
framework: Optional[str] = None,
@@ -7804,7 +7816,12 @@ def update_inference_endpoint(
78047816
The maximum number of replicas (instances) to scale to for the Inference Endpoint.
78057817
scale_to_zero_timeout (`int`, *optional*):
78067818
The duration in minutes before an inactive endpoint is scaled to zero.
7807-
7819+
scaling_metric (`str` or [`InferenceEndpointScalingMetric `], *optional*):
7820+
The metric reference for scaling. Either "pendingRequests" or "hardwareUsage" when provided.
7821+
Defaults to None.
7822+
scaling_threshold (`float`, *optional*):
7823+
The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.
7824+
Defaults to None.
78087825
repository (`str`, *optional*):
78097826
The name of the model repository associated with the Inference Endpoint (e.g. `"gpt2"`).
78107827
framework (`str`, *optional*):
@@ -7858,6 +7875,8 @@ def update_inference_endpoint(
78587875
payload["compute"]["scaling"]["minReplica"] = min_replica
78597876
if scale_to_zero_timeout is not None:
78607877
payload["compute"]["scaling"]["scaleToZeroTimeout"] = scale_to_zero_timeout
7878+
if scaling_metric:
7879+
payload["compute"]["scaling"]["measure"] = {scaling_metric: scaling_threshold}
78617880
if repository is not None:
78627881
payload["model"]["repository"] = repository
78637882
if framework is not None:

tests/test_cli.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,6 +1330,11 @@ def test_deploy_from_hub(self, runner: CliRunner) -> None:
13301330
namespace=None,
13311331
token=None,
13321332
task=None,
1333+
min_replica=1,
1334+
max_replica=1,
1335+
scaling_metric=None,
1336+
scaling_threshold=None,
1337+
scale_to_zero_timeout=None,
13331338
)
13341339
assert '"name": "hub"' in result.stdout
13351340

@@ -1404,6 +1409,8 @@ def test_update(self, runner: CliRunner) -> None:
14041409
max_replica=None,
14051410
scale_to_zero_timeout=None,
14061411
token=None,
1412+
scaling_metric=None,
1413+
scaling_threshold=None,
14071414
)
14081415
assert '"name": "updated"' in result.stdout
14091416

0 commit comments

Comments
 (0)