Skip to content

Commit c05af1d

Browse files
feat(api,ui,sdk): Make CPU limits configurable (#586)
# Description As of present, users are not able to configure the CPU limits of the pods in which Merlin models and transformers are deployed in - they are instead determined automatically on the platform-level (Merlin API server). Depending on how the API server has been configured, one of the following happens: - the CPU limit of a model is set as its CPU request value, multiplied by a [scaling factor](https://github.com/caraml-dev/merlin/blob/f1ebe099ea168988b365ee72ce08543b127826e1/api/config/config.go#L364) (e.g. 2 CPU * 1.5) **or,** - Note that this is the existing way memory limits are automatically set by the Merlin API server - the CPU limit is left unset - Note that because KServe does not currently allow CPU limits to be completely unset, the Merlin API server instead sets an [arbitrary value ](https://github.com/caraml-dev/merlin/blob/f1ebe099ea168988b365ee72ce08543b127826e1/api/config/config.go#L363)(ideally one that is very big) as the CPU limit instead This PR introduces a new workflow which would allow users to instead override the platform-level CPU limits (described in the paragraph above) set on a model. This workflow is available via the UI, SDK and by extension, directly calling the API endpoint of the API server. UI: ![Screenshot 2024-05-24 at 2 13 46 PM](https://github.com/caraml-dev/merlin/assets/36802364/a2b59c1e-df2d-4070-92ff-b4f375256da1) ![Screenshot 2024-05-24 at 2 23 42 PM](https://github.com/caraml-dev/merlin/assets/36802364/616f0d03-0d36-4b82-8dd5-051d098a78c2) SDK: ```python merlin.deploy( version_1, resource_request=merlin.ResourceRequest( min_replica=0, max_replica=0, cpu_request="0.5", cpu_limit="2", memory_request="1Gi", ), ) ``` In addition, this PR adds a new configuration, `DefaultEnvVarsWithoutCPULimits`, which is a list of env vars that automatically get added to all Merlin models and transformers when CPU limits are not set. This allows the Merlin API server's operators to set env vars platform-wide that can potentially improve these deployments' performance, e.g. env vars involving concurrency. # Modifications - `api/cluster/resource/templater.go` - Refactoring of templater methods to set default env vars when cpu limits are not explicitly set and when the cpu limit scaling factor is set as 0 - `api/config/config.go` - Addition of the new field `DefaultEnvVarsWithoutCPULimits` - `api/config/config_test.go` - Addition of a new unit test to test the parsing of configs from .yaml files - `docs/user/templates/model_deployment/01_deploying_a_model_version.md` - Addition of docs to demonstrate how the platform-level CPU limits can be overriden - `python/sdk/merlin/resource_request.py` - Addition of a new cpu limit field to the resource request class - `ui/src/pages/version/components/forms/components/CPULimitsFormGroup.js` - Addition of a new form group to allow cpu limits to be specified on the UI # Tests - [x] Deploying existing models (and transformers) with and without CPU limits set # Checklist - [x] Added PR label - [x] Added unit test, integration, and/or e2e tests - [x] Tested locally - [x] Updated documentation - [x] Update Swagger spec if the PR introduce API changes - [x] Regenerated Golang and Python client if the PR introduces API changes # Release Notes <!-- Does this PR introduce a user-facing change? If no, just write "NONE" in the release-note block below. If yes, a release note is required. Enter your extended release note in the block below. If the PR requires additional action from users switching to the new release, include the string "action required". For more information about release notes, see kubernetes' guide here: http://git.k8s.io/community/contributors/guide/release-notes.md --> ```release-note NONE ```
1 parent d948254 commit c05af1d

39 files changed

+1310
-184
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
coverage:
2+
status:
3+
patch:
4+
default:
5+
threshold: 0.03%

.github/workflows/merlin.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ jobs:
141141
name: sdk-test-${{ matrix.python-version }}
142142
token: ${{ secrets.CODECOV_TOKEN }}
143143
working-directory: ./python/sdk
144+
codecov_yml_path: ../../.github/workflows/codecov-config/codecov.yml
144145

145146
lint-api:
146147
runs-on: ubuntu-latest
@@ -191,6 +192,7 @@ jobs:
191192
name: api-test
192193
token: ${{ secrets.CODECOV_TOKEN }}
193194
working-directory: ./api
195+
codecov_yml_path: ../.github/workflows/codecov-config/codecov.yml
194196

195197
test-observation-publisher:
196198
runs-on: ubuntu-latest

api/client/model_resource_request.go

Lines changed: 36 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/cluster/resource/templater.go

Lines changed: 58 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -192,22 +192,11 @@ func (t *InferenceServiceTemplater) CreateInferenceServiceSpec(modelService *mod
192192
}
193193

194194
func (t *InferenceServiceTemplater) createPredictorSpec(modelService *models.Service) (kservev1beta1.PredictorSpec, error) {
195-
envVars := modelService.EnvVars
196-
197-
// Set resource limits to request * userContainerCPULimitRequestFactor or UserContainerMemoryLimitRequestFactor
198-
limits := map[corev1.ResourceName]resource.Quantity{}
199-
if t.deploymentConfig.UserContainerCPULimitRequestFactor != 0 {
200-
limits[corev1.ResourceCPU] = ScaleQuantity(
201-
modelService.ResourceRequest.CPURequest, t.deploymentConfig.UserContainerCPULimitRequestFactor,
202-
)
203-
} else {
204-
// TODO: Remove this else-block when KServe finally allows default CPU limits to be removed
205-
var err error
206-
limits[corev1.ResourceCPU], err = resource.ParseQuantity(t.deploymentConfig.UserContainerCPUDefaultLimit)
207-
if err != nil {
208-
return kservev1beta1.PredictorSpec{}, err
209-
}
195+
limits, envVars, err := t.getResourceLimitsAndEnvVars(modelService.ResourceRequest, modelService.EnvVars)
196+
if err != nil {
197+
return kservev1beta1.PredictorSpec{}, err
210198
}
199+
211200
if t.deploymentConfig.UserContainerMemoryLimitRequestFactor != 0 {
212201
limits[corev1.ResourceMemory] = ScaleQuantity(
213202
modelService.ResourceRequest.MemoryRequest, t.deploymentConfig.UserContainerMemoryLimitRequestFactor,
@@ -329,7 +318,7 @@ func (t *InferenceServiceTemplater) createPredictorSpec(modelService *models.Ser
329318
// 1. PyFunc default env
330319
// 2. User environment variable
331320
// 3. Default env variable that can be override by user environment
332-
higherPriorityEnvVars := models.MergeEnvVars(modelService.EnvVars, pyfuncDefaultEnv)
321+
higherPriorityEnvVars := models.MergeEnvVars(envVars, pyfuncDefaultEnv)
333322
lowerPriorityEnvVars := models.EnvVars{}
334323
if modelService.Protocol == protocol.UpiV1 {
335324
lowerPriorityEnvVars = append(lowerPriorityEnvVars, models.EnvVar{Name: envGRPCOptions, Value: t.deploymentConfig.PyfuncGRPCOptions})
@@ -364,7 +353,7 @@ func (t *InferenceServiceTemplater) createPredictorSpec(modelService *models.Ser
364353
}
365354

366355
case models.ModelTypeCustom:
367-
predictorSpec = createCustomPredictorSpec(modelService, resources, nodeSelector, tolerations)
356+
predictorSpec = createCustomPredictorSpec(modelService, envVars, resources, nodeSelector, tolerations)
368357
}
369358

370359
if len(nodeSelector) > 0 {
@@ -392,28 +381,11 @@ func (t *InferenceServiceTemplater) createTransformerSpec(
392381
modelService *models.Service,
393382
transformer *models.Transformer,
394383
) (*kservev1beta1.TransformerSpec, error) {
395-
// Set resource limits to request * userContainerCPULimitRequestFactor or UserContainerMemoryLimitRequestFactor
396-
limits := map[corev1.ResourceName]resource.Quantity{}
397-
if t.deploymentConfig.UserContainerCPULimitRequestFactor != 0 {
398-
limits[corev1.ResourceCPU] = ScaleQuantity(
399-
transformer.ResourceRequest.CPURequest, t.deploymentConfig.UserContainerCPULimitRequestFactor,
400-
)
401-
} else {
402-
// TODO: Remove this else-block when KServe finally allows default CPU limits to be removed
403-
var err error
404-
limits[corev1.ResourceCPU], err = resource.ParseQuantity(t.deploymentConfig.UserContainerCPUDefaultLimit)
405-
if err != nil {
406-
return nil, err
407-
}
408-
}
409-
if t.deploymentConfig.UserContainerMemoryLimitRequestFactor != 0 {
410-
limits[corev1.ResourceMemory] = ScaleQuantity(
411-
transformer.ResourceRequest.MemoryRequest, t.deploymentConfig.UserContainerMemoryLimitRequestFactor,
412-
)
384+
limits, envVars, err := t.getResourceLimitsAndEnvVars(transformer.ResourceRequest, transformer.EnvVars)
385+
if err != nil {
386+
return nil, err
413387
}
414388

415-
envVars := transformer.EnvVars
416-
417389
// Put in defaults if not provided by users (user's input is used)
418390
if transformer.TransformerType == models.StandardTransformerType {
419391
transformer.Image = t.deploymentConfig.StandardTransformer.ImageName
@@ -780,9 +752,13 @@ func createDefaultPredictorEnvVars(modelService *models.Service) models.EnvVars
780752
return defaultEnvVars
781753
}
782754

783-
func createCustomPredictorSpec(modelService *models.Service, resources corev1.ResourceRequirements, nodeSelector map[string]string, tolerations []corev1.Toleration) kservev1beta1.PredictorSpec {
784-
envVars := modelService.EnvVars
785-
755+
func createCustomPredictorSpec(
756+
modelService *models.Service,
757+
envVars models.EnvVars,
758+
resources corev1.ResourceRequirements,
759+
nodeSelector map[string]string,
760+
tolerations []corev1.Toleration,
761+
) kservev1beta1.PredictorSpec {
786762
// Add default env var (Overwrite by user not allowed)
787763
defaultEnvVar := createDefaultPredictorEnvVars(modelService)
788764
envVars = models.MergeEnvVars(envVars, defaultEnvVar)
@@ -910,3 +886,45 @@ func (t *InferenceServiceTemplater) applyDefaults(service *models.Service) {
910886
}
911887
}
912888
}
889+
890+
func (t *InferenceServiceTemplater) getResourceLimitsAndEnvVars(
891+
resourceRequest *models.ResourceRequest,
892+
envVars models.EnvVars,
893+
) (map[corev1.ResourceName]resource.Quantity, models.EnvVars, error) {
894+
// Set resource limits to request * userContainerCPULimitRequestFactor or UserContainerMemoryLimitRequestFactor
895+
limits := map[corev1.ResourceName]resource.Quantity{}
896+
// Set cpu resource limits automatically if they have not been set
897+
if resourceRequest.CPULimit == nil || resourceRequest.CPULimit.IsZero() {
898+
if t.deploymentConfig.UserContainerCPULimitRequestFactor != 0 {
899+
limits[corev1.ResourceCPU] = ScaleQuantity(
900+
resourceRequest.CPURequest, t.deploymentConfig.UserContainerCPULimitRequestFactor,
901+
)
902+
} else {
903+
// TODO: Remove this else-block when KServe finally allows default CPU limits to be removed
904+
var err error
905+
limits[corev1.ResourceCPU], err = resource.ParseQuantity(t.deploymentConfig.UserContainerCPUDefaultLimit)
906+
if err != nil {
907+
return nil, nil, err
908+
}
909+
// Set additional env vars to manage concurrency so model performance improves when no CPU limits are set
910+
envVars = models.MergeEnvVars(ParseEnvVars(t.deploymentConfig.DefaultEnvVarsWithoutCPULimits), envVars)
911+
}
912+
} else {
913+
limits[corev1.ResourceCPU] = *resourceRequest.CPULimit
914+
}
915+
916+
if t.deploymentConfig.UserContainerMemoryLimitRequestFactor != 0 {
917+
limits[corev1.ResourceMemory] = ScaleQuantity(
918+
resourceRequest.MemoryRequest, t.deploymentConfig.UserContainerMemoryLimitRequestFactor,
919+
)
920+
}
921+
return limits, envVars, nil
922+
}
923+
924+
func ParseEnvVars(envVars []corev1.EnvVar) models.EnvVars {
925+
var parsedEnvVars models.EnvVars
926+
for _, envVar := range envVars {
927+
parsedEnvVars = append(parsedEnvVars, models.EnvVar{Name: envVar.Name, Value: envVar.Value})
928+
}
929+
return parsedEnvVars
930+
}

api/cluster/resource/templater_gpu_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ var (
5858
"nvidia.com/gpu": resource.MustParse("1"),
5959
},
6060
Limits: corev1.ResourceList{
61-
corev1.ResourceCPU: resource.MustParse("8"),
61+
corev1.ResourceCPU: resource.MustParse("10"),
6262
corev1.ResourceMemory: ScaleQuantity(defaultModelResourceRequests.MemoryRequest, 2),
6363
"nvidia.com/gpu": resource.MustParse("1"),
6464
},

0 commit comments

Comments
 (0)