Skip to content

Commit 153d581

Browse files
Levi080513Levi080513
andauthored
fix: set default ulimit 65536 for ray container (#160)
Co-authored-by: Levi080513 <[email protected]>
1 parent 50ef05a commit 153d581

File tree

3 files changed

+60
-152
lines changed

3 files changed

+60
-152
lines changed

internal/cluster/kubernetes_cluster_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import (
55
"testing"
66

77
v1 "github.com/neutree-ai/neutree/api/v1"
8-
"github.com/neutree-ai/neutree/internal/util"
98
"github.com/stretchr/testify/require"
109
"k8s.io/apimachinery/pkg/api/resource"
1110
"k8s.io/client-go/kubernetes/scheme"
@@ -17,6 +16,7 @@ import (
1716

1817
acceleratormocks "github.com/neutree-ai/neutree/internal/accelerator/mocks"
1918
plugin "github.com/neutree-ai/neutree/internal/accelerator/plugin"
19+
"github.com/neutree-ai/neutree/internal/util"
2020
)
2121

2222
func newNode(name string, schedulable bool, resources map[corev1.ResourceName]resource.Quantity, labels map[string]string) *corev1.Node {

internal/cluster/ray_ssh_operation.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -295,21 +295,23 @@ func (c *sshRayClusterReconciler) generateRayClusterConfig(reconcileContext *Rec
295295
"--cap-add=SYS_ADMIN",
296296
"--security-opt=seccomp=unconfined",
297297
"-e RAY_kill_child_processes_on_worker_exit_with_raylet_subreaper=true",
298+
// Increase nofile ulimit to avoid "Too many open files" error in Ray workers
299+
"--ulimit nofile=65536:65536",
298300
}
299301

300302
rayClusterConfig.HeadStartRayCommands = []string{
301303
"ray stop",
302-
fmt.Sprintf(`python /home/ray/start.py --head --port=6379 --metrics-export-port=%d --disable-usage-stats --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 --labels='{"%s":"%s"}'`, //nolint:lll
304+
fmt.Sprintf(`ulimit -n 65536; python /home/ray/start.py --head --port=6379 --metrics-export-port=%d --disable-usage-stats --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 --labels='{"%s":"%s"}'`, //nolint:lll
303305
v1.RayletMetricsPort, v1.NeutreeServingVersionLabel, cluster.Spec.Version),
304306
}
305307
rayClusterConfig.WorkerStartRayCommands = []string{
306308
"ray stop",
307-
fmt.Sprintf(`python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=%d --disable-usage-stats --labels='{"%s":"%s","%s":"%s"}'`,
309+
fmt.Sprintf(`ulimit -n 65536; python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=%d --disable-usage-stats --labels='{"%s":"%s","%s":"%s"}'`,
308310
v1.RayletMetricsPort, v1.NeutreeNodeProvisionTypeLabel, v1.AutoScaleNodeProvisionType, v1.NeutreeServingVersionLabel, cluster.Spec.Version),
309311
}
310312
rayClusterConfig.StaticWorkerStartRayCommands = []string{
311313
"ray stop",
312-
fmt.Sprintf(`python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=%d --disable-usage-stats --labels='{"%s":"%s","%s":"%s"}'`,
314+
fmt.Sprintf(`ulimit -n 65536; python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=%d --disable-usage-stats --labels='{"%s":"%s","%s":"%s"}'`,
313315
v1.RayletMetricsPort, v1.NeutreeNodeProvisionTypeLabel, v1.StaticNodeProvisionType, v1.NeutreeServingVersionLabel, cluster.Spec.Version),
314316
}
315317

internal/cluster/ray_ssh_operation_test.go

Lines changed: 54 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -885,12 +885,50 @@ func TestDownCluster(t *testing.T) {
885885
}
886886

887887
func TestGenerateRayClusterConfig(t *testing.T) {
888+
defaultExpectedConfig := func() *v1.RayClusterConfig {
889+
return &v1.RayClusterConfig{
890+
ClusterName: "test-cluster",
891+
Provider: v1.Provider{
892+
Type: "local",
893+
},
894+
Auth: v1.Auth{
895+
SSHUser: "root",
896+
},
897+
Docker: v1.Docker{
898+
ContainerName: "ray_container",
899+
PullBeforeRun: true,
900+
Image: "registry.example.com/neutree/neutree-serve:v1.0.0",
901+
RunOptions: []string{
902+
"--privileged",
903+
"--cap-add=SYS_ADMIN",
904+
"--security-opt=seccomp=unconfined",
905+
"-e RAY_kill_child_processes_on_worker_exit_with_raylet_subreaper=true",
906+
"--ulimit nofile=65536:65536",
907+
},
908+
},
909+
HeadStartRayCommands: []string{
910+
"ray stop",
911+
`ulimit -n 65536; python /home/ray/start.py --head --port=6379 --metrics-export-port=54311 --disable-usage-stats --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 --labels='{"neutree.ai/neutree-serving-version":"v1.0.0"}'`,
912+
},
913+
WorkerStartRayCommands: []string{
914+
"ray stop",
915+
`ulimit -n 65536; python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=54311 --disable-usage-stats --labels='{"neutree.ai/node-provision-type":"autoscaler","neutree.ai/neutree-serving-version":"v1.0.0"}'`,
916+
},
917+
StaticWorkerStartRayCommands: []string{
918+
"ray stop",
919+
`ulimit -n 65536; python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=54311 --disable-usage-stats --labels='{"neutree.ai/node-provision-type":"static","neutree.ai/neutree-serving-version":"v1.0.0"}'`,
920+
},
921+
InitializationCommands: []string{
922+
"docker login registry.example.com -u 'user' -p 'pass'",
923+
},
924+
}
925+
}
888926
tests := []struct {
889927
name string
890928
cluster *v1.Cluster
891929
imageRegistry *v1.ImageRegistry
892930
inputConfig *v1.RayClusterConfig
893-
expectedConfig *v1.RayClusterConfig
931+
expectedConfig func() *v1.RayClusterConfig
894932
expectError bool
895933
}{
896934
{
@@ -920,86 +958,8 @@ func TestGenerateRayClusterConfig(t *testing.T) {
920958
inputConfig: &v1.RayClusterConfig{
921959
ClusterName: "test-cluster",
922960
},
923-
expectedConfig: &v1.RayClusterConfig{
924-
ClusterName: "test-cluster",
925-
Provider: v1.Provider{
926-
Type: "local",
927-
},
928-
Docker: v1.Docker{
929-
ContainerName: "ray_container",
930-
PullBeforeRun: true,
931-
Image: "registry.example.com/neutree/neutree-serve:v1.0.0",
932-
},
933-
HeadStartRayCommands: []string{
934-
"ray stop",
935-
`python /home/ray/start.py --head --port=6379 --metrics-export-port=54311 --disable-usage-stats --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 --labels='{"neutree.ai/neutree-serving-version":"v1.0.0"}'`,
936-
},
937-
WorkerStartRayCommands: []string{
938-
"ray stop",
939-
`python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=54311 --disable-usage-stats --labels='{"neutree.ai/node-provision-type":"autoscaler","neutree.ai/neutree-serving-version":"v1.0.0"}'`,
940-
},
941-
StaticWorkerStartRayCommands: []string{
942-
"ray stop",
943-
`python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=54311 --disable-usage-stats --labels='{"neutree.ai/node-provision-type":"static","neutree.ai/neutree-serving-version":"v1.0.0"}'`,
944-
},
945-
InitializationCommands: []string{
946-
"docker login registry.example.com -u 'user' -p 'pass'",
947-
},
948-
},
949-
expectError: false,
950-
},
951-
{
952-
name: "success - always use neutree cluster name",
953-
cluster: &v1.Cluster{
954-
Metadata: &v1.Metadata{Name: "test-cluster"},
955-
Spec: &v1.ClusterSpec{
956-
Version: "v1.0.0",
957-
Config: map[string]interface{}{
958-
"auth": map[string]interface{}{
959-
"ssh_user": "root",
960-
},
961-
},
962-
},
963-
},
964-
imageRegistry: &v1.ImageRegistry{
965-
Spec: &v1.ImageRegistrySpec{
966-
URL: "http://registry.example.com",
967-
Repository: "neutree",
968-
AuthConfig: v1.ImageRegistryAuthConfig{
969-
Username: "user",
970-
Password: "pass",
971-
},
972-
Ca: "Y2EK",
973-
},
974-
},
975-
inputConfig: &v1.RayClusterConfig{
976-
ClusterName: "test-cluster-1",
977-
},
978-
expectedConfig: &v1.RayClusterConfig{
979-
ClusterName: "test-cluster",
980-
Provider: v1.Provider{
981-
Type: "local",
982-
},
983-
Docker: v1.Docker{
984-
ContainerName: "ray_container",
985-
PullBeforeRun: true,
986-
Image: "registry.example.com/neutree/neutree-serve:v1.0.0",
987-
},
988-
HeadStartRayCommands: []string{
989-
"ray stop",
990-
`python /home/ray/start.py --head --port=6379 --metrics-export-port=54311 --disable-usage-stats --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 --labels='{"neutree.ai/neutree-serving-version":"v1.0.0"}'`,
991-
},
992-
WorkerStartRayCommands: []string{
993-
"ray stop",
994-
`python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=54311 --disable-usage-stats --labels='{"neutree.ai/node-provision-type":"autoscaler","neutree.ai/neutree-serving-version":"v1.0.0"}'`,
995-
},
996-
StaticWorkerStartRayCommands: []string{
997-
"ray stop",
998-
`python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=54311 --disable-usage-stats --labels='{"neutree.ai/node-provision-type":"static","neutree.ai/neutree-serving-version":"v1.0.0"}'`,
999-
},
1000-
InitializationCommands: []string{
1001-
"docker login registry.example.com -u 'user' -p 'pass'",
1002-
},
961+
expectedConfig: func() *v1.RayClusterConfig {
962+
return defaultExpectedConfig()
1003963
},
1004964
expectError: false,
1005965
},
@@ -1030,36 +990,13 @@ func TestGenerateRayClusterConfig(t *testing.T) {
1030990
inputConfig: &v1.RayClusterConfig{
1031991
ClusterName: "test-cluster-1",
1032992
},
1033-
expectedConfig: &v1.RayClusterConfig{
1034-
ClusterName: "test-cluster",
1035-
Provider: v1.Provider{
1036-
Type: "local",
1037-
},
1038-
Docker: v1.Docker{
1039-
ContainerName: "ray_container",
1040-
PullBeforeRun: true,
1041-
Image: "registry.example.com/neutree/neutree-serve:v1.0.0",
1042-
},
1043-
HeadStartRayCommands: []string{
1044-
"ray stop",
1045-
`python /home/ray/start.py --head --port=6379 --metrics-export-port=54311 --disable-usage-stats --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 --labels='{"neutree.ai/neutree-serving-version":"v1.0.0"}'`,
1046-
},
1047-
WorkerStartRayCommands: []string{
1048-
"ray stop",
1049-
`python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=54311 --disable-usage-stats --labels='{"neutree.ai/node-provision-type":"autoscaler","neutree.ai/neutree-serving-version":"v1.0.0"}'`,
1050-
},
1051-
StaticWorkerStartRayCommands: []string{
1052-
"ray stop",
1053-
`python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=54311 --disable-usage-stats --labels='{"neutree.ai/node-provision-type":"static","neutree.ai/neutree-serving-version":"v1.0.0"}'`,
1054-
},
1055-
InitializationCommands: []string{
1056-
"docker login registry.example.com -u 'user' -p 'pass'",
1057-
},
993+
expectedConfig: func() *v1.RayClusterConfig {
994+
return defaultExpectedConfig()
1058995
},
1059996
expectError: false,
1060997
},
1061998
{
1062-
name: "success - registry without CA",
999+
name: "success - without registry auth",
10631000
cluster: &v1.Cluster{
10641001
Metadata: &v1.Metadata{Name: "test-cluster"},
10651002
Spec: &v1.ClusterSpec{
@@ -1075,40 +1012,13 @@ func TestGenerateRayClusterConfig(t *testing.T) {
10751012
Spec: &v1.ImageRegistrySpec{
10761013
URL: "http://registry.example.com",
10771014
Repository: "neutree",
1078-
AuthConfig: v1.ImageRegistryAuthConfig{
1079-
Username: "user",
1080-
Password: "pass",
1081-
},
10821015
},
10831016
},
1084-
inputConfig: &v1.RayClusterConfig{
1085-
ClusterName: "test-cluster-1",
1086-
},
1087-
expectedConfig: &v1.RayClusterConfig{
1088-
ClusterName: "test-cluster",
1089-
Provider: v1.Provider{
1090-
Type: "local",
1091-
},
1092-
Docker: v1.Docker{
1093-
ContainerName: "ray_container",
1094-
PullBeforeRun: true,
1095-
Image: "registry.example.com/neutree/neutree-serve:v1.0.0",
1096-
},
1097-
HeadStartRayCommands: []string{
1098-
"ray stop",
1099-
`python /home/ray/start.py --head --port=6379 --metrics-export-port=54311 --disable-usage-stats --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 --labels='{"neutree.ai/neutree-serving-version":"v1.0.0"}'`,
1100-
},
1101-
WorkerStartRayCommands: []string{
1102-
"ray stop",
1103-
`python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=54311 --disable-usage-stats --labels='{"neutree.ai/node-provision-type":"autoscaler","neutree.ai/neutree-serving-version":"v1.0.0"}'`,
1104-
},
1105-
StaticWorkerStartRayCommands: []string{
1106-
"ray stop",
1107-
`python /home/ray/start.py --address=$RAY_HEAD_IP:6379 --metrics-export-port=54311 --disable-usage-stats --labels='{"neutree.ai/node-provision-type":"static","neutree.ai/neutree-serving-version":"v1.0.0"}'`,
1108-
},
1109-
InitializationCommands: []string{
1110-
"docker login registry.example.com -u 'user' -p 'pass'",
1111-
},
1017+
inputConfig: &v1.RayClusterConfig{},
1018+
expectedConfig: func() *v1.RayClusterConfig {
1019+
config := defaultExpectedConfig()
1020+
config.InitializationCommands = []string{}
1021+
return config
11121022
},
11131023
expectError: false,
11141024
},
@@ -1154,15 +1064,11 @@ func TestGenerateRayClusterConfig(t *testing.T) {
11541064
assert.Error(t, err)
11551065
} else {
11561066
assert.NoError(t, err)
1157-
assert.Equal(t, tt.expectedConfig.ClusterName, config.ClusterName)
1158-
assert.Equal(t, tt.expectedConfig.Provider.Type, config.Provider.Type)
1159-
assert.Equal(t, tt.expectedConfig.Docker.ContainerName, config.Docker.ContainerName)
1160-
assert.Equal(t, tt.expectedConfig.Docker.PullBeforeRun, config.Docker.PullBeforeRun)
1161-
assert.Equal(t, tt.expectedConfig.Docker.Image, config.Docker.Image)
1162-
assert.Equal(t, tt.expectedConfig.HeadStartRayCommands, config.HeadStartRayCommands)
1163-
assert.Equal(t, tt.expectedConfig.WorkerStartRayCommands, config.WorkerStartRayCommands)
1164-
assert.Equal(t, tt.expectedConfig.StaticWorkerStartRayCommands, config.StaticWorkerStartRayCommands)
1165-
assert.Equal(t, tt.expectedConfig.InitializationCommands, config.InitializationCommands)
1067+
expectedConfig := tt.expectedConfig()
1068+
eq := assert.ObjectsAreEqual(config, expectedConfig)
1069+
if !eq {
1070+
t.Errorf("Generated config does not match expected config.\nGot: %+v\nExpected: %+v", config, expectedConfig)
1071+
}
11661072
}
11671073
})
11681074
}

0 commit comments

Comments
 (0)