diff --git a/.buildkite/.pipeline_gen_v2 b/.buildkite/.pipeline_gen_v2 new file mode 100644 index 000000000..e69de29bb diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml new file mode 100644 index 000000000..fad862980 --- /dev/null +++ b/.buildkite/ci_config.yaml @@ -0,0 +1,10 @@ +name: vllm_omni_ci +github_repo_name: vllm-project/vllm-omni +job_dirs: + - ".buildkite/jobs" +run_all_patterns: [] +run_all_exclude_patterns: [] +registries: public.ecr.aws/q9t5s3a7 +repositories: + main: "vllm-ci-postmerge-repo" + premerge: "vllm-ci-test-repo" diff --git a/.buildkite/jobs/build.yaml b/.buildkite/jobs/build.yaml new file mode 100644 index 000000000..8ecb51b40 --- /dev/null +++ b/.buildkite/jobs/build.yaml @@ -0,0 +1,10 @@ +group: Build +steps: + - label: ":docker: build vllm-omni-ci image" + key: image-build + depends_on: [] + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "docker build --file docker/Dockerfile.ci -t vllm-omni-ci ." + - "docker tag vllm-omni-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" + - "docker push public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" diff --git a/.buildkite/jobs/tests.yaml b/.buildkite/jobs/tests.yaml new file mode 100644 index 000000000..e9c24550d --- /dev/null +++ b/.buildkite/jobs/tests.yaml @@ -0,0 +1,30 @@ +group: Tests +depends_on: + - image-build +steps: +- label: "Simple Unit Test" + commands: + - ".buildkite/scripts/simple_test.sh" + no_gpu: true + no_plugin: true + +- label: "Diffusion Model Test" + timeout_in_minutes: 15 + commands: + - pytest -s -v tests/single_stage/test_diffusion_model.py + +- label: "Omni Model Test" + timeout_in_minutes: 15 + num_gpus: 4 + commands: + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/multi_stages/ + +- label: "Omni Model Test with H100" + timeout_in_minutes: 20 + gpu: h100 + num_gpus: 2 + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/multi_stages_h100/ diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml deleted file mode 100644 index cdf111a1f..000000000 --- a/.buildkite/pipeline.yml +++ /dev/null @@ -1,87 +0,0 @@ -steps: - - label: ":docker: Build image" - key: image-build - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker build --file docker/Dockerfile.ci -t vllm-omni-ci ." - - "docker tag vllm-omni-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" - - "docker push public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" - agents: - queue: "cpu_queue_premerge_us_east_1" - - - label: "Simple Unit Test" - commands: - - ".buildkite/scripts/simple_test.sh" - agents: - queue: "cpu_queue_premerge" - - - label: "Diffusion Model Test" - timeout_in_minutes: 15 - depends_on: image-build - commands: - - pytest -s -v tests/single_stage/test_diffusion_model.py - agents: - queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - environment: - - "HF_HOME=/fsx/hf_cache" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - - label: "Omni Model Test" - timeout_in_minutes: 15 - depends_on: image-build - commands: - - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/multi_stages/ - agents: - queue: "gpu_4_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - environment: - - "HF_HOME=/fsx/hf_cache" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - - label: "Omni Model Test with H100" - timeout_in_minutes: 20 - depends_on: image-build - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/multi_stages_h100/ - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate