Skip to content

Commit 5138ef3

Browse files
authored
[None][infra] Add fallback when get wheel from build stage is fail (#9290)
Signed-off-by: ZhanruiSunCh <[email protected]>
1 parent e2a372a commit 5138ef3

File tree

1 file changed

+30
-11
lines changed

1 file changed

+30
-11
lines changed

jenkins/BuildDockerImage.groovy

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ def buildImage(config, imageKeyToTag)
333333
}
334334
}
335335

336-
args += prepareWheelFromBuildStage(dockerfileStage, arch)
336+
def buildWheelArgs = prepareWheelFromBuildStage(dockerfileStage, arch)
337337
// Avoid the frequency of OOM issue when building the wheel
338338
if (target == "trtllm") {
339339
if (arch == "x86_64") {
@@ -346,15 +346,34 @@ def buildImage(config, imageKeyToTag)
346346
sh "env | sort"
347347
def randomSleep = (Math.random() * 600 + 600).toInteger()
348348
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
349-
trtllm_utils.llmExecStepWithRetry(this, script: """
350-
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
351-
BASE_IMAGE=${BASE_IMAGE} \
352-
TRITON_IMAGE=${TRITON_IMAGE} \
353-
TORCH_INSTALL_TYPE=${torchInstallType} \
354-
IMAGE_WITH_TAG=${imageWithTag} \
355-
STAGE=${dockerfileStage} \
356-
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
357-
""", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
349+
try {
350+
trtllm_utils.llmExecStepWithRetry(this, script: """
351+
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
352+
BASE_IMAGE=${BASE_IMAGE} \
353+
TRITON_IMAGE=${TRITON_IMAGE} \
354+
TORCH_INSTALL_TYPE=${torchInstallType} \
355+
IMAGE_WITH_TAG=${imageWithTag} \
356+
STAGE=${dockerfileStage} \
357+
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
358+
""", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
359+
} catch (InterruptedException ex) {
360+
throw ex
361+
} catch (Exception ex) {
362+
if (buildWheelArgs.trim().isEmpty()) {
363+
throw ex
364+
}
365+
echo "Build failed with wheel arguments, retrying without them"
366+
buildWheelArgs = ""
367+
trtllm_utils.llmExecStepWithRetry(this, script: """
368+
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
369+
BASE_IMAGE=${BASE_IMAGE} \
370+
TRITON_IMAGE=${TRITON_IMAGE} \
371+
TORCH_INSTALL_TYPE=${torchInstallType} \
372+
IMAGE_WITH_TAG=${imageWithTag} \
373+
STAGE=${dockerfileStage} \
374+
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
375+
""", sleepInSecs: randomSleep, numRetries: 2, shortCommondRunTimeMax: 7200)
376+
}
358377
if (target == "ngc-release") {
359378
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
360379
}
@@ -369,7 +388,7 @@ def buildImage(config, imageKeyToTag)
369388
TORCH_INSTALL_TYPE=${torchInstallType} \
370389
IMAGE_WITH_TAG=${customImageWithTag} \
371390
STAGE=${dockerfileStage} \
372-
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
391+
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
373392
"""
374393
}
375394
}

0 commit comments

Comments
 (0)