3232 required : true
3333 description : ' Dependency packages, you can also set a specific version'
3434 type : string
35- default : ' packaging transformers_stream_generator transformers datasets matplotlib'
35+ default : ' packaging transformers_stream_generator transformers datasets matplotlib jmespath '
3636 default_tp :
3737 required : true
3838 description : ' Default tp value'
3939 type : string
4040 default : ' --tp 1'
41+ log_level :
42+ required : true
43+ description : ' Default ERROR, can also set INFO'
44+ type : string
45+ default : ' ERROR'
46+ kvint_quantization :
47+ required : true
48+ description : ' Default kvint4, kvint8'
49+ type : string
50+ default : " ['kvint4','kvint8']"
4151 models :
4252 required : true
4353 description : ' Set models run benchmark'
5262 DATASET_FILE : /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
5363 TP_INFO : --tp 1
5464 LOOP_NUM : 3
65+ TRITON_PTXAS_PATH : /usr/local/cuda/bin/ptxas
5566
5667
5768jobs :
@@ -93,15 +104,15 @@ jobs:
93104 generation_benchmark :
94105 needs : linux-build
95106 if : ${{github.event_name == 'schedule' || (!cancelled() && contains(fromJSON(github.event.inputs.benchmark_type), 'generation'))}}
96- runs-on : [self-hosted, linux-a100-2 ]
107+ runs-on : [self-hosted, linux-a100]
97108 strategy :
98109 fail-fast : false
99110 matrix :
100111 model : ${{fromJSON(github.event.inputs.models)}}
101112 timeout-minutes : 120
102113 env :
103114 MODEL_PATH : /nvme/qa_test_models/${{matrix.model}}
104- CUDA_VISIBLE_DEVICES : 4,5
115+ CUDA_VISIBLE_DEVICES : 6,7
105116 container :
106117 image : nvcr.io/nvidia/tritonserver:22.12-py3
107118 options : " --gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
@@ -119,7 +130,7 @@ jobs:
119130 ref : ${{github.event.inputs.repo_ref || 'main'}}
120131 - name : Copy repository - offline
121132 if : ${{inputs.offline_mode}}
122- run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
133+ run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
123134 - name : Download Artifacts
124135 if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
125136 uses : actions/download-artifact@v4
@@ -133,7 +144,7 @@ jobs:
133144 run : |
134145 python3 -m pip install ${{inputs.dependency_pkgs}}
135146 # manually install flash attn
136- # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
147+ # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
137148 python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
138149 - name : Install lmdeploy
139150 if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
@@ -216,7 +227,7 @@ jobs:
216227 ref : ${{github.event.inputs.repo_ref || 'main'}}
217228 - name : Copy repository - offline
218229 if : ${{inputs.offline_mode}}
219- run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
230+ run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
220231 - name : Download Artifacts
221232 if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
222233 uses : actions/download-artifact@v4
@@ -230,7 +241,7 @@ jobs:
230241 run : |
231242 python3 -m pip install ${{inputs.dependency_pkgs}}
232243 # manually install flash attn
233- # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
244+ # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
234245 python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
235246 - name : Install lmdeploy
236247 if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
@@ -266,7 +277,7 @@ jobs:
266277 done
267278 done
268279 - name : Run throughput benchmark - kvint4
269- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
280+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
270281 env :
271282 result_dir : benchmark-throughput-turbomind-kvint4
272283 run : |
@@ -281,7 +292,7 @@ jobs:
281292 done
282293 done
283294 - name : Run throughput benchmark - kvint8
284- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
295+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
285296 env :
286297 result_dir : benchmark-throughput-turbomind-kvint8
287298 run : |
@@ -357,7 +368,7 @@ jobs:
357368 ref : ${{github.event.inputs.repo_ref || 'main'}}
358369 - name : Copy repository - offline
359370 if : ${{inputs.offline_mode}}
360- run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/* .
371+ run : cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
361372 - name : Download Artifacts
362373 if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
363374 uses : actions/download-artifact@v4
@@ -371,7 +382,7 @@ jobs:
371382 run : |
372383 python3 -m pip install ${{inputs.dependency_pkgs}}
373384 # manually install flash attn
374- # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
385+ # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
375386 python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
376387 - name : Install lmdeploy
377388 if : ${{github.event_name == 'schedule' || !inputs.offline_mode}}
@@ -394,7 +405,7 @@ jobs:
394405 - name : Start restful api turbomind
395406 if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
396407 run : |
397- lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --log-level INFO > turbomind_run.log 2>&1 &
408+ lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --log-level ${{inputs.log_level}} > turbomind_run.log 2>&1 &
398409 echo "restful_pid=$!" >> "$GITHUB_ENV"
399410 sleep 180s
400411 - name : Run restful benchmark
@@ -414,17 +425,17 @@ jobs:
414425 done
415426 - name : Kill restful api turbomind
416427 continue-on-error : true
417- if : always( )
428+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind' )
418429 run : |
419430 kill -15 "$restful_pid"
420431 - name : Start restful api turbomind - kvint4
421- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
432+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
422433 run : |
423- lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 4 --log-level INFO > turbomind_kvint4_run.log 2>&1 &
434+ lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 4 --log-level ${{inputs.log_level}} > turbomind_kvint4_run.log 2>&1 &
424435 echo "restful_pid=$!" >> "$GITHUB_ENV"
425436 sleep 180s
426437 - name : Run restful benchmark -kvint4
427- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
438+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
428439 env :
429440 result_dir : benchmark-restful-turbomind-kvint4
430441 run : |
@@ -439,18 +450,17 @@ jobs:
439450 done
440451 done
441452 - name : Kill restful api turbomind - kvint4
442- continue-on-error : true
443- if : always()
453+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
444454 run : |
445455 kill -15 "$restful_pid"
446456 - name : Start restful api turbomind - kvint8
447- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
457+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
448458 run : |
449- lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 8 --log-level INFO > turbomind_kvint8_run.log 2>&1 &
459+ lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 8 --log-level ${{inputs.log_level}} > turbomind_kvint8_run.log 2>&1 &
450460 echo "restful_pid=$!" >> "$GITHUB_ENV"
451461 sleep 180s
452462 - name : Run restful benchmark -kvint8
453- if : contains(fromJSON(github.event.inputs.backend), 'turbomind')
463+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
454464 env :
455465 result_dir : benchmark-restful-turbomind-kvint8
456466 run : |
@@ -465,14 +475,13 @@ jobs:
465475 done
466476 done
467477 - name : Kill restful api turbomind - kvint8
468- continue-on-error : true
469- if : always()
478+ if : contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
470479 run : |
471480 kill -15 "$restful_pid"
472481 - name : Start restful api pytorch
473482 if : (!contains(env.MODEL_FORMAT, 'awq') && contains(fromJSON(github.event.inputs.backend), 'pytorch'))
474483 run : |
475- lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --backend pytorch --log-level INFO > pytorch_run.log 2>&1 &
484+ lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --backend pytorch --log-level ${{inputs.log_level}} > pytorch_run.log 2>&1 &
476485 echo "restful_pid=$!" >> "$GITHUB_ENV"
477486 sleep 120s
478487 - name : Run restful benchmark - pytorch
@@ -491,7 +500,7 @@ jobs:
491500 done
492501 done
493502 - name : Kill restful api pytorch
494- if : always( )
503+ if : (!contains(env.MODEL_FORMAT, 'awq') && contains(fromJSON(github.event.inputs.backend), 'pytorch') )
495504 run : |
496505 kill -15 "$restful_pid"
497506 - name : Save reports
@@ -521,7 +530,7 @@ jobs:
521530 WORKDIR : /nvme/qa_test_models/triton_workspace
522531 OFFLINE_PKGS : /nvme/qa_test_models/offline_pkg
523532 MODEL_PATH : /nvme/qa_test_models/autotest_model/workspace_${{matrix.model}}
524- DEVICE : device=7
533+ DEVICE : device=4
525534 GRPC_PORT : 33337
526535 strategy :
527536 fail-fast : false
@@ -537,7 +546,7 @@ jobs:
537546 - name : Set params
538547 if : (contains( matrix.model, 'internlm2-chat-20b'))
539548 run : |
540- echo 'DEVICE="device=6,7 "' >> "$GITHUB_ENV"
549+ echo 'DEVICE="device=4,5 "' >> "$GITHUB_ENV"
541550 - name : Create test container
542551 run : |
543552 export date_today="$(date +'%H%M%S')"
0 commit comments