FastDeploy/.github/workflows/_accuracy_test.yml at develop · cloudforge1/FastDeploy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
name: Accuracy Test
description: "Run Accuracy Tests"

on:
  workflow_call:
    inputs:
      DOCKER_IMAGE:
        description: "Build Images"
        required: true
        type: string
        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
      FASTDEPLOY_ARCHIVE_URL:
        description: "URL of the compressed FastDeploy code archive."
        required: true
        type: string
      FASTDEPLOY_WHEEL_URL:
        description: "URL of the FastDeploy Wheel."
        required: true
        type: string
      CACHE_DIR:
        description: "Cache Dir Use"
        required: false
        type: string
        default: ""
      MODEL_CACHE_DIR:
        description: "Cache Dir Use"
        required: false
        type: string
        default: ""

jobs:
  accuracy_tests:
    runs-on: [self-hosted, GPU-h20-1Cards]
    timeout-minutes: 60
    steps:
      - name: Code Prepare
        shell: bash
        env:
          docker_image: ${{ inputs.DOCKER_IMAGE }}
          fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
        run: |
          set -x
          REPO="https://github.com/${{ github.repository }}.git"
          FULL_REPO="${{ github.repository }}"
          REPO_NAME="${FULL_REPO##*/}"
          BASE_BRANCH="${{ github.base_ref }}"
          docker pull ${docker_image}
          # Clean the repository directory before starting
          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
          -e "REPO_NAME=${REPO_NAME}" \
          ${docker_image} /bin/bash -c '
            CLEAN_RETRIES=3
            CLEAN_COUNT=0

            while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do
              echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..."
              rm -rf "${REPO_NAME}"* || true
              sleep 2

              # Check if anything matching ${REPO_NAME}* still exists
              if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then
                echo "All ${REPO_NAME}* removed successfully"
                break
              fi

              CLEAN_COUNT=$((CLEAN_COUNT + 1))
            done

            if ls "${REPO_NAME}"* >/dev/null 2>&1; then
              echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
              ls -ld "${REPO_NAME}"*
              exit 1
            fi
          '

          wget -q --no-proxy ${fd_archive_url}
          tar -xf FastDeploy.tar.gz
          rm -rf FastDeploy.tar.gz
          cd FastDeploy
          git config --global user.name "FastDeployCI"
          git config --global user.email "fastdeploy_ci@example.com"
          git log -n 3 --oneline

      - name: Run FastDeploy Base Tests
        shell: bash
        env:
          docker_image: ${{ inputs.DOCKER_IMAGE }}
          fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
          CACHE_DIR: ${{ inputs.CACHE_DIR }}
          MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
        run: |
          runner_name="${{ runner.name }}"
          CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
          DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
          DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)

          FLASK_PORT=$((8068 + DEVICE_PORT * 100))
          FD_API_PORT=$((8088 + DEVICE_PORT * 100))
          FD_ENGINE_QUEUE_PORT=$((8058 + DEVICE_PORT * 100))
          FD_METRICS_PORT=$((8078 + DEVICE_PORT * 100))
          FD_CACHE_QUEUE_PORT=$((8098 + DEVICE_PORT * 100))
          echo "Test ENV Parameter:"
          echo "========================================================="
          echo "FLASK_PORT=${FLASK_PORT}"
          echo "FD_API_PORT=${FD_API_PORT}"
          echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
          echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
          echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}"
          echo "DEVICES=${DEVICES}"
          echo "========================================================="

          CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
          echo "CACHE_DIR is set to ${CACHE_DIR}"
          if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
            touch "${CACHE_DIR}/gitconfig"
          fi
          if [ ! -d "${MODEL_CACHE_DIR}" ]; then
            echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
            exit 1
          fi

          PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT)
          LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
          echo "==== LOG_FILE is ${LOG_FILE} ===="

          echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE

          for port in "${PORTS[@]}"; do
              PIDS=$(lsof -t -i :$port || true)
              if [ -n "$PIDS" ]; then
                  echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
                  echo "$PIDS" | xargs -r kill -9
                  echo "Port $port cleared" | tee -a $LOG_FILE
              else
                  echo "Port $port is free" | tee -a $LOG_FILE
              fi
          done

          echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE

          echo "========================================================="
          echo "Ensuring no stale container named ${runner_name} ..."
          if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
            echo "Removing stale container: ${runner_name}"
            docker rm -f ${runner_name} || true
          fi

          docker run --rm --ipc=host --pid=host --net=host \
          --name ${runner_name} \
          -v $(pwd):/workspace \
          -w /workspace \
          -e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
          -e "FD_API_PORT=${FD_API_PORT}" \
          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
          -e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
          -e "FLASK_PORT=${FLASK_PORT}" \
          -v "${MODEL_CACHE_DIR}:/MODELDATA" \
          -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
          -v "${CACHE_DIR}/.cache:/root/.cache" \
          -v "${CACHE_DIR}/ConfigDir:/root/.config" \
          -e TZ="Asia/Shanghai" \
          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/

          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

          python -m pip install ${fastdeploy_wheel_url}
          python -m pip install pytest

          wget --no-proxy https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
          chmod +x ./llm-deploy-linux-amd64
          ./llm-deploy-linux-amd64 -python python3.10 \
          -model_name ERNIE-4.5-0.3B-Paddle \
          -model_path /MODELDATA \
          --skip install,model

          git config --global --add safe.directory /workspace/FastDeploy
          cd FastDeploy
          pushd tests/ce/deploy
          ps -ef | grep "${FD_CACHE_QUEUE_PORT}" | grep -v grep | awk "{print \$2}" | xargs -r kill -9
          ps -ef | grep "${FD_ENGINE_QUEUE_PORT}" | grep -v grep | awk "{print \$2}" | xargs -r kill -9
          python3.10 deploy.py > dd.log 2>&1 &
          sleep 3
          curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
            -H "Content-Type: application/json" \
            -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"

          curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
          popd

          pushd tests/ce/accuracy_cases
          export URL=http://localhost:${FD_API_PORT}/v1/chat/completions
          export TEMPLATE=TOKEN_LOGPROB
          export MODEL_SIZE=0.3B
          TEST_EXIT_CODE=0
          python gsm8k.py || TEST_EXIT_CODE=1
          popd
          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
          '
          if [ -f ./FastDeploy/exit_code.env ]; then
            source ./FastDeploy/exit_code.env
            cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
          fi
          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
          exit ${TEST_EXIT_CODE}