FastDeploy/.github/workflows/_gpu_4cards_case_test.yml at develop · cloudforge1/FastDeploy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
name: 4-GPU E2E Tests
description: "Run FastDeploy e2e tests on 4 GPUs"

on:
  workflow_call:
    inputs:
      DOCKER_IMAGE:
        description: "Build Images"
        required: true
        type: string
        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-paddle-dev"
      FASTDEPLOY_ARCHIVE_URL:
        description: "URL of the compressed FastDeploy code archive."
        required: true
        type: string
      FASTDEPLOY_WHEEL_URL:
        description: "URL of the FastDeploy Wheel."
        required: true
        type: string
      CACHE_DIR:
        description: "Cache Dir Use"
        required: false
        type: string
        default: ""
      MODEL_CACHE_DIR:
        description: "Cache Dir Use"
        required: false
        type: string
        default: ""
    secrets:
      github-token:
        required: true

jobs:
  check_bypass:
    uses: ./.github/workflows/check-bypass.yml
    secrets:
      github-token: ${{ secrets.github-token }}
    with:
      workflow-name: gpu_4cards_test

  run_4_cards_tests:
    runs-on: [self-hosted, GPU-h20-4Cards]
    needs: check_bypass
    if: ${{ inputs.FASTDEPLOY_WHEEL_URL != '' && needs.check_bypass.outputs.can-skip != 'true' }}
    timeout-minutes: 30
    steps:
      - name: Code Prepare
        shell: bash
        env:
          docker_image: ${{ inputs.DOCKER_IMAGE }}
          fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
        run: |
          set -x
          REPO="https://github.com/${{ github.repository }}.git"
          FULL_REPO="${{ github.repository }}"
          REPO_NAME="${FULL_REPO##*/}"
          BASE_BRANCH="${{ github.base_ref }}"
          docker pull ${docker_image}
          # Clean the repository directory before starting
          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
          -e "REPO_NAME=${REPO_NAME}" \
          ${docker_image} /bin/bash -c '
            CLEAN_RETRIES=3
            CLEAN_COUNT=0

            while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do
              echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..."
              rm -rf "${REPO_NAME}"* || true
              sleep 2

              # Check if anything matching ${REPO_NAME}* still exists
              if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then
                echo "All ${REPO_NAME}* removed successfully"
                break
              fi

              CLEAN_COUNT=$((CLEAN_COUNT + 1))
            done

            if ls "${REPO_NAME}"* >/dev/null 2>&1; then
              echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
              ls -ld "${REPO_NAME}"*
              exit 1
            fi
          '

          wget -q --no-proxy ${fd_archive_url}
          tar -xf FastDeploy.tar.gz
          rm -rf FastDeploy.tar.gz
          cd FastDeploy
          git config --global user.name "FastDeployCI"
          git config --global user.email "fastdeploy_ci@example.com"
          git log -n 3 --oneline

      - name: Run Four Cards Tests
        shell: bash
        env:
          docker_image: ${{ inputs.DOCKER_IMAGE }}
          fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
          CACHE_DIR: ${{ inputs.CACHE_DIR }}
          BASE_REF: ${{ github.event.pull_request.base.ref }}
          MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
          IS_PR: ${{ github.event_name == 'pull_request' }}
        run: |
          if [[ "$IS_PR" == "true" ]]; then
            echo "Running on PR"
          else
            echo "Not a PR"
          fi
          runner_name="${{ runner.name }}"
          CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
          DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
          DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)

          FLASK_PORT=$((8068 + DEVICE_PORT * 100))
          FD_API_PORT=$((8088 + DEVICE_PORT * 100))
          FD_ENGINE_QUEUE_PORT=$((8058 + DEVICE_PORT * 100))
          FD_METRICS_PORT=$((8078 + DEVICE_PORT * 100))
          FD_CACHE_QUEUE_PORT=$((8098 + DEVICE_PORT * 100))
          FD_ROUTER_PORT=$((8048 + DEVICE_PORT * 100))
          FD_CONNECTOR_PORT=$((8038 + DEVICE_PORT * 100))
          FD_RDMA_PORT=$((8028 + DEVICE_PORT * 100))
          echo "Test ENV Parameter:"
          echo "========================================================="
          echo "FLASK_PORT=${FLASK_PORT}"
          echo "FD_API_PORT=${FD_API_PORT}"
          echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
          echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
          echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}"
          echo "FD_ROUTER_PORT=${FD_ROUTER_PORT}"
          echo "FD_CONNECTOR_PORT=${FD_CONNECTOR_PORT}"
          echo "FD_RDMA_PORT=${FD_RDMA_PORT}"
          echo "DEVICES=${DEVICES}"
          echo "========================================================="

          CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
          echo "CACHE_DIR is set to ${CACHE_DIR}"
          if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
            touch "${CACHE_DIR}/gitconfig"
          fi

          PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT)
          LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
          echo "==== LOG_FILE is ${LOG_FILE} ===="

          echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE

          for port in "${PORTS[@]}"; do
              PIDS=$(lsof -t -i :$port || true)
              if [ -n "$PIDS" ]; then
                  echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
                  echo "$PIDS" | xargs -r kill -9
                  echo "Port $port cleared" | tee -a $LOG_FILE
              else
                  echo "Port $port is free" | tee -a $LOG_FILE
              fi
          done

          echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE

          echo "========================================================="
          echo "Ensuring no stale container named ${runner_name} ..."
          if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
            echo "Removing stale container: ${runner_name}"
            docker rm -f ${runner_name} || true
          fi

          docker run --rm --ipc=host --net=host \
          --name ${runner_name} \
          -v $(pwd):/workspace -w /workspace \
          -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
          -v "${CACHE_DIR}/.cache:/root/.cache" \
          -v "${CACHE_DIR}/ConfigDir:/root/.config" \
          -v "${MODEL_CACHE_DIR}:/ModelData:ro" \
          -e "MODEL_PATH=/ModelData" \
          -e "FD_API_PORT=${FD_API_PORT}" \
          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
          -e "FLASK_PORT=${FLASK_PORT}" \
          -e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
          -e TZ="Asia/Shanghai" \
          -e "fd_wheel_url=${fd_wheel_url}" \
          -e "BASE_REF=${BASE_REF}" \
          -e "IS_PR=${IS_PR}" \
          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c '

          git config --global --add safe.directory /workspace/FastDeploy
          cd FastDeploy
          git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt

          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
          pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

          python -m pip install -r scripts/unittest_requirement.txt
          python -m pip install ${fd_wheel_url}
          rm -rf fastdeploy
          python -m pip install ${fd_wheel_url} --no-deps --target=/workspace/FastDeploy
          export PYTHONPATH=/workspace/FastDeploy/

          export CUDA_VISIBLE_DEVICES=0,1,2,3
          bash scripts/run_gpu_4cards.sh
          '