diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index dfe4baabb..c9b7ecbb4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -267,6 +267,41 @@ jobs: run: | source ./env-setup/12.8_env_setup.sh rm -rf ./gpu-simulator/gpgpu-sim + + # Clone gpgpu-sim with fork-aware branch selection + echo "Cloning gpgpu-sim with fork-aware branch selection..." + git clone --quiet git@github.com:accel-sim/gpgpu-sim_distribution.git ./gpu-simulator/gpgpu-sim + + # Try to checkout the same branch from the same owner's fork first + if [[ ${{ github.event_name }} == 'pull_request' ]]; then + current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1) + else + current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1) + fi + current_repo=$(echo ${{ github.repository }} | cut -d'/' -f2) + + gpgpusim_repo=$(echo $current_repo | sed 's/accel-sim-framework/gpgpu-sim_distribution/') + + echo "Attempting to checkout branch '$BRANCH_NAME' from '$current_owner/$gpgpusim_repo'" + + # First, try to add the fork owner's repository as a remote and check if the branch exists + if git -C ./gpu-simulator/gpgpu-sim/ remote add fork-owner git@github.com:$current_owner/$gpgpusim_repo.git 2>/dev/null; then + # Check if the branch exists in the fork owner's repository + if git -C ./gpu-simulator/gpgpu-sim/ ls-remote fork-owner | grep -q "refs/heads/$BRANCH_NAME"; then + echo "Found branch '$BRANCH_NAME' in '$current_owner/$gpgpusim_repo' repository, checking it out" + git -C ./gpu-simulator/gpgpu-sim/ fetch fork-owner + git -C ./gpu-simulator/gpgpu-sim/ checkout -B $BRANCH_NAME fork-owner/$BRANCH_NAME + else + echo "Branch '$BRANCH_NAME' not found in '$current_owner/$gpgpusim_repo' repository, falling back to upstream dev branch" + git -C ./gpu-simulator/gpgpu-sim/ checkout -B dev origin/dev + fi + # Remove the temporary remote + git -C ./gpu-simulator/gpgpu-sim/ remote remove fork-owner + else + echo "Could not add '$current_owner/$gpgpusim_repo' remote, falling back to upstream dev branch" + git -C ./gpu-simulator/gpgpu-sim/ checkout -B dev origin/dev + fi + source ./gpu-simulator/setup_environment.sh make clean -C gpu-simulator make -j20 -C gpu-simulator diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index 5a2889573..814ecb8bc 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -5,9 +5,11 @@ on: # push: schedule: - cron: '0 20 * * FRI' # 8:00 PM every Friday - +env: + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} jobs: Tracer-Weekly: + timeout-minutes: 720 if: ${{ github.repository == 'accel-sim/accel-sim-framework' || github.event_name == 'workflow_dispatch' }} runs-on: tgrogers-gpu01 defaults: @@ -38,6 +40,7 @@ jobs: git -C ./gpu-app-collection/ submodule update --init -- ./src/cuda/cuda-samples source ./gpu-app-collection/src/setup_environment ln -s /home/tgrogers-raid/a/common/data_dirs ./gpu-app-collection/ + make -j8 -C ./gpu-app-collection/src rodinia_2.0-ft make -j8 -C ./gpu-app-collection/src rodinia-3.1 make -j8 -C ./gpu-app-collection/src GPU_Microbenchmark # make -j8 -C ./gpu-app-collection/src Deepbench_nvidia @@ -49,30 +52,22 @@ jobs: source ./env-setup/12.8_env_setup.sh source ./gpu-app-collection/src/setup_environment rm -rf ./hw_run/ + ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7 rm -rf /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces mkdir -p /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces - ln -s /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces ./hw_run - ./util/tracer_nvbit/run_hw_trace.py -B rodinia-3.1,GPU_Microbenchmark -D 7 - # ./util/tracer_nvbit/run_hw_trace.py -B rodinia-3.1,GPU_Microbenchmark,parboil,polybench,cutlass_5_trace,Deepbench_nvidia_tencore,Deepbench_nvidia_normal -D 7 + mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run + # ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark,parboil,polybench,cutlass_5_trace,Deepbench_nvidia_tencore,Deepbench_nvidia_normal -D 7 - name: generate-spinlock-traces-spinlock_handling run: | source ./env-setup/12.8_env_setup.sh source ./gpu-app-collection/src/setup_environment rm -rf ./hw_run/ ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling fast_forward - mv ./hw_run ./hw_run_fast_forward + mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_fast_forward ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling none - mv ./hw_run ./hw_run_none - - name: test-new-traces-spinlock_handling - # Test only fast-forwarded traces as the none one takes too long to run (~2-3 hr) - run: | - source ./env-setup/12.8_env_setup.sh - source ./gpu-simulator/setup_environment.sh - ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T ./hw_run_fast_forward/traces/device-7/ -N spinlock-microbenchmark-$$-fast_forward - ./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-fast_forward - # ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T ./hw_run_none/traces/device-7/ -N spinlock-microbenchmark-$$-none - # ./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-none + mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_none SASS-Weekly: + timeout-minutes: 720 needs: [Tracer-Weekly] if: ${{ github.repository == 'accel-sim/accel-sim-framework' || github.event_name == 'workflow_dispatch' }} runs-on: tgrogers-raid @@ -93,6 +88,37 @@ jobs: run: | source ./env-setup/12.8_env_setup.sh rm -rf ./gpu-simulator/gpgpu-sim + + # Clone gpgpu-sim with fork-aware branch selection + echo "Cloning gpgpu-sim with fork-aware branch selection..." + git clone --quiet git@github.com:accel-sim/gpgpu-sim_distribution.git ./gpu-simulator/gpgpu-sim + + current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1) + current_branch=$BRANCH_NAME + current_repo=$(echo $GITHUB_REPOSITORY | cut -d'/' -f2) + + gpgpusim_repo=$(echo $current_repo | sed 's/accel-sim-framework/gpgpu-sim_distribution/') + + echo "Attempting to checkout branch '$BRANCH_NAME' from '$current_owner/$gpgpusim_repo'" + + # First, try to add the fork owner's repository as a remote and check if the branch exists + if git -C ./gpu-simulator/gpgpu-sim/ remote add fork-owner git@github.com:$current_owner/$gpgpusim_repo.git 2>/dev/null; then + # Check if the branch exists in the fork owner's repository + if git -C ./gpu-simulator/gpgpu-sim/ ls-remote fork-owner | grep -q "refs/heads/$BRANCH_NAME"; then + echo "Found branch '$BRANCH_NAME' in '$current_owner/$gpgpusim_repo' repository, checking it out" + git -C ./gpu-simulator/gpgpu-sim/ fetch fork-owner + git -C ./gpu-simulator/gpgpu-sim/ checkout -B $BRANCH_NAME fork-owner/$BRANCH_NAME + else + echo "Branch '$BRANCH_NAME' not found in '$current_owner/$gpgpusim_repo' repository, falling back to accel-sim dev branch" + git -C ./gpu-simulator/gpgpu-sim/ checkout -B dev origin/dev + fi + # Remove the temporary remote + git -C ./gpu-simulator/gpgpu-sim/ remote remove fork-owner + else + echo "Could not add '$current_owner/$gpgpusim_repo' remote, falling back to upstream dev branch" + git -C ./gpu-simulator/gpgpu-sim/ checkout -B dev origin/dev + fi + source ./gpu-simulator/setup_environment.sh make clean -C gpu-simulator make -j -C gpu-simulator @@ -100,7 +126,31 @@ jobs: run: | source ./env-setup/12.8_env_setup.sh source ./gpu-simulator/setup_environment.sh - ln -s /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces ./hw_run - # ./util/job_launching/run_simulations.py -B rodinia-3.1,GPU_Microbenchmark,sdk-4.2-scaled,parboil,polybench,cutlass_5_trace,Deepbench_nvidia_tencore,Deepbench_nvidia_normal -C QV100-SASS-5B_INSN -T ./hw_run/traces/device-7/12.8 -N weekly-$$ -M 70G - ./util/job_launching/run_simulations.py -B rodinia-3.1,GPU_Microbenchmark -C QV100-SASS-5B_INSN -T ./hw_run/traces/device-7/12.8 -N weekly-$$ -M 70G + ln -s /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run ./hw_run + ./util/job_launching/run_simulations.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -C QV100-SASS -T ./hw_run/traces/device-7/12.8 -N weekly-$$ -M 70G ./util/job_launching/monitor_func_test.py -T 12 -S 1800 -I -v -s weekly-stats-per-app.csv -N weekly-$$ + - name: test-new-traces-spinlock_handling + # Test only fast-forwarded traces as the none one takes too long to run (~2-3 hr) + run: | + source ./env-setup/12.8_env_setup.sh + source ./gpu-simulator/setup_environment.sh + ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_fast_forward/traces/device-7/ -N spinlock-microbenchmark-$$-fast_forward + ./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-fast_forward + # ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_none/traces/device-7/ -N spinlock-microbenchmark-$$-none + # ./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-none + failures: + if: failure() + env: + ACTION_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + REPORT_URL: "" + runs-on: tgrogers-raid + needs: [Tracer-Weekly, SASS-Weekly] + steps: + - uses: actions/checkout@v4 + - name: Notify Failure + run: | + # Setup envs + git clone --quiet --branch cluster-ubuntu git@github.com:purdue-aalp/env-setup.git + source ./env-setup/common/common_inc.sh + export BRANCH_NAME="Weekly Tests" + python3 .github/scripts/send_ci_email.py -t failure