Skip to content

Commit 59dbbb0

Browse files
Add comprehensive documentation and examples for matrix batching
Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
1 parent d465f04 commit 59dbbb0

File tree

2 files changed

+450
-0
lines changed

2 files changed

+450
-0
lines changed
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
name: "Example: Batched Matrix Workflow"
2+
3+
# This is an example workflow demonstrating how to use the batching feature
4+
# to work around GitHub Actions' 256 job matrix limit
5+
6+
on:
7+
workflow_dispatch:
8+
inputs:
9+
model-prefix:
10+
description: "Model prefix to benchmark"
11+
required: true
12+
type: string
13+
seq-lens:
14+
description: "Sequence length config (e.g., 1k1k)"
15+
required: true
16+
type: string
17+
18+
jobs:
19+
# Step 1: Determine how many batches are needed
20+
get-batch-count:
21+
runs-on: ubuntu-latest
22+
outputs:
23+
batch-count: ${{ steps.count.outputs.batch-count }}
24+
steps:
25+
- name: Checkout code
26+
uses: actions/checkout@v4
27+
28+
- id: count
29+
run: |
30+
pip install pydantic
31+
BATCH_COUNT=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py \
32+
full-sweep \
33+
--config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml \
34+
${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
35+
--seq-lens ${{ inputs.seq-lens }} \
36+
--model-prefix ${{ inputs.model-prefix }} \
37+
--get-batch-count)
38+
echo "batch-count=$BATCH_COUNT" >> $GITHUB_OUTPUT
39+
echo "Total batches needed: $BATCH_COUNT"
40+
41+
# Step 2: Generate config for each batch
42+
# This job runs once per batch (up to the batch-count)
43+
get-batch-configs:
44+
needs: get-batch-count
45+
runs-on: ubuntu-latest
46+
# Create a matrix with one entry per batch
47+
strategy:
48+
matrix:
49+
# Generate array [0, 1, 2, ..., batch-count-1]
50+
batch-index: ${{ fromJson(format('[{0}]', join(range(0, fromJson(needs.get-batch-count.outputs.batch-count)), ','))) }}
51+
outputs:
52+
# Each batch gets its own output
53+
configs-${{ matrix.batch-index }}: ${{ steps.get-configs.outputs.configs }}
54+
steps:
55+
- name: Checkout code
56+
uses: actions/checkout@v4
57+
58+
- id: get-configs
59+
run: |
60+
pip install pydantic
61+
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py \
62+
full-sweep \
63+
--config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml \
64+
${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
65+
--seq-lens ${{ inputs.seq-lens }} \
66+
--model-prefix ${{ inputs.model-prefix }} \
67+
--batch-index ${{ matrix.batch-index }})
68+
echo "configs=$CONFIG_JSON" >> $GITHUB_OUTPUT
69+
echo "Generated batch ${{ matrix.batch-index }}"
70+
71+
# Step 3: Run benchmarks for batch 0
72+
# You would create similar jobs for batch-1, batch-2, etc. if needed
73+
benchmark-batch-0:
74+
needs: get-batch-configs
75+
# Only run if batch 0 exists
76+
if: ${{ fromJson(needs.get-batch-count.outputs.batch-count) > 0 }}
77+
uses: ./.github/workflows/benchmark-tmpl.yml
78+
name: ${{ inputs.model-prefix }} ${{ inputs.seq-lens }} batch-0 /
79+
strategy:
80+
fail-fast: false
81+
matrix:
82+
config: ${{ fromJson(needs.get-batch-configs.outputs.configs-0) }}
83+
secrets: inherit
84+
with:
85+
exp-name: "${{ inputs.model-prefix }}_${{ inputs.seq-lens }}_batch0"
86+
isl: 1024
87+
osl: 1024
88+
max-model-len: 2048
89+
runner: ${{ matrix.config.runner }}
90+
image: ${{ matrix.config.image }}
91+
model: ${{ matrix.config.model }}
92+
framework: ${{ matrix.config.framework }}
93+
precision: ${{ matrix.config.precision }}
94+
tp: ${{ matrix.config.tp }}
95+
ep: ${{ matrix.config.ep }}
96+
dp-attn: ${{ matrix.config.dp-attn }}
97+
conc: ${{ matrix.config.conc }}
98+
99+
# Step 4 (optional): Collect results from all batches
100+
collect-results:
101+
needs: [get-batch-count, benchmark-batch-0]
102+
if: ${{ always() }}
103+
runs-on: ubuntu-latest
104+
steps:
105+
- name: Summary
106+
run: |
107+
echo "Processed ${{ needs.get-batch-count.outputs.batch-count }} batch(es)"
108+
echo "Benchmark complete"
109+
110+
# Note: For production use with multiple batches, you would either:
111+
# 1. Create multiple benchmark-batch-N jobs (one per possible batch)
112+
# 2. Use a dynamic workflow generation approach
113+
# 3. Use GitHub's reusable workflows with a loop construct (when available)
114+
#
115+
# The current InferenceMAX workflows split by model-prefix instead,
116+
# which naturally keeps each job under the 256 limit.

0 commit comments

Comments
 (0)