Skip to content

Commit 492604b

Browse files
trigger functional tests
1 parent 9c6e3c6 commit 492604b

File tree

3 files changed

+121
-1
lines changed

3 files changed

+121
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
ENV_VARS:
2+
CUDA_DEVICE_MAX_CONNECTIONS: 1
3+
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
4+
NCCL_ALGO: Ring
5+
CUBLAS_WORKSPACE_CONFIG: :4096:8
6+
TEST_TYPE: frozen-start
7+
MODE: inference
8+
MODEL_ARGS:
9+
--tiktoken-pattern: v2
10+
--use-mcore-models: true
11+
--tokenizer-type: TikTokenizer
12+
--tokenizer-model: /mnt/artifacts/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json
13+
--auto-detect-ckpt-format: true
14+
--max-tokens-to-oom: 3600000
15+
--inference-max-seq-length: 4096
16+
--attention-backend: flash
17+
--use-checkpoint-args: true
18+
--micro-batch-size: 1
19+
--no-load-optim: true
20+
--no-use-tokenizer-model-from-checkpoint-args: true
21+
--timing-log-level: 0
22+
--load: /mnt/artifacts/model/mcore_mistral/nemo_minitron-0.5b/v1
23+
--distributed-backend: nccl
24+
--log-interval: 1
25+
--transformer-impl: transformer_engine
26+
--tensor-model-parallel-size: 1
27+
--pipeline-model-parallel-size: 1
28+
--deterministic-mode: true
29+
--ckpt-format: torch_dist
30+
--bf16: true
31+
--fp8-recipe: tensorwise
32+
--fp8-format: hybrid
33+
--fp8-param-gather: true
34+
--first-last-layers-bf16: true
35+
--log-memory-to-tensorboard: true
36+
--log-num-zeros-in-grad: true
37+
--log-validation-ppl-to-tensorboard: true
38+
--log-timers-to-tensorboard: true
39+
--num-layers: 24
40+
--hidden-size: 1152
41+
--num-attention-heads: 16
42+
--max-position-embeddings: 1024
43+
--seq-length: 1024
44+
--temperature: 1.0
45+
--top_k: 1
46+
--return-log-probs: true
47+
--num-tokens-to-generate: 30
48+
--enable-cuda-graph: true
49+
--decode-only-cuda-graphs: true
50+
--inference-dynamic-batching-buffer-guaranteed-fraction: 0
51+
--inference-dynamic-batching-buffer-overflow-factor: 0.2
52+
--inference-dynamic-batching-buffer-size-gb: 20
53+
--dist-ckpt-strictness: log_unexpected
54+
--inference-ckpt-non-strict: true # To handle the extra_state errors
55+
--output-path: /opt/megatron-lm/runs/35444deb-9f42-4b45-a169-6bfd3f6068e9/generations_dev_.json
56+
--prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
57+
--incoming-requests-per-sec: -1 # all requests arrive up front.
58+
--inference-repeat-n: 8
59+
METRICS:
60+
- "generated_tokens"
61+
- "logprobs"
62+
- "throughput"

tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ MODEL_ARGS:
5151
--prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
5252
--incoming-requests-per-step: 32
5353
--use-flashinfer-fused-rope: true
54-
--use-inference-optimized-layers: true
54+
--use-inference-optimized-layers: tru
5555

5656
METRICS:
5757
- "generated_tokens"
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
ENV_VARS:
2+
CUDA_DEVICE_MAX_CONNECTIONS: 1
3+
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
4+
NCCL_ALGO: Ring
5+
CUBLAS_WORKSPACE_CONFIG: :4096:8
6+
TEST_TYPE: frozen-start
7+
MODE: inference
8+
MODEL_ARGS:
9+
--tiktoken-pattern: v2
10+
--use-mcore-models: true
11+
--tokenizer-type: TikTokenizer
12+
--tokenizer-model: /mnt/artifacts/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json
13+
--auto-detect-ckpt-format: true
14+
--max-tokens-to-oom: 3600000
15+
--inference-max-seq-length: 4096
16+
--attention-backend: flash
17+
--use-checkpoint-args: true
18+
--micro-batch-size: 1
19+
--no-load-optim: true
20+
--no-use-tokenizer-model-from-checkpoint-args: true
21+
--timing-log-level: 0
22+
--load: /mnt/artifacts/model/mcore_mistral/nemo_minitron-0.5b/v1/
23+
--distributed-backend: nccl
24+
--log-interval: 1
25+
--transformer-impl: transformer_engine
26+
--tensor-model-parallel-size: 1
27+
--pipeline-model-parallel-size: 1
28+
--deterministic-mode: true
29+
--ckpt-format: torch_dist
30+
--bf16: true
31+
--log-memory-to-tensorboard: true
32+
--log-num-zeros-in-grad: true
33+
--log-validation-ppl-to-tensorboard: true
34+
--log-timers-to-tensorboard: true
35+
--num-layers: 24
36+
--hidden-size: 1152
37+
--num-attention-heads: 16
38+
--max-position-embeddings: 1024
39+
--seq-length: 1024
40+
--temperature: 1.0
41+
--top_k: 1
42+
--return-log-probs: true
43+
--num-tokens-to-generate: 30
44+
--inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility
45+
--inference-dynamic-batching-buffer-guaranteed-fraction: 0
46+
--inference-dynamic-batching-buffer-overflow-factor: 0.2
47+
--inference-dynamic-batching-buffer-size-gb: 20
48+
--dist-ckpt-strictness: log_unexpected
49+
--inference-ckpt-non-strict: true # To handle the extra_state errors
50+
--output-path: /opt/megatron-lm/runs/5f57e0c4-be52-44f7-9c9f-0a27a2b1cb65/generations_dev_.json
51+
--prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies."
52+
--incoming-requests-per-step: 32
53+
--use-flashinfer-fused-rope: true
54+
--use-inference-optimized-layers: true
55+
56+
METRICS:
57+
- "generated_tokens"
58+
- "logprobs"

0 commit comments

Comments
 (0)