Skip to content

Commit 45d6d33

Browse files
authored
Merge pull request #103 from NVIDIA/23.07
23.07 changes
2 parents 59a05d2 + 23b6cda commit 45d6d33

File tree

16 files changed

+642
-62
lines changed

16 files changed

+642
-62
lines changed

README.md

Lines changed: 233 additions & 40 deletions
Large diffs are not rendered by default.

auto_configurator/conf/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ fastertransformer_path: ${auto_configurator_path}/../FasterTransformer
1919
base_results_dir: ${auto_configurator_path}/results
2020
data_dir: ${launcher_scripts_path}/data
2121

22-
training_container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3
22+
training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
2323
container_mounts:
2424
- null
2525

auto_configurator/tests/config_tests/test_main_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def test_config(self):
2626
base_results_dir: ${auto_configurator_path}/results
2727
data_dir: ${launcher_scripts_path}/data
2828
29-
training_container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3
29+
training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
3030
container_mounts:
3131
- null
3232

csp_tools/aws/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
FROM nvcr.io/ea-bignlp/nemofw-training:23.05-py3
15+
FROM nvcr.io/ea-bignlp/nemofw-training:23.07-py3
1616

1717
ARG EFA_INSTALLER_VERSION=latest
1818
ARG AWS_OFI_NCCL_VERSION=1.4.0-aws

launcher_scripts/conf/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ data_dir: ${launcher_scripts_path}/data # Location to store and read the data.
3131
base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs.
3232
container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
3333
- null
34-
container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3
34+
container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
3535

3636
wandb_api_key_file: null # File where the w&B api key is stored. Key must be on the first line.
3737

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
run:
2+
name: ${.task_name}_${.model_train_name}
3+
time_limit: "04:00:00"
4+
dependency: "singleton"
5+
convert_name: convert_nemo
6+
model_train_name: gpt3_sft
7+
convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name}
8+
task_name: "custom_task" # Rename this name to be more clear
9+
results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name}
10+
11+
trainer:
12+
devices: 8
13+
accelerator: gpu
14+
num_nodes: 1
15+
precision: bf16
16+
logger: False # logger provided by exp_manager
17+
enable_checkpointing: False
18+
replace_sampler_ddp: False
19+
max_epochs: 1
20+
max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
21+
log_every_n_steps: 10 # frequency with which training steps are logged
22+
val_check_interval: 300 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
23+
gradient_clip_val: 1.0
24+
25+
exp_manager:
26+
explicit_log_dir: ${fine_tuning.run.results_dir}/results
27+
exp_dir: null
28+
name: megatron_gpt3_${fine_tuning.run.task_name}
29+
create_wandb_logger: False
30+
wandb_logger_kwargs:
31+
project: nemo_gpt3_${fine_tuning.run.task_name}
32+
name: ${fine_tuning.run.name}
33+
resume_if_exists: True
34+
resume_ignore_no_checkpoint: True
35+
create_checkpoint_callback: True
36+
checkpoint_callback_params:
37+
monitor: validation_${fine_tuning.model.data.validation_ds.metric.name}
38+
save_top_k: 5
39+
mode: min
40+
save_nemo_on_train_end: True
41+
filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}'
42+
model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}}
43+
save_best_model: True
44+
45+
model:
46+
seed: 1234
47+
tensor_model_parallel_size: 1 # intra-layer model parallelism
48+
pipeline_model_parallel_size: 1 # inter-layer model parallelism
49+
global_batch_size: 8
50+
micro_batch_size: 1
51+
restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_gpt.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
52+
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
53+
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
54+
sync_batch_comm: False
55+
megatron_amp_O2: True
56+
57+
## Sequence Parallelism
58+
# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
59+
# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
60+
sequence_parallel: False
61+
62+
## Activation Checkpoint
63+
activations_checkpoint_granularity: selective # 'selective' or 'full'
64+
activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
65+
# 'uniform' divides the total number of transformer layers and checkpoints the input activation
66+
# of each chunk at the specified granularity
67+
# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
68+
activations_checkpoint_num_layers: null # not used with 'selective'
69+
answer_only_loss: True # not used right now
70+
gradient_as_bucket_view: False
71+
seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
72+
use_flash_attention: True # if not None, will match the base model's value
73+
74+
hidden_dropout: 0.1
75+
attention_dropout: 0.1
76+
ffn_dropout: 0.1
77+
78+
data:
79+
chat: False # whether use chatbot data or not
80+
train_ds:
81+
# Example of how to specify paths to multiple datasets
82+
# file_names:
83+
# - /path/to/squad.jsonl
84+
# - /path/to/mnli.jsonl
85+
# - /path/to/boolq.jsonl
86+
# Example of how each dataset is formatted
87+
# {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
88+
file_names: ??? # Path to a list of JSONL files corresponding to the source data.
89+
global_batch_size: ${fine_tuning.model.global_batch_size}
90+
micro_batch_size: ${fine_tuning.model.micro_batch_size}
91+
shuffle: True
92+
num_workers: 4
93+
pin_memory: True
94+
max_seq_length: 2048
95+
min_seq_length: 1
96+
drop_last: True
97+
# Example of how to specify concat_sampling_probabilities
98+
# concat_sampling_probabilities:
99+
# - 0.5
100+
# - 0.25
101+
# - 0.25
102+
concat_sampling_probabilities: ??? # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
103+
context_key: 'input'
104+
label_key: 'output'
105+
add_eos: True
106+
add_sep: False
107+
add_bos: False
108+
separate_prompt_and_response_with_newline: True
109+
truncation_field: "context" # Options: ['context', 'answer']
110+
index_mapping_dir: null # Path to a directory to write index mapping files.
111+
prompt_template: null # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
112+
113+
validation_ds:
114+
file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
115+
names: null # Names of the corresponding datasets used to log metrics.
116+
global_batch_size: ${fine_tuning.model.global_batch_size}
117+
micro_batch_size: ${fine_tuning.model.micro_batch_size}
118+
shuffle: True
119+
num_workers: 4
120+
pin_memory: True
121+
max_seq_length: 2048
122+
min_seq_length: 1
123+
drop_last: True
124+
context_key: 'input'
125+
label_key: 'output'
126+
add_eos: ${fine_tuning.model.data.train_ds.add_eos}
127+
add_sep: ${fine_tuning.model.data.train_ds.add_sep}
128+
add_bos: ${fine_tuning.model.data.train_ds.add_bos}
129+
separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
130+
write_predictions_to_file: False
131+
output_file_path_prefix: null # Prefix of the file to write predictions to.
132+
truncation_field: "context" # Options: ['context', 'answer']
133+
index_mapping_dir: null # Path to a directory to write index mapping files.
134+
prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
135+
136+
metric:
137+
name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
138+
average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
139+
num_classes: null
140+
141+
test_ds:
142+
file_names: ${fine_tuning.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
143+
names: null # Names of the corresponding datasets used to log metrics.
144+
global_batch_size: ${fine_tuning.model.global_batch_size}
145+
micro_batch_size: ${fine_tuning.model.micro_batch_size}
146+
shuffle: True
147+
num_workers: 4
148+
pin_memory: True
149+
max_seq_length: 2048
150+
min_seq_length: 1
151+
drop_last: True
152+
context_key: 'input'
153+
label_key: 'output'
154+
add_eos: ${fine_tuning.model.data.train_ds.add_eos}
155+
add_sep: ${fine_tuning.model.data.train_ds.add_sep}
156+
add_bos: ${fine_tuning.model.data.train_ds.add_bos}
157+
separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
158+
write_predictions_to_file: False
159+
output_file_path_prefix: null # Prefix of the file to write predictions to.
160+
truncation_field: "context" # Options: ['context', 'answer']
161+
index_mapping_dir: null # Path to a directory to write index mapping files.
162+
prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
163+
164+
metric:
165+
name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
166+
average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
167+
num_classes: null
168+
169+
optim:
170+
name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
171+
lr: 5e-6
172+
weight_decay: 0.01
173+
betas:
174+
- 0.9
175+
- 0.98
176+

0 commit comments

Comments
 (0)