|
| 1 | +run: |
| 2 | + name: ${.task_name}_${.model_train_name} |
| 3 | + time_limit: "04:00:00" |
| 4 | + dependency: "singleton" |
| 5 | + convert_name: convert_nemo |
| 6 | + model_train_name: gpt3_sft |
| 7 | + convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name} |
| 8 | + task_name: "custom_task" # Rename this name to be more clear |
| 9 | + results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name} |
| 10 | + |
| 11 | +trainer: |
| 12 | + devices: 8 |
| 13 | + accelerator: gpu |
| 14 | + num_nodes: 1 |
| 15 | + precision: bf16 |
| 16 | + logger: False # logger provided by exp_manager |
| 17 | + enable_checkpointing: False |
| 18 | + replace_sampler_ddp: False |
| 19 | + max_epochs: 1 |
| 20 | + max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches |
| 21 | + log_every_n_steps: 10 # frequency with which training steps are logged |
| 22 | + val_check_interval: 300 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch |
| 23 | + gradient_clip_val: 1.0 |
| 24 | + |
| 25 | +exp_manager: |
| 26 | + explicit_log_dir: ${fine_tuning.run.results_dir}/results |
| 27 | + exp_dir: null |
| 28 | + name: megatron_gpt3_${fine_tuning.run.task_name} |
| 29 | + create_wandb_logger: False |
| 30 | + wandb_logger_kwargs: |
| 31 | + project: nemo_gpt3_${fine_tuning.run.task_name} |
| 32 | + name: ${fine_tuning.run.name} |
| 33 | + resume_if_exists: True |
| 34 | + resume_ignore_no_checkpoint: True |
| 35 | + create_checkpoint_callback: True |
| 36 | + checkpoint_callback_params: |
| 37 | + monitor: validation_${fine_tuning.model.data.validation_ds.metric.name} |
| 38 | + save_top_k: 5 |
| 39 | + mode: min |
| 40 | + save_nemo_on_train_end: True |
| 41 | + filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}' |
| 42 | + model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}} |
| 43 | + save_best_model: True |
| 44 | + |
| 45 | +model: |
| 46 | + seed: 1234 |
| 47 | + tensor_model_parallel_size: 1 # intra-layer model parallelism |
| 48 | + pipeline_model_parallel_size: 1 # inter-layer model parallelism |
| 49 | + global_batch_size: 8 |
| 50 | + micro_batch_size: 1 |
| 51 | + restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_gpt.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with |
| 52 | + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. |
| 53 | + save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. |
| 54 | + sync_batch_comm: False |
| 55 | + megatron_amp_O2: True |
| 56 | + |
| 57 | + ## Sequence Parallelism |
| 58 | + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially |
| 59 | + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. |
| 60 | + sequence_parallel: False |
| 61 | + |
| 62 | + ## Activation Checkpoint |
| 63 | + activations_checkpoint_granularity: selective # 'selective' or 'full' |
| 64 | + activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' |
| 65 | + # 'uniform' divides the total number of transformer layers and checkpoints the input activation |
| 66 | + # of each chunk at the specified granularity |
| 67 | + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity |
| 68 | + activations_checkpoint_num_layers: null # not used with 'selective' |
| 69 | + answer_only_loss: True # not used right now |
| 70 | + gradient_as_bucket_view: False |
| 71 | + seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value |
| 72 | + use_flash_attention: True # if not None, will match the base model's value |
| 73 | + |
| 74 | + hidden_dropout: 0.1 |
| 75 | + attention_dropout: 0.1 |
| 76 | + ffn_dropout: 0.1 |
| 77 | + |
| 78 | + data: |
| 79 | + chat: False # whether use chatbot data or not |
| 80 | + train_ds: |
| 81 | + # Example of how to specify paths to multiple datasets |
| 82 | + # file_names: |
| 83 | + # - /path/to/squad.jsonl |
| 84 | + # - /path/to/mnli.jsonl |
| 85 | + # - /path/to/boolq.jsonl |
| 86 | + # Example of how each dataset is formatted |
| 87 | + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} |
| 88 | + file_names: ??? # Path to a list of JSONL files corresponding to the source data. |
| 89 | + global_batch_size: ${fine_tuning.model.global_batch_size} |
| 90 | + micro_batch_size: ${fine_tuning.model.micro_batch_size} |
| 91 | + shuffle: True |
| 92 | + num_workers: 4 |
| 93 | + pin_memory: True |
| 94 | + max_seq_length: 2048 |
| 95 | + min_seq_length: 1 |
| 96 | + drop_last: True |
| 97 | + # Example of how to specify concat_sampling_probabilities |
| 98 | + # concat_sampling_probabilities: |
| 99 | + # - 0.5 |
| 100 | + # - 0.25 |
| 101 | + # - 0.25 |
| 102 | + concat_sampling_probabilities: ??? # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' |
| 103 | + context_key: 'input' |
| 104 | + label_key: 'output' |
| 105 | + add_eos: True |
| 106 | + add_sep: False |
| 107 | + add_bos: False |
| 108 | + separate_prompt_and_response_with_newline: True |
| 109 | + truncation_field: "context" # Options: ['context', 'answer'] |
| 110 | + index_mapping_dir: null # Path to a directory to write index mapping files. |
| 111 | + prompt_template: null # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" |
| 112 | + |
| 113 | + validation_ds: |
| 114 | + file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. |
| 115 | + names: null # Names of the corresponding datasets used to log metrics. |
| 116 | + global_batch_size: ${fine_tuning.model.global_batch_size} |
| 117 | + micro_batch_size: ${fine_tuning.model.micro_batch_size} |
| 118 | + shuffle: True |
| 119 | + num_workers: 4 |
| 120 | + pin_memory: True |
| 121 | + max_seq_length: 2048 |
| 122 | + min_seq_length: 1 |
| 123 | + drop_last: True |
| 124 | + context_key: 'input' |
| 125 | + label_key: 'output' |
| 126 | + add_eos: ${fine_tuning.model.data.train_ds.add_eos} |
| 127 | + add_sep: ${fine_tuning.model.data.train_ds.add_sep} |
| 128 | + add_bos: ${fine_tuning.model.data.train_ds.add_bos} |
| 129 | + separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline} |
| 130 | + write_predictions_to_file: False |
| 131 | + output_file_path_prefix: null # Prefix of the file to write predictions to. |
| 132 | + truncation_field: "context" # Options: ['context', 'answer'] |
| 133 | + index_mapping_dir: null # Path to a directory to write index mapping files. |
| 134 | + prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" |
| 135 | + |
| 136 | + metric: |
| 137 | + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] |
| 138 | + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. |
| 139 | + num_classes: null |
| 140 | + |
| 141 | + test_ds: |
| 142 | + file_names: ${fine_tuning.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. |
| 143 | + names: null # Names of the corresponding datasets used to log metrics. |
| 144 | + global_batch_size: ${fine_tuning.model.global_batch_size} |
| 145 | + micro_batch_size: ${fine_tuning.model.micro_batch_size} |
| 146 | + shuffle: True |
| 147 | + num_workers: 4 |
| 148 | + pin_memory: True |
| 149 | + max_seq_length: 2048 |
| 150 | + min_seq_length: 1 |
| 151 | + drop_last: True |
| 152 | + context_key: 'input' |
| 153 | + label_key: 'output' |
| 154 | + add_eos: ${fine_tuning.model.data.train_ds.add_eos} |
| 155 | + add_sep: ${fine_tuning.model.data.train_ds.add_sep} |
| 156 | + add_bos: ${fine_tuning.model.data.train_ds.add_bos} |
| 157 | + separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline} |
| 158 | + write_predictions_to_file: False |
| 159 | + output_file_path_prefix: null # Prefix of the file to write predictions to. |
| 160 | + truncation_field: "context" # Options: ['context', 'answer'] |
| 161 | + index_mapping_dir: null # Path to a directory to write index mapping files. |
| 162 | + prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" |
| 163 | + |
| 164 | + metric: |
| 165 | + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] |
| 166 | + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. |
| 167 | + num_classes: null |
| 168 | + |
| 169 | + optim: |
| 170 | + name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. |
| 171 | + lr: 5e-6 |
| 172 | + weight_decay: 0.01 |
| 173 | + betas: |
| 174 | + - 0.9 |
| 175 | + - 0.98 |
| 176 | + |
0 commit comments