-
Notifications
You must be signed in to change notification settings - Fork 135
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #103 from NVIDIA/23.07
23.07 changes
- Loading branch information
Showing
16 changed files
with
642 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
176 changes: 176 additions & 0 deletions
176
launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
run: | ||
name: ${.task_name}_${.model_train_name} | ||
time_limit: "04:00:00" | ||
dependency: "singleton" | ||
convert_name: convert_nemo | ||
model_train_name: gpt3_sft | ||
convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name} | ||
task_name: "custom_task" # Rename this name to be more clear | ||
results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name} | ||
|
||
trainer: | ||
devices: 8 | ||
accelerator: gpu | ||
num_nodes: 1 | ||
precision: bf16 | ||
logger: False # logger provided by exp_manager | ||
enable_checkpointing: False | ||
replace_sampler_ddp: False | ||
max_epochs: 1 | ||
max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches | ||
log_every_n_steps: 10 # frequency with which training steps are logged | ||
val_check_interval: 300 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch | ||
gradient_clip_val: 1.0 | ||
|
||
exp_manager: | ||
explicit_log_dir: ${fine_tuning.run.results_dir}/results | ||
exp_dir: null | ||
name: megatron_gpt3_${fine_tuning.run.task_name} | ||
create_wandb_logger: False | ||
wandb_logger_kwargs: | ||
project: nemo_gpt3_${fine_tuning.run.task_name} | ||
name: ${fine_tuning.run.name} | ||
resume_if_exists: True | ||
resume_ignore_no_checkpoint: True | ||
create_checkpoint_callback: True | ||
checkpoint_callback_params: | ||
monitor: validation_${fine_tuning.model.data.validation_ds.metric.name} | ||
save_top_k: 5 | ||
mode: min | ||
save_nemo_on_train_end: True | ||
filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}' | ||
model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}} | ||
save_best_model: True | ||
|
||
model: | ||
seed: 1234 | ||
tensor_model_parallel_size: 1 # intra-layer model parallelism | ||
pipeline_model_parallel_size: 1 # inter-layer model parallelism | ||
global_batch_size: 8 | ||
micro_batch_size: 1 | ||
restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_gpt.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with | ||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. | ||
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. | ||
sync_batch_comm: False | ||
megatron_amp_O2: True | ||
|
||
## Sequence Parallelism | ||
# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially | ||
# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. | ||
sequence_parallel: False | ||
|
||
## Activation Checkpoint | ||
activations_checkpoint_granularity: selective # 'selective' or 'full' | ||
activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' | ||
# 'uniform' divides the total number of transformer layers and checkpoints the input activation | ||
# of each chunk at the specified granularity | ||
# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity | ||
activations_checkpoint_num_layers: null # not used with 'selective' | ||
answer_only_loss: True # not used right now | ||
gradient_as_bucket_view: False | ||
seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value | ||
use_flash_attention: True # if not None, will match the base model's value | ||
|
||
hidden_dropout: 0.1 | ||
attention_dropout: 0.1 | ||
ffn_dropout: 0.1 | ||
|
||
data: | ||
chat: False # whether use chatbot data or not | ||
train_ds: | ||
# Example of how to specify paths to multiple datasets | ||
# file_names: | ||
# - /path/to/squad.jsonl | ||
# - /path/to/mnli.jsonl | ||
# - /path/to/boolq.jsonl | ||
# Example of how each dataset is formatted | ||
# {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} | ||
file_names: ??? # Path to a list of JSONL files corresponding to the source data. | ||
global_batch_size: ${fine_tuning.model.global_batch_size} | ||
micro_batch_size: ${fine_tuning.model.micro_batch_size} | ||
shuffle: True | ||
num_workers: 4 | ||
pin_memory: True | ||
max_seq_length: 2048 | ||
min_seq_length: 1 | ||
drop_last: True | ||
# Example of how to specify concat_sampling_probabilities | ||
# concat_sampling_probabilities: | ||
# - 0.5 | ||
# - 0.25 | ||
# - 0.25 | ||
concat_sampling_probabilities: ??? # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' | ||
context_key: 'input' | ||
label_key: 'output' | ||
add_eos: True | ||
add_sep: False | ||
add_bos: False | ||
separate_prompt_and_response_with_newline: True | ||
truncation_field: "context" # Options: ['context', 'answer'] | ||
index_mapping_dir: null # Path to a directory to write index mapping files. | ||
prompt_template: null # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" | ||
|
||
validation_ds: | ||
file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. | ||
names: null # Names of the corresponding datasets used to log metrics. | ||
global_batch_size: ${fine_tuning.model.global_batch_size} | ||
micro_batch_size: ${fine_tuning.model.micro_batch_size} | ||
shuffle: True | ||
num_workers: 4 | ||
pin_memory: True | ||
max_seq_length: 2048 | ||
min_seq_length: 1 | ||
drop_last: True | ||
context_key: 'input' | ||
label_key: 'output' | ||
add_eos: ${fine_tuning.model.data.train_ds.add_eos} | ||
add_sep: ${fine_tuning.model.data.train_ds.add_sep} | ||
add_bos: ${fine_tuning.model.data.train_ds.add_bos} | ||
separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline} | ||
write_predictions_to_file: False | ||
output_file_path_prefix: null # Prefix of the file to write predictions to. | ||
truncation_field: "context" # Options: ['context', 'answer'] | ||
index_mapping_dir: null # Path to a directory to write index mapping files. | ||
prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" | ||
|
||
metric: | ||
name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] | ||
average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. | ||
num_classes: null | ||
|
||
test_ds: | ||
file_names: ${fine_tuning.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. | ||
names: null # Names of the corresponding datasets used to log metrics. | ||
global_batch_size: ${fine_tuning.model.global_batch_size} | ||
micro_batch_size: ${fine_tuning.model.micro_batch_size} | ||
shuffle: True | ||
num_workers: 4 | ||
pin_memory: True | ||
max_seq_length: 2048 | ||
min_seq_length: 1 | ||
drop_last: True | ||
context_key: 'input' | ||
label_key: 'output' | ||
add_eos: ${fine_tuning.model.data.train_ds.add_eos} | ||
add_sep: ${fine_tuning.model.data.train_ds.add_sep} | ||
add_bos: ${fine_tuning.model.data.train_ds.add_bos} | ||
separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline} | ||
write_predictions_to_file: False | ||
output_file_path_prefix: null # Prefix of the file to write predictions to. | ||
truncation_field: "context" # Options: ['context', 'answer'] | ||
index_mapping_dir: null # Path to a directory to write index mapping files. | ||
prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" | ||
|
||
metric: | ||
name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] | ||
average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. | ||
num_classes: null | ||
|
||
optim: | ||
name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. | ||
lr: 5e-6 | ||
weight_decay: 0.01 | ||
betas: | ||
- 0.9 | ||
- 0.98 | ||
|
Oops, something went wrong.