Skip to content

Commit

Permalink
Merge pull request #103 from NVIDIA/23.07
Browse files Browse the repository at this point in the history
23.07 changes
  • Loading branch information
dimapihtar committed Aug 4, 2023
2 parents 59a05d2 + 23b6cda commit 45d6d33
Show file tree
Hide file tree
Showing 16 changed files with 642 additions and 62 deletions.
273 changes: 233 additions & 40 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion auto_configurator/conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ fastertransformer_path: ${auto_configurator_path}/../FasterTransformer
base_results_dir: ${auto_configurator_path}/results
data_dir: ${launcher_scripts_path}/data

training_container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3
training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
container_mounts:
- null

Expand Down
2 changes: 1 addition & 1 deletion auto_configurator/tests/config_tests/test_main_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_config(self):
base_results_dir: ${auto_configurator_path}/results
data_dir: ${launcher_scripts_path}/data
training_container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3
training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
container_mounts:
- null
Expand Down
2 changes: 1 addition & 1 deletion csp_tools/aws/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM nvcr.io/ea-bignlp/nemofw-training:23.05-py3
FROM nvcr.io/ea-bignlp/nemofw-training:23.07-py3

ARG EFA_INSTALLER_VERSION=latest
ARG AWS_OFI_NCCL_VERSION=1.4.0-aws
Expand Down
2 changes: 1 addition & 1 deletion launcher_scripts/conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ data_dir: ${launcher_scripts_path}/data # Location to store and read the data.
base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs.
container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
- null
container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3
container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3

wandb_api_key_file: null # File where the w&B api key is stored. Key must be on the first line.

Expand Down
176 changes: 176 additions & 0 deletions launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
run:
name: ${.task_name}_${.model_train_name}
time_limit: "04:00:00"
dependency: "singleton"
convert_name: convert_nemo
model_train_name: gpt3_sft
convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name}
task_name: "custom_task" # Rename this name to be more clear
results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name}

trainer:
devices: 8
accelerator: gpu
num_nodes: 1
precision: bf16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: 1
max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10 # frequency with which training steps are logged
val_check_interval: 300 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
gradient_clip_val: 1.0

exp_manager:
explicit_log_dir: ${fine_tuning.run.results_dir}/results
exp_dir: null
name: megatron_gpt3_${fine_tuning.run.task_name}
create_wandb_logger: False
wandb_logger_kwargs:
project: nemo_gpt3_${fine_tuning.run.task_name}
name: ${fine_tuning.run.name}
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: validation_${fine_tuning.model.data.validation_ds.metric.name}
save_top_k: 5
mode: min
save_nemo_on_train_end: True
filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}}
save_best_model: True

model:
seed: 1234
tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism
global_batch_size: 8
micro_batch_size: 1
restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_gpt.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
sync_batch_comm: False
megatron_amp_O2: True

## Sequence Parallelism
# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
sequence_parallel: False

## Activation Checkpoint
activations_checkpoint_granularity: selective # 'selective' or 'full'
activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
# 'uniform' divides the total number of transformer layers and checkpoints the input activation
# of each chunk at the specified granularity
# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
activations_checkpoint_num_layers: null # not used with 'selective'
answer_only_loss: True # not used right now
gradient_as_bucket_view: False
seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
use_flash_attention: True # if not None, will match the base model's value

hidden_dropout: 0.1
attention_dropout: 0.1
ffn_dropout: 0.1

data:
chat: False # whether use chatbot data or not
train_ds:
# Example of how to specify paths to multiple datasets
# file_names:
# - /path/to/squad.jsonl
# - /path/to/mnli.jsonl
# - /path/to/boolq.jsonl
# Example of how each dataset is formatted
# {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
file_names: ??? # Path to a list of JSONL files corresponding to the source data.
global_batch_size: ${fine_tuning.model.global_batch_size}
micro_batch_size: ${fine_tuning.model.micro_batch_size}
shuffle: True
num_workers: 4
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
drop_last: True
# Example of how to specify concat_sampling_probabilities
# concat_sampling_probabilities:
# - 0.5
# - 0.25
# - 0.25
concat_sampling_probabilities: ??? # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
context_key: 'input'
label_key: 'output'
add_eos: True
add_sep: False
add_bos: False
separate_prompt_and_response_with_newline: True
truncation_field: "context" # Options: ['context', 'answer']
index_mapping_dir: null # Path to a directory to write index mapping files.
prompt_template: null # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"

validation_ds:
file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
names: null # Names of the corresponding datasets used to log metrics.
global_batch_size: ${fine_tuning.model.global_batch_size}
micro_batch_size: ${fine_tuning.model.micro_batch_size}
shuffle: True
num_workers: 4
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
drop_last: True
context_key: 'input'
label_key: 'output'
add_eos: ${fine_tuning.model.data.train_ds.add_eos}
add_sep: ${fine_tuning.model.data.train_ds.add_sep}
add_bos: ${fine_tuning.model.data.train_ds.add_bos}
separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
write_predictions_to_file: False
output_file_path_prefix: null # Prefix of the file to write predictions to.
truncation_field: "context" # Options: ['context', 'answer']
index_mapping_dir: null # Path to a directory to write index mapping files.
prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"

metric:
name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
num_classes: null

test_ds:
file_names: ${fine_tuning.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
names: null # Names of the corresponding datasets used to log metrics.
global_batch_size: ${fine_tuning.model.global_batch_size}
micro_batch_size: ${fine_tuning.model.micro_batch_size}
shuffle: True
num_workers: 4
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
drop_last: True
context_key: 'input'
label_key: 'output'
add_eos: ${fine_tuning.model.data.train_ds.add_eos}
add_sep: ${fine_tuning.model.data.train_ds.add_sep}
add_bos: ${fine_tuning.model.data.train_ds.add_bos}
separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
write_predictions_to_file: False
output_file_path_prefix: null # Prefix of the file to write predictions to.
truncation_field: "context" # Options: ['context', 'answer']
index_mapping_dir: null # Path to a directory to write index mapping files.
prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"

metric:
name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
num_classes: null

optim:
name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
lr: 5e-6
weight_decay: 0.01
betas:
- 0.9
- 0.98

Loading

0 comments on commit 45d6d33

Please sign in to comment.