Merge pull request #103 from NVIDIA/23.07

23.07 changes
NVIDIA · Aug 4, 2023 · 45d6d33 · 45d6d33
2 parents 59a05d2 + 23b6cda
commit 45d6d33
Show file tree

Hide file tree

Showing 16 changed files with 642 additions and 62 deletions.
diff --git a/README.md b/README.md
diff --git a/auto_configurator/conf/config.yaml b/auto_configurator/conf/config.yaml
@@ -19,7 +19,7 @@ fastertransformer_path: ${auto_configurator_path}/../FasterTransformer
 base_results_dir: ${auto_configurator_path}/results
 data_dir: ${launcher_scripts_path}/data
 
-training_container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3
+training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
 container_mounts:
   - null
 

diff --git a/auto_configurator/tests/config_tests/test_main_config.py b/auto_configurator/tests/config_tests/test_main_config.py
@@ -26,7 +26,7 @@ def test_config(self):
         base_results_dir: ${auto_configurator_path}/results
         data_dir: ${launcher_scripts_path}/data
 
-        training_container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3
+        training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
         container_mounts:
           - null
         

diff --git a/csp_tools/aws/Dockerfile b/csp_tools/aws/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/ea-bignlp/nemofw-training:23.05-py3
+FROM nvcr.io/ea-bignlp/nemofw-training:23.07-py3
 
 ARG EFA_INSTALLER_VERSION=latest
 ARG AWS_OFI_NCCL_VERSION=1.4.0-aws

diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
@@ -31,7 +31,7 @@ data_dir: ${launcher_scripts_path}/data  # Location to store and read the data.
 base_results_dir: ${launcher_scripts_path}/results  # Location to store the results, checkpoints and logs.
 container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
   - null
-container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3
+container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
 
 wandb_api_key_file: null  # File where the w&B api key is stored. Key must be on the first line.
 

diff --git a/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml b/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml
@@ -0,0 +1,176 @@
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: gpt3_sft
+  convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name}
+  task_name: "custom_task"  # Rename this name to be more clear
+  results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name}
+
+trainer:
+  devices: 8
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: 1
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 300 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: ${fine_tuning.run.results_dir}/results
+  exp_dir: null
+  name: megatron_gpt3_${fine_tuning.run.task_name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: nemo_gpt3_${fine_tuning.run.task_name}
+    name: ${fine_tuning.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${fine_tuning.model.data.validation_ds.metric.name}
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: True
+    filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}}
+    save_best_model: True
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  global_batch_size: 8
+  micro_batch_size: 1
+  restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_gpt.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: True 
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: selective # 'selective' or 'full' 
+  activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  answer_only_loss: True  # not used right now
+  gradient_as_bucket_view: False
+  seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
+  use_flash_attention: True # if not None, will match the base model's value
+
+  hidden_dropout: 0.1
+  attention_dropout: 0.1
+  ffn_dropout: 0.1
+
+  data:
+    chat: False # whether use chatbot data or not
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: ??? # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: True
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: null # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${fine_tuning.model.data.train_ds.add_eos}
+      add_sep: ${fine_tuning.model.data.train_ds.add_sep}
+      add_bos: ${fine_tuning.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    test_ds:
+      file_names: ${fine_tuning.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${fine_tuning.model.data.train_ds.add_eos}
+      add_sep: ${fine_tuning.model.data.train_ds.add_sep}
+      add_bos: ${fine_tuning.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
+    lr: 5e-6
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+