From d2318a1387734810bf0fe0252b7f1a5c75e05570 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 16 Jul 2025 17:23:28 -0500 Subject: [PATCH 1/2] YAdd training v5.1 files --- mlperf_logging/benchmark_meta.py | 14 ++- mlperf_logging/compliance_checker/README.md | 12 +- .../training_5.1.0/closed_common.yaml | 2 +- .../training_5.1.0/closed_flux1.yaml | 77 +++++++++++++ .../training_5.1.0/closed_llama31_405b.yaml | 85 ++++++++++++++ .../training_5.1.0/closed_llama31_8b.yaml | 49 ++++++++ .../training_5.1.0/common.yaml | 4 +- .../training_5.1.0/open_common.yaml | 2 +- .../training_5.1.0/open_flux1.yaml | 32 ++++++ .../training_5.1.0/open_llama31_405b.yaml | 78 +++++++++++++ .../training_5.1.0/open_llama31_8b.yaml | 8 ++ mlperf_logging/mllog/constants.py | 2 + mlperf_logging/rcp_checker/rcp_checker.py | 2 +- .../training_5.1.0/rcps_flux1.json | 65 +++++++++++ .../training_5.1.0/rcps_llama31_405b.json | 106 ++++++++++++++++++ .../training_5.1.0/rcps_llama31_8b.json | 25 +++++ .../visualization_scripts/rcp_viewer.py | 2 +- mlperf_logging/result_summarizer/config.yaml | 4 +- 18 files changed, 555 insertions(+), 14 deletions(-) create mode 100644 mlperf_logging/compliance_checker/training_5.1.0/closed_flux1.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.1.0/open_flux1.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.1.0/open_llama31_405b.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.1.0/open_llama31_8b.yaml create mode 100644 mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json create mode 100644 mlperf_logging/rcp_checker/training_5.1.0/rcps_llama31_405b.json create mode 100644 mlperf_logging/rcp_checker/training_5.1.0/rcps_llama31_8b.json diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index d323050..cf7c3e3 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -20,6 +20,9 @@ 'rgat': 10, 'llama2_70b_lora': 10, 'llama31_405b': 3, + # TODO: Update with official values + 'llama31_8b': 10, + 'flux1': 10, }, 'hpc' : { @@ -143,7 +146,16 @@ 'llama2_70b_lora', 'rgat', 'llama31_405b' - ] + ], + '5.1': [ + 'llama31_8b', + 'dlrm_dcnv2', + 'retinanet', + 'flux1', + 'llama2_70b_lora', + 'rgat', + 'llama31_405b' + ] }, 'hpc': { diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index d9f3dee..523ce1b 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -12,7 +12,7 @@ To check a log file for compliance: By default, 5.1.0 training edition rules are used and the default config is set to `5.1.0/common.yaml`. This config will check all common keys and enqueue benchmark specific config to be checked as well. -Old training editions, still supported are 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0 +Old training editions, still supported are 5.0.0, 4.1.0 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0 To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0. @@ -26,17 +26,19 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ 5.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks 5.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks 5.1.0/closed_retinanet.yaml - Per-benchmark rules, closed submissions. - 5.1.0/closed_bert.yaml + 5.1.0/closed_llama31_8b.yaml + 5.1.0/closed_llama31_405b.yaml 5.1.0/closed_dlrm_dcnv2.yaml 5.1.0/closed_rgat.yaml 5.1.0/closed_llama2_70b_lora.yaml - 5.1.0/closed_flux.yaml + 5.1.0/closed_flux1.yaml 5.1.0/open_retinanet.yaml - Per-benchmark rules, open submissions. - 5.1.0/open_bert.yaml + 5.1.0/open_llama31_8b.yaml + 5.1.0/open_llama31_405b.yaml 5.1.0/open_dlrm_dcnv2.yaml 5.1.0/open_rgat.yaml 5.1.0/open_llama2_70b_lora.yaml - 5.1.0/open_flux.yaml + 5.1.0/open_flux1.yaml ### Existing config files for HPC submissions diff --git a/mlperf_logging/compliance_checker/training_5.1.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_5.1.0/closed_common.yaml index 2c49169..8639eeb 100755 --- a/mlperf_logging/compliance_checker/training_5.1.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_5.1.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora', 'flux'] " + CHECK: " v['value'] in ['retinanet', 'flux1', 'dlrm_dcnv2', 'llama31_8b', 'rgat', 'llama2_70b_lora', 'llama31_405b'] " POST: " enqueue_config('training_5.1.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_5.1.0/closed_flux1.yaml b/mlperf_logging/compliance_checker/training_5.1.0/closed_flux1.yaml new file mode 100644 index 0000000..984e9d0 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.1.0/closed_flux1.yaml @@ -0,0 +1,77 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +- BEGIN: + CODE: | + from dataclasses import replace + agg_eval_lines = {} + for line in loglines: + if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: + samples_count = line.value['metadata']['samples_count'] + if samples_count not in agg_eval_lines: + new_line = replace(line) # Make a copy + new_line.key = "aggregated_eval_accuracy" + new_line.full_string = "" # Not needed + new_line.lineno = -1 # Not needed + new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} + agg_eval_lines[samples_count] = new_line + + agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) + agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] + loglines.extend(agg_eval_lines.values()) + +- KEY: + NAME: global_batch_size + REQ: AT_LEAST_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +# TODO: Update with official metric name +- KEY: + NAME: averaged_validation_loss + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml new file mode 100644 index 0000000..c47fd87 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_405b.yaml @@ -0,0 +1,85 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] * 1152 == s['global_batch_size'] * 8e-5 " + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 5760 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + diff --git a/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml new file mode 100644 index 0000000..3619827 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.1.0/closed_llama31_8b.yaml @@ -0,0 +1,49 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +# TODO: Update with official compliance requirements +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_training_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: num_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: start_warmup_step + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_weight_decay_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_5.1.0/common.yaml b/mlperf_logging/compliance_checker/training_5.1.0/common.yaml index cfdd5a6..360854c 100755 --- a/mlperf_logging/compliance_checker/training_5.1.0/common.yaml +++ b/mlperf_logging/compliance_checker/training_5.1.0/common.yaml @@ -107,13 +107,13 @@ NAME: epoch_start REQ: AT_LEAST_ONE_OR(block_start) CHECK: - - "'epoch_num' in v['metadata']" + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" - KEY: NAME: epoch_stop REQ: AT_LEAST_ONE_OR(block_stop) CHECK: - - "'epoch_num' in v['metadata']" + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" # making sure previous eval did print it's accuracy result - KEY: diff --git a/mlperf_logging/compliance_checker/training_5.1.0/open_common.yaml b/mlperf_logging/compliance_checker/training_5.1.0/open_common.yaml index 97abafc..41015a8 100644 --- a/mlperf_logging/compliance_checker/training_5.1.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_5.1.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['retinanet', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora', 'flux'] " + CHECK: " v['value'] in ['retinanet', 'flux1', 'dlrm_dcnv2', 'llama31_8b', 'rgat', 'llama2_70b_lora', 'llama31_405b'] " POST: " enqueue_config('training_5.1.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_5.1.0/open_flux1.yaml b/mlperf_logging/compliance_checker/training_5.1.0/open_flux1.yaml new file mode 100644 index 0000000..19a69fa --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.1.0/open_flux1.yaml @@ -0,0 +1,32 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +- BEGIN: + CODE: | + from dataclasses import replace + agg_eval_lines = {} + for line in loglines: + if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: + samples_count = line.value['metadata']['samples_count'] + if samples_count not in agg_eval_lines: + new_line = replace(line) # Make a copy + new_line.key = "aggregated_eval_accuracy" + new_line.full_string = "" # Not needed + new_line.lineno = -1 # Not needed + new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} + agg_eval_lines[samples_count] = new_line + + agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) + agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] + loglines.extend(agg_eval_lines.values()) + +# TODO: Update with official metric name +- KEY: + NAME: averaged_validation_loss + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_5.1.0/open_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_5.1.0/open_llama31_405b.yaml new file mode 100644 index 0000000..0a29e8b --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.1.0/open_llama31_405b.yaml @@ -0,0 +1,78 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 5760 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + diff --git a/mlperf_logging/compliance_checker/training_5.1.0/open_llama31_8b.yaml b/mlperf_logging/compliance_checker/training_5.1.0/open_llama31_8b.yaml new file mode 100644 index 0000000..ff3f204 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.1.0/open_llama31_8b.yaml @@ -0,0 +1,8 @@ + +# TODO: Update with official compliance requirements +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index d272c1e..880a814 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -55,6 +55,8 @@ GNN = "gnn" RGAT = "rgat" LLAMA31_405B = "llama31_405b" +LLAMA31_8B = "llama31_8b" +FLUX1 = "flux1" # Constant values - model info ADAGRAD = "adagrad" diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 3806a90..1fb028b 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -441,7 +441,7 @@ def _set_results_scaling(self, scale_factor, results_dir): def _eval_submission_record(self, rcp_record, subm_epochs, results_dir): '''Compare reference and submission convergence.''' - if self.ruleset == "5.0.0" and self.benchmark == "llama31_405b": + if self.ruleset in ["5.0.0", "5.1.0"] and self.benchmark == "llama31_405b": rcp_record['Max Speedup'] = rcp_record['RCP Mean'] / (rcp_record['Min Epochs'] - 46080) subm_epochs.sort() diff --git a/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json b/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json new file mode 100644 index 0000000..e6c0fa8 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json @@ -0,0 +1,65 @@ +{ + "flux1_ref_1024": + { + "Benchmark": "flux1", + "Creator": "", + "When": "", + "Platform": "", + "BS": 1024, + "Hyperparams": { + "opt_adamw_beta_1": 0, + "opt_adamw_beta_2": 0, + "opt_adamw_epsilon": 0, + "opt_adamw_weight_decay": 0, + "opt_base_learning_rate": 0, + "opt_learning_rate_warmup_steps": 0 + }, + "Epochs to converge": [ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 + ] + }, + "flux1_ref_2048": + { + "Benchmark": "flux1", + "Creator": "", + "When": "", + "Platform": "", + "BS": 2048, + "Hyperparams": { + "opt_adamw_beta_1": 0, + "opt_adamw_beta_2": 0, + "opt_adamw_epsilon": 0, + "opt_adamw_weight_decay": 0, + "opt_base_learning_rate": 0, + "opt_learning_rate_warmup_steps": 0 + }, + "Epochs to converge": [ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 + ] + }, + "flux1_ref_4096": + { + "Benchmark": "flux1", + "Creator": "", + "When": "", + "Platform": "", + "BS": 4096, + "Hyperparams": { + "opt_adamw_beta_1": 0, + "opt_adamw_beta_2": 0, + "opt_adamw_epsilon": 0, + "opt_adamw_weight_decay": 0, + "opt_base_learning_rate": 0, + "opt_learning_rate_warmup_steps": 0 + }, + "Epochs to converge": [ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 + ] + } +} \ No newline at end of file diff --git a/mlperf_logging/rcp_checker/training_5.1.0/rcps_llama31_405b.json b/mlperf_logging/rcp_checker/training_5.1.0/rcps_llama31_405b.json new file mode 100644 index 0000000..70adaf5 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.1.0/rcps_llama31_405b.json @@ -0,0 +1,106 @@ +{ + "llama31_405b_ref_1008": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.0 submission", + "Platform": "288xDGX-H100", + "BS": 1008, + "Hyperparams": { + "opt_base_learning_rate": 7e-05, + "opt_learning_rate_warmup_steps": 9143, + "gradient_accumulation_steps": 126 + }, + "Epochs to converge": [ + 324576,324576,324576, + 324576,324576,324576 + ] + }, + "llama31_405b_ref_1152": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.0 submission", + "Platform": "288xDGX-H100", + "BS": 1152, + "Hyperparams": { + "opt_base_learning_rate": 8e-05, + "opt_learning_rate_warmup_steps": 8000, + "gradient_accumulation_steps": 144 + }, + "Epochs to converge": [ + 322560,322560,322560, + 322560,322560,322560 + ] + }, + + "llama31_405b_ref_2304": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.0 submission", + "Platform": "288xDGX-H100", + "BS": 2304, + "Hyperparams": { + "opt_base_learning_rate": 16e-05, + "opt_learning_rate_warmup_steps": 4000, + "gradient_accumulation_steps": 288 + }, + "Epochs to converge": [ + 368640,368640,368640, + 368640,414720,414720 + ] + }, + "llama31_405b_ref_4608": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.0 submission", + "Platform": "288xDGX-H100", + "BS": 4608, + "Hyperparams": { + "opt_base_learning_rate": 32e-05, + "opt_learning_rate_warmup_steps": 2000, + "gradient_accumulation_steps": 576 + }, + "Epochs to converge": [ + 460800,460800,506880, + 506880,506880,506880 + ] + }, + "llama31_405b_ref_6912": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.0 submission", + "Platform": "72xDGX-H100", + "BS": 6912, + "Hyperparams": { + "opt_base_learning_rate": 48e-05, + "opt_learning_rate_warmup_steps": 1334, + "gradient_accumulation_steps": 3456 + }, + "Epochs to converge": [ + 580608,580608,580608, + 628992,628992,628992 + ] + }, + "llama31_405b_ref_9216": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.0 submission", + "Platform": "288xDGX-H100", + "BS": 9216, + "Hyperparams": { + "opt_base_learning_rate": 64e-05, + "opt_learning_rate_warmup_steps": 1000, + "gradient_accumulation_steps": 1152 + }, + "Epochs to converge": [ + 645120,645120,691200, + 691200,737280,737280 + ] + } + } + \ No newline at end of file diff --git a/mlperf_logging/rcp_checker/training_5.1.0/rcps_llama31_8b.json b/mlperf_logging/rcp_checker/training_5.1.0/rcps_llama31_8b.json new file mode 100644 index 0000000..bed1d1c --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.1.0/rcps_llama31_8b.json @@ -0,0 +1,25 @@ +{ + + "llama31_8b_ref_X": + { + "Benchmark": "", + "Creator": "", + "When": "", + "Platform": "", + "BS": 0, + "Hyperparams": { + "opt_base_learning_rate": 0, + "opt_epsilon": 0, + "opt_learning_rate_training_steps": 0, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0, + "opt_lamb_beta_2": 0, + "opt_lamb_weight_decay_rate": 0, + "gradient_accumulation_steps": 0 + }, + "Epochs to converge": [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + } +} diff --git a/mlperf_logging/rcp_checker/visualization_scripts/rcp_viewer.py b/mlperf_logging/rcp_checker/visualization_scripts/rcp_viewer.py index 1c4b968..cba24d7 100755 --- a/mlperf_logging/rcp_checker/visualization_scripts/rcp_viewer.py +++ b/mlperf_logging/rcp_checker/visualization_scripts/rcp_viewer.py @@ -22,7 +22,7 @@ def main(): parser.add_argument('--usage', type=str, default='training', choices=['training', 'hpc'], help="the WG that produced the benchmark") - parser.add_argument('--version', type=str, default='5.0.0', + parser.add_argument('--version', type=str, default='5.1.0', help='what version of the ruleset') parser.add_argument('--verbose', action='store_true') parser.add_argument('--unpruned', action='store_true', diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index e0306fa..897d29a 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -94,12 +94,12 @@ columns: llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] default: [" ", " ", " "] "5.1.0": - bert: ["Benchmark results (minutes)", "NLP", "Wikipedia", "BERT"] dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] retinanet: ["Benchmark results (minutes)", "Object detection, light-weight", "OpenImages", "RetinaNet"] - flux: ["Benchmark results (minutes)", "Text to image", "CC12M and Coco-2014", "Flux"] + flux1: ["Benchmark results (minutes)", "Text to image", "CC12M and Coco-2014 for eval", "Flux1"] llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] rgat: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] + llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] default: [" ", " ", " "] From 7ccd11ab07c689e4f29c46c348ccfa34ca7d8043 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Thu, 17 Jul 2025 17:09:43 -0500 Subject: [PATCH 2/2] Minor fixes and renames --- mlperf_logging/compliance_checker/README.md | 2 +- .../training_5.1.0/closed_flux.yaml | 41 --------- .../training_5.1.0/closed_flux1.yaml | 21 +---- .../training_5.1.0/open_flux.yaml | 6 -- .../training_5.1.0/open_flux1.yaml | 19 ---- mlperf_logging/rcp_checker/rcp_checker.py | 6 +- .../rcp_checker/training_5.1.0/rcps_flux.json | 65 ------------- .../training_5.1.0/rcps_flux1.json | 92 +++++++++---------- 8 files changed, 51 insertions(+), 201 deletions(-) delete mode 100644 mlperf_logging/compliance_checker/training_5.1.0/closed_flux.yaml delete mode 100644 mlperf_logging/compliance_checker/training_5.1.0/open_flux.yaml delete mode 100644 mlperf_logging/rcp_checker/training_5.1.0/rcps_flux.json diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index 523ce1b..48c6ed5 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -12,7 +12,7 @@ To check a log file for compliance: By default, 5.1.0 training edition rules are used and the default config is set to `5.1.0/common.yaml`. This config will check all common keys and enqueue benchmark specific config to be checked as well. -Old training editions, still supported are 5.0.0, 4.1.0 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0 +Old training editions, still supported are 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0 To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0. diff --git a/mlperf_logging/compliance_checker/training_5.1.0/closed_flux.yaml b/mlperf_logging/compliance_checker/training_5.1.0/closed_flux.yaml deleted file mode 100644 index d0ed330..0000000 --- a/mlperf_logging/compliance_checker/training_5.1.0/closed_flux.yaml +++ /dev/null @@ -1,41 +0,0 @@ -- KEY: - NAME: global_batch_size - REQ: EXACTLY_ONE - POST: > - s['global_batch_size'] = v['value'] - - -- KEY: - NAME: opt_learning_rate_warmup_steps - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_base_learning_rate - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_gradient_clip_norm - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_adamw_weight_decay - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_adamw_epsilon - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_adamw_beta_1 - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_adamw_beta_2 - REQ: EXACTLY_ONE - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'samples_count' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 0.6) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_5.1.0/closed_flux1.yaml b/mlperf_logging/compliance_checker/training_5.1.0/closed_flux1.yaml index 984e9d0..23955bc 100644 --- a/mlperf_logging/compliance_checker/training_5.1.0/closed_flux1.yaml +++ b/mlperf_logging/compliance_checker/training_5.1.0/closed_flux1.yaml @@ -4,25 +4,6 @@ # called aggregated_eval_accuracy, which aggregates # both metrics into a single log line -- BEGIN: - CODE: | - from dataclasses import replace - agg_eval_lines = {} - for line in loglines: - if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: - samples_count = line.value['metadata']['samples_count'] - if samples_count not in agg_eval_lines: - new_line = replace(line) # Make a copy - new_line.key = "aggregated_eval_accuracy" - new_line.full_string = "" # Not needed - new_line.lineno = -1 # Not needed - new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} - agg_eval_lines[samples_count] = new_line - - agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) - agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] - loglines.extend(agg_eval_lines.values()) - - KEY: NAME: global_batch_size REQ: AT_LEAST_ONE @@ -70,7 +51,7 @@ # TODO: Update with official metric name - KEY: - NAME: averaged_validation_loss + NAME: eval_accuracy REQ: AT_LEAST_ONE CHECK: - "'epoch_num' in v['metadata']" diff --git a/mlperf_logging/compliance_checker/training_5.1.0/open_flux.yaml b/mlperf_logging/compliance_checker/training_5.1.0/open_flux.yaml deleted file mode 100644 index f732825..0000000 --- a/mlperf_logging/compliance_checker/training_5.1.0/open_flux.yaml +++ /dev/null @@ -1,6 +0,0 @@ -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'samples_count' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 0.6) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_5.1.0/open_flux1.yaml b/mlperf_logging/compliance_checker/training_5.1.0/open_flux1.yaml index 19a69fa..4144e05 100644 --- a/mlperf_logging/compliance_checker/training_5.1.0/open_flux1.yaml +++ b/mlperf_logging/compliance_checker/training_5.1.0/open_flux1.yaml @@ -4,25 +4,6 @@ # called aggregated_eval_accuracy, which aggregates # both metrics into a single log line -- BEGIN: - CODE: | - from dataclasses import replace - agg_eval_lines = {} - for line in loglines: - if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: - samples_count = line.value['metadata']['samples_count'] - if samples_count not in agg_eval_lines: - new_line = replace(line) # Make a copy - new_line.key = "aggregated_eval_accuracy" - new_line.full_string = "" # Not needed - new_line.lineno = -1 # Not needed - new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} - agg_eval_lines[samples_count] = new_line - - agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) - agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] - loglines.extend(agg_eval_lines.values()) - # TODO: Update with official metric name - KEY: NAME: averaged_validation_loss diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 1fb028b..c08f4ea 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -32,7 +32,7 @@ 'gnn': 10, 'rgat': 10, 'llama2_70b_lora': 10, - 'flux': 10, + 'flux1': 10, 'llama31_405b': 3, }, "hpc": { @@ -83,7 +83,7 @@ def read_submission_file(result_file, ruleset, use_train_samples): eval_metric = json.loads(eval_accuracy_str)["metadata"]["metric"] eval_score = json.loads(eval_accuracy_str)["value"] stable_diffusion_eval_results[eval_step][eval_metric] = eval_score - elif benchmark in {"llama2_70b_lora", "flux", "llama31_405b"} and ("eval_error" in str or "eval_accuracy" in str): + elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b"} and ("eval_error" in str or "eval_accuracy" in str): eval_accuracy_str = str conv_epoch = json.loads(eval_accuracy_str)["metadata"]["samples_count"] eval_score = json.loads(eval_accuracy_str)["value"] @@ -210,7 +210,7 @@ def _process_raw_rcp_data(self, raw_rcp_data): ''' processed_rcps = {} for record, record_contents in raw_rcp_data.items(): - conv_unit = "samples to converge" if record_contents['Benchmark'] in ['llama2_70b_lora', 'flux'] else "Epochs to converge" + conv_unit = "samples to converge" if record_contents['Benchmark'] in ['llama2_70b_lora', 'flux1'] else "Epochs to converge" processed_record = {'Benchmark': record_contents['Benchmark'], 'BS': record_contents['BS'], 'Hyperparams': record_contents['Hyperparams'], diff --git a/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux.json b/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux.json deleted file mode 100644 index ba15ef9..0000000 --- a/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "flux_ref_1024": { - "Benchmark": "flux", - "Creator": "NVIDIA", - "When": "Reference RCPs before v5.1", - "Platform": "8xDGX-B200", - "BS": 1024, - "Hyperparams": { - "opt_adamw_beta_1": 0.9, - "opt_adamw_beta_2": 0.95, - "opt_adamw_epsilon": 1e-8, - "opt_adamw_weight_decay": 0.1, - "opt_base_learning_rate": 2.0e-4, - "opt_learning_rate_warmup_steps": 0, - "opt_gradient_clip_norm": 1.0 - }, - "samples to converge": [ - 8912896, 8650752, 9437184, 8126464, 8388608, 9175040, 8650752, 8126464, - 8388608, 9961472, 7864320, 8126464, 9699328, 8650752, 9437184, 8912896, - 8388608, 9175040, 8126464, 9175040 - ] - }, - "flux_ref_2048": { - "Benchmark": "flux", - "Creator": "NVIDIA", - "When": "Reference RCPs before v5.1", - "Platform": "8xDGX-B200", - "BS": 2048, - "Hyperparams": { - "opt_adamw_beta_1": 0.9, - "opt_adamw_beta_2": 0.95, - "opt_adamw_epsilon": 1e-8, - "opt_adamw_weight_decay": 0.1, - "opt_base_learning_rate": 2.5e-4, - "opt_learning_rate_warmup_steps": 0, - "opt_gradient_clip_norm": 1.0 - }, - "samples to converge": [ - 11272192, 10223616, 11534336, 10747904, 9699328, 10485760, 11010048, - 10223616, 11796480, 10485760, 10747904, 11272192, 9699328, 10485760, - 11534336, 9961472, 10485760, 10485760, 11272192, 11272192 - ] - }, - "flux_ref_4096": { - "Benchmark": "flux", - "Creator": "NVIDIA", - "When": "Reference RCPs before v5.1", - "Platform": "8xDGX-B200", - "BS": 4096, - "Hyperparams": { - "opt_adamw_beta_1": 0.9, - "opt_adamw_beta_2": 0.95, - "opt_adamw_epsilon": 1e-8, - "opt_adamw_weight_decay": 0.1, - "opt_base_learning_rate": 4.0e-4, - "opt_learning_rate_warmup_steps": 100, - "opt_gradient_clip_norm": 1.0 - }, - "samples to converge": [ - 15466496, 15728640, 15990784, 15466496, 15728640, 15466496, 14942208, - 14680064, 15728640, 15990784, 15990784, 15728640, 15728640, 16252928, - 14942208, 15728640, 16252928, 15204352, 16515072, 14942208 - ] - } -} diff --git a/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json b/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json index e6c0fa8..3fb9815 100644 --- a/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json +++ b/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json @@ -1,65 +1,65 @@ { - "flux1_ref_1024": - { + "flux_ref_1024": { "Benchmark": "flux1", - "Creator": "", - "When": "", - "Platform": "", + "Creator": "NVIDIA", + "When": "Reference RCPs before v5.1", + "Platform": "8xDGX-B200", "BS": 1024, "Hyperparams": { - "opt_adamw_beta_1": 0, - "opt_adamw_beta_2": 0, - "opt_adamw_epsilon": 0, - "opt_adamw_weight_decay": 0, - "opt_base_learning_rate": 0, - "opt_learning_rate_warmup_steps": 0 + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.0e-4, + "opt_learning_rate_warmup_steps": 0, + "opt_gradient_clip_norm": 1.0 }, - "Epochs to converge": [ - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0 + "samples to converge": [ + 8912896, 8650752, 9437184, 8126464, 8388608, 9175040, 8650752, 8126464, + 8388608, 9961472, 7864320, 8126464, 9699328, 8650752, 9437184, 8912896, + 8388608, 9175040, 8126464, 9175040 ] }, - "flux1_ref_2048": - { + "flux_ref_2048": { "Benchmark": "flux1", - "Creator": "", - "When": "", - "Platform": "", + "Creator": "NVIDIA", + "When": "Reference RCPs before v5.1", + "Platform": "8xDGX-B200", "BS": 2048, "Hyperparams": { - "opt_adamw_beta_1": 0, - "opt_adamw_beta_2": 0, - "opt_adamw_epsilon": 0, - "opt_adamw_weight_decay": 0, - "opt_base_learning_rate": 0, - "opt_learning_rate_warmup_steps": 0 + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.5e-4, + "opt_learning_rate_warmup_steps": 0, + "opt_gradient_clip_norm": 1.0 }, - "Epochs to converge": [ - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0 + "samples to converge": [ + 11272192, 10223616, 11534336, 10747904, 9699328, 10485760, 11010048, + 10223616, 11796480, 10485760, 10747904, 11272192, 9699328, 10485760, + 11534336, 9961472, 10485760, 10485760, 11272192, 11272192 ] }, - "flux1_ref_4096": - { + "flux_ref_4096": { "Benchmark": "flux1", - "Creator": "", - "When": "", - "Platform": "", + "Creator": "NVIDIA", + "When": "Reference RCPs before v5.1", + "Platform": "8xDGX-B200", "BS": 4096, "Hyperparams": { - "opt_adamw_beta_1": 0, - "opt_adamw_beta_2": 0, - "opt_adamw_epsilon": 0, - "opt_adamw_weight_decay": 0, - "opt_base_learning_rate": 0, - "opt_learning_rate_warmup_steps": 0 + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 4.0e-4, + "opt_learning_rate_warmup_steps": 100, + "opt_gradient_clip_norm": 1.0 }, - "Epochs to converge": [ - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0 + "samples to converge": [ + 15466496, 15728640, 15990784, 15466496, 15728640, 15466496, 14942208, + 14680064, 15728640, 15990784, 15990784, 15728640, 15728640, 16252928, + 14942208, 15728640, 16252928, 15204352, 16515072, 14942208 ] } -} \ No newline at end of file +}