mlcommons · pgmpablo157321 · Jul 16, 2025 · Jul 17, 2025
@@ -20,6 +20,9 @@
         'rgat': 10,  
         'llama2_70b_lora': 10,
         'llama31_405b': 3,
+        # TODO: Update with official values
+        'llama31_8b': 10,
+        'flux1': 10,
     },
 
     'hpc' : {
@@ -143,7 +146,16 @@
         'llama2_70b_lora',
         'rgat',
         'llama31_405b'
-    ]    
+    ],
+    '5.1': [
+        'llama31_8b',
+        'dlrm_dcnv2',   
+        'retinanet',        
+        'flux1',
+        'llama2_70b_lora',
+        'rgat',
+        'llama31_405b'
+    ]  
     },
 
     'hpc': {

@@ -12,7 +12,7 @@ To check a log file for compliance:
 
 By default, 5.1.0 training edition rules are used and the default config is set to `5.1.0/common.yaml`.
 This config will check all common keys and enqueue benchmark specific config to be checked as well.
-Old training editions, still supported are 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
+Old training editions, still supported are 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
 
 To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0.
 
@@ -26,17 +26,19 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_
     5.1.0/closed_common.yaml   - the common rules file for closed submissions. These rules apply to all benchmarks
     5.1.0/open_common.yaml     - the common rules file for open submissions. These rules apply to all benchmarks
     5.1.0/closed_retinanet.yaml      - Per-benchmark rules, closed submissions.    
-    5.1.0/closed_bert.yaml
+    5.1.0/closed_llama31_8b.yaml
+    5.1.0/closed_llama31_405b.yaml
     5.1.0/closed_dlrm_dcnv2.yaml
     5.1.0/closed_rgat.yaml
     5.1.0/closed_llama2_70b_lora.yaml
-    5.1.0/closed_flux.yaml
+    5.1.0/closed_flux1.yaml
     5.1.0/open_retinanet.yaml        - Per-benchmark rules, open submissions.    
-    5.1.0/open_bert.yaml
+    5.1.0/open_llama31_8b.yaml
+    5.1.0/open_llama31_405b.yaml
     5.1.0/open_dlrm_dcnv2.yaml
     5.1.0/open_rgat.yaml
     5.1.0/open_llama2_70b_lora.yaml
-    5.1.0/open_flux.yaml
+    5.1.0/open_flux1.yaml
 
 ### Existing config files for HPC submissions
 

@@ -2,7 +2,7 @@
 - KEY:
     NAME:  submission_benchmark
     REQ:   EXACTLY_ONE
-    CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora', 'flux'] "
+    CHECK: " v['value'] in ['retinanet', 'flux1', 'dlrm_dcnv2', 'llama31_8b', 'rgat', 'llama2_70b_lora', 'llama31_405b'] "
     POST:  " enqueue_config('training_5.1.0/closed_{}.yaml'.format(v['value'])) "
 
 - KEY:

@@ -0,0 +1,58 @@
+# Stable diffusion uses two metrics, FID and CLIP.
+# These metrics can be calculated offline, using different scripts
+# and logged seperatly. Therefore, we create a virtual key
+# called aggregated_eval_accuracy, which aggregates
+# both metrics into a single log line
+
+- KEY:
+    NAME:  global_batch_size
+    REQ:   AT_LEAST_ONE
+    CHECK: " v['value'] >= 0 "
+
+- KEY:
+    NAME:  opt_name
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 'adamw' "
+
+- KEY:
+    NAME:  opt_adamw_beta_1
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0.9 "
+
+- KEY:
+    NAME:  opt_adamw_beta_2
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0.95 "
+
+- KEY:
+    NAME:  opt_adamw_epsilon
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 1e-08 "
+
+- KEY:
+    NAME:  opt_adamw_weight_decay
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0.1 "
+
+- KEY:
+    NAME:  opt_base_learning_rate
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0.0 "
+
+- KEY:
+    NAME:  opt_learning_rate_warmup_steps
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0 "
+
+- KEY:
+    NAME:  opt_gradient_clip_norm
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 1.0 "
+
+# TODO: Update with official metric name
+- KEY:
+    NAME: eval_accuracy
+    REQ: AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0"
@@ -0,0 +1,85 @@
+- KEY:
+    NAME:  global_batch_size
+    REQ:   EXACTLY_ONE
+    POST: >
+        s['global_batch_size'] = v['value']
+
+- KEY:
+    NAME:  max_sequence_length
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 8192 "
+
+- KEY:
+    NAME:  opt_name
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 'adamw' "
+
+- KEY:
+    NAME:  opt_base_learning_rate
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] * 1152 == s['global_batch_size'] * 8e-5 " 
+
+- KEY:
+    NAME:  opt_end_learning_rate
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_decay_steps
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_warmup_steps
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_decay_schedule
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 'cosine with linear warmup' "
+
+- KEY:
+    NAME:  opt_adamw_beta_1
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0.9 "
+
+- KEY:
+    NAME:  opt_adamw_beta_2
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0.95 "
+
+- KEY:
+    NAME:  opt_adamw_epsilon
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 1e-05 "
+
+- KEY:
+    NAME:  opt_adamw_weight_decay
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0.1 "
+
+- KEY:
+    NAME:  opt_gradient_clip_norm
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 1.0 "
+
+- KEY:
+    NAME:  gradient_accumulation_steps
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0 "
+
+- KEY:
+    NAME:  eval_samples
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 5760 "
+
+- KEY:
+    NAME:  eval_accuracy
+    REQ:   AT_LEAST_ONE
+    CHECK:
+        - "'samples_count' in v['metadata']"
+    ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0"
+
+- KEY:
+    NAME: init_checkpoint_step
+    REQ:  EXACTLY_ONE
+    CHECK: " v['value'] == 0 "
+
@@ -4,38 +4,46 @@
     POST: >
         s['global_batch_size'] = v['value']
 
+# TODO: Update with official compliance requirements
+- KEY:
+    NAME:  opt_base_learning_rate
+    REQ:   EXACTLY_ONE
 
 - KEY:
-    NAME:  opt_learning_rate_warmup_steps
+    NAME:  opt_lamb_epsilon
     REQ:   EXACTLY_ONE
 
 - KEY:
-    NAME:  opt_base_learning_rate
+    NAME:  opt_learning_rate_training_steps
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_warmup_steps
     REQ:   EXACTLY_ONE
 
 - KEY:
-    NAME:  opt_gradient_clip_norm
+    NAME:  num_warmup_steps
     REQ:   EXACTLY_ONE
 
 - KEY:
-    NAME:  opt_adamw_weight_decay
+    NAME:  start_warmup_step
     REQ:   EXACTLY_ONE
 
 - KEY:
-    NAME:  opt_adamw_epsilon
+    NAME:  opt_lamb_beta_1
     REQ:   EXACTLY_ONE
 
 - KEY:
-    NAME:  opt_adamw_beta_1
+    NAME:  opt_lamb_beta_2
     REQ:   EXACTLY_ONE
 
 - KEY:
-    NAME:  opt_adamw_beta_2
+    NAME:  opt_lamb_weight_decay_rate
     REQ:   EXACTLY_ONE
 
 - KEY:
     NAME:  eval_accuracy
     REQ:   AT_LEAST_ONE
     CHECK:
-        - "'samples_count' in v['metadata']"
-    ATLEAST_ONE_CHECK: "(v['value'] <= 0.6) and v['value'] > 0.0"
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0"
@@ -107,13 +107,13 @@
     NAME:  epoch_start
     REQ:   AT_LEAST_ONE_OR(block_start)
     CHECK:
-        - "'epoch_num' in v['metadata']"
+        - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])"
 
 - KEY:
     NAME:  epoch_stop
     REQ:   AT_LEAST_ONE_OR(block_stop)
     CHECK:
-        - "'epoch_num' in v['metadata']"
+        - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])"
 
 # making sure previous eval did print it's accuracy result
 - KEY:

@@ -2,5 +2,5 @@
 - KEY:
     NAME:  submission_benchmark
     REQ:   EXACTLY_ONE
-    CHECK: " v['value'] in ['retinanet', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora', 'flux'] "
+    CHECK: " v['value'] in ['retinanet', 'flux1', 'dlrm_dcnv2', 'llama31_8b', 'rgat', 'llama2_70b_lora', 'llama31_405b'] "
     POST:  " enqueue_config('training_5.1.0/open_{}.yaml'.format(v['value'])) "
@@ -0,0 +1,13 @@
+# Stable diffusion uses two metrics, FID and CLIP.
+# These metrics can be calculated offline, using different scripts
+# and logged seperatly. Therefore, we create a virtual key
+# called aggregated_eval_accuracy, which aggregates
+# both metrics into a single log line
+
+# TODO: Update with official metric name
+- KEY:
+    NAME: averaged_validation_loss
+    REQ: AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0"
@@ -0,0 +1,78 @@
+- KEY:
+    NAME:  global_batch_size
+    REQ:   EXACTLY_ONE
+    POST: >
+        s['global_batch_size'] = v['value']
+
+- KEY:
+    NAME:  max_sequence_length
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 8192 "
+
+- KEY:
+    NAME:  opt_name
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 'adamw' "
+
+- KEY:
+    NAME:  opt_base_learning_rate
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_end_learning_rate
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_decay_steps
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_warmup_steps
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_decay_schedule
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_adamw_beta_1
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_adamw_beta_2
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_adamw_epsilon
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_adamw_weight_decay
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_gradient_clip_norm
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  gradient_accumulation_steps
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0 "
+
+- KEY:
+    NAME:  eval_samples
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 5760 "
+
+- KEY:
+    NAME:  eval_accuracy
+    REQ:   AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0"
+
+- KEY:
+    NAME: init_checkpoint_step
+    REQ:  EXACTLY_ONE
+    CHECK: " v['value'] == 0 "
+