diff --git a/script/get-dlrm-data-mlperf-inference/_cm.yaml b/script/get-dlrm-data-mlperf-inference/_cm.yaml index 9a46c31a6..ab0e46e8b 100644 --- a/script/get-dlrm-data-mlperf-inference/_cm.yaml +++ b/script/get-dlrm-data-mlperf-inference/_cm.yaml @@ -16,6 +16,16 @@ new_env_keys: - DLRM_DATA_PATH input_mapping: dlrm_data_path: CM_DLRM_DATA_PATH + criteo_day23_raw_data_path: CM_CRITEO_DAY23_RAW_DATA_PATH + prehook_deps: + - tags: get,ml-model,dlrm,_pytorch + enable_if_env: + CM_DLRM_MODEL_DOWNLOAD: + - "on" + - tags: get,dataset,preprocessed,criteo,_mlc + enable_if_env: + CM_DLRM_DATASET_DOWNLOAD: + - "on" variations: nvidia: group: implementation diff --git a/script/get-dlrm-data-mlperf-inference/customize.py b/script/get-dlrm-data-mlperf-inference/customize.py index 1d3799163..1e72e38b3 100644 --- a/script/get-dlrm-data-mlperf-inference/customize.py +++ b/script/get-dlrm-data-mlperf-inference/customize.py @@ -8,8 +8,23 @@ def preprocess(i): env = i['env'] dlrm_data_path = env.get('CM_DLRM_DATA_PATH', env.get('DLRM_DATA_PATH', '')) - if dlrm_data_path == '' or not os.path.exists(dlrm_data_path): - return {'return': 1, 'error': f'Please input a valid path as --dlrm_data_path'} + if dlrm_data_path == '': + print(f'Data path is not given as input through --dlrm_data_path. Using the cache directory:{os.getcwd()} as the data path') + dlrm_data_path = os.getcwd() + elif not os.path.exists(dlrm_data_path): + return {'return':1, 'error':"given dlrm data path does not exists"} + + # creating required folders inside the dlrm data path if not exists + # criteo dataset + criteo_fp32_path = os.path.join(dlrm_data_path, "criteo", "day23", "fp32") + if not os.path.exists(criteo_fp32_path): + os.makedirs(criteo_fp32_path) + + # dlrm model + model_path = os.path.join(dlrm_data_path, "model") + if not os.path.exists(model_path): + os.makedirs(model_path) + meta = i['meta'] script_path=i['run_script_input']['path'] @@ -22,46 +37,60 @@ def preprocess(i): if variation == "nvidia": if not os.path.exists(os.path.join(dlrm_data_path, "model")): - return {'return': 1, 'error': f'model directory is missing inside {dlrm_data_path}'} + print(f'model directory is missing inside {dlrm_data_path}') + env['CM_DLRM_MODEL_DOWNLOAD'] = True if not os.path.exists(os.path.join(dlrm_data_path, "criteo")): - return {'return': 1, 'error': f'criteo directory is missing inside {dlrm_data_path}'} + print(f'criteo directory is missing inside {dlrm_data_path}') + env['CM_DLRM_DATASET_DOWNLOAD'] = True if not os.path.exists(os.path.join(dlrm_data_path, "model", "model_weights")): - return {'return': 1, 'error': f'model_weights directory is missing inside {dlrm_data_path}/model'} + print(f'model_weights directory is missing inside {dlrm_data_path}/model') + env['CM_DLRM_MODEL_DOWNLOAD'] = True if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23")): - return {'return': 1, 'error': f'day23 directory is missing inside {dlrm_data_path}/day23'} + print(f'day23 directory is missing inside {dlrm_data_path}/day23') + env['CM_DLRM_DATASET_DOWNLOAD'] = True if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32")): - return {'return': 1, 'error': f'fp32 directory is missing inside {dlrm_data_path}/day23'} - + print(f'fp32 directory is missing inside {dlrm_data_path}/criteo/day23') + env['CM_DLRM_DATASET_DOWNLOAD'] = True if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot.npz")) and not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot_unpacked")): - return {'return': 1, 'error': f'day_23_sparse_multi_hot.npz is missing inside {dlrm_data_path}/criteo/day23/fp32'} + print(f'day_23_sparse_multi_hot.npz or day_23_sparse_multi_hot_unpacked is missing inside {dlrm_data_path}/criteo/day23/fp32') + env['CM_DLRM_DATASET_DOWNLOAD'] = True if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_dense.npy")): - return {'return': 1, 'error': f'day_23_dense.npy is missing inside {dlrm_data_path}/criteo/day23/fp32'} + print(f'day_23_dense.npy is missing inside {dlrm_data_path}/criteo/day23/fp32') + env['CM_DLRM_DATASET_DOWNLOAD'] = True if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_labels.npy")): - return {'return': 1, 'error': f'day_23_labels.npy is missing inside {dlrm_data_path}/criteo/day23/fp32'} + print(f'day_23_labels.npy is missing inside {dlrm_data_path}/criteo/day23/fp32') + env['CM_DLRM_DATASET_DOWNLOAD'] = True if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "raw_data")): - return {'return': 1, 'error': f'raw_data is missing inside {dlrm_data_path}/criteo/day23'} - + if env.get('CM_CRITEO_DAY23_RAW_DATA_PATH', '') == '': + return {'return':1, 'error':'Raw data missing inside {dlrm_data_path}/criteo/day23. Specify the target folder through input mapping(--criteo_day23_raw_data_path="path to raw criteo dataset")'} - if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot_unpacked")): - os.system(f"unzip {os.path.join(dlrm_data_path, 'criteo', 'day23', 'fp32', 'day_23_sparse_multi_hot.npz')} -d {os.path.join(dlrm_data_path, 'criteo', 'day23', 'fp32', 'day_23_sparse_multi_hot_unpacked')}") - - xsep = ' && ' run_cmd = '' - if os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot.npz")): - file_path = os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot.npz") - run_cmd = ("echo {} {} | md5sum -c").format('c46b7e31ec6f2f8768fa60bdfc0f6e40', file_path) - - if run_cmd != '': - run_cmd += xsep + xsep = ' && ' + # addition of run command to download the datasets and model + if env.get('CM_DLRM_DATASET_DOWNLOAD', False) == True: + run_cmd += 'cp -r "$CM_CRITEO_PREPROCESSED_PATH"/. ' + os.path.join(dlrm_data_path,"criteo","day23","fp32") + xsep + if env.get('CM_DLRM_MODEL_DOWNLOAD', False) == True: + run_cmd += 'cp -r "$CM_ML_MODEL_FILE_WITH_PATH"/. ' + os.path.join(dlrm_data_path, "model") + xsep + + if env.get('CM_DLRM_DATASET_DOWNLOAD', '') != True: + if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot_unpacked")): + os.system(f"unzip {os.path.join(dlrm_data_path, 'criteo', 'day23', 'fp32', 'day_23_sparse_multi_hot.npz')} -d {os.path.join(dlrm_data_path, 'criteo', 'day23', 'fp32', 'day_23_sparse_multi_hot_unpacked')}") + else: + run_cmd += f"unzip {os.path.join(dlrm_data_path, 'criteo', 'day23', 'fp32', 'day_23_sparse_multi_hot.npz')} -d {os.path.join(dlrm_data_path, 'criteo', 'day23', 'fp32', 'day_23_sparse_multi_hot_unpacked')}" + xsep + + if os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot.npz")) or env['CM_DLRM_DATASET_DOWNLOAD'] == True: + file_path = os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot.npz") + run_cmd += ("echo {} {} | md5sum -c").format('c46b7e31ec6f2f8768fa60bdfc0f6e40', file_path) + xsep + file_path = os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_dense.npy") - run_cmd += ("echo {} {} | md5sum -c").format('cdf7af87cbc7e9b468c0be46b1767601', file_path) + run_cmd += ("echo {} {} | md5sum -c").format('cdf7af87cbc7e9b468c0be46b1767601', file_path) + xsep file_path = os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_labels.npy") - run_cmd += xsep + ("echo {} {} | md5sum -c").format('dd68f93301812026ed6f58dfb0757fa7', file_path) + run_cmd += ("echo {} {} | md5sum -c").format('dd68f93301812026ed6f58dfb0757fa7', file_path) + xsep dir_path = os.path.join(dlrm_data_path, "criteo", "day23", "fp32") - run_cmd += xsep + ("cd {}; md5sum -c {}").format(dir_path, os.path.join(script_path, "checksums.txt" )) + run_cmd += ("cd {}; md5sum -c {}").format(dir_path, os.path.join(script_path, "checksums.txt" )) env['CM_DLRM_V2_DAY23_FILE_PATH'] = os.path.join(dlrm_data_path, "criteo", "day23", "raw_data") env['CM_DLRM_V2_AGGREGATION_TRACE_FILE_PATH'] = os.path.join(dlrm_data_path, "criteo", "day23", "sample_partition.txt") @@ -74,6 +103,9 @@ def postprocess(i): env = i['env'] - env['CM_GET_DEPENDENT_CACHED_PATH'] = env.get('CM_DLRM_DATA_PATH', env['DLRM_DATA_PATH']) + if env.get('CM_DLRM_DATA_PATH', '') == '' and env.get('DLRM_DATA_PATH', '') == '': + env['CM_DLRM_DATA_PATH'] = os.getcwd() + else: + env['CM_GET_DEPENDENT_CACHED_PATH'] = env.get('CM_DLRM_DATA_PATH', env['DLRM_DATA_PATH']) return {'return':0}