Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download and organise DLRM data automatically #93

Merged
14 changes: 14 additions & 0 deletions script/get-dlrm-data-mlperf-inference/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,20 @@ new_env_keys:
- DLRM_DATA_PATH
input_mapping:
dlrm_data_path: CM_DLRM_DATA_PATH

prehook_deps:
- tags: get,ml-model,dlrm,_pytorch
update_tags_from_env_with_prefix:
download_path: CM_DLRM_DATA_PATH

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_ missing? Also this uses a list right?

enable_if_env:
CM_DLRM_MODEL_DOWNLOAD:
- "on"
- tags: get,dataset,preprocessed,criteo,_mlc
update_tags_from_env_with_prefix:
output_dir: CM_DLRM_DATA_PATH

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is output_dir a tag?

skip_if_env:
CM_DLRM_DATASET_DOWNLOAD:
- "on"
variations:
nvidia:
group: implementation
Expand Down
59 changes: 35 additions & 24 deletions script/get-dlrm-data-mlperf-inference/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ def preprocess(i):

dlrm_data_path = env.get('CM_DLRM_DATA_PATH', env.get('DLRM_DATA_PATH', ''))
if dlrm_data_path == '' or not os.path.exists(dlrm_data_path):
return {'return': 1, 'error': f'Please input a valid path as --dlrm_data_path'}
print(f'Data path is not given as input through --dlrm_data_path. Using the cache directory:{os.getcwd()} as the data path'}
dlrm_data_path = os.getcwd()

meta = i['meta']

script_path=i['run_script_input']['path']
Expand All @@ -22,40 +24,46 @@ def preprocess(i):

if variation == "nvidia":
if not os.path.exists(os.path.join(dlrm_data_path, "model")):
return {'return': 1, 'error': f'model directory is missing inside {dlrm_data_path}'}
print(f'model directory is missing inside {dlrm_data_path}')
env['CM_DLRM_MODEL_DOWNLOAD'] = True
if not os.path.exists(os.path.join(dlrm_data_path, "criteo")):
return {'return': 1, 'error': f'criteo directory is missing inside {dlrm_data_path}'}
print(f'criteo directory is missing inside {dlrm_data_path}')
env['CM_DLRM_DATASET_DOWNLOAD'] = True
if not os.path.exists(os.path.join(dlrm_data_path, "model", "model_weights")):
return {'return': 1, 'error': f'model_weights directory is missing inside {dlrm_data_path}/model'}
print(f'model_weights directory is missing inside {dlrm_data_path}/model')
env['CM_DLRM_MODEL_DOWNLOAD'] = True
if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23")):
return {'return': 1, 'error': f'day23 directory is missing inside {dlrm_data_path}/day23'}
print(f'day23 directory is missing inside {dlrm_data_path}/day23')
env['CM_DLRM_DATASET_DOWNLOAD'] = True
if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32")):
return {'return': 1, 'error': f'fp32 directory is missing inside {dlrm_data_path}/day23'}

print(f'fp32 directory is missing inside {dlrm_data_path}/criteo/day23')
env['CM_DLRM_DATASET_DOWNLOAD'] = True
if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot.npz")) and not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot_unpacked")):
return {'return': 1, 'error': f'day_23_sparse_multi_hot.npz is missing inside {dlrm_data_path}/criteo/day23/fp32'}
print(f'day_23_sparse_multi_hot.npz or day_23_sparse_multi_hot_unpacked is missing inside {dlrm_data_path}/criteo/day23/fp32')
env['CM_DLRM_DATASET_DOWNLOAD'] = True
if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_dense.npy")):
return {'return': 1, 'error': f'day_23_dense.npy is missing inside {dlrm_data_path}/criteo/day23/fp32'}
print(f'day_23_dense.npy is missing inside {dlrm_data_path}/criteo/day23/fp32')
env['CM_DLRM_DATASET_DOWNLOAD'] = True
if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_labels.npy")):
return {'return': 1, 'error': f'day_23_labels.npy is missing inside {dlrm_data_path}/criteo/day23/fp32'}
print(f'day_23_labels.npy is missing inside {dlrm_data_path}/criteo/day23/fp32')
env['CM_DLRM_DATASET_DOWNLOAD'] = True
if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "raw_data")):
return {'return': 1, 'error': f'raw_data is missing inside {dlrm_data_path}/criteo/day23'}


if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot_unpacked")):
os.system(f"unzip {os.path.join(dlrm_data_path, 'criteo', 'day23', 'fp32', 'day_23_sparse_multi_hot.npz')} -d {os.path.join(dlrm_data_path, 'criteo', 'day23', 'fp32', 'day_23_sparse_multi_hot_unpacked')}")

xsep = ' && '
print(f'raw_data is missing inside {dlrm_data_path}/criteo/day23')
env['CM_DLRM_DATASET_DOWNLOAD'] = True

run_cmd = ''
if os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot.npz")):
file_path = os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot.npz")
run_cmd = ("echo {} {} | md5sum -c").format('c46b7e31ec6f2f8768fa60bdfc0f6e40', file_path)
xsep = ' && '

if run_cmd != '':
run_cmd += xsep
if env['CM_DLRM_DATASET_DOWNLOAD'] != True:
if not os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot_unpacked")):
os.system(f"unzip {os.path.join(dlrm_data_path, 'criteo', 'day23', 'fp32', 'day_23_sparse_multi_hot.npz')} -d {os.path.join(dlrm_data_path, 'criteo', 'day23', 'fp32', 'day_23_sparse_multi_hot_unpacked')}")

if os.path.exists(os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot.npz")) or env['CM_DLRM_DATASET_DOWNLOAD'] == True:
file_path = os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_sparse_multi_hot.npz")
run_cmd = xsep + ("echo {} {} | md5sum -c").format('c46b7e31ec6f2f8768fa60bdfc0f6e40', file_path)

file_path = os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_dense.npy")
run_cmd += ("echo {} {} | md5sum -c").format('cdf7af87cbc7e9b468c0be46b1767601', file_path)
run_cmd += xsep + ("echo {} {} | md5sum -c").format('cdf7af87cbc7e9b468c0be46b1767601', file_path)

file_path = os.path.join(dlrm_data_path, "criteo", "day23", "fp32", "day_23_labels.npy")
run_cmd += xsep + ("echo {} {} | md5sum -c").format('dd68f93301812026ed6f58dfb0757fa7', file_path)
Expand All @@ -74,6 +82,9 @@ def postprocess(i):

env = i['env']

env['CM_GET_DEPENDENT_CACHED_PATH'] = env.get('CM_DLRM_DATA_PATH', env['DLRM_DATA_PATH'])
if env.get('CM_DLRM_DATA_PATH', '') == '' and env.get('DLRM_DATA_PATH', '') == '':
env['CM_DLRM_DATA_PATH'] = os.getcwd()
else:
env['CM_GET_DEPENDENT_CACHED_PATH'] = env.get('CM_DLRM_DATA_PATH', env['DLRM_DATA_PATH'])

return {'return':0}
19 changes: 19 additions & 0 deletions script/get-dlrm-data-mlperf-inference/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,24 @@ function run() {
fi
}

if [ "$CM_DLRM_DATASET_DOWNLOAD" = true ]; then
if [ -n "$CM_DLRM_DATA_PATH" ]; then
cp -r "$CM_CRITEO_PREPROCESSED_PATH" "$CM_DLRM_DATA_PATH/criteo/day23/fp32"
else
cp -r "$CM_CRITEO_PREPROCESSED_PATH" "$(pwd)/criteo/day23/fp32"
fi
exit_if_error
fi

if [ "$CM_DLRM_MODEL_DOWNLOAD" = true ]; then
if [ -n "$CM_DLRM_DATA_PATH" ]; then
cp -r "$CM_ML_MODEL_FILE_WITH_PATH" "$CM_DLRM_DATA_PATH/model"
else
cp -r "$CM_ML_MODEL_FILE_WITH_PATH" "$(pwd)/model"
fi
exit_if_error
fi


#Add your run commands here...
run "$CM_RUN_CMD"
Loading