From d12deb381a77d820f6d509141696443d5659af53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Tue, 15 Feb 2022 19:48:54 -0500 Subject: [PATCH 01/14] Implement MedPerf Model Hello World --- medperf/model/mlcube/mlcube.yaml | 36 +++++++++++ .../workspace/additional_files/greetings.csv | 4 ++ medperf/model/mlcube/workspace/names.csv | 4 ++ .../model/mlcube/workspace/parameters.yaml | 5 ++ .../model/mlcube/workspace/predictions.csv | 13 ++++ medperf/model/project/Dockerfile | 29 +++++++++ medperf/model/project/app.py | 61 +++++++++++++++++++ medperf/model/project/mlcube.py | 54 ++++++++++++++++ medperf/model/project/requirements.txt | 2 + 9 files changed, 208 insertions(+) create mode 100644 medperf/model/mlcube/mlcube.yaml create mode 100644 medperf/model/mlcube/workspace/additional_files/greetings.csv create mode 100644 medperf/model/mlcube/workspace/names.csv create mode 100644 medperf/model/mlcube/workspace/parameters.yaml create mode 100644 medperf/model/mlcube/workspace/predictions.csv create mode 100644 medperf/model/project/Dockerfile create mode 100644 medperf/model/project/app.py create mode 100644 medperf/model/project/mlcube.py create mode 100644 medperf/model/project/requirements.txt diff --git a/medperf/model/mlcube/mlcube.yaml b/medperf/model/mlcube/mlcube.yaml new file mode 100644 index 0000000..a1bed7f --- /dev/null +++ b/medperf/model/mlcube/mlcube.yaml @@ -0,0 +1,36 @@ +name: Hello-World Medperf Model MLCube +description: MLCommons demonstration MLCube for building models for MedPerf +authors: + - {name: "MLCommons Medical Working Group"} + +platform: + accelerator_count: 0 + +docker: + # Image name. + image: medical-hello-world + # Docker build context relative to $MLCUBE_ROOT. Default is `build`. + build_context: "../project" + # Docker file name within docker build context, default is `Dockerfile`. + build_file: "Dockerfile" + +tasks: + # Model MLCubes require only a single task: `infer`. + # This task takes input data, as well as configuration parameters + # and/or extra artifacts, and generates predictions on the data + infer: + parameters: + inputs: { + data_path: names.csv, # Required. Where to find the data to run predictions on + parameters_file: parameters.yaml, # Required. Helper file to provide additional arguments. Value MUST be parameters.yaml + # If you need any additional files that should + # not be included inside the mlcube image, + # add them inside `additional_files` folder + # E.g. model weights + + # Toy Hello World example + greetings: additional_files/greetings.csv + } + outputs: { + output_path: {type: file, default: predictions.csv} # Required. Where to store predictions artifact. Value MUST be predictions.csv (This will probably be an issue) + } \ No newline at end of file diff --git a/medperf/model/mlcube/workspace/additional_files/greetings.csv b/medperf/model/mlcube/workspace/additional_files/greetings.csv new file mode 100644 index 0000000..25da071 --- /dev/null +++ b/medperf/model/mlcube/workspace/additional_files/greetings.csv @@ -0,0 +1,4 @@ +Hello +Howdy +Greetings +Bonjour \ No newline at end of file diff --git a/medperf/model/mlcube/workspace/names.csv b/medperf/model/mlcube/workspace/names.csv new file mode 100644 index 0000000..e25f6fb --- /dev/null +++ b/medperf/model/mlcube/workspace/names.csv @@ -0,0 +1,4 @@ +First name,Last name +Adam,Smith +John,Smith +Michael,Stevens \ No newline at end of file diff --git a/medperf/model/mlcube/workspace/parameters.yaml b/medperf/model/mlcube/workspace/parameters.yaml new file mode 100644 index 0000000..5221348 --- /dev/null +++ b/medperf/model/mlcube/workspace/parameters.yaml @@ -0,0 +1,5 @@ +# Here you can store any key-value arguments that should be easily modifiable +# by external users. E.g. batch_size + +# example argument for Hello World +uppercase: false \ No newline at end of file diff --git a/medperf/model/mlcube/workspace/predictions.csv b/medperf/model/mlcube/workspace/predictions.csv new file mode 100644 index 0000000..e591d71 --- /dev/null +++ b/medperf/model/mlcube/workspace/predictions.csv @@ -0,0 +1,13 @@ +id,greeting +0,"Hello, Adam Smith" +1,"Hello, John Smith" +2,"Hello, Michael Stevens" +3,"Howdy, Adam Smith" +4,"Howdy, John Smith" +5,"Howdy, Michael Stevens" +6,"Greetings, Adam Smith" +7,"Greetings, John Smith" +8,"Greetings, Michael Stevens" +9,"Bonjour, Adam Smith" +10,"Bonjour, John Smith" +11,"Bonjour, Michael Stevens" diff --git a/medperf/model/project/Dockerfile b/medperf/model/project/Dockerfile new file mode 100644 index 0000000..8e348f6 --- /dev/null +++ b/medperf/model/project/Dockerfile @@ -0,0 +1,29 @@ +FROM ubuntu:18.04 +MAINTAINER MLPerf MLBox Working Group + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common \ + python3-dev \ + curl && \ + rm -rf /var/lib/apt/lists/* + +RUN add-apt-repository ppa:deadsnakes/ppa -y && apt-get update + +RUN apt-get install python3 -y + +RUN apt-get install python3-pip -y + +COPY ./requirements.txt project/requirements.txt + +RUN pip3 install --upgrade pip + +RUN pip3 install --no-cache-dir -r project/requirements.txt + +ENV LANG C.UTF-8 + +COPY . /project + +WORKDIR /project + +ENTRYPOINT ["python3", "mlcube.py"] \ No newline at end of file diff --git a/medperf/model/project/app.py b/medperf/model/project/app.py new file mode 100644 index 0000000..e7fd35b --- /dev/null +++ b/medperf/model/project/app.py @@ -0,0 +1,61 @@ +# Hello World Script +# +# This script is unrelated to the MLCube interface. It could be run +# independently without issues. It provides the actual implementation +# of the app. +import csv +import argparse + +def hello_world(greetings, names, uppercase=False): + """Generates a combination of greetings and names + + Args: + greetings (List[str]): list of greetings + names (List[str]): list of names + uppercase (bool): Wether to uppercase the whole greeting + + Returns: + List[str]: combinations of greetings and names + """ + full_greetings = [] + + for greeting in greetings: + for name, last_name in names: + full_greeting = f"{greeting}, {name} {last_name}" + if uppercase: + full_greeting = full_greeting.upper() + full_greetings.append(full_greeting) + + return full_greetings + + +if __name__ == '__main__': + parser = argparse.ArgumentParser("MedPerf Model Hello World Example") + parser.add_argument('--names', dest="names", type=str, help="file containing names. CSV expected") + parser.add_argument('--uppercase', dest="uppercase", type=bool, help="wether to return uppercase greetings") + parser.add_argument('--greetings', dest="greetings", type=str, help="file containing greetings. CSV expected") + parser.add_argument('--out', dest="out", type=str, help="file to store resulting greetings") + + args = parser.parse_args() + + names = [] + greetings = [] + + with open(args.names, "r") as f: + reader = csv.reader(f) + next(reader) # skip header + for row in reader: + names.append(row) + + with open(args.greetings, "r") as f: + reader = csv.reader(f) + for row in reader: + greetings.append(row[0]) + + full_greetings = hello_world(greetings, names) + + with open(args.out, "w") as f: + writer = csv.writer(f) + writer.writerow(["id", "greeting"]) + for idx, full_greeting in enumerate(full_greetings): + writer.writerow([idx, full_greeting]) \ No newline at end of file diff --git a/medperf/model/project/mlcube.py b/medperf/model/project/mlcube.py new file mode 100644 index 0000000..f73da59 --- /dev/null +++ b/medperf/model/project/mlcube.py @@ -0,0 +1,54 @@ +# MLCube Entrypoint +# +# This script shows how you can bridge your app with an MLCube interface. +# MLCubes expect the entrypoint to behave like a CLI, where tasks are +# commands, and input/output parameters and command-line arguments. +# You can provide that interface to MLCube in any way you prefer. +# Here, we show a way that requires minimal intrusion to the original code, +# By running the application through subprocesses. + +import yaml +import typer +import subprocess + +app = typer.Typer() + +def exec_python(cmd: str) -> None: + """Execute a python script as a subprocess + + Args: + cmd (str): command to run as would be written inside the terminal + """ + splitted_cmd = cmd.split() + process = subprocess.Popen(splitted_cmd, cwd=".") + process.wait() + +@app.command("infer") +def infer( + data_path: str = typer.Option(..., "--data_path"), + params_file: str = typer.Option(..., "--parameters_file"), + greetings: str = typer.Option(..., "--greetings"), + out_path: str = typer.Option(..., "--output_path") +): + """infer task command. This is what gets executed when we run: + `mlcube run infer` + + Args: + data_path (str): Location of the data to run inference with. Required for Medperf Model MLCubes. + params_file (str): Location of the parameters.yaml file. Required for Medperf Model MLCubes. + greetings (str): Example of an extra parameter that uses `additional_files`. + out_path (str): Location to store prediction results. Required for Medperf Model MLCubes. + """ + with open(params_file, "r") as f: + params = yaml.safe_load(f) + + uppercase = params["uppercase"] + cmd = f"python3 app.py --names={data_path} --uppercase={uppercase} --greetings={greetings} --out={out_path}" + exec_python(cmd) + +@app.command("hotfix") +def hotfix(): + pass + +if __name__ == "__main__": + app() \ No newline at end of file diff --git a/medperf/model/project/requirements.txt b/medperf/model/project/requirements.txt new file mode 100644 index 0000000..d56bee7 --- /dev/null +++ b/medperf/model/project/requirements.txt @@ -0,0 +1,2 @@ +pyYAML +typer \ No newline at end of file From 0034513fb3f174071f6c80c0ba7d86f0715ce167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Wed, 16 Feb 2022 10:15:04 -0500 Subject: [PATCH 02/14] Pass uppercase to hello_world funciton --- medperf/model/project/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medperf/model/project/app.py b/medperf/model/project/app.py index e7fd35b..bba5e79 100644 --- a/medperf/model/project/app.py +++ b/medperf/model/project/app.py @@ -52,10 +52,10 @@ def hello_world(greetings, names, uppercase=False): for row in reader: greetings.append(row[0]) - full_greetings = hello_world(greetings, names) + full_greetings = hello_world(greetings, names, args.uppercase) with open(args.out, "w") as f: writer = csv.writer(f) writer.writerow(["id", "greeting"]) for idx, full_greeting in enumerate(full_greetings): - writer.writerow([idx, full_greeting]) \ No newline at end of file + writer.writerow([idx, full_greeting]) From 8f6f3683edf5aa8a0dbb03464243e98ff3d78bc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Wed, 16 Feb 2022 13:07:07 -0500 Subject: [PATCH 03/14] Create README --- medperf/model/README.md | 125 +++++++++++++++++++++++++++++++ medperf/model/mlcube/mlcube.yaml | 2 +- 2 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 medperf/model/README.md diff --git a/medperf/model/README.md b/medperf/model/README.md new file mode 100644 index 0000000..4685f9d --- /dev/null +++ b/medperf/model/README.md @@ -0,0 +1,125 @@ +# MedPerf's Model MLCube Template +This is a Hello World implementation, following the structure and conventions MedPerf uses to successfully run Models on the platform. + +## Purpose: +At the time of writing, model MLCubes have the only purpose of obtaining predictions on data. This means that we expect all models inside MedPerf to already be trained. + +## How to run: +This is a working template, which means it should work out-of-the-box. Follow the next steps: + +1. Clone the repository +2. cd to the repository + ```bash + cd mlcube_examples + ``` +3. Install mlcube and mlcube-docker + + ```bash + pip install mlcube mlcube-docker + ``` +4. cd to current example's `mlcube` folder + + ```bash + cd medperf/model/mlcube + ``` +5. execute the `infer` task with mlcube + ```bash + mlcube run --task=infer + ``` +6. check resulting predictions + ```bash + cat workspace/predictions.csv + ``` +That's it! You just built and ran a hello-world model mlcube! + +## Contents + +MLCubes usually share a similar folder structure and files. Here's a brief description of the role for the relevant files + +1. __`mlcube/mlcube.yaml`__: + + The `mlcube.yaml` file contains metadata about your model, including its interface. For MedPerf, we require an `infer` function that takes in (at minimum) arguments for `data_path` and `parameters_file` and produces `predictions.csv`. You see this definition in the mlcube.yaml file as: + + ```yml + tasks: + # Model MLCubes require only a single task: `infer`. + # This task takes input data, as well as configuration parameters + # and/or extra artifacts, and generates predictions on the data + infer: + parameters: + inputs: { + data_path: names.csv, # Required. Where to find the data to run predictions on + parameters_file: parameters.yaml, # Required. Helper file to provide additional arguments. Value MUST be parameters.yaml + # If you need any additional files that should + # not be included inside the mlcube image, + # add them inside `additional_files` folder + # E.g. model weights + + # Toy Hello World example + greetings: additional_files/greetings.csv + } + outputs: { + output_path: {type: file, default: predictions.csv} # Required. Where to store predictions artifact. Value MUST be predictions.csv + } + + ``` + In this case, we’ve added an extra “greetings” argument to our infer function. Note that the default value will always be used. + +2. __`mlcube/workspace/parameters.yaml`__: + + This file provides ways to parameterize your model. You can set any key-value pairs that should be easily modifiable in order to adjust you model's behavior. Current example shows a basic usage case for changing generated Hello world examples to uppercase: + ```yml + # Here you can store any key-value arguments that should be easily modifiable + # by external users. E.g. batch_size + + # example argument for Hello World + uppercase: false + ``` + + In real-case scenarios, you may want to enable various parameter settings in your model. For example, perhaps you want to enable patching for large volumes to save on memory consumption, where patches of the volume are inference and the outputs stitched together. To do this, you would include something like “patching: …” in your parameters.yaml file, and then you would register multiple “mlcubes” with different parameters.yaml files. Though we use the term “registering in mlcube”, really you register an mlcube and a parameters.yaml file, such that you have separate registrations for different configurations of your cube. In this way, one registered “mlcube” might be your model with “patching: true”, and another might be “patching: false”. These two registered cubes would share the same image file, and medperf/mlcube will be smart about re-using the downloaded image while downloading each of the parameters.yaml files. In our example, we have implemented one cube, registered it twice, each with a different parameters.yaml file, and our benchmark will now compare our model with patching against our model without patching. + +3. __`mlcube/workspace/additional_files/*`__: + + You may require additional files that should not be packaged inside the mlcube (due to size or usability constrains) like weights. For these cases, we provide an additional folder called `additional_files`. Here, you can provide any other files that should be present at the time of inference. At the time of mlcube registration, this folder must be compressed into a tarball (`.tar.gz`) and hosted somewhere on the web. MedPerf will then be able to download, verify and reposition those files in the expected location for model execution. In order to reference such files, you can provide additional parameters to the `mlcube.yaml` task definition, as we demonstrate with the `greetings` parameter. + + + +4. __`project`__: + + Contains the actual implementation of the mlcube. This includes all project-specific code, `Dockerfile` for building docker containers of the project and requirements for running the code. + +5. __`project/mlcube.py`__: + + MLCube expects an entrypoint to the project in order to run the code and the specified tasks. It expects this entrypoint to behave like a CLI, in which each MLCube task (e.g. `infer`) is executed as a subcommand, and each input/output parameter is passed as a CLI argument. An example of the expected interface is: + ```bash + python3 project/mlcube.py infer --data_path= --parameters_file= --greetings= --output_path= + ``` + `mlcube.py` provides such interface for this toy example. As long as you follow such CLI interface, you can implement it however you want. We provide an example that requirems minimal modifications to the original project code, by running any project task through subprocesses. + + #### __What is that “hotfix” function I see in mlcube.py?__ + + In short, it’s benign and there to avoid a potential cli issue, so you can just leave it and forget about it. + + For those who care, when using typer/click for your cli, like we do, you need more than one @app.command, or typer/click will not parse the command-line in the way mlcube expects. This is a silly, known issue that goes away as soon as you have more than one task in your mlcube interface. But since our model cubes currently only have one task, we add an extra, blank typer command to avoid this issue. If you don’t use typer/click, you likely don’t need this dummy command. + +## How to modify +If you want to adjust this template for your own use-case, then the following list serves as a step-by-step guide: +1. Remove demo artifacts from `/mlcube/workspace`: + - `/mlcube/workspace/names.csv` + - `/mlcube/workspace/predictions.csv` + - `/mlcube/workspace/additional_files/greetings.csv` +2. Pass your original code to the `/project` folder (removing `app.py`) +3. Adjust your code and the `/project/mlcube.py` file so that commands point to the respective code and receive the expected arguments +4. Modify `/project/requirements.txt` so that it contains all code dependencies for your project +5. Default `/project/Dockerfile` should suffice, but feel free to add/modify it to work with your needs. As long as it has an entrypoint pointing to `mlcube.py` +6. Inside `/mlcube/workspace` add the data you want your model to use for inference +7. Inside `/mlcube/workspace/additional_files` add any files that are required for model execution (e.g. model weights) +8. Adjust `/mlcube/mlcube.yaml` so that: +9. `data_path` points to the location where you expect data to be +10. `parameters_file` should NOT be modified in any way +11. remove demo `greetings` parameter +12. Add any other required parameters that point to `additional_files` (e.g. model_weights). Naming can be arbitrary, but all files referenced from now on should be contained inside `additional_files` +13. `output_path` should NOT be modified in any way + +## Requirements are negotiable +The required fields in the mlcube task interface show what medperf currently assumes. As we are in alpha, this is a great time to raise concerns or requests about these requirements! Now is the best time for us to make changes. diff --git a/medperf/model/mlcube/mlcube.yaml b/medperf/model/mlcube/mlcube.yaml index a1bed7f..75ec902 100644 --- a/medperf/model/mlcube/mlcube.yaml +++ b/medperf/model/mlcube/mlcube.yaml @@ -21,7 +21,7 @@ tasks: infer: parameters: inputs: { - data_path: names.csv, # Required. Where to find the data to run predictions on + data_path: names.csv, # Required. Where to find the data to run predictions on parameters_file: parameters.yaml, # Required. Helper file to provide additional arguments. Value MUST be parameters.yaml # If you need any additional files that should # not be included inside the mlcube image, From d7a10d84ac1244ea6c548dbc307a085169808cc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Wed, 16 Feb 2022 15:10:08 -0500 Subject: [PATCH 04/14] Correct README mlcube.yaml instructions --- medperf/model/README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/medperf/model/README.md b/medperf/model/README.md index 4685f9d..f838cdc 100644 --- a/medperf/model/README.md +++ b/medperf/model/README.md @@ -115,11 +115,12 @@ If you want to adjust this template for your own use-case, then the following li 6. Inside `/mlcube/workspace` add the data you want your model to use for inference 7. Inside `/mlcube/workspace/additional_files` add any files that are required for model execution (e.g. model weights) 8. Adjust `/mlcube/mlcube.yaml` so that: -9. `data_path` points to the location where you expect data to be -10. `parameters_file` should NOT be modified in any way -11. remove demo `greetings` parameter -12. Add any other required parameters that point to `additional_files` (e.g. model_weights). Naming can be arbitrary, but all files referenced from now on should be contained inside `additional_files` -13. `output_path` should NOT be modified in any way + 1. metadata such as `name`, `description`, `authors` and `image_name` are correctly assigned. + 2. `data_path` points to the location where you expect data to be inside the `workspace` directory. + 3. `parameters_file` should NOT be modified in any way. + 4. remove demo `greetings` parameter. + 5. Add any other required parameters that point to `additional_files` (e.g. model_weights). Naming can be arbitrary, but all files referenced from now on should be contained inside `additional_files`. + 6. `output_path` should NOT be modified in any way. ## Requirements are negotiable The required fields in the mlcube task interface show what medperf currently assumes. As we are in alpha, this is a great time to raise concerns or requests about these requirements! Now is the best time for us to make changes. From ac24dbf8ca6b8abcaf0a473bf9cc73ec98197aab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Wed, 16 Feb 2022 15:52:05 -0500 Subject: [PATCH 05/14] Start working on data preparator template --- medperf/data_preparator/mlcube/mlcube.yaml | 51 ++++++++++++++++++ .../mlcube/workspace/labels.csv | 13 +++++ .../mlcube/workspace/names.txt | 3 ++ .../mlcube/workspace/parameters.yaml | 1 + medperf/data_preparator/project/Dockerfile | 29 ++++++++++ medperf/data_preparator/project/mlcube.py | 54 +++++++++++++++++++ .../data_preparator/project/requirements.txt | 2 + 7 files changed, 153 insertions(+) create mode 100644 medperf/data_preparator/mlcube/mlcube.yaml create mode 100644 medperf/data_preparator/mlcube/workspace/labels.csv create mode 100644 medperf/data_preparator/mlcube/workspace/names.txt create mode 100644 medperf/data_preparator/mlcube/workspace/parameters.yaml create mode 100644 medperf/data_preparator/project/Dockerfile create mode 100644 medperf/data_preparator/project/mlcube.py create mode 100644 medperf/data_preparator/project/requirements.txt diff --git a/medperf/data_preparator/mlcube/mlcube.yaml b/medperf/data_preparator/mlcube/mlcube.yaml new file mode 100644 index 0000000..6ec9603 --- /dev/null +++ b/medperf/data_preparator/mlcube/mlcube.yaml @@ -0,0 +1,51 @@ +name: Hello World Medperf Data Preparator Cube +description: MLCommons demonstration MLCube for building data preparators for MedPerf +authors: + - {name: "MLCommons Medical Working Group"} + +platform: + accelerator_count: 0 + +docker: + # Image name. + image: medical-data-prep-hello-world + # Docker build context relative to $MLCUBE_ROOT. Default is `build`. + build_context: "../project" + # Docker file name within docker build context, default is `Dockerfile`. + build_file: "Dockerfile" + +tasks: + prepare: + # This task is in charge of transforming the input data into the format + # expected by the model cubes. + parameters: + inputs: { + data_path: names.txt, # Required. Value must point to the location of the raw data inside workspace + labels_path: labels.csv, # Required. Value must point to the file containing labels for the data + parameters_file: parameters.yaml # Required. Value must be `parameters.yaml` + } + outputs: { + output_path: data/ # Required. Indicates where to store the transformed data + } + sanity_check: + # This task ensures that the previously transformed data was transformed correctly. + # It runs a set of tests that check que quality of the data. The rigurosity of those + # tests is determined by the cube author. + parameters: + inputs: { + data_path: data/, # Required. Value should be the output of the prepare task + parameters_file: parameters.yaml # Required. Value must be `parameters.yaml` + } + statistics: + # This task computes statistics on the prepared dataset. Its purpose is to get a high-level + # idea of what is contained inside the data, without providing any specifics of any single entry + parameters: + inputs: { + data_path: data/, # Required. Value should be the output of the prepare task + parameters_file: parameters.yaml # Required. Value must be `parameters.yaml` + } + outputs: { + output_path: { + type: file, default: statistics.yaml # Required. Value must be `statistics.yaml` + } + } \ No newline at end of file diff --git a/medperf/data_preparator/mlcube/workspace/labels.csv b/medperf/data_preparator/mlcube/workspace/labels.csv new file mode 100644 index 0000000..e591d71 --- /dev/null +++ b/medperf/data_preparator/mlcube/workspace/labels.csv @@ -0,0 +1,13 @@ +id,greeting +0,"Hello, Adam Smith" +1,"Hello, John Smith" +2,"Hello, Michael Stevens" +3,"Howdy, Adam Smith" +4,"Howdy, John Smith" +5,"Howdy, Michael Stevens" +6,"Greetings, Adam Smith" +7,"Greetings, John Smith" +8,"Greetings, Michael Stevens" +9,"Bonjour, Adam Smith" +10,"Bonjour, John Smith" +11,"Bonjour, Michael Stevens" diff --git a/medperf/data_preparator/mlcube/workspace/names.txt b/medperf/data_preparator/mlcube/workspace/names.txt new file mode 100644 index 0000000..491910d --- /dev/null +++ b/medperf/data_preparator/mlcube/workspace/names.txt @@ -0,0 +1,3 @@ +Adam Smith Miller +John Smith Jones +Michael M. Stevens Taylor \ No newline at end of file diff --git a/medperf/data_preparator/mlcube/workspace/parameters.yaml b/medperf/data_preparator/mlcube/workspace/parameters.yaml new file mode 100644 index 0000000..ef399fe --- /dev/null +++ b/medperf/data_preparator/mlcube/workspace/parameters.yaml @@ -0,0 +1 @@ +num_words: 2 \ No newline at end of file diff --git a/medperf/data_preparator/project/Dockerfile b/medperf/data_preparator/project/Dockerfile new file mode 100644 index 0000000..8e348f6 --- /dev/null +++ b/medperf/data_preparator/project/Dockerfile @@ -0,0 +1,29 @@ +FROM ubuntu:18.04 +MAINTAINER MLPerf MLBox Working Group + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common \ + python3-dev \ + curl && \ + rm -rf /var/lib/apt/lists/* + +RUN add-apt-repository ppa:deadsnakes/ppa -y && apt-get update + +RUN apt-get install python3 -y + +RUN apt-get install python3-pip -y + +COPY ./requirements.txt project/requirements.txt + +RUN pip3 install --upgrade pip + +RUN pip3 install --no-cache-dir -r project/requirements.txt + +ENV LANG C.UTF-8 + +COPY . /project + +WORKDIR /project + +ENTRYPOINT ["python3", "mlcube.py"] \ No newline at end of file diff --git a/medperf/data_preparator/project/mlcube.py b/medperf/data_preparator/project/mlcube.py new file mode 100644 index 0000000..8ddd305 --- /dev/null +++ b/medperf/data_preparator/project/mlcube.py @@ -0,0 +1,54 @@ +# MLCube Entrypoint +# +# This script shows how you can bridge your app with an MLCube interface. +# MLCubes expect the entrypoint to behave like a CLI, where tasks are +# commands, and input/output parameters and command-line arguments. +# You can provide that interface to MLCube in any way you prefer. +# Here, we show a way that requires minimal intrusion to the original code, +# By running the application through subprocesses. + +import yaml +import typer +import subprocess + +app = typer.Typer() + +def exec_python(cmd: str) -> None: + """Execute a python script as a subprocess + + Args: + cmd (str): command to run as would be written inside the terminal + """ + splitted_cmd = cmd.split() + process = subprocess.Popen(splitted_cmd, cwd=".") + process.wait() + +@app.command("prepare") +def prepare( + data_path: str = typer.Option(..., "--data_path"), + labels_path: str = typer.Option(..., "--labels_path"), + params_file: str = typer.Option(..., "--parameters_file"), + out_path: str = typer.Option(..., "--output_path") +): + """infer task command. This is what gets executed when we run: + `mlcube run infer` + + Args: + data_path (str): Location of the data to transform. Required for Medperf Data Preparation MLCubes. + labels_path (str): Location of the labels. Required for Medperf Data Preparation MLCubes + params_file (str): Location of the parameters.yaml file. Required for Medperf Data Preparation MLCubes. + out_path (str): Location to store transformed data. Required for Medperf Data Preparation MLCubes. + """ + with open(params_file, "r") as f: + params = yaml.safe_load(f) + + num_words = params["num_words"] + cmd = f"python3 transform.py --data={data_path} --labels={labels_path} --num_words={num_words} --out={out_path}" + exec_python(cmd) + +@app.command("hotfix") +def hotfix(): + pass + +if __name__ == "__main__": + app() \ No newline at end of file diff --git a/medperf/data_preparator/project/requirements.txt b/medperf/data_preparator/project/requirements.txt new file mode 100644 index 0000000..d56bee7 --- /dev/null +++ b/medperf/data_preparator/project/requirements.txt @@ -0,0 +1,2 @@ +pyYAML +typer \ No newline at end of file From 473b769b61dfd1d79020f5272dc7aefc110268f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Thu, 17 Feb 2022 17:37:57 -0500 Subject: [PATCH 06/14] Update template to recent conceptual changes --- medperf/model/README.md | 36 +++++++++---------- medperf/model/mlcube/mlcube.yaml | 4 +-- .../mlcube/workspace/{ => data}/names.csv | 0 .../model/mlcube/workspace/predictions.csv | 13 ------- .../workspace/predictions/predictions.csv | 13 +++++++ medperf/model/project/app.py | 6 ++-- medperf/model/project/mlcube.py | 4 ++- 7 files changed, 40 insertions(+), 36 deletions(-) rename medperf/model/mlcube/workspace/{ => data}/names.csv (100%) delete mode 100644 medperf/model/mlcube/workspace/predictions.csv create mode 100644 medperf/model/mlcube/workspace/predictions/predictions.csv diff --git a/medperf/model/README.md b/medperf/model/README.md index f838cdc..e279a8e 100644 --- a/medperf/model/README.md +++ b/medperf/model/README.md @@ -38,28 +38,28 @@ MLCubes usually share a similar folder structure and files. Here's a brief descr 1. __`mlcube/mlcube.yaml`__: - The `mlcube.yaml` file contains metadata about your model, including its interface. For MedPerf, we require an `infer` function that takes in (at minimum) arguments for `data_path` and `parameters_file` and produces `predictions.csv`. You see this definition in the mlcube.yaml file as: + The `mlcube.yaml` file contains metadata about your model, including its interface. For MedPerf, we require an `infer` function that takes in (at minimum) arguments for `data_path` and `parameters_file` and produces prediction artifacts inside the `output_path`. You see this definition in the mlcube.yaml file as: ```yml tasks: - # Model MLCubes require only a single task: `infer`. - # This task takes input data, as well as configuration parameters - # and/or extra artifacts, and generates predictions on the data - infer: - parameters: + # Model MLCubes require only a single task: `infer`. + # This task takes input data, as well as configuration parameters + # and/or extra artifacts, and generates predictions on the data + infer: + parameters: inputs: { - data_path: names.csv, # Required. Where to find the data to run predictions on - parameters_file: parameters.yaml, # Required. Helper file to provide additional arguments. Value MUST be parameters.yaml - # If you need any additional files that should - # not be included inside the mlcube image, - # add them inside `additional_files` folder - # E.g. model weights - - # Toy Hello World example - greetings: additional_files/greetings.csv + data_path: data, # Required. Where to find the data to run predictions on. MUST be a folder + parameters_file: parameters.yaml, # Required. Helper file to provide additional arguments. Value MUST be parameters.yaml + # If you need any additional files that should + # not be included inside the mlcube image, + # add them inside `additional_files` folder + # E.g. model weights + + # Toy Hello World example + greetings: additional_files/greetings.csv } outputs: { - output_path: {type: file, default: predictions.csv} # Required. Where to store predictions artifact. Value MUST be predictions.csv + output_path: {type: directory, default: predictions} # Required. Where to store prediction artifacts. MUST be a folder } ``` @@ -105,8 +105,8 @@ MLCubes usually share a similar folder structure and files. Here's a brief descr ## How to modify If you want to adjust this template for your own use-case, then the following list serves as a step-by-step guide: 1. Remove demo artifacts from `/mlcube/workspace`: - - `/mlcube/workspace/names.csv` - - `/mlcube/workspace/predictions.csv` + - `/mlcube/workspace/data/*` + - `/mlcube/workspace/predictions/*` - `/mlcube/workspace/additional_files/greetings.csv` 2. Pass your original code to the `/project` folder (removing `app.py`) 3. Adjust your code and the `/project/mlcube.py` file so that commands point to the respective code and receive the expected arguments diff --git a/medperf/model/mlcube/mlcube.yaml b/medperf/model/mlcube/mlcube.yaml index 75ec902..d9f6c2b 100644 --- a/medperf/model/mlcube/mlcube.yaml +++ b/medperf/model/mlcube/mlcube.yaml @@ -21,7 +21,7 @@ tasks: infer: parameters: inputs: { - data_path: names.csv, # Required. Where to find the data to run predictions on + data_path: data, # Required. Where to find the data to run predictions on. MUST be a folder parameters_file: parameters.yaml, # Required. Helper file to provide additional arguments. Value MUST be parameters.yaml # If you need any additional files that should # not be included inside the mlcube image, @@ -32,5 +32,5 @@ tasks: greetings: additional_files/greetings.csv } outputs: { - output_path: {type: file, default: predictions.csv} # Required. Where to store predictions artifact. Value MUST be predictions.csv (This will probably be an issue) + output_path: {type: directory, default: predictions} # Required. Where to store prediction artifacts. MUST be a folder } \ No newline at end of file diff --git a/medperf/model/mlcube/workspace/names.csv b/medperf/model/mlcube/workspace/data/names.csv similarity index 100% rename from medperf/model/mlcube/workspace/names.csv rename to medperf/model/mlcube/workspace/data/names.csv diff --git a/medperf/model/mlcube/workspace/predictions.csv b/medperf/model/mlcube/workspace/predictions.csv deleted file mode 100644 index e591d71..0000000 --- a/medperf/model/mlcube/workspace/predictions.csv +++ /dev/null @@ -1,13 +0,0 @@ -id,greeting -0,"Hello, Adam Smith" -1,"Hello, John Smith" -2,"Hello, Michael Stevens" -3,"Howdy, Adam Smith" -4,"Howdy, John Smith" -5,"Howdy, Michael Stevens" -6,"Greetings, Adam Smith" -7,"Greetings, John Smith" -8,"Greetings, Michael Stevens" -9,"Bonjour, Adam Smith" -10,"Bonjour, John Smith" -11,"Bonjour, Michael Stevens" diff --git a/medperf/model/mlcube/workspace/predictions/predictions.csv b/medperf/model/mlcube/workspace/predictions/predictions.csv new file mode 100644 index 0000000..23f5e8b --- /dev/null +++ b/medperf/model/mlcube/workspace/predictions/predictions.csv @@ -0,0 +1,13 @@ +id,greeting +0,"HELLO, ADAM SMITH" +1,"HELLO, JOHN SMITH" +2,"HELLO, MICHAEL STEVENS" +3,"HOWDY, ADAM SMITH" +4,"HOWDY, JOHN SMITH" +5,"HOWDY, MICHAEL STEVENS" +6,"GREETINGS, ADAM SMITH" +7,"GREETINGS, JOHN SMITH" +8,"GREETINGS, MICHAEL STEVENS" +9,"BONJOUR, ADAM SMITH" +10,"BONJOUR, JOHN SMITH" +11,"BONJOUR, MICHAEL STEVENS" diff --git a/medperf/model/project/app.py b/medperf/model/project/app.py index bba5e79..99e64e1 100644 --- a/medperf/model/project/app.py +++ b/medperf/model/project/app.py @@ -3,6 +3,7 @@ # This script is unrelated to the MLCube interface. It could be run # independently without issues. It provides the actual implementation # of the app. +import os import csv import argparse @@ -34,7 +35,7 @@ def hello_world(greetings, names, uppercase=False): parser.add_argument('--names', dest="names", type=str, help="file containing names. CSV expected") parser.add_argument('--uppercase', dest="uppercase", type=bool, help="wether to return uppercase greetings") parser.add_argument('--greetings', dest="greetings", type=str, help="file containing greetings. CSV expected") - parser.add_argument('--out', dest="out", type=str, help="file to store resulting greetings") + parser.add_argument('--out', dest="out", type=str, help="path to store resulting greetings") args = parser.parse_args() @@ -54,7 +55,8 @@ def hello_world(greetings, names, uppercase=False): full_greetings = hello_world(greetings, names, args.uppercase) - with open(args.out, "w") as f: + out_file = os.path.join(args.out, "predictions.csv") + with open(out_file, "w") as f: writer = csv.writer(f) writer.writerow(["id", "greeting"]) for idx, full_greeting in enumerate(full_greetings): diff --git a/medperf/model/project/mlcube.py b/medperf/model/project/mlcube.py index f73da59..66894a3 100644 --- a/medperf/model/project/mlcube.py +++ b/medperf/model/project/mlcube.py @@ -7,6 +7,7 @@ # Here, we show a way that requires minimal intrusion to the original code, # By running the application through subprocesses. +import os import yaml import typer import subprocess @@ -42,8 +43,9 @@ def infer( with open(params_file, "r") as f: params = yaml.safe_load(f) + names_file = os.path.join(data_path, "names.csv") uppercase = params["uppercase"] - cmd = f"python3 app.py --names={data_path} --uppercase={uppercase} --greetings={greetings} --out={out_path}" + cmd = f"python3 app.py --names={names_file} --uppercase={uppercase} --greetings={greetings} --out={out_path}" exec_python(cmd) @app.command("hotfix") From ce9a20428e07c7707ab1a2273094fc479e3050a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Thu, 24 Feb 2022 18:38:18 -0500 Subject: [PATCH 07/14] Implement Data Preparator template --- medperf/data_preparator/mlcube/mlcube.yaml | 6 +- .../mlcube/workspace/{ => labels}/labels.csv | 0 .../mlcube/workspace/{ => names}/names.txt | 0 .../mlcube/workspace/parameters.yaml | 1 - medperf/data_preparator/project/mlcube.py | 45 +++++++++++--- medperf/data_preparator/project/prepare.py | 60 +++++++++++++++++++ .../data_preparator/project/requirements.txt | 3 +- .../data_preparator/project/sanity_check.py | 26 ++++++++ medperf/data_preparator/project/statistics.py | 54 +++++++++++++++++ 9 files changed, 181 insertions(+), 14 deletions(-) rename medperf/data_preparator/mlcube/workspace/{ => labels}/labels.csv (100%) rename medperf/data_preparator/mlcube/workspace/{ => names}/names.txt (100%) create mode 100644 medperf/data_preparator/project/prepare.py create mode 100644 medperf/data_preparator/project/sanity_check.py create mode 100644 medperf/data_preparator/project/statistics.py diff --git a/medperf/data_preparator/mlcube/mlcube.yaml b/medperf/data_preparator/mlcube/mlcube.yaml index 6ec9603..49b3461 100644 --- a/medperf/data_preparator/mlcube/mlcube.yaml +++ b/medperf/data_preparator/mlcube/mlcube.yaml @@ -20,12 +20,12 @@ tasks: # expected by the model cubes. parameters: inputs: { - data_path: names.txt, # Required. Value must point to the location of the raw data inside workspace - labels_path: labels.csv, # Required. Value must point to the file containing labels for the data + data_path: names/, # Required. Value must point to a directory containing the raw data inside workspace + labels_path: labels/, # Required. Value must point to a directory containing labels for the data parameters_file: parameters.yaml # Required. Value must be `parameters.yaml` } outputs: { - output_path: data/ # Required. Indicates where to store the transformed data + output_path: data/ # Required. Indicates where to store the transformed data. Must contain transformed data and labels } sanity_check: # This task ensures that the previously transformed data was transformed correctly. diff --git a/medperf/data_preparator/mlcube/workspace/labels.csv b/medperf/data_preparator/mlcube/workspace/labels/labels.csv similarity index 100% rename from medperf/data_preparator/mlcube/workspace/labels.csv rename to medperf/data_preparator/mlcube/workspace/labels/labels.csv diff --git a/medperf/data_preparator/mlcube/workspace/names.txt b/medperf/data_preparator/mlcube/workspace/names/names.txt similarity index 100% rename from medperf/data_preparator/mlcube/workspace/names.txt rename to medperf/data_preparator/mlcube/workspace/names/names.txt diff --git a/medperf/data_preparator/mlcube/workspace/parameters.yaml b/medperf/data_preparator/mlcube/workspace/parameters.yaml index ef399fe..e69de29 100644 --- a/medperf/data_preparator/mlcube/workspace/parameters.yaml +++ b/medperf/data_preparator/mlcube/workspace/parameters.yaml @@ -1 +0,0 @@ -num_words: 2 \ No newline at end of file diff --git a/medperf/data_preparator/project/mlcube.py b/medperf/data_preparator/project/mlcube.py index 8ddd305..60edf0d 100644 --- a/medperf/data_preparator/project/mlcube.py +++ b/medperf/data_preparator/project/mlcube.py @@ -30,8 +30,8 @@ def prepare( params_file: str = typer.Option(..., "--parameters_file"), out_path: str = typer.Option(..., "--output_path") ): - """infer task command. This is what gets executed when we run: - `mlcube run infer` + """Prepare task command. This is what gets executed when we run: + `mlcube run --task=prepare` Args: data_path (str): Location of the data to transform. Required for Medperf Data Preparation MLCubes. @@ -39,16 +39,43 @@ def prepare( params_file (str): Location of the parameters.yaml file. Required for Medperf Data Preparation MLCubes. out_path (str): Location to store transformed data. Required for Medperf Data Preparation MLCubes. """ - with open(params_file, "r") as f: - params = yaml.safe_load(f) + cmd = f"python3 prepare.py --names_path={data_path} --labels_path={labels_path} --out={out_path}" + exec_python(cmd) + +@app.command("sanity_check") +def sanity_check( + data_path: str = typer.Option(..., "--data_path"), + params_file: str = typer.Option(..., "--parameters_file") +): + """Sanity check task command. This is what gets executed when we run: + `mlcube run --task=sanity_check` - num_words = params["num_words"] - cmd = f"python3 transform.py --data={data_path} --labels={labels_path} --num_words={num_words} --out={out_path}" + Args: + data_path (str): Location of the prepared data. Required for Medperf Data Preparation MLCubes. + params_file (str): Location of the parameters.yaml file. Required for Medperf Data Preparation MLCubes. + """ + cmd = f"python3 sanity_check.py --data_path={data_path}" exec_python(cmd) -@app.command("hotfix") -def hotfix(): - pass +@app.command("statistics") +def statistics( + data_path: str = typer.Option(..., "--data_path"), + params_file: str = typer.Option(..., "--parameters_file"), + output_path: str = typer.Option(..., "--output_path") +): + """Computes statistics about the data. This statistics are uploaded + to the Medperf platform under the data owner's approval. Include + every statistic you consider useful for determining the nature of the + data, but keep in mind that we want to keep the data as private as + possible. + + Args: + data_path (str): Location of the prepared data. Required for Medperf Data Preparation MLCubes. + params_file (str): Location of the parameters.yaml file. Required for Medperf Data Preparation MLCubes. + output_path (str): File to store the statistics. Must be statistics.yaml. Required for Medperf Data Preparation MLCubes. + """ + cmd = f"python3 statistics.py --data_path={data_path} --out_file={output_path}" + exec_python(cmd) if __name__ == "__main__": app() \ No newline at end of file diff --git a/medperf/data_preparator/project/prepare.py b/medperf/data_preparator/project/prepare.py new file mode 100644 index 0000000..b86610d --- /dev/null +++ b/medperf/data_preparator/project/prepare.py @@ -0,0 +1,60 @@ +import os +import shutil +import argparse +import pandas as pd + +def prepare(names: pd.DataFrame): + """Takes a list of names and formats them into [First Name, Last Name] + + Args: + names (pd.DataFrame): DataFrame containing the names to be prepared + """ + names["First Name"] = names["Name"].str.split().str[0] + names["Last Name"] = names["Name"].str.split().str[-2] + names.drop("Name", axis="columns", inplace=True) + + return names + +def get_names_df(files, column_name): + names_files = os.listdir(args.names) + csv_files = [file for file in names_files if file.endswith(".csv")] + tsv_files = [file for file in names_files if file.endswith(".tsv")] + txt_files = [file for file in names_files if file.endswith(".txt")] + + if len(csv_files): + filepath = os.path.join(files, csv_files[0]) + df = pd.read_csv(filepath, usecols=[column_name]) + return df + if len(tsv_files): + filepath = os.path.join(files, tsv_files[0]) + df = pd.read_csv(filepath, usecols=[column_name], sep='\t') + return df + if len(txt_files): + filepath = os.path.join(files, txt_files[0]) + with open(filepath, "r") as f: + names = f.readlines() + + df = pd.DataFrame(data=names, columns=[column_name]) + return df + +if __name__ == '__main__': + parser = argparse.ArgumentParser("Medperf Data Preparator Example") + parser.add_argument("--names_path", dest="names", type=str, help="path containing raw names") + parser.add_argument("--labels_path", dest="labels", type=str, help="path containing labels") + parser.add_argument("--out", dest="out" , type=str, help="path to store prepared data") + + args = parser.parse_args() + + # One of the intended use-cases of the data preparator cube + # is to accept multiple data formats depending on the task needs + names_df = get_names_df(args.names, "Name") + prepared_names = prepare(names_df) + + # add the labels to the output folder. In this case we're going to assume + # the labels will always follow the same format + in_labels = os.path.join(args.labels, "labels.csv") + out_labels = os.path.join(args.out, "labels.csv") + shutil.copyfile(in_labels, out_labels) + + out_file = os.path.join(args.out, "names.csv") + prepared_names.to_csv(out_file, index=False) \ No newline at end of file diff --git a/medperf/data_preparator/project/requirements.txt b/medperf/data_preparator/project/requirements.txt index d56bee7..6db6faf 100644 --- a/medperf/data_preparator/project/requirements.txt +++ b/medperf/data_preparator/project/requirements.txt @@ -1,2 +1,3 @@ pyYAML -typer \ No newline at end of file +typer +pandas \ No newline at end of file diff --git a/medperf/data_preparator/project/sanity_check.py b/medperf/data_preparator/project/sanity_check.py new file mode 100644 index 0000000..6801f3e --- /dev/null +++ b/medperf/data_preparator/project/sanity_check.py @@ -0,0 +1,26 @@ +import os +import argparse +import pandas as pd + +def sanity_check(names_df): + """Runs a few checks to ensure data quality and integrity + + Args: + names_df (pd.DataFrame): DataFrame containing transformed data. + """ + # Here you must add all the checks you consider important regarding the + # state of the data + assert names_df.columns.tolist() == ["First Name", "Last Name"], "Column mismatch" + assert names_df["First Name"].isna().sum() == 0, "There are empty fields" + assert names_df["Last Name"].isna().sum() == 0, "There are empty fields" + +if __name__ == '__main__': + parser = argparse.ArgumentParser("Medperf Model Sanity Check Example") + parser.add_argument("--data_path", dest="data", type=str, help="directory containing the prepared data") + + args = parser.parse_args() + + names_file = os.path.join(args.data, "names.csv") + names_df = pd.read_csv(names_file) + + sanity_check(names_df) \ No newline at end of file diff --git a/medperf/data_preparator/project/statistics.py b/medperf/data_preparator/project/statistics.py new file mode 100644 index 0000000..6792cb4 --- /dev/null +++ b/medperf/data_preparator/project/statistics.py @@ -0,0 +1,54 @@ +import os +import yaml +import argparse +import pandas as pd + +def get_statistics(names_df: pd.DataFrame) -> dict: + """Computes statistics about the data. This statistics are uploaded + to the Medperf platform under the data owner's approval. Include + every statistic you consider useful for determining the nature of the + data, but keep in mind that we want to keep the data as private as + possible. + + Args: + names_df (pd.DataFrame): DataFrame containing the prepared dataset + + Returns: + dict: dictionary with all the computed statistics + """ + fname_len = names_df["First Name"].str.len() + lname_len = names_df["Last Name"].str.len() + + stats = { + "First Name": { + "length mean": float(fname_len.mean()), + "length std": float(fname_len.std()), + "length min": int(fname_len.min()), + "length max": int(fname_len.max()) + }, + "Last Name": { + "length mean": float(lname_len.mean()), + "length std": float(lname_len.std()), + "length min": int(lname_len.min()), + "length max": int(lname_len.max()) + }, + "size": len(names_df) + } + + return stats + +if __name__ == '__main__': + parser = argparse.ArgumentParser("MedPerf Statistics Example") + parser.add_argument("--data_path", dest="data", type=str, help="directory containing the prepared data") + parser.add_argument("--out_file", dest="out_file", type=str, help="file to store statistics") + + args = parser.parse_args() + + namesfile = os.path.join(args.data, "names.csv") + names_df = pd.read_csv(namesfile) + + stats = get_statistics(names_df) + + with open(args.out_file, "w") as f: + yaml.dump(stats, f) + From 89497e7882ab5a3fb182abb5d4f5463df8b68086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Thu, 24 Feb 2022 18:38:48 -0500 Subject: [PATCH 08/14] Update outdated model documentation --- medperf/model/project/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medperf/model/project/app.py b/medperf/model/project/app.py index 99e64e1..fde4bef 100644 --- a/medperf/model/project/app.py +++ b/medperf/model/project/app.py @@ -32,9 +32,9 @@ def hello_world(greetings, names, uppercase=False): if __name__ == '__main__': parser = argparse.ArgumentParser("MedPerf Model Hello World Example") - parser.add_argument('--names', dest="names", type=str, help="file containing names. CSV expected") + parser.add_argument('--names', dest="names", type=str, help="directory containing names") parser.add_argument('--uppercase', dest="uppercase", type=bool, help="wether to return uppercase greetings") - parser.add_argument('--greetings', dest="greetings", type=str, help="file containing greetings. CSV expected") + parser.add_argument('--greetings', dest="greetings", type=str, help="directory containing greetings") parser.add_argument('--out', dest="out", type=str, help="path to store resulting greetings") args = parser.parse_args() From f901fff3b5dcb8ae7911e9b78cfecb7727be7670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Fri, 25 Feb 2022 17:04:12 -0500 Subject: [PATCH 09/14] Write Data Preparator README --- medperf/data_preparator/README.md | 130 ++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 medperf/data_preparator/README.md diff --git a/medperf/data_preparator/README.md b/medperf/data_preparator/README.md new file mode 100644 index 0000000..98b184a --- /dev/null +++ b/medperf/data_preparator/README.md @@ -0,0 +1,130 @@ +# MedPerf's Data Preparator MLCube Template +This is a Hello World implementation, following the structure and conventions MedPerf uses to process and transform raw datasets. + +## Purpose: +At the time of writing, Data Preparators are in charge of standardizing the input data format models expect to receive. Additionally, they provide tools for testing the integrity of the data and for extracting useful insights from it. + +## How to run: +This template was built so it can work out-of-the-box. Follow the next steps: + +1. Clone the repository +2. cd to the repository + ```bash + cd mlcube_examples + ``` +3. Install mlcube and mlcube-docker + + ```bash + pip install mlcube mlcube-docker + ``` +4. cd to current example's `mlcube` folder + + ```bash + cd medperf/data_preparator/mlcube + ``` +5. execute the `prepare` task with mlcube + ```bash + mlcube run --task=prepare + ``` +6. check resulting data + ```bash + ls workspace/data + ``` +7. execute the `sanity_check` task + ```bash + mlcube run --task=sanity_check + ``` +8. execute the `statistics` task + ```bash + mlcube run --task=statistics + ``` +9. check the resulting statistics + ```bash + cat workspace/statistics.yaml + ``` +That's it! You just built and ran a hello-world data preparator mlcube! + +## Contents + +MLCubes usually share a similar folder structure and files. Here's a brief description of the role for the relevant files + +1. __`mlcube/mlcube.yaml`__: + + The `mlcube.yaml` file contains metadata about your data preparation procedure, including its interface. For MedPerf, we require three tasks: `prepare`, `sanity_check` and `statistics`. The description of the tasks and their input/outputs are described in the file: + + ```yml + tasks: + prepare: + # This task is in charge of transforming the input data into the format + # expected by the model cubes. + parameters: + inputs: { + data_path: names/, # Required. Value must point to a directory containing the raw data inside workspace + labels_path: labels/, # Required. Value must point to a directory containing labels for the data + parameters_file: parameters.yaml # Required. Value must be `parameters.yaml` + } + outputs: { + output_path: data/ # Required. Indicates where to store the transformed data. Must contain transformed data and labels + } + sanity_check: + # This task ensures that the previously transformed data was transformed correctly. + # It runs a set of tests that check que quality of the data. The rigurosity of those + # tests is determined by the cube author. + parameters: + inputs: { + data_path: data/, # Required. Value should be the output of the prepare task + parameters_file: parameters.yaml # Required. Value must be `parameters.yaml` + } + statistics: + # This task computes statistics on the prepared dataset. Its purpose is to get a high-level + # idea of what is contained inside the data, without providing any specifics of any single entry + parameters: + inputs: { + data_path: data/, # Required. Value should be the output of the prepare task + parameters_file: parameters.yaml # Required. Value must be `parameters.yaml` + } + outputs: { + output_path: { + type: file, default: statistics.yaml # Required. Value must be `statistics.yaml` + } + } + ``` + +2. __`mlcube/workspace/parameters.yaml`__: + + This file provides ways to parameterize the data preparation process. You can set any key-value pairs that should be easily modifiable in order to adjust you mlcube's behavior. This file is mandatory, but can be left blank if parametrization is not needed, as is the case in this example. + +3. __`project`__: + + Contains the actual implementation of the mlcube. This includes all project-specific code, `Dockerfile` for building docker containers of the project and requirements for running the code. + +5. __`project/mlcube.py`__: + + MLCube expects an entrypoint to the project in order to run the code and the specified tasks. It expects this entrypoint to behave like a CLI, in which each MLCube task (e.g. `prepare`) is executed as a subcommand, and each input/output parameter is passed as a CLI argument. An example of the expected interface is: + ```bash + python3 project/mlcube.py prepare --data_path= --labels_path= --parameters_file= --output_path= + ``` + `mlcube.py` provides such interface for this toy example. As long as you follow such CLI interface, you can implement it however you want. We provide an example that requirems minimal modifications to the original project code, by running any project task through subprocesses. + +## How to modify +If you want to adjust this template for your own use-case, then the following list serves as a step-by-step guide: +1. Remove demo artifacts from `/mlcube/workspace`: + - `/mlcube/workspace/data` + - `/mlcube/workspace/labels` + - `/mlcube/workspace/names` + - `/mlcube/workspace/statistics.yaml` +2. Pass your original code to the `/project` folder (removing everything but `mlcube.py`) +3. Adjust your code and the `/project/mlcube.py` file so that commands point to the respective code and receive the expected arguments +4. Modify `/project/requirements.txt` so that it contains all code dependencies for your project +5. Default `/project/Dockerfile` should suffice, but feel free to add/modify it to work with your needs. As long as it has an entrypoint pointing to `mlcube.py` +6. Inside `/mlcube/workspace` add the input folders for preparing data. +7. Inside `/mlcube/workspace/additional_files` add any files that are required for model execution (e.g. model weights) +8. Adjust `/mlcube/mlcube.yaml` so that: + 1. metadata such as `name`, `description`, `authors` and `image_name` are correctly assigned. + 2. `data_path`, `labels_path` and other IO parameters point to the location where you expect data to be inside the `workspace` directory. + 3. `parameters_file` should NOT be modified in any way. + 4. Add any other required parameters that point to `additional_files` (e.g. model_weights). Naming can be arbitrary, but all files referenced from now on should be contained inside `additional_files`. + 5. `output_path`s should NOT be modified in any way. + +## Requirements are negotiable +The required fields in the mlcube task interface show what medperf currently assumes. As we are in alpha, this is a great time to raise concerns or requests about these requirements! Now is the best time for us to make changes. \ No newline at end of file From 1cb58395bfa6a9e5173cdfc1937f89dfddbde6bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Fri, 25 Feb 2022 17:04:22 -0500 Subject: [PATCH 10/14] README Small fixes --- medperf/model/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medperf/model/README.md b/medperf/model/README.md index e279a8e..be7b815 100644 --- a/medperf/model/README.md +++ b/medperf/model/README.md @@ -5,7 +5,7 @@ This is a Hello World implementation, following the structure and conventions Me At the time of writing, model MLCubes have the only purpose of obtaining predictions on data. This means that we expect all models inside MedPerf to already be trained. ## How to run: -This is a working template, which means it should work out-of-the-box. Follow the next steps: +This template was built so it can work out-of-the-box. Follow the next steps: 1. Clone the repository 2. cd to the repository @@ -28,7 +28,7 @@ This is a working template, which means it should work out-of-the-box. Follow th ``` 6. check resulting predictions ```bash - cat workspace/predictions.csv + cat workspace/predictions/predictions.csv ``` That's it! You just built and ran a hello-world model mlcube! From 9dcf220618b121c2d6197122fbcbd5fc3d3e471a Mon Sep 17 00:00:00 2001 From: Sarthak Pati Date: Thu, 3 Mar 2022 15:23:07 -0500 Subject: [PATCH 11/14] added instructions to create & activate venv --- medperf/model/README.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/medperf/model/README.md b/medperf/model/README.md index be7b815..297a5d9 100644 --- a/medperf/model/README.md +++ b/medperf/model/README.md @@ -12,21 +12,24 @@ This template was built so it can work out-of-the-box. Follow the next steps: ```bash cd mlcube_examples ``` -3. Install mlcube and mlcube-docker - +3. Create and activate virtual environment + ```bash + conda create -n venv_mlcub python=3.7 -y # change to your prefered python version + conda activate venv_mlcub + ``` +4. Install mlcube and mlcube-docker ```bash pip install mlcube mlcube-docker ``` -4. cd to current example's `mlcube` folder - +5. cd to current example's `mlcube` folder ```bash cd medperf/model/mlcube ``` -5. execute the `infer` task with mlcube +6. execute the `infer` task with mlcube ```bash mlcube run --task=infer ``` -6. check resulting predictions +7. check resulting predictions ```bash cat workspace/predictions/predictions.csv ``` From 4b5414f9e30b587e53881c09f9f80ae191391224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Fri, 4 Mar 2022 11:14:24 -0500 Subject: [PATCH 12/14] Add docker username to image name --- medperf/data_preparator/mlcube/mlcube.yaml | 2 +- medperf/model/mlcube/mlcube.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/medperf/data_preparator/mlcube/mlcube.yaml b/medperf/data_preparator/mlcube/mlcube.yaml index 49b3461..3be87b3 100644 --- a/medperf/data_preparator/mlcube/mlcube.yaml +++ b/medperf/data_preparator/mlcube/mlcube.yaml @@ -8,7 +8,7 @@ platform: docker: # Image name. - image: medical-data-prep-hello-world + image: mlcommons/medical-data-prep-hello-world # Docker build context relative to $MLCUBE_ROOT. Default is `build`. build_context: "../project" # Docker file name within docker build context, default is `Dockerfile`. diff --git a/medperf/model/mlcube/mlcube.yaml b/medperf/model/mlcube/mlcube.yaml index d9f6c2b..4dba3e8 100644 --- a/medperf/model/mlcube/mlcube.yaml +++ b/medperf/model/mlcube/mlcube.yaml @@ -8,7 +8,7 @@ platform: docker: # Image name. - image: medical-hello-world + image: mlcommons/medical-hello-world # Docker build context relative to $MLCUBE_ROOT. Default is `build`. build_context: "../project" # Docker file name within docker build context, default is `Dockerfile`. From 29060a51c0ddd95368dfcc0e3e66fab7f85c41ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Fri, 4 Mar 2022 11:54:46 -0500 Subject: [PATCH 13/14] Write metrics template --- medperf/metrics/mlcube/mlcube.yaml | 35 +++++++ .../metrics/mlcube/workspace/parameters.yaml | 12 +++ medperf/metrics/project/Dockerfile | 26 +++++ medperf/metrics/project/app.py | 99 +++++++++++++++++++ medperf/metrics/project/mlcube.py | 48 +++++++++ medperf/metrics/project/requirements.txt | 3 + medperf/model/project/mlcube.py | 4 +- 7 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 medperf/metrics/mlcube/mlcube.yaml create mode 100644 medperf/metrics/mlcube/workspace/parameters.yaml create mode 100644 medperf/metrics/project/Dockerfile create mode 100644 medperf/metrics/project/app.py create mode 100644 medperf/metrics/project/mlcube.py create mode 100644 medperf/metrics/project/requirements.txt diff --git a/medperf/metrics/mlcube/mlcube.yaml b/medperf/metrics/mlcube/mlcube.yaml new file mode 100644 index 0000000..27d874b --- /dev/null +++ b/medperf/metrics/mlcube/mlcube.yaml @@ -0,0 +1,35 @@ +name: Hello-World Medperf Metrics MLCube +description: MLCommons demonstration MLCube for writing metrics for MedPerf +authors: + - {name: "MLCommons Medical Working Group"} + +platform: + accelerator_count: 0 + +docker: + # Image name. + image: mlcommons/hello-world-metrics + # Docker build context relative to $MLCUBE_ROOT. Default is `build`. + build_context: "../project" + # Docker file name within docker build context, default is `Dockerfile`. + build_file: "Dockerfile" + +tasks: + # Metrics MLCubes require only a single task: `evaluate` + # This tast takes the predictions generated by the model mlcube (as a directory) + # and the output of the Data Preparation MLCube containing the labels (as a directory) + # to compute metrics, which are then stored inside the output_path + evaluate: + # Executes a number of metrics specified by the params file + parameters: + inputs: { + predictions: predictions, # Required. Where to find the predictions. MUST be a folder + labels: labels, # Required. Where to find the labels. MUST be a folder + parameters_file: parameters.yaml # Required. Helper file to provide additional arguments. Value MUST be parameters.yaml + # If you need any additional files that should + # not be included inside the mlcube image, + # add them inside `additional_files` folder + } + outputs: { + output_path: {type: "file", default: "results.yaml"} # Required. Where to write the metrics results. Value MUST be results.yaml + } \ No newline at end of file diff --git a/medperf/metrics/mlcube/workspace/parameters.yaml b/medperf/metrics/mlcube/workspace/parameters.yaml new file mode 100644 index 0000000..e46bfa9 --- /dev/null +++ b/medperf/metrics/mlcube/workspace/parameters.yaml @@ -0,0 +1,12 @@ +# File for parametrizing your metrics calculations + +metrics: + # List of metrics to run + - ACC + +label columns: + # Label columns that are going to be evaluated + - greeting + +# Common identifier column for labels and predictions +id column: id \ No newline at end of file diff --git a/medperf/metrics/project/Dockerfile b/medperf/metrics/project/Dockerfile new file mode 100644 index 0000000..af853de --- /dev/null +++ b/medperf/metrics/project/Dockerfile @@ -0,0 +1,26 @@ +FROM ubuntu:18.04 +MAINTAINER MLPerf MLBox Working Group + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common \ + curl && \ + rm -rf /var/lib/apt/lists/* + +RUN add-apt-repository ppa:deadsnakes/ppa -y && apt-get update + +RUN apt-get install python3.8 python3-pip -y + +COPY ./requirements.txt project/requirements.txt + +RUN pip3 install --upgrade pip + +RUN pip3 install --default-timeout=10000 --no-cache-dir -r project/requirements.txt + +ENV LANG C.UTF-8 + +COPY . /project + +WORKDIR /project + +ENTRYPOINT ["python3", "mlcube.py"] \ No newline at end of file diff --git a/medperf/metrics/project/app.py b/medperf/metrics/project/app.py new file mode 100644 index 0000000..5ded938 --- /dev/null +++ b/medperf/metrics/project/app.py @@ -0,0 +1,99 @@ +# Hello World Script +# +# This script is unrelated to the MLCube interface. It could be run +# independently without issues. It provides the actual implementation +# of the metrics. This file is executed by MLCube through mlcube.py +import argparse +import yaml +import pandas as pd + + +class ACC: + # Given this is a toy example, the metric is implemented by hand + # It is recommended that metrics are obtained from trusted + # libraries + @staticmethod + def run(labels, preds): + total_count = len(labels) + correct_count = (labels == preds).sum() + return correct_count / total_count + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--labels_csv", + "--labels-csv", + type=str, + required=True, + help="File containing the labels", + ) + parser.add_argument( + "--preds_csv", + "--preds-csv", + type=str, + required=True, + help="File containing the predictions", + ) + parser.add_argument( + "--output_file", + "--output-file", + type=str, + required=True, + help="file to store metrics results as YAML", + ) + parser.add_argument( + "--parameters_file", + "--parameters-file", + type=str, + required=True, + help="File containing parameters for evaluation", + ) + args = parser.parse_args() + + # Load all files + + with open(args.parameters_file, "r") as f: + params = yaml.full_load(f) + + labels = pd.read_csv(args.labels_csv) + preds = pd.read_csv(args.preds_csv) + + labels = reformat_data(labels, params) + preds = reformat_data(preds, params) + + available_metrics = { + "ACC": ACC, + } + results = {} + cols = list(labels.columns) + for metric_name in params["metrics"]: + metric = available_metrics[metric_name] + scores = metric.run(labels, preds) + scores = {col: score for col, score in zip(cols, scores)} + results[metric_name] = scores + + with open(args.output_file, "w") as f: + yaml.dump(results, f) + + +def reformat_data(df, params): + """Ensures that the dataframe contains the desired label columns and is sorted by a defined identifier column + Args: + df (pd.DataFrame): dataframe containing data labels and an identifier for each row + params (dict): dictionary containing key-value pairs for identified the labels-of-interest and the common identifier column. + Returns: + pd.DataFrame: reformatted labels and predictions respectively + """ + label_cols = params["label columns"] + id_col = params["id column"] + select_cols = label_cols + [id_col] + + df = df[select_cols] + df = df.set_index(id_col) + df = df.sort_index() + return df + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/medperf/metrics/project/mlcube.py b/medperf/metrics/project/mlcube.py new file mode 100644 index 0000000..f244388 --- /dev/null +++ b/medperf/metrics/project/mlcube.py @@ -0,0 +1,48 @@ +# MLCube Entrypoint +# +# This script shows how you can bridge your app with an MLCube interface. +# MLCubes expect the entrypoint to behave like a CLI, where tasks are +# commands, and input/output parameters and command-line arguments. +# You can provide that interface to MLCube in any way you prefer. +# Here, we show a way that requires minimal intrusion to the original code, +# By running the application through subprocesses. +import os +import typer +import subprocess + + +app = typer.Typer() + + +def exec_python(cmd: str) -> None: + """Execute a python script as a subprocess + + Args: + cmd (str): command to run as would be written inside the terminal + """ + splitted_cmd = cmd.split() + process = subprocess.Popen(splitted_cmd, cwd=".") + process.wait() + + +@app.command("evaluate") +def evaluate( + labels: str = typer.Option(..., "--labels"), + predictions: str = typer.Option(..., "--predictions"), + parameters_file: str = typer.Option(..., "--parameters_file"), + output_path: str = typer.Option(..., "--output_path"), +): + labels_csv = os.path.join(labels, "labels.csv") + preds_csv = os.path.join(predictions, "predictions.csv") + + cmd = f"python3 app.py --labels_csv={labels_csv} --preds_csv={preds_csv} --parameters_file={parameters_file} --output_file={output_path}" + exec_python(cmd) + + +@app.command("hotfix") +def hotfix(): + pass + + +if __name__ == "__main__": + app() \ No newline at end of file diff --git a/medperf/metrics/project/requirements.txt b/medperf/metrics/project/requirements.txt new file mode 100644 index 0000000..7e80e76 --- /dev/null +++ b/medperf/metrics/project/requirements.txt @@ -0,0 +1,3 @@ +PyYAML~=5.3 +pandas~=1.1 +typer \ No newline at end of file diff --git a/medperf/model/project/mlcube.py b/medperf/model/project/mlcube.py index 66894a3..0a9d1a8 100644 --- a/medperf/model/project/mlcube.py +++ b/medperf/model/project/mlcube.py @@ -45,7 +45,9 @@ def infer( names_file = os.path.join(data_path, "names.csv") uppercase = params["uppercase"] - cmd = f"python3 app.py --names={names_file} --uppercase={uppercase} --greetings={greetings} --out={out_path}" + cmd = f"python3 app.py --names={names_file} --greetings={greetings} --out={out_path}" + if uppercase: + cmd += f" --uppercase={uppercase}" exec_python(cmd) @app.command("hotfix") From a0c7b1a3420be395035eac22fdf2faea69d5b18f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Aristiz=C3=A1bal?= Date: Fri, 4 Mar 2022 12:19:49 -0500 Subject: [PATCH 14/14] Add README to metrics template --- medperf/metrics/README.md | 128 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 medperf/metrics/README.md diff --git a/medperf/metrics/README.md b/medperf/metrics/README.md new file mode 100644 index 0000000..e3edfc0 --- /dev/null +++ b/medperf/metrics/README.md @@ -0,0 +1,128 @@ +## Purpose: +At the time of writing, metrics MLCubes have the only purpose of computing metrics on predictions. The receives the predictions generated by the model, and the output of the data preparation cube in order to extract labels and calculate the model's performance. + +## How to run: +This template was built so it can work out-of-the-box. Follow the next steps: + +1. Clone the repository +2. cd to the repository + ```bash + cd mlcube_examples + ``` +3. Install mlcube and mlcube-docker + + ```bash + pip install mlcube mlcube-docker + ``` +4. cd to current example's `mlcube` folder + + ```bash + cd medperf/metrics/mlcube + ``` +5. execute the `evaluate` task with mlcube + ```bash + mlcube run --task=infer -Pdocker.build_strategy=auto + ``` +6. check resulting predictions + ```bash + cat workspace/results.yaml + ``` +That's it! You just built and ran a hello-world metrics mlcube! + +## Contents + +MLCubes usually share a similar folder structure and files. Here's a brief description of the role for the relevant files + +1. __`mlcube/mlcube.yaml`__: + + The `mlcube.yaml` file contains metadata about your project, including its interface. For MedPerf, we require an `evaluate` function that takes in (at minimum) arguments for `predictions`, `labels` and `parameters_file` and outputs model performance artifacts inside the `output_path`. You see this definition in the mlcube.yaml file as: + + ```yml + tasks: + # Metrics MLCubes require only a single task: `evaluate` + # This tast takes the predictions generated by the model mlcube (as a directory) + # and the output of the Data Preparation MLCube containing the labels (as a directory) + # to compute metrics, which are then stored inside the output_path + evaluate: + # Executes a number of metrics specified by the params file + parameters: + inputs: { + predictions: predictions, # Required. Where to find the predictions. MUST be a folder + labels: labels, # Required. Where to find the labels. MUST be a folder + parameters_file: parameters.yaml # Required. Helper file to provide additional arguments. Value MUST be parameters.yaml + # If you need any additional files that should + # not be included inside the mlcube image, + # add them inside `additional_files` folder + } + outputs: { + output_path: {type: "file", default: "results.yaml"} # Required. Where to write the metrics results. Value MUST be results.yaml + } + ``` + The output generated by the metrics mlcube is expected to be a file named `results.yaml`, which contains the results of the computed metrics. + +2. __`mlcube/workspace/parameters.yaml`__: + + This file provides ways to parameterize your model. You can set any key-value pairs that should be easily modifiable in order to adjust you model's behavior. Current example shows how we can specify the metrics we want to compute (`metrics`), for what labels (`label columns`), and the column we use for identifying each true-label/prediction (`id column`): + ```yml + # File for parametrizing your metrics calculations + + metrics: + # List of metrics to run + - ACC + + label columns: + # Label columns that are going to be evaluated + - greeting + + # Common identifier column for labels and predictions + id column: id + ``` + + This structure follows how we've been specifying metrics parametrization. Your metrics don't need to follow this parameters structure. + +3. __`mlcube/workspace/additional_files/*`__: + + You may require additional files that should not be packaged inside the mlcube (due to size or usability constrains) like weights. For these cases, we provide an additional folder called `additional_files`. Here, you can provide any other files that should be present at the time of inference. At the time of mlcube registration, this folder must be compressed into a tarball (`.tar.gz`) and hosted somewhere on the web. MedPerf will then be able to download, verify and reposition those files in the expected location for mlcube execution. + + + +4. __`project`__: + + Contains the actual implementation of the mlcube. This includes all project-specific code, `Dockerfile` for building docker containers of the project and requirements for running the code. + +5. __`project/mlcube.py`__: + + MLCube expects an entrypoint to the project in order to run the code and the specified tasks. It expects this entrypoint to behave like a CLI, in which each MLCube task (e.g. `evaluate`) is executed as a subcommand, and each input/output parameter is passed as a CLI argument. An example of the expected interface is: + ```bash + python3 project/mlcube.py evaluate --predictions= --labels= --parameters_file= --output_path= + ``` + `mlcube.py` provides such interface for this toy example. As long as you follow such CLI interface, you can implement it however you want. We provide an example that requirems minimal modifications to the original project code, by running any project task through subprocesses. + + #### __What is that “hotfix” function I see in mlcube.py?__ + + In short, it’s benign and there to avoid a potential cli issue, so you can just leave it and forget about it. + + For those who care, when using typer/click for your cli, like we do, you need more than one @app.command, or typer/click will not parse the command-line in the way mlcube expects. This is a silly, known issue that goes away as soon as you have more than one task in your mlcube interface. But since our model cubes currently only have one task, we add an extra, blank typer command to avoid this issue. If you don’t use typer/click, you likely don’t need this dummy command. + +## How to modify +If you want to adjust this template for your own use-case, then the following list serves as a step-by-step guide: +1. Remove demo artifacts from `/mlcube/workspace`: + - `/mlcube/workspace/labels/*` + - `/mlcube/workspace/predictions/*` + - `/mlcube/workspace/` +2. Pass your original code to the `/project` folder (removing `app.py`) +3. Adjust your code and the `/project/mlcube.py` file so that commands point to the respective code and receive the expected arguments +4. Modify `/project/requirements.txt` so that it contains all code dependencies for your project +5. Default `/project/Dockerfile` should suffice, but feel free to add/modify it to work with your needs. As long as it has an entrypoint pointing to `mlcube.py` +6. Inside `/mlcube/workspace` add the data you want your cube to compute metrics for +7. Inside `/mlcube/workspace/additional_files` add any additional files that are required for metrics execution +8. Adjust `/mlcube/mlcube.yaml` so that: + 1. metadata such as `name`, `description`, `authors` and `image_name` are correctly assigned. + 2. `labels` points to the location where you expect labels to be inside the `workspace` directory. + 3. `predictions` points to the location where you expect predictions to be inside the `workspace` directory. + 4. `parameters_file` should NOT be modified in any way. + 6. Add any other required parameters that point to `additional_files`. Naming can be arbitrary, but all files referenced from now on should be contained inside `additional_files`. + 7. `output_path` should NOT be modified in any way. + +## Requirements are negotiable +The required fields in the mlcube task interface show what medperf currently assumes. As we are in alpha, this is a great time to raise concerns or requests about these requirements! Now is the best time for us to make changes.