Docker Regression Tests Github action (#2403)

* Testing docker regression * Run docker regression nightly * Addressed review comments * code cleanup * clean up gpu runner * changes based on feedback * lint failure
pytorch · Jul 14, 2023 · 5390025 · 5390025
1 parent 2f5a784
commit 5390025
Show file tree

Hide file tree

Showing 13 changed files with 167 additions and 24 deletions.
diff --git a/.github/workflows/regression_tests_docker.yml b/.github/workflows/regression_tests_docker.yml
@@ -0,0 +1,55 @@
+name: Run Regression Tests on Docker
+
+on:
+  # run every day at 5:15am
+  schedule:
+    - cron:  '15 5 * * *'
+
+concurrency:
+  group: ci-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/master' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  docker-regression:
+    strategy:
+      fail-fast: false
+      matrix:
+        hardware: [ubuntu-20.04, [self-hosted, regression-test-gpu]]
+    runs-on:
+      - ${{ matrix.hardware }}
+    steps:
+      - name: Clean up previous run
+        run: |
+          echo "Cleaning up previous run"
+          ls -la ./
+          sudo rm -rf ./* || true
+          sudo rm -rf ./.??* || true
+          ls -la ./
+          docker system prune -f
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+      - name: Branch name
+        run: |
+          echo $GITHUB_REF_NAME
+      - name: Build CPU Docker Image
+        if: contains(matrix.hardware, 'ubuntu')
+        run: |
+          cd docker
+          ./build_image.sh -bt ci -b $GITHUB_REF_NAME -t pytorch/torchserve:ci
+      - name: Build GPU Docker Image
+        if: false == contains(matrix.hardware, 'ubuntu')
+        run: |
+          cd docker
+          ./build_image.sh -g -cv cu117 -bt ci -b $GITHUB_REF_NAME -t pytorch/torchserve:ci
+      - name: Torchserve GPU Regression Tests
+        if: false == contains(matrix.hardware, 'ubuntu')
+        run: |
+          docker run --gpus all -v $GITHUB_WORKSPACE:/home/serve pytorch/torchserve:ci
+      - name: Torchserve CPU Regression Tests
+        if: contains(matrix.hardware, 'ubuntu')
+        run: |
+          docker run -v $GITHUB_WORKSPACE:/home/serve pytorch/torchserve:ci
+      - name: Cleanup Docker Images
+        if: success()
+        run: |
+          docker system prune -f && docker rmi pytorch/torchserve:ci
diff --git a/.github/workflows/regression_tests_gpu.yml b/.github/workflows/regression_tests_gpu.yml
@@ -20,11 +20,10 @@ jobs:
       - name: Clean up previous run
         run: |
           echo "Cleaning up previous run"
-          cd $RUNNER_WORKSPACE
-          pwd
-          cd ..
-          pwd
-          rm -rf _tool
+          ls -la ./
+          sudo rm -rf ./* || true
+          sudo rm -rf ./.??* || true
+          ls -la ./
       - name: Update git
         run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt-get update && sudo apt-get install git -y
       - name: Check git version

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -85,7 +85,7 @@ RUN \
 RUN python -m pip install --no-cache-dir torchserve torch-model-archiver torch-workflow-archiver
 
 # Final image for production
-FROM ${BASE_IMAGE} AS runtime-image
+FROM ${BASE_IMAGE} AS production-image
 # Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top)
 ARG PYTHON_VERSION
 ENV PYTHONUNBUFFERED TRUE
@@ -130,3 +130,48 @@ WORKDIR /home/model-server
 ENV TEMP=/home/model-server/tmp
 ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
 CMD ["serve"]
+
+# Final image for docker regression
+FROM ${BASE_IMAGE} AS ci-image
+# Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top)
+ARG PYTHON_VERSION
+ARG BRANCH_NAME
+ENV PYTHONUNBUFFERED TRUE
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install software-properties-common -y && \
+    add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt remove python-pip  python3-pip && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    python$PYTHON_VERSION \
+    python3-distutils \
+    python$PYTHON_VERSION-dev \
+    python$PYTHON_VERSION-venv \
+    # using openjdk-17-jdk due to circular dependency(ca-certificates) bug in openjdk-17-jre-headless debian package
+    # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1009905
+    openjdk-17-jdk \
+    build-essential \
+    wget \
+    numactl \
+    nodejs \
+    npm \
+    zip \
+    unzip \
+    && npm install -g newman newman-reporter-htmlextra markdown-link-check \
+    && rm -rf /var/lib/apt/lists/* \
+    && cd /tmp
+
+
+COPY --from=compile-image /home/venv /home/venv
+
+ENV PATH="/home/venv/bin:$PATH"
+
+RUN python -m pip install --no-cache-dir -r https://github.com/pytorch/serve/$BRANCH_NAME/requirements/developer.txt
+
+RUN mkdir /home/serve
+ENV TS_RUN_IN_DOCKER True
+
+WORKDIR /home/serve
+CMD ["python", "test/regression_tests.py"]
diff --git a/docker/README.md b/docker/README.md
@@ -28,13 +28,13 @@ cd serve/docker
 
 # Create TorchServe docker image
 
-Use `build_image.sh` script to build the docker images. The script builds the `production`, `dev` and `codebuild` docker images.
+Use `build_image.sh` script to build the docker images. The script builds the `production`, `dev` , `ci` and `codebuild` docker images.
 | Parameter | Description |
 |------|------|
 |-h, --help|Show script help|
 |-b, --branch_name|Specify a branch name to use. Default: master |
 |-g, --gpu|Build image with GPU based ubuntu base image|
-|-bt, --buildtype|Which type of docker image to build. Can be one of : production, dev, codebuild|
+|-bt, --buildtype|Which type of docker image to build. Can be one of : production, dev, ci, codebuild|
 |-t, --tag|Tag name for image. If not specified, script uses torchserve default tag names.|
 |-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118`. Default `cu117`|
 |-ipex, --build-with-ipex| Specify to build with intel_extension_for_pytorch. If not specified, script builds without intel_extension_for_pytorch.|

diff --git a/docker/build_image.sh b/docker/build_image.sh
@@ -137,7 +137,10 @@ fi
 
 if [ "${BUILD_TYPE}" == "production" ]
 then
-  DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg CUDA_VERSION="${CUDA_VERSION}"  --build-arg PYTHON_VERSION="${PYTHON_VERSION}" -t "${DOCKER_TAG}" .
+  DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg CUDA_VERSION="${CUDA_VERSION}"  --build-arg PYTHON_VERSION="${PYTHON_VERSION}" -t "${DOCKER_TAG}" --target production-image  .
+elif [ "${BUILD_TYPE}" == "ci" ]
+then
+  DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg CUDA_VERSION="${CUDA_VERSION}"  --build-arg PYTHON_VERSION="${PYTHON_VERSION}" --build-arg BRANCH_NAME="${BRANCH_NAME}"  -t "${DOCKER_TAG}" --target ci-image  .
 elif [ "${BUILD_TYPE}" == "benchmark" ]
 then
   DOCKER_BUILDKIT=1 docker build --pull --no-cache --file Dockerfile.benchmark --build-arg USE_LOCAL_SERVE_FOLDER=$USE_LOCAL_SERVE_FOLDER --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg CUDA_VERSION="${CUDA_VERSION}" --build-arg MACHINE_TYPE="${MACHINE}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}" -t "${DOCKER_TAG}" .

diff --git a/examples/dcgan_fashiongen/create_mar.sh b/examples/dcgan_fashiongen/create_mar.sh
@@ -16,7 +16,6 @@ function cleanup {
 trap cleanup EXIT
 
 # Download and Extract model's source code
-sudo apt-get install zip unzip -y
 
 wget https://github.com/facebookresearch/pytorch_GAN_zoo/archive/$SRCZIP
 unzip $SRCZIP

diff --git a/test/pytest/test_example_intel_extension_for_pytorch.py b/test/pytest/test_example_intel_extension_for_pytorch.py
@@ -88,6 +88,10 @@ def scale_workers_with_core_pinning(scaled_num_workers):
     or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
     reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available",
 )
+@pytest.mark.skipif(
+    os.environ.get("TS_RUN_IN_DOCKER", False),
+    reason="Test to be run outside docker",
+)
 def test_single_worker_affinity():
     num_workers = 1
     worker_idx = 0
@@ -112,6 +116,10 @@ def test_single_worker_affinity():
     or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
     reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available",
 )
+@pytest.mark.skipif(
+    os.environ.get("TS_RUN_IN_DOCKER", False),
+    reason="Test to be run outside docker",
+)
 def test_multi_worker_affinity():
     num_workers = 2
     setup_torchserve()
@@ -138,6 +146,10 @@ def test_multi_worker_affinity():
     or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
     reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available",
 )
+@pytest.mark.skipif(
+    os.environ.get("TS_RUN_IN_DOCKER", False),
+    reason="Test to be run outside docker",
+)
 def test_worker_scale_up_affinity():
     initial_num_workers = 1
     setup_torchserve()
@@ -171,6 +183,10 @@ def test_worker_scale_up_affinity():
     or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
     reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available",
 )
+@pytest.mark.skipif(
+    os.environ.get("TS_RUN_IN_DOCKER", False),
+    reason="Test to be run outside docker",
+)
 def test_worker_scale_down_affinity():
     initial_num_workers = 2
     setup_torchserve()

diff --git a/test/pytest/test_example_scriptable_tokenzier.py b/test/pytest/test_example_scriptable_tokenzier.py
@@ -318,8 +318,10 @@ def test_inference_with_pretrained_model(model_store, test_file, torchserve):
     assert "Positive" in result_entries
 
     assert float(result_entries["Negative"]) == pytest.approx(
-        0.0001851904089562595, 1e-3
+        0.0001851904089562595, abs=1e-6
+    )
+    assert float(result_entries["Positive"]) == pytest.approx(
+        0.9998148083686829, abs=1e-6
     )
-    assert float(result_entries["Positive"]) == pytest.approx(0.9998148083686829, 1e-3)
 
     test_utils.unregister_model(model_name)
diff --git a/test/pytest/test_handler.py b/test/pytest/test_handler.py
@@ -285,6 +285,10 @@ def test_kserve_mnist_model_register_and_inference_on_valid_model_explain():
     test_utils.unregister_model("mnist")
 
 
+@pytest.mark.skipif(
+    os.environ.get("TS_RUN_IN_DOCKER", False),
+    reason="Test to be run outside docker",
+)
 def test_huggingface_bert_batch_inference():
     batch_size = 2
     batch_delay = 10000  # 10 seconds

diff --git a/test/pytest/test_sm_mme_requirements.py b/test/pytest/test_sm_mme_requirements.py
@@ -15,6 +15,10 @@
 )
 
 
+@pytest.mark.skipif(
+    os.environ.get("TS_RUN_IN_DOCKER", False),
+    reason="Test to be run outside docker",
+)
 def test_no_model_loaded():
     """
     Validates that TorchServe returns reponse code 404 if no model is loaded.
@@ -34,6 +38,10 @@ def test_no_model_loaded():
     not ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
     reason="Test to be run on GPU only",
 )
+@pytest.mark.skipif(
+    os.environ.get("TS_RUN_IN_DOCKER", False),
+    reason="Test to be run outside docker",
+)
 def test_oom_on_model_load():
     """
     Validates that TorchServe returns reponse code 507 if there is OOM on model loading.
@@ -63,6 +71,10 @@ def test_oom_on_model_load():
     not ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
     reason="Test to be run on GPU only",
 )
+@pytest.mark.skipif(
+    os.environ.get("TS_RUN_IN_DOCKER", False),
+    reason="Test to be run outside docker",
+)
 def test_oom_on_invoke():
     # Create model store directory
     pathlib.Path(test_utils.MODEL_STORE).mkdir(parents=True, exist_ok=True)

diff --git a/test/pytest/test_torch_compile.py b/test/pytest/test_torch_compile.py
@@ -64,6 +64,10 @@ def test_start_torchserve(self):
         assert len(glob.glob("logs/model_log.log")) == 1
         assert len(glob.glob("logs/ts_log.log")) == 1
 
+    @pytest.mark.skipif(
+        os.environ.get("TS_RUN_IN_DOCKER", False),
+        reason="Test to be run outside docker",
+    )
     def test_server_status(self):
         result = subprocess.run(
             "curl http://localhost:8080/ping",
@@ -75,6 +79,10 @@ def test_server_status(self):
         expected_server_status = json.loads(expected_server_status_str)
         assert json.loads(result.stdout) == expected_server_status
 
+    @pytest.mark.skipif(
+        os.environ.get("TS_RUN_IN_DOCKER", False),
+        reason="Test to be run outside docker",
+    )
     def test_registered_model(self):
         result = subprocess.run(
             "curl http://localhost:8081/models",
@@ -86,6 +94,10 @@ def test_registered_model(self):
         expected_registered_model = json.loads(expected_registered_model_str)
         assert json.loads(result.stdout) == expected_registered_model
 
+    @pytest.mark.skipif(
+        os.environ.get("TS_RUN_IN_DOCKER", False),
+        reason="Test to be run outside docker",
+    )
     def test_serve_inference(self):
         request_data = {"instances": [[1.0], [2.0], [3.0]]}
         request_json = json.dumps(request_data)

diff --git a/test/regression_tests.py b/test/regression_tests.py
@@ -1,28 +1,23 @@
-import sys
 import os
-from pygit2 import Repository
+import sys
 
 # To help discover local modules
 REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
 sys.path.append(REPO_ROOT)
 
+import datetime
+
+from ts_scripts import marsgen as mg
+from ts_scripts.api_utils import test_api
 from ts_scripts.install_from_src import install_from_src
 from ts_scripts.regression_utils import test_regression
-from ts_scripts.api_utils import test_api
-from ts_scripts import print_env_info as build_hdr_printer
 from ts_scripts.utils import check_python_version
-from ts_scripts import marsgen as mg
-
-import datetime
 
 now = datetime.datetime.now()
 print("Current date and time : " + now.strftime("%Y-%m-%d %H:%M:%S"))
 
 check_python_version()
 
-git_branch = Repository('.').head.shorthand
-build_hdr_printer.main(git_branch)
-
 # Install from source
 install_from_src()
 
@@ -32,10 +27,10 @@
 # Run newman api tests
 test_api(
     "all"
-)  #"all" > management, inference, increased_timeout_inference, https collections
+)  # "all" > management, inference, increased_timeout_inference, https collections
 
 # Run regression tests
 test_regression()
 
 # delete mar_gen_dir
-mg.delete_model_store_gen_dir()
+mg.delete_model_store_gen_dir()
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1062,4 +1062,5 @@ XLA
 inferentia
 ActionSLAM
 statins
+ci
 chatGPT