CI Infra updates (#19903)

This PR makes a number of changes to make it more stable: Remove SafeDocker client which uses python docker package to run containers. Change to use "docker run" command directly using subprocess.call(), because the python-docker client does not support a gpus parameter which newer docker versions use and we don't get timeout issues when using the docker command directly. This will allow us to update our AMIs to use newer docker versions. In order to support both docker variants simultaneously, we first try to use the --gpus all parameter to docker run, if it fails with error code 125 (which means docker run command itself failed,) then we retry using the old --runtime nvidia parameter. Remove the extra custom codecov calls (which usually fail and have to retry multiple times, even though the initial codecov command works.)
apache · Feb 19, 2021 · 2c4f5aa · 2c4f5aa
1 parent ecadeac
commit 2c4f5aa
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 740 deletions.
diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
@@ -112,20 +112,7 @@ def get_git_commit_hash() {
 }
 
 def publish_test_coverage() {
-    // CodeCovs auto detection has trouble with our CIs PR validation due the merging strategy
-    git_commit_hash = get_git_commit_hash()
-
-    if (env.CHANGE_ID) {
-      // PR execution
-      codecovArgs = "-B ${env.CHANGE_TARGET} -C ${git_commit_hash} -P ${env.CHANGE_ID}"
-    } else {
-      // Branch execution
-      codecovArgs = "-B ${env.BRANCH_NAME} -C ${git_commit_hash}"
-    }
-
-    // To make sure we never fail because test coverage reporting is not available
-    // Fall back to our own copy of the bash helper if it failed to download the public version
-    sh "(curl --retry 10 -s https://codecov.io/bash | bash -s - ${codecovArgs}) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s - ${codecovArgs}) || true"
+    sh "curl -s https://codecov.io/bash | bash"
 }
 
 def collect_test_results_unix(original_file_name, new_file_name) {

diff --git a/ci/build.py b/ci/build.py
@@ -34,7 +34,6 @@
 
 import yaml
 
-from safe_docker_run import SafeDockerClient
 from util import *
 
 
@@ -104,8 +103,7 @@ def default_ccache_dir() -> str:
     return os.path.join(os.path.expanduser("~"), ".ccache")
 
 
-def container_run(docker_client: SafeDockerClient,
-                  platform: str,
+def container_run(platform: str,
                   nvidia_runtime: bool,
                   docker_registry: str,
                   shared_memory_size: str,
@@ -114,17 +112,12 @@ def container_run(docker_client: SafeDockerClient,
                   environment: Dict[str, str],
                   dry_run: bool = False) -> int:
     """Run command in a container"""
-    container_wait_s = 600
-    #
-    # Environment setup
-    #
+    # set default environment variables
     environment.update({
         'CCACHE_MAXSIZE': '500G',
         'CCACHE_TEMPDIR': '/tmp/ccache',  # temp dir should be local and not shared
-        'CCACHE_DIR': '/work/ccache',  # this path is inside the container as /work/ccache is
-                                       # mounted
-        'CCACHE_LOGFILE': '/tmp/ccache.log',  # a container-scoped log, useful for ccache
-                                              # verification.
+        'CCACHE_DIR': '/work/ccache',  # this path is inside the container as /work/ccache is mounted
+        'CCACHE_LOGFILE': '/tmp/ccache.log',  # a container-scoped log, useful for ccache verification.
     })
     environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ})
 
@@ -136,13 +129,9 @@ def container_run(docker_client: SafeDockerClient,
     os.makedirs(local_ccache_dir, exist_ok=True)
     logging.info("Using ccache directory: %s", local_ccache_dir)
 
-    # Equivalent command
-    docker_cmd_list = [
-        "docker",
-        'run',
-        "--gpus all" if nvidia_runtime else "",
-        "--cap-add",
-        "SYS_PTRACE", # Required by ASAN
+    # Build docker command
+    docker_arg_list = [
+        "--cap-add", "SYS_PTRACE", # Required by ASAN
         '--rm',
         '--shm-size={}'.format(shared_memory_size),
         # mount mxnet root
@@ -158,40 +147,27 @@ def container_run(docker_client: SafeDockerClient,
         '-e', "CCACHE_DIR={}".format(environment['CCACHE_DIR']),
         # a container-scoped log, useful for ccache verification.
         '-e', "CCACHE_LOGFILE={}".format(environment['CCACHE_LOGFILE']),
-        '-ti',
-        tag]
-    docker_cmd_list.extend(command)
-    docker_cmd = ' \\\n\t'.join(docker_cmd_list)
-    logging.info("Running %s in container %s", command, tag)
-    logging.info("Executing the equivalent of:\n%s\n", docker_cmd)
+    ]
+    docker_arg_list += [tag]
+    docker_arg_list.extend(command)
+
+    def docker_run_cmd(cmd):
+        logging.info("Running %s in container %s", command, tag)
+        logging.info("Executing command:\n%s\n", ' \\\n\t'.join(cmd))
+        subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr, check=True)
 
     if not dry_run:
-        #############################
-        #
-        signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM})
-        # noinspection PyShadowingNames
-        runtime = None
-        if nvidia_runtime:
-            # noinspection PyShadowingNames
-            # runc is default (docker info | grep -i runtime)
-            runtime = 'nvidia'
-
-        return docker_client.run(
-            tag,
-            runtime=runtime,
-            command=command,
-            shm_size=shared_memory_size,
-            user='{}:{}'.format(os.getuid(), os.getgid()),
-            cap_add='SYS_PTRACE',
-            volumes={
-                mx_root:
-                    {'bind': '/work/mxnet', 'mode': 'rw'},
-                local_build_folder:
-                    {'bind': '/work/build', 'mode': 'rw'},
-                local_ccache_dir:
-                    {'bind': '/work/ccache', 'mode': 'rw'},
-            },
-            environment=environment)
+        if not nvidia_runtime:
+            docker_run_cmd(['docker', 'run'] + docker_arg_list)
+        else:
+            try:
+                docker_run_cmd(['docker', 'run', '--gpus', 'all'] + docker_arg_list)
+            except subprocess.CalledProcessError as e:
+                if e.returncode == 125:
+                    docker_run_cmd(['docker', 'run', '--runtime', 'nvidia'] + docker_arg_list)
+                else:
+                    raise
+
     return 0
 
 
@@ -292,8 +268,6 @@ def main() -> int:
     args = parser.parse_args()
 
     command = list(chain.from_iterable(args.command))
-    docker_client = SafeDockerClient()
-
     environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
                         for e in args.environment])
 
@@ -318,29 +292,28 @@ def main() -> int:
         ret = 0
         if command:
             ret = container_run(
-                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
+                platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
                 local_ccache_dir=args.ccache_dir, environment=environment)
         elif args.print_docker_run:
             command = []
             ret = container_run(
-                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
+                platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
                 local_ccache_dir=args.ccache_dir, dry_run=True, environment=environment)
         else:
             # With no commands, execute a build function for the target platform
             command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
             logging.info("No command specified, trying default build: %s", ' '.join(command))
             ret = container_run(
-                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
+                platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
                 local_ccache_dir=args.ccache_dir, environment=environment)
 
         if ret != 0:
             logging.critical("Execution of %s failed with status: %d", command, ret)
             return ret
 
-
     else:
         parser.print_help()
         list_platforms()