From 4f8ddb824b346011982a9aa62d9698839a1d49cc Mon Sep 17 00:00:00 2001 From: Rovanion Luckey Date: Wed, 19 Apr 2023 14:20:56 +0200 Subject: [PATCH 1/5] Build reproducible tarballs from git --- easybuild/tools/filetools.py | 14 ++++--- test/framework/filetools.py | 72 +++++++++++++++++++++++------------- 2 files changed, 56 insertions(+), 30 deletions(-) diff --git a/easybuild/tools/filetools.py b/easybuild/tools/filetools.py index 80a6ba6560..7a431c8ee5 100644 --- a/easybuild/tools/filetools.py +++ b/easybuild/tools/filetools.py @@ -2690,11 +2690,15 @@ def get_source_tarball_from_git(filename, targetdir, git_config): for cmd in cmds: run.run_cmd(cmd, log_all=True, simple=True, regexp=False, path=repo_name, trace=False) - # create an archive and delete the git repo directory - if keep_git_dir: - tar_cmd = ['tar', 'cfvz', targetpath, repo_name] - else: - tar_cmd = ['tar', 'cfvz', targetpath, '--exclude', '.git', repo_name] + # When CentOS 7 is phased out and tar>1.28 is everywhere, replace find-sort-pipe with tar-flag + # '--sort=name' and place LC_ALL in front of tar. Also remove flags --null, --no-recursion, and + # --files-from - from the flags to tar. See https://reproducible-builds.org/docs/archives/ + tar_cmd = ['find', repo_name, '-print0', '-path \'*/.git\' -prune' if not keep_git_dir else '', '|', + 'LC_ALL=C', 'sort', '--zero-terminated', '|', + 'GZIP=--no-name', 'tar', '--create', '--file', targetpath, '--no-recursion', + '--gzip', '--mtime="1970-01-01 00:00Z"', '--owner=0', '--group=0', + '--numeric-owner', '--format=gnu', '--null', + '--no-recursion', '--files-from -'] run.run_cmd(' '.join(tar_cmd), log_all=True, simple=True, regexp=False, trace=False) # cleanup (repo_name dir does not exist in dry run mode) diff --git a/test/framework/filetools.py b/test/framework/filetools.py index fcaebe16d4..5e2d0412fe 100644 --- a/test/framework/filetools.py +++ b/test/framework/filetools.py @@ -2787,41 +2787,57 @@ def run_check(): 'url': 'git@github.com:easybuilders', 'tag': 'tag_for_tests', } - git_repo = {'git_repo': 'git@github.com:easybuilders/testrepository.git'} # Just to make the below shorter + string_args = { + 'git_repo': 'git@github.com:easybuilders/testrepository.git', + 'test_prefix': self.test_prefix, + } + expected = '\n'.join([ r' running command "git clone --depth 1 --branch tag_for_tests %(git_repo)s"', - r" \(in /.*\)", - r' running command "tar cfvz .*/target/test.tar.gz --exclude .git testrepository"', - r" \(in /.*\)", - ]) % git_repo + r" \(in .*/tmp.*\)", + r' running command "find testrepository -print0 -path \'*/.git\' -prune | LC_ALL=C sort --zero-terminated' + r' | GZIP=--no-name tar --create --file %(test_prefix)s/target/test.tar.gz --no-recursion' + r' --gzip --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu' + r' --null --no-recursion --files-from -"', + r" \(in .*/tmp.*\)", + ]) % string_args run_check() git_config['clone_into'] = 'test123' expected = '\n'.join([ r' running command "git clone --depth 1 --branch tag_for_tests %(git_repo)s test123"', - r" \(in /.*\)", - r' running command "tar cfvz .*/target/test.tar.gz --exclude .git test123"', - r" \(in /.*\)", - ]) % git_repo + r" \(in .*/tmp.*\)", + r' running command "find test123 -print0 -path \'*/.git\' -prune | LC_ALL=C sort --zero-terminated' + r' | GZIP=--no-name tar --create --file #(test_fprefix)s/target/test.tar.gz --no-recursion' + r' --gzip --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu' + r' --null --no-recursion --files-from -"', + r" \(in .*/tmp.*\)", + ]) % string_args run_check() del git_config['clone_into'] git_config['recursive'] = True expected = '\n'.join([ r' running command "git clone --depth 1 --branch tag_for_tests --recursive %(git_repo)s"', - r" \(in /.*\)", - r' running command "tar cfvz .*/target/test.tar.gz --exclude .git testrepository"', - r" \(in /.*\)", - ]) % git_repo + r" \(in .*/tmp.*\)", + r' running command "find testrepository -print0 -path \'*/.git\' -prune | LC_ALL=C sort --zero-terminated' + r' | GZIP=--no-name tar --create --file #(test_fprefix)s/target/test.tar.gz --no-recursion' + r' --gzip --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu' + r' --null --no-recursion --files-from -"', + r" \(in .*/tmp.*\)", + ]) % string_args run_check() git_config['keep_git_dir'] = True expected = '\n'.join([ r' running command "git clone --branch tag_for_tests --recursive %(git_repo)s"', - r" \(in /.*\)", - r' running command "tar cfvz .*/target/test.tar.gz testrepository"', - r" \(in /.*\)", - ]) % git_repo + r" \(in .*/tmp.*\)", + r' running command "find testrepository -print0 | LC_ALL=C sort --zero-terminated | GZIP=--no-name tar' + r' --create --file #(test_fprefix)s/target/test.tar.gz --no-recursion --gzip' + r' --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu --null --no-recursion' + r' --files-from -"', + r" \(in .*/tmp.*\)", + ]) % string_args run_check() del git_config['keep_git_dir'] @@ -2829,23 +2845,29 @@ def run_check(): git_config['commit'] = '8456f86' expected = '\n'.join([ r' running command "git clone --no-checkout %(git_repo)s"', - r" \(in /.*\)", + r" \(in .*/tmp.*\)", r' running command "git checkout 8456f86 && git submodule update --init --recursive"', r" \(in testrepository\)", - r' running command "tar cfvz .*/target/test.tar.gz --exclude .git testrepository"', - r" \(in /.*\)", - ]) % git_repo + r' running command "find testrepository -print0 -path \'*/.git\' -prune | LC_ALL=C sort --zero-terminated' + r' | GZIP=--no-name tar --create --file #(test_fprefix)s/target/test.tar.gz --no-recursion' + r' --gzip --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu' + r' --null --no-recursion --files-from -"', + r" \(in .*/tmp.*\)", + ]) % string_args run_check() del git_config['recursive'] expected = '\n'.join([ r' running command "git clone --no-checkout %(git_repo)s"', - r" \(in /.*\)", + r" \(in .*/tmp.*\)", r' running command "git checkout 8456f86"', r" \(in testrepository\)", - r' running command "tar cfvz .*/target/test.tar.gz --exclude .git testrepository"', - r" \(in /.*\)", - ]) % git_repo + r' running command "find testrepository -print0 -path \'*/.git\' -prune | LC_ALL=C sort --zero-terminated' + r' | GZIP=--no-name tar --create --file #(test_fprefix)s/target/test.tar.gz --no-recursion' + r' --gzip --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu' + r' --null --no-recursion --files-from -"', + r" \(in .*/tmp.*\)", + ]) % string_args run_check() # Test with real data. From 8d9e14eb6d72490c675adb8a70415a4c735cc0bc Mon Sep 17 00:00:00 2001 From: Alex Domingo Date: Thu, 29 Feb 2024 02:40:57 +0100 Subject: [PATCH 2/5] make reproducible archives only of git repos without .git dir and reset timestamps with touch --- easybuild/tools/filetools.py | 42 ++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/easybuild/tools/filetools.py b/easybuild/tools/filetools.py index 12fe557a60..5b7406fffa 100644 --- a/easybuild/tools/filetools.py +++ b/easybuild/tools/filetools.py @@ -2540,12 +2540,12 @@ def copy(paths, target_path, force_in_dry_run=False, **kwargs): raise EasyBuildError("Specified path to copy is not an existing file or directory: %s", path) -def get_source_tarball_from_git(filename, targetdir, git_config): +def get_source_tarball_from_git(filename, target_dir, git_config): """ Downloads a git repository, at a specific tag or commit, recursively or not, and make an archive with it :param filename: name of the archive to save the code to (must be .tar.gz) - :param targetdir: target directory where to save the archive to + :param target_dir: target directory where to save the archive to :param git_config: dictionary containing url, repo_name, recursive, and one of tag or commit """ # sanity check on git_config value being passed @@ -2584,8 +2584,7 @@ def get_source_tarball_from_git(filename, targetdir, git_config): raise EasyBuildError("git_config currently only supports filename ending in .tar.gz") # prepare target directory and clone repository - mkdir(targetdir, parents=True) - targetpath = os.path.join(targetdir, filename) + mkdir(target_dir, parents=True) # compose 'git clone' command, and run it if extra_config_params: @@ -2668,21 +2667,36 @@ def get_source_tarball_from_git(filename, targetdir, git_config): for cmd in cmds: run_shell_cmd(cmd, work_dir=work_dir, hidden=True, verbose_dry_run=True) - # When CentOS 7 is phased out and tar>1.28 is everywhere, replace find-sort-pipe with tar-flag - # '--sort=name' and place LC_ALL in front of tar. Also remove flags --null, --no-recursion, and - # --files-from - from the flags to tar. See https://reproducible-builds.org/docs/archives/ - tar_cmd = ['find', repo_name, '-print0', '-path \'*/.git\' -prune' if not keep_git_dir else '', '|', - 'LC_ALL=C', 'sort', '--zero-terminated', '|', - 'GZIP=--no-name', 'tar', '--create', '--file', targetpath, '--no-recursion', - '--gzip', '--mtime="1970-01-01 00:00Z"', '--owner=0', '--group=0', - '--numeric-owner', '--format=gnu', '--null', - '--no-recursion', '--files-from -'] + # Create archive + archive_path = os.path.join(target_dir, filename) + + if keep_git_dir: + # create archive of git repo including .git directory + tar_cmd = ['tar', 'cfvz', archive_path, repo_name] + else: + # create reproducible archive + # see https://reproducible-builds.org/docs/archives/ + # TODO: when CentOS 7 is phased out and tar>1.28 is everywhere, replace sort step + # in the pipe with tar-flag '--sort=name' and place LC_ALL in front of tar. + tar_cmd = [ + # print names of all files and folders excluding .git directory + 'find', repo_name, '-name ".git"', '-prune', '-o', '-print0', + # reset access and modification timestamps + '-exec', 'touch', '-t 197001010100', '{}', '\;', '|', + # sort file list + 'LC_ALL=C', 'sort', '--zero-terminated', '|', + # create tarball in GNU format with ownership reset + 'tar', '--create', '--no-recursion', '--owner=0', '--group=0', '--numeric-owner', '--format=gnu', + '--null', '--files-from', '-', '|', + # compress tarball with gzip without original file name and timestamp + 'gzip', '--no-name', '>', archive_path + ] run_shell_cmd(' '.join(tar_cmd), work_dir=tmpdir, hidden=True, verbose_dry_run=True) # cleanup (repo_name dir does not exist in dry run mode) remove(tmpdir) - return targetpath + return archive_path def move_file(path, target_path, force_in_dry_run=False): From 46e61c85d881dc081936347930cfa44bdbd42bde Mon Sep 17 00:00:00 2001 From: Alex Domingo Date: Thu, 29 Feb 2024 02:52:25 +0100 Subject: [PATCH 3/5] update FileToolsTest.test_github_get_source_tarball_from_git tests with reproducible tar commands --- test/framework/filetools.py | 63 +++++++++++++++---------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/test/framework/filetools.py b/test/framework/filetools.py index 3adb736586..8d823ef59c 100644 --- a/test/framework/filetools.py +++ b/test/framework/filetools.py @@ -2802,26 +2802,25 @@ def run_check(): 'git_repo': 'git@github.com:easybuilders/testrepository.git', 'test_prefix': self.test_prefix, } + reprod_tar_cmd_pattern = ( + r' running shell command "find {} -name \".git\" -prune -o -print0 -exec touch -t 197001010100 {{}} \; |' + r' LC_ALL=C sort --zero-terminated | tar --create --no-recursion --owner=0 --group=0 --numeric-owner' + r' --format=gnu --null --files-from - | gzip --no-name > %(test_prefix)s/target/test.tar.gz' + ) expected = '\n'.join([ - r' running command "git clone --depth 1 --branch tag_for_tests %(git_repo)s"', + r' running shell command "git clone --depth 1 --branch tag_for_tests %(git_repo)s"', r" \(in .*/tmp.*\)", - r' running command "find testrepository -print0 -path \'*/.git\' -prune | LC_ALL=C sort --zero-terminated' - r' | GZIP=--no-name tar --create --file %(test_prefix)s/target/test.tar.gz --no-recursion' - r' --gzip --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu' - r' --null --no-recursion --files-from -"', + reprod_tar_cmd_pattern.format("testrepository"), r" \(in .*/tmp.*\)", ]) % string_args run_check() git_config['clone_into'] = 'test123' expected = '\n'.join([ - r' running command "git clone --depth 1 --branch tag_for_tests %(git_repo)s test123"', + r' running shell command "git clone --depth 1 --branch tag_for_tests %(git_repo)s test123"', r" \(in .*/tmp.*\)", - r' running command "find test123 -print0 -path \'*/.git\' -prune | LC_ALL=C sort --zero-terminated' - r' | GZIP=--no-name tar --create --file #(test_fprefix)s/target/test.tar.gz --no-recursion' - r' --gzip --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu' - r' --null --no-recursion --files-from -"', + reprod_tar_cmd_pattern.format("test123"), r" \(in .*/tmp.*\)", ]) % string_args run_check() @@ -2829,12 +2828,9 @@ def run_check(): git_config['recursive'] = True expected = '\n'.join([ - r' running command "git clone --depth 1 --branch tag_for_tests --recursive %(git_repo)s"', + r' running shell command "git clone --depth 1 --branch tag_for_tests --recursive %(git_repo)s"', r" \(in .*/tmp.*\)", - r' running command "find testrepository -print0 -path \'*/.git\' -prune | LC_ALL=C sort --zero-terminated' - r' | GZIP=--no-name tar --create --file #(test_fprefix)s/target/test.tar.gz --no-recursion' - r' --gzip --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu' - r' --null --no-recursion --files-from -"', + reprod_tar_cmd_pattern.format("testrepository"), r" \(in .*/tmp.*\)", ]) % string_args run_check() @@ -2844,9 +2840,9 @@ def run_check(): ' running shell command "git clone --depth 1 --branch tag_for_tests --recursive' + ' --recurse-submodules=\'!vcflib\' --recurse-submodules=\'!sdsl-lite\' %(git_repo)s"', r" \(in .*/tmp.*\)", - r' running shell command "tar cfvz .*/target/test.tar.gz --exclude .git testrepository"', + reprod_tar_cmd_pattern.format("testrepository"), r" \(in .*/tmp.*\)", - ]) % git_repo + ]) % string_args run_check() git_config['extra_config_params'] = [ @@ -2858,21 +2854,18 @@ def run_check(): + ' clone --depth 1 --branch tag_for_tests --recursive' + ' --recurse-submodules=\'!vcflib\' --recurse-submodules=\'!sdsl-lite\' %(git_repo)s"', r" \(in .*/tmp.*\)", - r' running shell command "tar cfvz .*/target/test.tar.gz --exclude .git testrepository"', + reprod_tar_cmd_pattern.format("testrepository"), r" \(in .*/tmp.*\)", - ]) % git_repo + ]) % string_args run_check() del git_config['recurse_submodules'] del git_config['extra_config_params'] git_config['keep_git_dir'] = True expected = '\n'.join([ - r' running command "git clone --branch tag_for_tests --recursive %(git_repo)s"', + r' running shell command "git clone --branch tag_for_tests --recursive %(git_repo)s"', r" \(in .*/tmp.*\)", - r' running command "find testrepository -print0 | LC_ALL=C sort --zero-terminated | GZIP=--no-name tar' - r' --create --file #(test_fprefix)s/target/test.tar.gz --no-recursion --gzip' - r' --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu --null --no-recursion' - r' --files-from -"', + r' running shell command "tar cfvz .*/target/test.tar.gz testrepository"', r" \(in .*/tmp.*\)", ]) % string_args run_check() @@ -2881,28 +2874,22 @@ def run_check(): del git_config['tag'] git_config['commit'] = '8456f86' expected = '\n'.join([ - r' running command "git clone --no-checkout %(git_repo)s"', + r' running shell command "git clone --no-checkout %(git_repo)s"', r" \(in .*/tmp.*\)", - r' running command "git checkout 8456f86 && git submodule update --init --recursive"', + r' running shell command "git checkout 8456f86 && git submodule update --init --recursive"', r" \(in testrepository\)", - r' running command "find testrepository -print0 -path \'*/.git\' -prune | LC_ALL=C sort --zero-terminated' - r' | GZIP=--no-name tar --create --file #(test_fprefix)s/target/test.tar.gz --no-recursion' - r' --gzip --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu' - r' --null --no-recursion --files-from -"', + reprod_tar_cmd_pattern.format("testrepository"), r" \(in .*/tmp.*\)", ]) % string_args run_check() git_config['recurse_submodules'] = ['!vcflib', '!sdsl-lite'] expected = '\n'.join([ - r' running command "git clone --no-checkout %(git_repo)s"', + r' running shell command "git clone --no-checkout %(git_repo)s"', r" \(in .*/tmp.*\)", - r' running command "git checkout 8456f86"', + r' running shell command "git checkout 8456f86"', r" \(in testrepository\)", - r' running command "find testrepository -print0 -path \'*/.git\' -prune | LC_ALL=C sort --zero-terminated' - r' | GZIP=--no-name tar --create --file #(test_fprefix)s/target/test.tar.gz --no-recursion' - r' --gzip --mtime="1970-01-01 00:00Z" --owner=0 --group=0 --numeric-owner --format=gnu' - r' --null --no-recursion --files-from -"', + reprod_tar_cmd_pattern.format("testrepository"), r" \(in .*/tmp.*\)", ]) % string_args run_check() @@ -2914,9 +2901,9 @@ def run_check(): r" \(in /.*\)", r' running shell command "git checkout 8456f86"', r" \(in /.*/testrepository\)", - r' running shell command "tar cfvz .*/target/test.tar.gz --exclude .git testrepository"', + reprod_tar_cmd_pattern.format("testrepository"), r" \(in /.*\)", - ]) % git_repo + ]) % string_args run_check() # Test with real data. From 8c76561bf026831bfb7ff065a923636f20dd4b1f Mon Sep 17 00:00:00 2001 From: Alex Domingo Date: Thu, 29 Feb 2024 09:56:49 +0100 Subject: [PATCH 4/5] remove TODO about sort option in tar as it is not supported across implementations --- easybuild/tools/filetools.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/easybuild/tools/filetools.py b/easybuild/tools/filetools.py index 5b7406fffa..470e347e88 100644 --- a/easybuild/tools/filetools.py +++ b/easybuild/tools/filetools.py @@ -2676,8 +2676,6 @@ def get_source_tarball_from_git(filename, target_dir, git_config): else: # create reproducible archive # see https://reproducible-builds.org/docs/archives/ - # TODO: when CentOS 7 is phased out and tar>1.28 is everywhere, replace sort step - # in the pipe with tar-flag '--sort=name' and place LC_ALL in front of tar. tar_cmd = [ # print names of all files and folders excluding .git directory 'find', repo_name, '-name ".git"', '-prune', '-o', '-print0', From 4363f9f371cbb94623e0be62c5ca3de37469fd88 Mon Sep 17 00:00:00 2001 From: Rovanion Luckey Date: Fri, 1 Mar 2024 10:40:09 +0100 Subject: [PATCH 5/5] filetools: Fix missing raw string for \; in get_source_tarball_from_git. --- easybuild/tools/filetools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/tools/filetools.py b/easybuild/tools/filetools.py index 470e347e88..330c89366b 100644 --- a/easybuild/tools/filetools.py +++ b/easybuild/tools/filetools.py @@ -2680,7 +2680,7 @@ def get_source_tarball_from_git(filename, target_dir, git_config): # print names of all files and folders excluding .git directory 'find', repo_name, '-name ".git"', '-prune', '-o', '-print0', # reset access and modification timestamps - '-exec', 'touch', '-t 197001010100', '{}', '\;', '|', + '-exec', 'touch', '-t 197001010100', '{}', r'\;', '|', # sort file list 'LC_ALL=C', 'sort', '--zero-terminated', '|', # create tarball in GNU format with ownership reset