easybuilders · boegel · Feb 17, 2021 · Oct 16, 2020 · Oct 23, 2020 · Oct 22, 2020
diff --git a/easybuild/tools/build_log.py b/easybuild/tools/build_log.py
@@ -215,7 +215,7 @@ def init_logging(logfile, logtostdout=False, silent=False, colorize=fancylogger.
             os.close(fd)
 
         fancylogger.logToFile(logfile, max_bytes=0)
-        print_msg('temporary log file in case of crash %s' % (logfile), log=None, silent=silent)
+        print_msg('Temporary log file in case of crash %s' % (logfile), log=None, silent=silent)
 
     log = fancylogger.getLogger(fname=False)
 

diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py
@@ -178,6 +178,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
         'filter_env_vars',
         'hide_deps',
         'hide_toolchains',
+        'http_header_fields_urlpat',
         'force_download',
         'from_pr',
         'git_working_dirs_path',

diff --git a/easybuild/tools/filetools.py b/easybuild/tools/filetools.py
@@ -569,6 +569,75 @@ def derive_alt_pypi_url(url):
     return alt_pypi_url
 
 
+def parse_http_header_fields_urlpat(arg, urlpat=None, header=None, urlpat_headers_collection=None, maxdepth=3):
+    """
+    Recurse into multi-line string "[URLPAT::][HEADER:]FILE|FIELD" where FILE may be another such string or file
+    containing lines matching the same format, such as "^https://www.example.com::/path/to/headers.txt", and flatten
+    the result to dict e.g. {'^https://www.example.com': ['Authorization: Basic token', 'User-Agent: Special Agent']}
+    """
+    if urlpat_headers_collection is None:
+        # this function call is not a recursive call
+        urlpat_headers = {}
+    else:
+        # copy existing header data to avoid modifying it
+        urlpat_headers = urlpat_headers_collection.copy()
+
+    # stop infinite recursion that might happen if a file.txt refers to itself
+    if maxdepth < 0:
+        raise EasyBuildError("Failed to parse_http_header_fields_urlpat (recursion limit)")
+
+    if not isinstance(arg, str):
+        raise EasyBuildError("Failed to parse_http_header_fields_urlpat (argument not a string)")
+
+    # HTTP header fields are separated by CRLF but splitting on LF is more convenient
+    for argline in arg.split('\n'):
+        argline = argline.strip()  # remove optional whitespace (e.g. remaining CR)
+        if argline == '' or '#' in argline[0]:
+            continue  # permit comment lines: ignore them
+
+        if os.path.isfile(os.path.join(os.getcwd(), argline)):
+            # expand existing relative path to absolute
+            argline = os.path.join(os.path.join(os.getcwd(), argline))
+        if os.path.isfile(argline):
+            # argline is a file path, so read that instead
+            _log.debug('File included in parse_http_header_fields_urlpat: %s' % argline)
+            argline = read_file(argline)
+            urlpat_headers = parse_http_header_fields_urlpat(argline, urlpat, header, urlpat_headers, maxdepth - 1)
+            continue
+
+        # URL pattern is separated by '::' from a HTTP header field
+        if '::' in argline:
+            [urlpat, argline] = argline.split('::', 1)  # get the urlpat
+            # the remainder may be another parseable argument, recurse with same depth
+            urlpat_headers = parse_http_header_fields_urlpat(argline, urlpat, header, urlpat_headers, maxdepth)
+            continue
+
+        # Header field has format HEADER: FIELD, and FIELD may be another parseable argument
+        # except if FIELD contains colons, then argline is the final HEADER: FIELD to be returned
+        if ':' in argline and argline.count(':') == 1:
+            [argheader, argline] = argline.split(':', 1)  # get the header and the remainder
+            # the remainder may be another parseable argument, recurse with same depth
+            # note that argheader would be forgotten in favor of the urlpat_headers returned by recursion,
+            # so pass on the header for reconstruction just in case there was nothing to recurse in
+            urlpat_headers = parse_http_header_fields_urlpat(argline, urlpat, argheader, urlpat_headers, maxdepth)
+            continue
+
+        if header is not None:
+            # parent caller didn't want to forget about the header, reconstruct as recursion stops here.
+            argline = header.strip() + ':' + argline
+
+        if urlpat is not None:
+            if urlpat in urlpat_headers.keys():
+                urlpat_headers[urlpat].append(argline)  # add headers to the list
+            else:
+                urlpat_headers[urlpat] = list([argline])  # new list headers for this urlpat
+        else:
+            _log.warning("Non-empty argument to http-header-fields-urlpat ignored (missing URL pattern)")
+
+    # return a dict full of {urlpat: [list, of, headers]}
+    return urlpat_headers
+
+
 def download_file(filename, url, path, forced=False):
     """Download a file from the given URL, to the specified path."""
 
@@ -581,6 +650,15 @@ def download_file(filename, url, path, forced=False):
         timeout = 10
     _log.debug("Using timeout of %s seconds for initiating download" % timeout)
 
+    # parse option HTTP header fields for URLs containing a pattern
+    http_header_fields_urlpat = build_option('http_header_fields_urlpat')
+    # compile a dict full of {urlpat: [header, list]}
+    urlpat_headers = dict()
+    if http_header_fields_urlpat is not None:
+        # there may be multiple options given, parse them all, while updating urlpat_headers
+        for arg in http_header_fields_urlpat:
+            urlpat_headers.update(parse_http_header_fields_urlpat(arg))
+
     # make sure directory exists
     basedir = os.path.dirname(path)
     mkdir(basedir, parents=True)
@@ -592,6 +670,17 @@ def download_file(filename, url, path, forced=False):
 
     # use custom HTTP header
     headers = {'User-Agent': 'EasyBuild', 'Accept': '*/*'}
+
+    # permit additional or override headers via http_headers_fields_urlpat option
+    # only append/override HTTP header fields that match current url
+    if urlpat_headers is not None:
+        for urlpatkey, http_header_fields in urlpat_headers.items():
+            if re.search(urlpatkey, url):
+                extraheaders = dict(hf.split(':', 1) for hf in http_header_fields)
+                for key, val in extraheaders.items():
+                    headers[key] = val
+                    _log.debug("Custom HTTP header field set: %s (value omitted from log)", key)
+
     # for backward compatibility, and to avoid relying on 3rd party Python library 'requests'
     url_req = std_urllib.Request(url, headers=headers)
     used_urllib = std_urllib

diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py
@@ -399,6 +399,11 @@ def override_options(self):
                           "(e.g. --hide-deps=zlib,ncurses)", 'strlist', 'extend', None),
             'hide-toolchains': ("Comma separated list of toolchains that you want automatically hidden, "
                                 "(e.g. --hide-toolchains=GCCcore)", 'strlist', 'extend', None),
+            'http-header-fields-urlpat': ("Set extra HTTP header FIELDs when downloading files from URL PATterns. "
+                                          "To not log sensitive values, specify a file containing newline separated "
+                                          "FIELDs. e.g. \"^https://www.example.com::/path/to/headers.txt\" or "
+                                          "\"client[A-z0-9]*.example.com': ['Authorization: Basic token']\".",
+                                          None, 'append', None, {'metavar': '[URLPAT::][HEADER:]FILE|FIELD'}),
             'ignore-checksums': ("Ignore failing checksum verification", None, 'store_true', False),
             'ignore-osdeps': ("Ignore any listed OS dependencies", None, 'store_true', False),
             'install-latest-eb-release': ("Install latest known version of easybuild", None, 'store_true', False),

diff --git a/test/framework/build_log.py b/test/framework/build_log.py
@@ -419,7 +419,7 @@ def test_init_logging(self):
         self.assertTrue(os.path.exists(logfile))
         self.assertEqual(os.path.dirname(logfile), tmpdir)
         self.assertTrue(isinstance(log, EasyBuildLog))
-        self.assertTrue(stdout.startswith("== temporary log file in case of crash"))
+        self.assertTrue(stdout.startswith("== Temporary log file in case of crash"))
 
         stop_logging(logfile)
 

diff --git a/test/framework/options.py b/test/framework/options.py
@@ -52,7 +52,8 @@
 from easybuild.tools.config import find_last_log, get_build_log_path, get_module_syntax, module_classes
 from easybuild.tools.environment import modify_env
 from easybuild.tools.filetools import change_dir, copy_dir, copy_file, download_file, is_patch_file, mkdir
-from easybuild.tools.filetools import read_file, remove_dir, remove_file, which, write_file
+from easybuild.tools.filetools import parse_http_header_fields_urlpat, read_file, remove_dir, remove_file
+from easybuild.tools.filetools import which, write_file
 from easybuild.tools.github import GITHUB_RAW, GITHUB_EB_MAIN, GITHUB_EASYCONFIGS_REPO
 from easybuild.tools.github import URL_SEPARATOR, fetch_github_token
 from easybuild.tools.modules import Lmod
@@ -2563,6 +2564,181 @@ def test_hide_toolchains(self):
         self.assertTrue(re.search(r'module: GCC/\.4\.9\.2', outtxt))
         self.assertTrue(re.search(r'module: gzip/1\.6-GCC-4\.9\.2', outtxt))
 
+    def test_parse_http_header_fields_urlpat(self):
+        """Test function parse_http_header_fields_urlpat"""
+        urlex = "example.com"
+        urlgnu = "gnu.org"
+        hdrauth = "Authorization"
+        valauth = "Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ=="
+        hdragent = "User-Agent"
+        valagent = "James/0.0.7 (MI6)"
+        hdrrefer = "Referer"
+        valrefer = "http://www.example.com/"
+        filesub1 = os.path.join(self.test_prefix, "testhttpheaders1.txt")
+        filesub2 = os.path.join(self.test_prefix, "testhttpheaders2.txt")
+        filesub3 = os.path.join(self.test_prefix, "testhttpheaders3.txt")
+        filesub4 = os.path.join(self.test_prefix, "testhttpheaders4.txt")
+        fileauth = os.path.join(self.test_prefix, "testhttpheadersauth.txt")
+        write_file(filesub4, filesub3)
+        write_file(filesub3, filesub2)
+        write_file(filesub2, filesub1)
+        write_file(filesub1, "%s::%s:%s\n" % (urlgnu, hdrauth, valauth))
+        write_file(filesub2, "%s::%s\n" % (urlex, filesub1))
+        write_file(filesub3, "%s::%s:%s\n" % (urlex, hdragent, filesub2))
+        write_file(fileauth, "%s\n" % (valauth))
+
+        # Case A: basic pattern
+        args = "%s::%s:%s" % (urlgnu, hdragent, valagent)
+        urlpat_headers = parse_http_header_fields_urlpat(args)
+        self.assertEqual({urlgnu: ["%s:%s" % (hdragent, valagent)]}, urlpat_headers)
+
+        # Case B: urlpat has another urlpat: retain deepest level
+        args = "%s::%s::%s::%s:%s" % (urlgnu, urlgnu, urlex, hdragent, valagent)
+        urlpat_headers = parse_http_header_fields_urlpat(args)
+        self.assertEqual({urlex: ["%s:%s" % (hdragent, valagent)]}, urlpat_headers)
+
+        # Case C: header value has a colon
+        args = "%s::%s:%s" % (urlex, hdrrefer, valrefer)
+        urlpat_headers = parse_http_header_fields_urlpat(args)
+        self.assertEqual({urlex: ["%s:%s" % (hdrrefer, valrefer)]}, urlpat_headers)
+
+        # Case D: recurse into files
+        args = filesub3
+        urlpat_headers = parse_http_header_fields_urlpat(args)
+        self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers)
+
+        # Case E: recurse into files as header
+        args = "%s::%s" % (urlex, filesub3)
+        urlpat_headers = parse_http_header_fields_urlpat(args)
+        self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers)
+
+        # Case F: recurse into files as value (header is replaced)
+        args = "%s::%s:%s" % (urlex, hdrrefer, filesub3)
+        urlpat_headers = parse_http_header_fields_urlpat(args)
+        self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers)
+
+        # Case G: recurse into files as value (header is retained)
+        args = "%s::%s:%s" % (urlgnu, hdrauth, fileauth)
+        urlpat_headers = parse_http_header_fields_urlpat(args)
+        self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers)
+
+        # Case H: recurse into files but hit limit
+        args = filesub4
+        error_regex = r"Failed to parse_http_header_fields_urlpat \(recursion limit\)"
+        self.assertErrorRegex(EasyBuildError, error_regex, parse_http_header_fields_urlpat, args)
+
+        # Case I: argument is not a string
+        args = list("foobar")
+        error_regex = r"Failed to parse_http_header_fields_urlpat \(argument not a string\)"
+        self.assertErrorRegex(EasyBuildError, error_regex, parse_http_header_fields_urlpat, args)
+
+    def test_http_header_fields_urlpat(self):
+        """Test use of --http-header-fields-urlpat."""
+        test_ecs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'easyconfigs', 'test_ecs')
+        ec_file = os.path.join(test_ecs_dir, 'g', 'gzip', 'gzip-1.6-GCC-4.9.2.eb')
+        common_args = [
+            ec_file,
+            '--stop=fetch',
+            '--debug',
+            '--force',
+            '--force-download',
+            '--logtostdout',
+        ]
+
+        # define header fields:values that should (not) show up in the logs, either
+        # because they are secret or because they are not matched for the url
+        testdohdr = 'HeaderAPPLIED'
+        testdoval = 'SECRETvalue'
+        testdonthdr = 'HeaderIGNORED'
+        testdontval = 'BOGUSvalue'
+
+        # header fields (or its values) could be files to be read instead of literals
+        testcmdfile = os.path.join(self.test_prefix, 'testhttpheaderscmdline.txt')
+        testincfile = os.path.join(self.test_prefix, 'testhttpheadersvalinc.txt')
+        testexcfile = os.path.join(self.test_prefix, 'testhttpheadersvalexc.txt')
+        testinchdrfile = os.path.join(self.test_prefix, 'testhttpheadershdrinc.txt')
+        testexchdrfile = os.path.join(self.test_prefix, 'testhttpheadershdrexc.txt')
+        testurlpatfile = os.path.join(self.test_prefix, 'testhttpheadersurlpat.txt')
+
+        # log mention format upon header or file inclusion
+        mentionhdr = 'Custom HTTP header field set: %s'
+        mentionfile = 'File included in parse_http_header_fields_urlpat: %s'
+
+        def run_and_assert(args, msg, words_expected=None, words_unexpected=None):
+            stdout, stderr = self._run_mock_eb(args, do_build=True, raise_error=True, testing=False)
+            if words_expected is not None:
+                for thestring in words_expected:
+                    self.assertTrue(re.compile(thestring).search(stdout), "Pattern '%s' missing from log (%s)" %
+                                    (thestring, msg))
+            if words_unexpected is not None:
+                for thestring in words_unexpected:
+                    self.assertFalse(re.compile(thestring).search(stdout), "Pattern '%s' leaked into log (%s)" %
+                                     (thestring, msg))
+
+        # A: simple direct case (all is logged because passed directly via EasyBuild configuration options)
+        args = list(common_args)
+        args.extend([
+            '--http-header-fields-urlpat=gnu.org::%s:%s' % (testdohdr, testdoval),
+            '--http-header-fields-urlpat=nomatch.com::%s:%s' % (testdonthdr, testdontval),
+        ])
+        # expect to find everything passed on cmdline
+        expected = [mentionhdr % (testdohdr), testdoval, testdonthdr, testdontval]
+        run_and_assert(args, "case A", expected)
+
+        # all subsequent tests share this argument list
+        args = common_args
+        args.append('--http-header-fields-urlpat=%s' % (testcmdfile))
+
+        # B: simple file case (secrets in file are not logged)
+        txt = '\n'.join([
+            'gnu.org::%s: %s' % (testdohdr, testdoval),
+            'nomatch.com::%s: %s' % (testdonthdr, testdontval),
+            '',
+        ])
+        write_file(testcmdfile, txt)
+        # expect to find only the header key (not its value) and only for the appropriate url
+        expected = [mentionhdr % testdohdr, mentionfile % testcmdfile]
+        not_expected = [testdoval, testdonthdr, testdontval]
+        run_and_assert(args, "case B", expected, not_expected)
+
+        # C: recursion one: header value is another file
+        txt = '\n'.join([
+            'gnu.org::%s: %s' % (testdohdr, testincfile),
+            'nomatch.com::%s: %s' % (testdonthdr, testexcfile),
+            '',
+        ])
+        write_file(testcmdfile, txt)
+        write_file(testincfile, '%s\n' % (testdoval))
+        write_file(testexcfile, '%s\n' % (testdontval))
+        # expect to find only the header key (not its value and not the filename) and only for the appropriate url
+        expected = [mentionhdr % (testdohdr), mentionfile % (testcmdfile),
+                    mentionfile % (testincfile), mentionfile % (testexcfile)]
+        not_expected = [testdoval, testdonthdr, testdontval]
+        run_and_assert(args, "case C", expected, not_expected)
+
+        # D: recursion two: header field+value is another file,
+        write_file(testcmdfile, '\n'.join(['gnu.org::%s' % (testinchdrfile), 'nomatch.com::%s' % (testexchdrfile), '']))
+        write_file(testinchdrfile, '%s: %s\n' % (testdohdr, testdoval))
+        write_file(testexchdrfile, '%s: %s\n' % (testdonthdr, testdontval))
+        # expect to find only the header key (and the literal filename) and only for the appropriate url
+        expected = [mentionhdr % (testdohdr), mentionfile % (testcmdfile),
+                    mentionfile % (testinchdrfile), mentionfile % (testexchdrfile)]
+        not_expected = [testdoval, testdonthdr, testdontval]
+        run_and_assert(args, "case D", expected, not_expected)
+
+        # E: recursion three: url pattern + header field + value in another file
+        write_file(testcmdfile, '%s\n' % (testurlpatfile))
+        txt = '\n'.join([
+            'gnu.org::%s: %s' % (testdohdr, testdoval),
+            'nomatch.com::%s: %s' % (testdonthdr, testdontval),
+            '',
+        ])
+        write_file(testurlpatfile, txt)
+        # expect to find only the header key (but not the literal filename) and only for the appropriate url
+        expected = [mentionhdr % (testdohdr), mentionfile % (testcmdfile), mentionfile % (testurlpatfile)]
+        not_expected = [testdoval, testdonthdr, testdontval]
+        run_and_assert(args, "case E", expected, not_expected)
+
     def test_test_report_env_filter(self):
         """Test use of --test-report-env-filter."""