diff --git a/easybuild/tools/build_log.py b/easybuild/tools/build_log.py index db6d6ddc67..2cf97c5f2d 100644 --- a/easybuild/tools/build_log.py +++ b/easybuild/tools/build_log.py @@ -215,7 +215,7 @@ def init_logging(logfile, logtostdout=False, silent=False, colorize=fancylogger. os.close(fd) fancylogger.logToFile(logfile, max_bytes=0) - print_msg('temporary log file in case of crash %s' % (logfile), log=None, silent=silent) + print_msg('Temporary log file in case of crash %s' % (logfile), log=None, silent=silent) log = fancylogger.getLogger(fname=False) diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py index b97186f3c5..9ed3c9d719 100644 --- a/easybuild/tools/config.py +++ b/easybuild/tools/config.py @@ -178,6 +178,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'filter_env_vars', 'hide_deps', 'hide_toolchains', + 'http_header_fields_urlpat', 'force_download', 'from_pr', 'git_working_dirs_path', diff --git a/easybuild/tools/filetools.py b/easybuild/tools/filetools.py index 51e5bc9d03..08a8097e2e 100644 --- a/easybuild/tools/filetools.py +++ b/easybuild/tools/filetools.py @@ -569,6 +569,75 @@ def derive_alt_pypi_url(url): return alt_pypi_url +def parse_http_header_fields_urlpat(arg, urlpat=None, header=None, urlpat_headers_collection=None, maxdepth=3): + """ + Recurse into multi-line string "[URLPAT::][HEADER:]FILE|FIELD" where FILE may be another such string or file + containing lines matching the same format, such as "^https://www.example.com::/path/to/headers.txt", and flatten + the result to dict e.g. {'^https://www.example.com': ['Authorization: Basic token', 'User-Agent: Special Agent']} + """ + if urlpat_headers_collection is None: + # this function call is not a recursive call + urlpat_headers = {} + else: + # copy existing header data to avoid modifying it + urlpat_headers = urlpat_headers_collection.copy() + + # stop infinite recursion that might happen if a file.txt refers to itself + if maxdepth < 0: + raise EasyBuildError("Failed to parse_http_header_fields_urlpat (recursion limit)") + + if not isinstance(arg, str): + raise EasyBuildError("Failed to parse_http_header_fields_urlpat (argument not a string)") + + # HTTP header fields are separated by CRLF but splitting on LF is more convenient + for argline in arg.split('\n'): + argline = argline.strip() # remove optional whitespace (e.g. remaining CR) + if argline == '' or '#' in argline[0]: + continue # permit comment lines: ignore them + + if os.path.isfile(os.path.join(os.getcwd(), argline)): + # expand existing relative path to absolute + argline = os.path.join(os.path.join(os.getcwd(), argline)) + if os.path.isfile(argline): + # argline is a file path, so read that instead + _log.debug('File included in parse_http_header_fields_urlpat: %s' % argline) + argline = read_file(argline) + urlpat_headers = parse_http_header_fields_urlpat(argline, urlpat, header, urlpat_headers, maxdepth - 1) + continue + + # URL pattern is separated by '::' from a HTTP header field + if '::' in argline: + [urlpat, argline] = argline.split('::', 1) # get the urlpat + # the remainder may be another parseable argument, recurse with same depth + urlpat_headers = parse_http_header_fields_urlpat(argline, urlpat, header, urlpat_headers, maxdepth) + continue + + # Header field has format HEADER: FIELD, and FIELD may be another parseable argument + # except if FIELD contains colons, then argline is the final HEADER: FIELD to be returned + if ':' in argline and argline.count(':') == 1: + [argheader, argline] = argline.split(':', 1) # get the header and the remainder + # the remainder may be another parseable argument, recurse with same depth + # note that argheader would be forgotten in favor of the urlpat_headers returned by recursion, + # so pass on the header for reconstruction just in case there was nothing to recurse in + urlpat_headers = parse_http_header_fields_urlpat(argline, urlpat, argheader, urlpat_headers, maxdepth) + continue + + if header is not None: + # parent caller didn't want to forget about the header, reconstruct as recursion stops here. + argline = header.strip() + ':' + argline + + if urlpat is not None: + if urlpat in urlpat_headers.keys(): + urlpat_headers[urlpat].append(argline) # add headers to the list + else: + urlpat_headers[urlpat] = list([argline]) # new list headers for this urlpat + else: + _log.warning("Non-empty argument to http-header-fields-urlpat ignored (missing URL pattern)") + + # return a dict full of {urlpat: [list, of, headers]} + return urlpat_headers + + def download_file(filename, url, path, forced=False): """Download a file from the given URL, to the specified path.""" @@ -581,6 +650,15 @@ def download_file(filename, url, path, forced=False): timeout = 10 _log.debug("Using timeout of %s seconds for initiating download" % timeout) + # parse option HTTP header fields for URLs containing a pattern + http_header_fields_urlpat = build_option('http_header_fields_urlpat') + # compile a dict full of {urlpat: [header, list]} + urlpat_headers = dict() + if http_header_fields_urlpat is not None: + # there may be multiple options given, parse them all, while updating urlpat_headers + for arg in http_header_fields_urlpat: + urlpat_headers.update(parse_http_header_fields_urlpat(arg)) + # make sure directory exists basedir = os.path.dirname(path) mkdir(basedir, parents=True) @@ -592,6 +670,17 @@ def download_file(filename, url, path, forced=False): # use custom HTTP header headers = {'User-Agent': 'EasyBuild', 'Accept': '*/*'} + + # permit additional or override headers via http_headers_fields_urlpat option + # only append/override HTTP header fields that match current url + if urlpat_headers is not None: + for urlpatkey, http_header_fields in urlpat_headers.items(): + if re.search(urlpatkey, url): + extraheaders = dict(hf.split(':', 1) for hf in http_header_fields) + for key, val in extraheaders.items(): + headers[key] = val + _log.debug("Custom HTTP header field set: %s (value omitted from log)", key) + # for backward compatibility, and to avoid relying on 3rd party Python library 'requests' url_req = std_urllib.Request(url, headers=headers) used_urllib = std_urllib diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index a363cde325..ebffd26eae 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -399,6 +399,11 @@ def override_options(self): "(e.g. --hide-deps=zlib,ncurses)", 'strlist', 'extend', None), 'hide-toolchains': ("Comma separated list of toolchains that you want automatically hidden, " "(e.g. --hide-toolchains=GCCcore)", 'strlist', 'extend', None), + 'http-header-fields-urlpat': ("Set extra HTTP header FIELDs when downloading files from URL PATterns. " + "To not log sensitive values, specify a file containing newline separated " + "FIELDs. e.g. \"^https://www.example.com::/path/to/headers.txt\" or " + "\"client[A-z0-9]*.example.com': ['Authorization: Basic token']\".", + None, 'append', None, {'metavar': '[URLPAT::][HEADER:]FILE|FIELD'}), 'ignore-checksums': ("Ignore failing checksum verification", None, 'store_true', False), 'ignore-osdeps': ("Ignore any listed OS dependencies", None, 'store_true', False), 'install-latest-eb-release': ("Install latest known version of easybuild", None, 'store_true', False), diff --git a/test/framework/build_log.py b/test/framework/build_log.py index 32aefc7b74..4fccf30e5c 100644 --- a/test/framework/build_log.py +++ b/test/framework/build_log.py @@ -419,7 +419,7 @@ def test_init_logging(self): self.assertTrue(os.path.exists(logfile)) self.assertEqual(os.path.dirname(logfile), tmpdir) self.assertTrue(isinstance(log, EasyBuildLog)) - self.assertTrue(stdout.startswith("== temporary log file in case of crash")) + self.assertTrue(stdout.startswith("== Temporary log file in case of crash")) stop_logging(logfile) diff --git a/test/framework/options.py b/test/framework/options.py index cbdd57cc71..ecd7536b3d 100644 --- a/test/framework/options.py +++ b/test/framework/options.py @@ -52,7 +52,8 @@ from easybuild.tools.config import find_last_log, get_build_log_path, get_module_syntax, module_classes from easybuild.tools.environment import modify_env from easybuild.tools.filetools import change_dir, copy_dir, copy_file, download_file, is_patch_file, mkdir -from easybuild.tools.filetools import read_file, remove_dir, remove_file, which, write_file +from easybuild.tools.filetools import parse_http_header_fields_urlpat, read_file, remove_dir, remove_file +from easybuild.tools.filetools import which, write_file from easybuild.tools.github import GITHUB_RAW, GITHUB_EB_MAIN, GITHUB_EASYCONFIGS_REPO from easybuild.tools.github import URL_SEPARATOR, fetch_github_token from easybuild.tools.modules import Lmod @@ -2563,6 +2564,181 @@ def test_hide_toolchains(self): self.assertTrue(re.search(r'module: GCC/\.4\.9\.2', outtxt)) self.assertTrue(re.search(r'module: gzip/1\.6-GCC-4\.9\.2', outtxt)) + def test_parse_http_header_fields_urlpat(self): + """Test function parse_http_header_fields_urlpat""" + urlex = "example.com" + urlgnu = "gnu.org" + hdrauth = "Authorization" + valauth = "Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ==" + hdragent = "User-Agent" + valagent = "James/0.0.7 (MI6)" + hdrrefer = "Referer" + valrefer = "http://www.example.com/" + filesub1 = os.path.join(self.test_prefix, "testhttpheaders1.txt") + filesub2 = os.path.join(self.test_prefix, "testhttpheaders2.txt") + filesub3 = os.path.join(self.test_prefix, "testhttpheaders3.txt") + filesub4 = os.path.join(self.test_prefix, "testhttpheaders4.txt") + fileauth = os.path.join(self.test_prefix, "testhttpheadersauth.txt") + write_file(filesub4, filesub3) + write_file(filesub3, filesub2) + write_file(filesub2, filesub1) + write_file(filesub1, "%s::%s:%s\n" % (urlgnu, hdrauth, valauth)) + write_file(filesub2, "%s::%s\n" % (urlex, filesub1)) + write_file(filesub3, "%s::%s:%s\n" % (urlex, hdragent, filesub2)) + write_file(fileauth, "%s\n" % (valauth)) + + # Case A: basic pattern + args = "%s::%s:%s" % (urlgnu, hdragent, valagent) + urlpat_headers = parse_http_header_fields_urlpat(args) + self.assertEqual({urlgnu: ["%s:%s" % (hdragent, valagent)]}, urlpat_headers) + + # Case B: urlpat has another urlpat: retain deepest level + args = "%s::%s::%s::%s:%s" % (urlgnu, urlgnu, urlex, hdragent, valagent) + urlpat_headers = parse_http_header_fields_urlpat(args) + self.assertEqual({urlex: ["%s:%s" % (hdragent, valagent)]}, urlpat_headers) + + # Case C: header value has a colon + args = "%s::%s:%s" % (urlex, hdrrefer, valrefer) + urlpat_headers = parse_http_header_fields_urlpat(args) + self.assertEqual({urlex: ["%s:%s" % (hdrrefer, valrefer)]}, urlpat_headers) + + # Case D: recurse into files + args = filesub3 + urlpat_headers = parse_http_header_fields_urlpat(args) + self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers) + + # Case E: recurse into files as header + args = "%s::%s" % (urlex, filesub3) + urlpat_headers = parse_http_header_fields_urlpat(args) + self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers) + + # Case F: recurse into files as value (header is replaced) + args = "%s::%s:%s" % (urlex, hdrrefer, filesub3) + urlpat_headers = parse_http_header_fields_urlpat(args) + self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers) + + # Case G: recurse into files as value (header is retained) + args = "%s::%s:%s" % (urlgnu, hdrauth, fileauth) + urlpat_headers = parse_http_header_fields_urlpat(args) + self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers) + + # Case H: recurse into files but hit limit + args = filesub4 + error_regex = r"Failed to parse_http_header_fields_urlpat \(recursion limit\)" + self.assertErrorRegex(EasyBuildError, error_regex, parse_http_header_fields_urlpat, args) + + # Case I: argument is not a string + args = list("foobar") + error_regex = r"Failed to parse_http_header_fields_urlpat \(argument not a string\)" + self.assertErrorRegex(EasyBuildError, error_regex, parse_http_header_fields_urlpat, args) + + def test_http_header_fields_urlpat(self): + """Test use of --http-header-fields-urlpat.""" + test_ecs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'easyconfigs', 'test_ecs') + ec_file = os.path.join(test_ecs_dir, 'g', 'gzip', 'gzip-1.6-GCC-4.9.2.eb') + common_args = [ + ec_file, + '--stop=fetch', + '--debug', + '--force', + '--force-download', + '--logtostdout', + ] + + # define header fields:values that should (not) show up in the logs, either + # because they are secret or because they are not matched for the url + testdohdr = 'HeaderAPPLIED' + testdoval = 'SECRETvalue' + testdonthdr = 'HeaderIGNORED' + testdontval = 'BOGUSvalue' + + # header fields (or its values) could be files to be read instead of literals + testcmdfile = os.path.join(self.test_prefix, 'testhttpheaderscmdline.txt') + testincfile = os.path.join(self.test_prefix, 'testhttpheadersvalinc.txt') + testexcfile = os.path.join(self.test_prefix, 'testhttpheadersvalexc.txt') + testinchdrfile = os.path.join(self.test_prefix, 'testhttpheadershdrinc.txt') + testexchdrfile = os.path.join(self.test_prefix, 'testhttpheadershdrexc.txt') + testurlpatfile = os.path.join(self.test_prefix, 'testhttpheadersurlpat.txt') + + # log mention format upon header or file inclusion + mentionhdr = 'Custom HTTP header field set: %s' + mentionfile = 'File included in parse_http_header_fields_urlpat: %s' + + def run_and_assert(args, msg, words_expected=None, words_unexpected=None): + stdout, stderr = self._run_mock_eb(args, do_build=True, raise_error=True, testing=False) + if words_expected is not None: + for thestring in words_expected: + self.assertTrue(re.compile(thestring).search(stdout), "Pattern '%s' missing from log (%s)" % + (thestring, msg)) + if words_unexpected is not None: + for thestring in words_unexpected: + self.assertFalse(re.compile(thestring).search(stdout), "Pattern '%s' leaked into log (%s)" % + (thestring, msg)) + + # A: simple direct case (all is logged because passed directly via EasyBuild configuration options) + args = list(common_args) + args.extend([ + '--http-header-fields-urlpat=gnu.org::%s:%s' % (testdohdr, testdoval), + '--http-header-fields-urlpat=nomatch.com::%s:%s' % (testdonthdr, testdontval), + ]) + # expect to find everything passed on cmdline + expected = [mentionhdr % (testdohdr), testdoval, testdonthdr, testdontval] + run_and_assert(args, "case A", expected) + + # all subsequent tests share this argument list + args = common_args + args.append('--http-header-fields-urlpat=%s' % (testcmdfile)) + + # B: simple file case (secrets in file are not logged) + txt = '\n'.join([ + 'gnu.org::%s: %s' % (testdohdr, testdoval), + 'nomatch.com::%s: %s' % (testdonthdr, testdontval), + '', + ]) + write_file(testcmdfile, txt) + # expect to find only the header key (not its value) and only for the appropriate url + expected = [mentionhdr % testdohdr, mentionfile % testcmdfile] + not_expected = [testdoval, testdonthdr, testdontval] + run_and_assert(args, "case B", expected, not_expected) + + # C: recursion one: header value is another file + txt = '\n'.join([ + 'gnu.org::%s: %s' % (testdohdr, testincfile), + 'nomatch.com::%s: %s' % (testdonthdr, testexcfile), + '', + ]) + write_file(testcmdfile, txt) + write_file(testincfile, '%s\n' % (testdoval)) + write_file(testexcfile, '%s\n' % (testdontval)) + # expect to find only the header key (not its value and not the filename) and only for the appropriate url + expected = [mentionhdr % (testdohdr), mentionfile % (testcmdfile), + mentionfile % (testincfile), mentionfile % (testexcfile)] + not_expected = [testdoval, testdonthdr, testdontval] + run_and_assert(args, "case C", expected, not_expected) + + # D: recursion two: header field+value is another file, + write_file(testcmdfile, '\n'.join(['gnu.org::%s' % (testinchdrfile), 'nomatch.com::%s' % (testexchdrfile), ''])) + write_file(testinchdrfile, '%s: %s\n' % (testdohdr, testdoval)) + write_file(testexchdrfile, '%s: %s\n' % (testdonthdr, testdontval)) + # expect to find only the header key (and the literal filename) and only for the appropriate url + expected = [mentionhdr % (testdohdr), mentionfile % (testcmdfile), + mentionfile % (testinchdrfile), mentionfile % (testexchdrfile)] + not_expected = [testdoval, testdonthdr, testdontval] + run_and_assert(args, "case D", expected, not_expected) + + # E: recursion three: url pattern + header field + value in another file + write_file(testcmdfile, '%s\n' % (testurlpatfile)) + txt = '\n'.join([ + 'gnu.org::%s: %s' % (testdohdr, testdoval), + 'nomatch.com::%s: %s' % (testdonthdr, testdontval), + '', + ]) + write_file(testurlpatfile, txt) + # expect to find only the header key (but not the literal filename) and only for the appropriate url + expected = [mentionhdr % (testdohdr), mentionfile % (testcmdfile), mentionfile % (testurlpatfile)] + not_expected = [testdoval, testdonthdr, testdontval] + run_and_assert(args, "case E", expected, not_expected) + def test_test_report_env_filter(self): """Test use of --test-report-env-filter."""