Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion easybuild/tools/build_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def init_logging(logfile, logtostdout=False, silent=False, colorize=fancylogger.
os.close(fd)

fancylogger.logToFile(logfile, max_bytes=0)
print_msg('temporary log file in case of crash %s' % (logfile), log=None, silent=silent)
print_msg('Temporary log file in case of crash %s' % (logfile), log=None, silent=silent)

log = fancylogger.getLogger(fname=False)

Expand Down
1 change: 1 addition & 0 deletions easybuild/tools/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
'filter_env_vars',
'hide_deps',
'hide_toolchains',
'http_header_fields_urlpat',
'force_download',
'from_pr',
'git_working_dirs_path',
Expand Down
89 changes: 89 additions & 0 deletions easybuild/tools/filetools.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,75 @@ def derive_alt_pypi_url(url):
return alt_pypi_url


def parse_http_header_fields_urlpat(arg, urlpat=None, header=None, urlpat_headers_collection=None, maxdepth=3):
"""
Recurse into multi-line string "[URLPAT::][HEADER:]FILE|FIELD" where FILE may be another such string or file
containing lines matching the same format, such as "^https://www.example.com::/path/to/headers.txt", and flatten
the result to dict e.g. {'^https://www.example.com': ['Authorization: Basic token', 'User-Agent: Special Agent']}
"""
if urlpat_headers_collection is None:
# this function call is not a recursive call
urlpat_headers = {}
else:
# copy existing header data to avoid modifying it
urlpat_headers = urlpat_headers_collection.copy()

# stop infinite recursion that might happen if a file.txt refers to itself
if maxdepth < 0:
raise EasyBuildError("Failed to parse_http_header_fields_urlpat (recursion limit)")

if not isinstance(arg, str):
raise EasyBuildError("Failed to parse_http_header_fields_urlpat (argument not a string)")

# HTTP header fields are separated by CRLF but splitting on LF is more convenient
for argline in arg.split('\n'):
argline = argline.strip() # remove optional whitespace (e.g. remaining CR)
if argline == '' or '#' in argline[0]:
continue # permit comment lines: ignore them

if os.path.isfile(os.path.join(os.getcwd(), argline)):
# expand existing relative path to absolute
argline = os.path.join(os.path.join(os.getcwd(), argline))
if os.path.isfile(argline):
# argline is a file path, so read that instead
_log.debug('File included in parse_http_header_fields_urlpat: %s' % argline)
argline = read_file(argline)
urlpat_headers = parse_http_header_fields_urlpat(argline, urlpat, header, urlpat_headers, maxdepth - 1)
continue

# URL pattern is separated by '::' from a HTTP header field
if '::' in argline:
[urlpat, argline] = argline.split('::', 1) # get the urlpat
# the remainder may be another parseable argument, recurse with same depth
urlpat_headers = parse_http_header_fields_urlpat(argline, urlpat, header, urlpat_headers, maxdepth)
continue

# Header field has format HEADER: FIELD, and FIELD may be another parseable argument
# except if FIELD contains colons, then argline is the final HEADER: FIELD to be returned
if ':' in argline and argline.count(':') == 1:
[argheader, argline] = argline.split(':', 1) # get the header and the remainder
# the remainder may be another parseable argument, recurse with same depth
# note that argheader would be forgotten in favor of the urlpat_headers returned by recursion,
# so pass on the header for reconstruction just in case there was nothing to recurse in
urlpat_headers = parse_http_header_fields_urlpat(argline, urlpat, argheader, urlpat_headers, maxdepth)
continue

if header is not None:
# parent caller didn't want to forget about the header, reconstruct as recursion stops here.
argline = header.strip() + ':' + argline

if urlpat is not None:
if urlpat in urlpat_headers.keys():
urlpat_headers[urlpat].append(argline) # add headers to the list
else:
urlpat_headers[urlpat] = list([argline]) # new list headers for this urlpat
else:
_log.warning("Non-empty argument to http-header-fields-urlpat ignored (missing URL pattern)")

# return a dict full of {urlpat: [list, of, headers]}
return urlpat_headers


def download_file(filename, url, path, forced=False):
"""Download a file from the given URL, to the specified path."""

Expand All @@ -581,6 +650,15 @@ def download_file(filename, url, path, forced=False):
timeout = 10
_log.debug("Using timeout of %s seconds for initiating download" % timeout)

# parse option HTTP header fields for URLs containing a pattern
http_header_fields_urlpat = build_option('http_header_fields_urlpat')
# compile a dict full of {urlpat: [header, list]}
urlpat_headers = dict()
if http_header_fields_urlpat is not None:
# there may be multiple options given, parse them all, while updating urlpat_headers
for arg in http_header_fields_urlpat:
urlpat_headers.update(parse_http_header_fields_urlpat(arg))

# make sure directory exists
basedir = os.path.dirname(path)
mkdir(basedir, parents=True)
Expand All @@ -592,6 +670,17 @@ def download_file(filename, url, path, forced=False):

# use custom HTTP header
headers = {'User-Agent': 'EasyBuild', 'Accept': '*/*'}

# permit additional or override headers via http_headers_fields_urlpat option
# only append/override HTTP header fields that match current url
if urlpat_headers is not None:
for urlpatkey, http_header_fields in urlpat_headers.items():
if re.search(urlpatkey, url):
extraheaders = dict(hf.split(':', 1) for hf in http_header_fields)
for key, val in extraheaders.items():
headers[key] = val
_log.debug("Custom HTTP header field set: %s (value omitted from log)", key)

# for backward compatibility, and to avoid relying on 3rd party Python library 'requests'
url_req = std_urllib.Request(url, headers=headers)
used_urllib = std_urllib
Expand Down
5 changes: 5 additions & 0 deletions easybuild/tools/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,11 @@ def override_options(self):
"(e.g. --hide-deps=zlib,ncurses)", 'strlist', 'extend', None),
'hide-toolchains': ("Comma separated list of toolchains that you want automatically hidden, "
"(e.g. --hide-toolchains=GCCcore)", 'strlist', 'extend', None),
'http-header-fields-urlpat': ("Set extra HTTP header FIELDs when downloading files from URL PATterns. "
"To not log sensitive values, specify a file containing newline separated "
"FIELDs. e.g. \"^https://www.example.com::/path/to/headers.txt\" or "
"\"client[A-z0-9]*.example.com': ['Authorization: Basic token']\".",
None, 'append', None, {'metavar': '[URLPAT::][HEADER:]FILE|FIELD'}),
'ignore-checksums': ("Ignore failing checksum verification", None, 'store_true', False),
'ignore-osdeps': ("Ignore any listed OS dependencies", None, 'store_true', False),
'install-latest-eb-release': ("Install latest known version of easybuild", None, 'store_true', False),
Expand Down
2 changes: 1 addition & 1 deletion test/framework/build_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ def test_init_logging(self):
self.assertTrue(os.path.exists(logfile))
self.assertEqual(os.path.dirname(logfile), tmpdir)
self.assertTrue(isinstance(log, EasyBuildLog))
self.assertTrue(stdout.startswith("== temporary log file in case of crash"))
self.assertTrue(stdout.startswith("== Temporary log file in case of crash"))

stop_logging(logfile)

Expand Down
178 changes: 177 additions & 1 deletion test/framework/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@
from easybuild.tools.config import find_last_log, get_build_log_path, get_module_syntax, module_classes
from easybuild.tools.environment import modify_env
from easybuild.tools.filetools import change_dir, copy_dir, copy_file, download_file, is_patch_file, mkdir
from easybuild.tools.filetools import read_file, remove_dir, remove_file, which, write_file
from easybuild.tools.filetools import parse_http_header_fields_urlpat, read_file, remove_dir, remove_file
from easybuild.tools.filetools import which, write_file
from easybuild.tools.github import GITHUB_RAW, GITHUB_EB_MAIN, GITHUB_EASYCONFIGS_REPO
from easybuild.tools.github import URL_SEPARATOR, fetch_github_token
from easybuild.tools.modules import Lmod
Expand Down Expand Up @@ -2563,6 +2564,181 @@ def test_hide_toolchains(self):
self.assertTrue(re.search(r'module: GCC/\.4\.9\.2', outtxt))
self.assertTrue(re.search(r'module: gzip/1\.6-GCC-4\.9\.2', outtxt))

def test_parse_http_header_fields_urlpat(self):
"""Test function parse_http_header_fields_urlpat"""
urlex = "example.com"
urlgnu = "gnu.org"
hdrauth = "Authorization"
valauth = "Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ=="
hdragent = "User-Agent"
valagent = "James/0.0.7 (MI6)"
hdrrefer = "Referer"
valrefer = "http://www.example.com/"
filesub1 = os.path.join(self.test_prefix, "testhttpheaders1.txt")
filesub2 = os.path.join(self.test_prefix, "testhttpheaders2.txt")
filesub3 = os.path.join(self.test_prefix, "testhttpheaders3.txt")
filesub4 = os.path.join(self.test_prefix, "testhttpheaders4.txt")
fileauth = os.path.join(self.test_prefix, "testhttpheadersauth.txt")
write_file(filesub4, filesub3)
write_file(filesub3, filesub2)
write_file(filesub2, filesub1)
write_file(filesub1, "%s::%s:%s\n" % (urlgnu, hdrauth, valauth))
write_file(filesub2, "%s::%s\n" % (urlex, filesub1))
write_file(filesub3, "%s::%s:%s\n" % (urlex, hdragent, filesub2))
write_file(fileauth, "%s\n" % (valauth))

# Case A: basic pattern
args = "%s::%s:%s" % (urlgnu, hdragent, valagent)
urlpat_headers = parse_http_header_fields_urlpat(args)
self.assertEqual({urlgnu: ["%s:%s" % (hdragent, valagent)]}, urlpat_headers)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better to hardcode stuff here in the tests, since it makes things a whole lot more explicit:

args = "gnu.org::User-Agent:James/0.0.7 (MI6)"
urlpat_headers = parse_http_header_fields_urlpat(args)
expected = {'gnu.org': 'User-Agent:James/0.0.7 (MI6)'}
self.assertEqual(expected, urlpat_headers)


# Case B: urlpat has another urlpat: retain deepest level
args = "%s::%s::%s::%s:%s" % (urlgnu, urlgnu, urlex, hdragent, valagent)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering: what's the use case for nesting of URL patterns?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well it is not really useful on the same line, but it to facilitate the case where another URL pattern is specified in a file that is read. If recursive calls to several files occur, it should pick up the deepest level url pattern for the header field.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, makes sense.

urlpat_headers = parse_http_header_fields_urlpat(args)
self.assertEqual({urlex: ["%s:%s" % (hdragent, valagent)]}, urlpat_headers)

# Case C: header value has a colon
args = "%s::%s:%s" % (urlex, hdrrefer, valrefer)
urlpat_headers = parse_http_header_fields_urlpat(args)
self.assertEqual({urlex: ["%s:%s" % (hdrrefer, valrefer)]}, urlpat_headers)

# Case D: recurse into files
args = filesub3
urlpat_headers = parse_http_header_fields_urlpat(args)
self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers)

# Case E: recurse into files as header
args = "%s::%s" % (urlex, filesub3)
urlpat_headers = parse_http_header_fields_urlpat(args)
self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers)

# Case F: recurse into files as value (header is replaced)
args = "%s::%s:%s" % (urlex, hdrrefer, filesub3)
urlpat_headers = parse_http_header_fields_urlpat(args)
self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers)

# Case G: recurse into files as value (header is retained)
args = "%s::%s:%s" % (urlgnu, hdrauth, fileauth)
urlpat_headers = parse_http_header_fields_urlpat(args)
self.assertEqual({urlgnu: ["%s:%s" % (hdrauth, valauth)]}, urlpat_headers)

# Case H: recurse into files but hit limit
args = filesub4
error_regex = r"Failed to parse_http_header_fields_urlpat \(recursion limit\)"
self.assertErrorRegex(EasyBuildError, error_regex, parse_http_header_fields_urlpat, args)

# Case I: argument is not a string
args = list("foobar")
error_regex = r"Failed to parse_http_header_fields_urlpat \(argument not a string\)"
self.assertErrorRegex(EasyBuildError, error_regex, parse_http_header_fields_urlpat, args)

def test_http_header_fields_urlpat(self):
"""Test use of --http-header-fields-urlpat."""
test_ecs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'easyconfigs', 'test_ecs')
ec_file = os.path.join(test_ecs_dir, 'g', 'gzip', 'gzip-1.6-GCC-4.9.2.eb')
common_args = [
ec_file,
'--stop=fetch',
'--debug',
'--force',
'--force-download',
'--logtostdout',
]

# define header fields:values that should (not) show up in the logs, either
# because they are secret or because they are not matched for the url
testdohdr = 'HeaderAPPLIED'
testdoval = 'SECRETvalue'
testdonthdr = 'HeaderIGNORED'
testdontval = 'BOGUSvalue'

# header fields (or its values) could be files to be read instead of literals
testcmdfile = os.path.join(self.test_prefix, 'testhttpheaderscmdline.txt')
testincfile = os.path.join(self.test_prefix, 'testhttpheadersvalinc.txt')
testexcfile = os.path.join(self.test_prefix, 'testhttpheadersvalexc.txt')
testinchdrfile = os.path.join(self.test_prefix, 'testhttpheadershdrinc.txt')
testexchdrfile = os.path.join(self.test_prefix, 'testhttpheadershdrexc.txt')
testurlpatfile = os.path.join(self.test_prefix, 'testhttpheadersurlpat.txt')

# log mention format upon header or file inclusion
mentionhdr = 'Custom HTTP header field set: %s'
mentionfile = 'File included in parse_http_header_fields_urlpat: %s'

def run_and_assert(args, msg, words_expected=None, words_unexpected=None):
stdout, stderr = self._run_mock_eb(args, do_build=True, raise_error=True, testing=False)
if words_expected is not None:
for thestring in words_expected:
self.assertTrue(re.compile(thestring).search(stdout), "Pattern '%s' missing from log (%s)" %
(thestring, msg))
if words_unexpected is not None:
for thestring in words_unexpected:
self.assertFalse(re.compile(thestring).search(stdout), "Pattern '%s' leaked into log (%s)" %
(thestring, msg))

# A: simple direct case (all is logged because passed directly via EasyBuild configuration options)
args = list(common_args)
args.extend([
'--http-header-fields-urlpat=gnu.org::%s:%s' % (testdohdr, testdoval),
'--http-header-fields-urlpat=nomatch.com::%s:%s' % (testdonthdr, testdontval),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should also check with single --http-header-fields-urlpat that has both entries, comma-separated?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried that, but decided not to use the strlist style option, because the comma could be used as a field value. The separation character is \n, but that is awkward to use on the command line. The user can still use the option twice or more times if needed, or just use a file to specify an array of fields... So there is no support for --http-header-fields-urlpat=a,b,c at the moment. Is it ok?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a good technical reason not to support it, so fine by me :)

])
# expect to find everything passed on cmdline
expected = [mentionhdr % (testdohdr), testdoval, testdonthdr, testdontval]
run_and_assert(args, "case A", expected)

# all subsequent tests share this argument list
args = common_args
args.append('--http-header-fields-urlpat=%s' % (testcmdfile))

# B: simple file case (secrets in file are not logged)
txt = '\n'.join([
'gnu.org::%s: %s' % (testdohdr, testdoval),
'nomatch.com::%s: %s' % (testdonthdr, testdontval),
'',
])
write_file(testcmdfile, txt)
# expect to find only the header key (not its value) and only for the appropriate url
expected = [mentionhdr % testdohdr, mentionfile % testcmdfile]
not_expected = [testdoval, testdonthdr, testdontval]
run_and_assert(args, "case B", expected, not_expected)

# C: recursion one: header value is another file
txt = '\n'.join([
'gnu.org::%s: %s' % (testdohdr, testincfile),
'nomatch.com::%s: %s' % (testdonthdr, testexcfile),
'',
])
write_file(testcmdfile, txt)
write_file(testincfile, '%s\n' % (testdoval))
write_file(testexcfile, '%s\n' % (testdontval))
# expect to find only the header key (not its value and not the filename) and only for the appropriate url
expected = [mentionhdr % (testdohdr), mentionfile % (testcmdfile),
mentionfile % (testincfile), mentionfile % (testexcfile)]
not_expected = [testdoval, testdonthdr, testdontval]
run_and_assert(args, "case C", expected, not_expected)

# D: recursion two: header field+value is another file,
write_file(testcmdfile, '\n'.join(['gnu.org::%s' % (testinchdrfile), 'nomatch.com::%s' % (testexchdrfile), '']))
write_file(testinchdrfile, '%s: %s\n' % (testdohdr, testdoval))
write_file(testexchdrfile, '%s: %s\n' % (testdonthdr, testdontval))
# expect to find only the header key (and the literal filename) and only for the appropriate url
expected = [mentionhdr % (testdohdr), mentionfile % (testcmdfile),
mentionfile % (testinchdrfile), mentionfile % (testexchdrfile)]
not_expected = [testdoval, testdonthdr, testdontval]
run_and_assert(args, "case D", expected, not_expected)

# E: recursion three: url pattern + header field + value in another file
write_file(testcmdfile, '%s\n' % (testurlpatfile))
txt = '\n'.join([
'gnu.org::%s: %s' % (testdohdr, testdoval),
'nomatch.com::%s: %s' % (testdonthdr, testdontval),
'',
])
write_file(testurlpatfile, txt)
# expect to find only the header key (but not the literal filename) and only for the appropriate url
expected = [mentionhdr % (testdohdr), mentionfile % (testcmdfile), mentionfile % (testurlpatfile)]
not_expected = [testdoval, testdonthdr, testdontval]
run_and_assert(args, "case E", expected, not_expected)

def test_test_report_env_filter(self):
"""Test use of --test-report-env-filter."""

Expand Down