diff --git a/WikiExtractor.py b/WikiExtractor.py index 730b3bab..791a71e3 100755 --- a/WikiExtractor.py +++ b/WikiExtractor.py @@ -2865,10 +2865,23 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, :param process_count: number of extraction processes to spawn. """ + def hook_compressed_encoded(encoding): + def hook(filename, mode): + ext = os.path.splitext(filename)[1] + if ext == '.gz': + import gzip + return gzip.open(filename, mode, encoding=encoding) + elif ext == '.bz2': + import bz2 + return bz2.open(filename, mode, encoding=encoding) + else: + return open(filename, mode, encoding=encoding) + return hook + if input_file == '-': input = sys.stdin else: - input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + input = fileinput.FileInput(input_file, openhook=hook_compressed_encoded('utf-8')) # collect siteinfo for line in input: