Skip to content

Commit 3bfb95e

Browse files
authored
Add extractor for bzip2-compressed files (#4421)
1 parent daedfce commit 3bfb95e

File tree

2 files changed

+25
-4
lines changed

2 files changed

+25
-4
lines changed

src/datasets/utils/extract.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import bz2
12
import gzip
23
import lzma
34
import os
@@ -157,9 +158,29 @@ def extract(input_path: str, output_path: str):
157158
dctx.copy_stream(ifh, ofh)
158159

159160

161+
class Bzip2Extractor:
162+
@staticmethod
163+
def is_extractable(path: str) -> bool:
164+
with open(path, "rb") as f:
165+
try:
166+
header_magic_bytes = f.read(3)
167+
except OSError:
168+
return False
169+
if header_magic_bytes == b"BZh":
170+
return True
171+
else:
172+
return False
173+
174+
@staticmethod
175+
def extract(input_path, output_path):
176+
with bz2.open(input_path, "rb") as compressed_file:
177+
with open(output_path, "wb") as extracted_file:
178+
shutil.copyfileobj(compressed_file, extracted_file)
179+
180+
160181
class Extractor:
161182
# Put zip file to the last, b/c it is possible wrongly detected as zip (I guess it means: as tar or gzip)
162-
extractors = [TarExtractor, GzipExtractor, ZipExtractor, XzExtractor, RarExtractor, ZstdExtractor]
183+
extractors = [TarExtractor, GzipExtractor, ZipExtractor, XzExtractor, RarExtractor, ZstdExtractor, Bzip2Extractor]
163184

164185
@classmethod
165186
def is_extractable(cls, path, return_extractor=False):

tests/test_extract.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ def test_zstd_extractor(zstd_file, tmp_path, text_file):
1919

2020

2121
@require_zstandard
22-
@pytest.mark.parametrize("compression_format", ["gzip", "xz", "zstd"])
23-
def test_extractor(compression_format, gz_file, xz_file, zstd_file, tmp_path, text_file):
24-
input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_file}
22+
@pytest.mark.parametrize("compression_format", ["gzip", "xz", "zstd", "bz2"])
23+
def test_extractor(compression_format, gz_file, xz_file, zstd_file, bz2_file, tmp_path, text_file):
24+
input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_file, "bz2": bz2_file}
2525
input_path = str(input_paths[compression_format])
2626
output_path = str(tmp_path / "extracted.txt")
2727
assert Extractor.is_extractable(input_path)

0 commit comments

Comments
 (0)