44import os
55import tarfile
66import textwrap
7+ import zipfile
78
89import pyarrow as pa
910import pyarrow .parquet as pq
@@ -95,12 +96,14 @@ def text_file(tmp_path_factory):
9596
9697
9798@pytest .fixture (scope = "session" )
98- def xz_file (tmp_path_factory ):
99- filename = tmp_path_factory .mktemp ("data" ) / "file.txt.xz"
99+ def bz2_file (tmp_path_factory ):
100+ import bz2
101+
102+ path = tmp_path_factory .mktemp ("data" ) / "file.txt.bz2"
100103 data = bytes (FILE_CONTENT , "utf-8" )
101- with lzma .open (filename , "wb" ) as f :
104+ with bz2 .open (path , "wb" ) as f :
102105 f .write (data )
103- return filename
106+ return path
104107
105108
106109@pytest .fixture (scope = "session" )
@@ -114,29 +117,6 @@ def gz_file(tmp_path_factory):
114117 return path
115118
116119
117- @pytest .fixture (scope = "session" )
118- def bz2_file (tmp_path_factory ):
119- import bz2
120-
121- path = tmp_path_factory .mktemp ("data" ) / "file.txt.bz2"
122- data = bytes (FILE_CONTENT , "utf-8" )
123- with bz2 .open (path , "wb" ) as f :
124- f .write (data )
125- return path
126-
127-
128- @pytest .fixture (scope = "session" )
129- def zstd_file (tmp_path_factory ):
130- if config .ZSTANDARD_AVAILABLE :
131- import zstandard as zstd
132-
133- path = tmp_path_factory .mktemp ("data" ) / "file.txt.zst"
134- data = bytes (FILE_CONTENT , "utf-8" )
135- with zstd .open (path , "wb" ) as f :
136- f .write (data )
137- return path
138-
139-
140120@pytest .fixture (scope = "session" )
141121def lz4_file (tmp_path_factory ):
142122 if config .LZ4_AVAILABLE :
@@ -160,6 +140,43 @@ def seven_zip_file(tmp_path_factory, text_file):
160140 return path
161141
162142
143+ @pytest .fixture (scope = "session" )
144+ def tar_file (tmp_path_factory , text_file ):
145+ path = tmp_path_factory .mktemp ("data" ) / "file.txt.tar"
146+ with tarfile .TarFile (path , "w" ) as f :
147+ f .add (text_file , arcname = os .path .basename (text_file ))
148+ return path
149+
150+
151+ @pytest .fixture (scope = "session" )
152+ def xz_file (tmp_path_factory ):
153+ path = tmp_path_factory .mktemp ("data" ) / "file.txt.xz"
154+ data = bytes (FILE_CONTENT , "utf-8" )
155+ with lzma .open (path , "wb" ) as f :
156+ f .write (data )
157+ return path
158+
159+
160+ @pytest .fixture (scope = "session" )
161+ def zip_file (tmp_path_factory , text_file ):
162+ path = tmp_path_factory .mktemp ("data" ) / "file.txt.zip"
163+ with zipfile .ZipFile (path , "w" ) as f :
164+ f .write (text_file , arcname = os .path .basename (text_file ))
165+ return path
166+
167+
168+ @pytest .fixture (scope = "session" )
169+ def zstd_file (tmp_path_factory ):
170+ if config .ZSTANDARD_AVAILABLE :
171+ import zstandard as zstd
172+
173+ path = tmp_path_factory .mktemp ("data" ) / "file.txt.zst"
174+ data = bytes (FILE_CONTENT , "utf-8" )
175+ with zstd .open (path , "wb" ) as f :
176+ f .write (data )
177+ return path
178+
179+
163180@pytest .fixture (scope = "session" )
164181def xml_file (tmp_path_factory ):
165182 filename = tmp_path_factory .mktemp ("data" ) / "file.xml"
@@ -276,8 +293,6 @@ def bz2_csv_path(csv_path, tmp_path_factory):
276293
277294@pytest .fixture (scope = "session" )
278295def zip_csv_path (csv_path , csv2_path , tmp_path_factory ):
279- import zipfile
280-
281296 path = tmp_path_factory .mktemp ("data" ) / "dataset.csv.zip"
282297 with zipfile .ZipFile (path , "w" ) as f :
283298 f .write (csv_path , arcname = os .path .basename (csv_path ))
@@ -287,8 +302,6 @@ def zip_csv_path(csv_path, csv2_path, tmp_path_factory):
287302
288303@pytest .fixture (scope = "session" )
289304def zip_csv_with_dir_path (csv_path , csv2_path , tmp_path_factory ):
290- import zipfile
291-
292305 path = tmp_path_factory .mktemp ("data" ) / "dataset_with_dir.csv.zip"
293306 with zipfile .ZipFile (path , "w" ) as f :
294307 f .write (csv_path , arcname = os .path .join ("main_dir" , os .path .basename (csv_path )))
@@ -392,8 +405,6 @@ def jsonl_gz_path(tmp_path_factory, jsonl_path):
392405
393406@pytest .fixture (scope = "session" )
394407def zip_jsonl_path (jsonl_path , jsonl2_path , tmp_path_factory ):
395- import zipfile
396-
397408 path = tmp_path_factory .mktemp ("data" ) / "dataset.jsonl.zip"
398409 with zipfile .ZipFile (path , "w" ) as f :
399410 f .write (jsonl_path , arcname = os .path .basename (jsonl_path ))
@@ -403,8 +414,6 @@ def zip_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):
403414
404415@pytest .fixture (scope = "session" )
405416def zip_jsonl_with_dir_path (jsonl_path , jsonl2_path , tmp_path_factory ):
406- import zipfile
407-
408417 path = tmp_path_factory .mktemp ("data" ) / "dataset_with_dir.jsonl.zip"
409418 with zipfile .ZipFile (path , "w" ) as f :
410419 f .write (jsonl_path , arcname = os .path .join ("main_dir" , os .path .basename (jsonl_path )))
@@ -451,8 +460,6 @@ def text2_path(tmp_path_factory):
451460
452461@pytest .fixture (scope = "session" )
453462def zip_text_path (text_path , text2_path , tmp_path_factory ):
454- import zipfile
455-
456463 path = tmp_path_factory .mktemp ("data" ) / "dataset.text.zip"
457464 with zipfile .ZipFile (path , "w" ) as f :
458465 f .write (text_path , arcname = os .path .basename (text_path ))
@@ -462,8 +469,6 @@ def zip_text_path(text_path, text2_path, tmp_path_factory):
462469
463470@pytest .fixture (scope = "session" )
464471def zip_text_with_dir_path (text_path , text2_path , tmp_path_factory ):
465- import zipfile
466-
467472 path = tmp_path_factory .mktemp ("data" ) / "dataset_with_dir.text.zip"
468473 with zipfile .ZipFile (path , "w" ) as f :
469474 f .write (text_path , arcname = os .path .join ("main_dir" , os .path .basename (text_path )))
@@ -487,8 +492,6 @@ def image_file():
487492
488493@pytest .fixture (scope = "session" )
489494def zip_image_path (image_file , tmp_path_factory ):
490- import zipfile
491-
492495 path = tmp_path_factory .mktemp ("data" ) / "dataset.img.zip"
493496 with zipfile .ZipFile (path , "w" ) as f :
494497 f .write (image_file , arcname = os .path .basename (image_file ))
0 commit comments