|
15 | 15 | logger = datasets.utils.logging.get_logger(__name__) |
16 | 16 |
|
17 | 17 |
|
| 18 | +if datasets.config.PYARROW_VERSION.major >= 7: |
| 19 | + |
| 20 | + def pa_table_from_pylist(mapping): |
| 21 | + return pa.Table.from_pylist(mapping) |
| 22 | + |
| 23 | +else: |
| 24 | + |
| 25 | + def pa_table_from_pylist(mapping): |
| 26 | + # Copied from: https://github.com/apache/arrow/blob/master/python/pyarrow/table.pxi#L5193 |
| 27 | + arrays = [] |
| 28 | + names = [] |
| 29 | + if mapping: |
| 30 | + names = list(mapping[0].keys()) |
| 31 | + for n in names: |
| 32 | + v = [row[n] if n in row else None for row in mapping] |
| 33 | + arrays.append(v) |
| 34 | + return pa.Table.from_arrays(arrays, names) |
| 35 | + |
| 36 | + |
18 | 37 | @dataclass |
19 | 38 | class JsonConfig(datasets.BuilderConfig): |
20 | 39 | """BuilderConfig for JSON.""" |
@@ -125,18 +144,29 @@ def _generate_tables(self, files): |
125 | 144 | ) |
126 | 145 | block_size *= 2 |
127 | 146 | except pa.ArrowInvalid as e: |
128 | | - logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}") |
129 | 147 | try: |
130 | 148 | with open(file, encoding="utf-8") as f: |
131 | 149 | dataset = json.load(f) |
132 | 150 | except json.JSONDecodeError: |
| 151 | + logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}") |
133 | 152 | raise e |
134 | | - raise ValueError( |
135 | | - f"Not able to read records in the JSON file at {file}. " |
136 | | - f"You should probably indicate the field of the JSON file containing your records. " |
137 | | - f"This JSON file contain the following fields: {str(list(dataset.keys()))}. " |
138 | | - f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. " |
139 | | - ) from None |
| 153 | + # If possible, parse the file as a list of json objects and exit the loop |
| 154 | + if isinstance(dataset, list): # list is the only sequence type supported in JSON |
| 155 | + try: |
| 156 | + pa_table = pa_table_from_pylist(dataset) |
| 157 | + except (pa.ArrowInvalid, AttributeError) as e: |
| 158 | + logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}") |
| 159 | + raise ValueError(f"Not able to read records in the JSON file at {file}.") from None |
| 160 | + yield file_idx, self._cast_table(pa_table) |
| 161 | + break |
| 162 | + else: |
| 163 | + logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}") |
| 164 | + raise ValueError( |
| 165 | + f"Not able to read records in the JSON file at {file}. " |
| 166 | + f"You should probably indicate the field of the JSON file containing your records. " |
| 167 | + f"This JSON file contain the following fields: {str(list(dataset.keys()))}. " |
| 168 | + f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. " |
| 169 | + ) from None |
140 | 170 | # Uncomment for debugging (will print the Arrow table size and elements) |
141 | 171 | # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}") |
142 | 172 | # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows))) |
|
0 commit comments