huggingface · lhoestq · Apr 26, 2021 · Apr 21, 2021 · Apr 21, 2021 · Apr 21, 2021
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -1771,8 +1771,21 @@ def init_buffer_and_writer():
         # Optionally initialize the writer as a context manager
         with contextlib.ExitStack() as stack:
             try:
+                # Only load the columns we actually need
+                if input_columns:
+                    input_dataset = self.with_format(
+                        self._format_type,
+                        columns=input_columns,
+                        output_all_columns=False,
+                        **self._format_kwargs
+                    )
+                    if remove_columns:
+                        remove_columns = list(set(remove_columns) & set(input_columns))
+                else:
+                    input_dataset = self
+
                 # Loop over single examples or batches and write to buffer/file if examples are to be updated
-                pbar_iterable = self if not batched else range(0, len(self), batch_size)
+                pbar_iterable = input_dataset if not batched else range(0, len(input_dataset), batch_size)
                 pbar_unit = "ex" if not batched else "ba"
                 pbar_desc = "#" + str(rank) if rank is not None else None
                 pbar = tqdm(pbar_iterable, disable=not_verbose, position=rank, unit=pbar_unit, desc=pbar_desc)
@@ -1790,13 +1803,13 @@ def init_buffer_and_writer():
                                 writer.write(example)
                 else:
                     for i in pbar:
-                        if drop_last_batch and i + batch_size > self.num_rows:
+                        if drop_last_batch and i + batch_size > input_dataset.num_rows:
                             continue
-                        batch = self[i : i + batch_size]
-                        indices = list(range(*(slice(i, i + batch_size).indices(self.num_rows))))  # Something simpler?
+                        batch = input_dataset[i : i + batch_size]
+                        indices = list(range(*(slice(i, i + batch_size).indices(input_dataset.num_rows))))  # Something simpler?
                         try:
                             batch = apply_function_on_filtered_inputs(
-                                batch, indices, check_same_num_examples=len(self.list_indexes()) > 0, offset=offset
+                                batch, indices, check_same_num_examples=len(input_dataset.list_indexes()) > 0, offset=offset
                             )
                         except NumExamplesMismatch:
                             raise DatasetTransformationNotAllowedError(

diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py
@@ -90,7 +90,7 @@ def _query_table(table: Table, key: Union[int, slice, range, str, Iterable]) ->
         if len(key) == 0:
             return table.table.slice(0, 0)
         # don't use pyarrow.Table.take even for pyarrow >=1.0 (see https://issues.apache.org/jira/browse/ARROW-9773)
-        return pa.concat_tables(table.fast_slice(int(i) % table.num_rows, 1) for i in key)
+        return table.fast_gather(np.array(key) % table.num_rows)
 
     _raise_bad_key_type(key)
 

diff --git a/src/datasets/table.py b/src/datasets/table.py
@@ -97,6 +97,19 @@ def __init__(self, table: pa.Table):
         self._batches = table.to_batches()
         self._offsets = np.cumsum([0] + [len(b) for b in self._batches])
 
+    def fast_gather(self, indices) -> pa.Table:
+        """
+        Create a pa.Table by gathering the records at the records at the specified indices. Should be faster
+        than pa.concat_tables(table.fast_slice(int(i) % table.num_rows, 1) for i in indices) since NumPy can compute
+        the binary searches in parallel, highly optimized C
+        """
+        assert len(indices), "Indices must be non-empty"
+        batch_indices = np.searchsorted(self._offsets, indices, side='right') - 1
+        return pa.Table.from_batches([
+            self._batches[batch_idx].slice(i - self._offsets[batch_idx], 1)
+            for batch_idx, i in zip(batch_indices, indices)
+        ], schema=self._schema)
+
     def fast_slice(self, offset=0, length=None) -> pa.Table:
         """
         Slice the Table using interpolation search.