Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions bodo/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1634,16 +1634,29 @@ def __init__(self, series):

def tokenize(
self,
tokenizer: Callable[[], Transformers.PreTrainedTokenizer], # noqa: F821
tokenizer: Callable[[], transformers.PreTrainedTokenizerBase] # noqa: F821
| transformers.PreTrainedTokenizerBase, # noqa: F821
) -> BodoSeries:
self._check_ai_input("tokenize")

try:
import transformers
except ImportError:
raise ImportError(
"Series.ai.tokenize() requires the 'transformers' package to be installed. "
"Please install it using 'pip install transformers'."
)
if isinstance(tokenizer, transformers.PreTrainedTokenizerBase):
tokenizer_func = lambda: tokenizer
else:
tokenizer_func = tokenizer

def per_row(tokenizer, row):
return tokenizer.encode(row, add_special_tokens=True)

list_of_int64 = pa.list_(pa.int64())
return self._series.map_with_state(
tokenizer,
tokenizer_func,
per_row,
output_type=pd.Series(dtype=pd.ArrowDtype(list_of_int64)),
)
Expand Down
15 changes: 8 additions & 7 deletions bodo/tests/test_df_lib/test_ai_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ def test_query_s3_vectors_error_checking():
)


def test_tokenize():
@pytest.mark.parametrize("init_func", [True, False])
def test_tokenize(init_func):
from transformers import AutoTokenizer

a = pd.Series(
Expand All @@ -160,14 +161,14 @@ def test_tokenize():
]
)
ba = bd.Series(a)
if init_func:
tokenizer = lambda: AutoTokenizer.from_pretrained("bert-base-uncased")
else:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def ret_tokenizer():
# Load a pretrained tokenizer (e.g., BERT)
return AutoTokenizer.from_pretrained("bert-base-uncased")

pd_tokenizer = ret_tokenizer()
pd_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
b = a.map(lambda x: pd_tokenizer.encode(x, add_special_tokens=True))
bb = ba.ai.tokenize(ret_tokenizer)
bb = ba.ai.tokenize(tokenizer)

_test_equal(
bb,
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/api_docs/dataframe_lib/series/ai/tokenize.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Tokenize a series of string dtype into a series of lists of int64.

<p class="api-header">Parameters</p>

: __tokenizer: *function*:__ A function returning a Transformers.PreTrainedTokenizer.
: __tokenizer: *function | Transformers.PretrainedTokenizer*:__ A function returning a Transformers.PreTrainedTokenizer or a Transformers.PreTrainedTokenizer.

<p class="api-header">Returns</p>

Expand Down