Skip to content
Merged
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
effb802
feat - GPTSFTChatDataset alignment with OpenAI Messages, compatibilit…
soluwalana Apr 30, 2025
c9bf3f0
feat - GPTSFTChatDataset alignment with OpenAI Messages, compatibilit…
soluwalana Apr 30, 2025
f62aa0f
Apply isort and black reformatting
soluwalana Apr 30, 2025
aac5d65
lint
soluwalana Apr 30, 2025
c65b02e
update test case
soluwalana Apr 30, 2025
cfe0d4e
F string lint issue
soluwalana Apr 30, 2025
5b43c01
Lint line length
soluwalana Apr 30, 2025
c59d8e1
Apply isort and black reformatting
soluwalana Apr 30, 2025
50f57a8
Apply isort and black reformatting
soluwalana May 1, 2025
71d3f7b
More lint
soluwalana May 1, 2025
e84c9fd
Apply isort and black reformatting
soluwalana May 1, 2025
e03ca17
More lint
soluwalana May 1, 2025
1aa493d
Apply isort and black reformatting
soluwalana May 1, 2025
2cc6b76
Missing parameters
soluwalana May 1, 2025
99a5bea
Don't download the model everytime
soluwalana May 2, 2025
ded682d
Code lint
soluwalana May 2, 2025
09e5cb5
Rollback change on whitespace in sentencepiece tokenizer
soluwalana May 2, 2025
379787b
Ensure all loss_masks are labeled as loss_mask
soluwalana May 8, 2025
5697d16
Apply isort and black reformatting
soluwalana May 14, 2025
9e42664
Remove duplicate function left in for rebase only"
soluwalana May 14, 2025
459a257
label loss mask actually loss mask (as there is an attention mask as …
soluwalana May 14, 2025
36956f4
Lint line length
soluwalana May 14, 2025
129e3d9
Apply isort and black reformatting
soluwalana May 14, 2025
06a1e3b
default use_hf_tokenzier_chat_template = True
soluwalana May 14, 2025
2098584
Add original build_samples_mapping back in
soluwalana May 14, 2025
1f29e14
PR feedback incorporation
soluwalana May 14, 2025
508bea6
Apply isort and black reformatting
soluwalana May 14, 2025
5289918
PR feedback incorporation
soluwalana May 14, 2025
07109f9
Apply isort and black reformatting
soluwalana May 14, 2025
e02f960
Fix tests
soluwalana May 15, 2025
b54ec35
Apply isort and black reformatting
soluwalana May 15, 2025
806ac13
PR changes
soluwalana May 15, 2025
44d1826
Apply isort and black reformatting
soluwalana May 15, 2025
83bc752
Merge branch 'main' into solu/chat-dataset-changes
chtruong814 May 17, 2025
b80defd
Merge branch 'main' into solu/chat-dataset-changes
chtruong814 May 17, 2025
a0340fb
Skip eval unit test (#13635)
chtruong814 May 17, 2025
51fd732
Map directly to the NeMo tokenizer path
soluwalana May 19, 2025
464c70b
Rollback change pointing to image cache
soluwalana May 19, 2025
65bcaec
Potential fix for code scanning alert no. 14984: Unused import
soluwalana May 19, 2025
29fd40b
correct path in TestData
soluwalana May 20, 2025
45c66f0
correct path in TestData
soluwalana May 20, 2025
a74b8e3
Merge branch 'main' into solu/chat-dataset-changes
soluwalana May 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
use_fast: Optional[bool] = True,
trust_remote_code: Optional[bool] = False,
include_special_tokens: bool = False,
chat_template: Optional[str] = None,
):
"""
Args:
Expand All @@ -68,14 +69,18 @@ def __init__(
use_fast: whether to use fast HuggingFace tokenizer
include_special_tokens: when True, converting text to ids will include special tokens / prompt tokens (if
any), yielding self.tokenizer(text).input_ids
chat_template: The chat template string to format "messages" with against the underlying HF tokneizer with
apply_chat_template function
"""
try:
self._initialize_tokenizer(pretrained_model_name, vocab_file, merges_file, use_fast, trust_remote_code)
self._initialize_tokenizer(
pretrained_model_name, vocab_file, merges_file, use_fast, trust_remote_code, chat_template
)
assert self.tokenizer, "tokenizer not initialized"
except Exception:
try:
self._initialize_tokenizer(
pretrained_model_name, vocab_file, merges_file, not use_fast, trust_remote_code
pretrained_model_name, vocab_file, merges_file, not use_fast, trust_remote_code, chat_template
)
assert self.tokenizer, "tokenizer not initialized"
except Exception as e:
Expand Down Expand Up @@ -168,6 +173,7 @@ def _initialize_tokenizer(
merges_file: Optional[str] = None,
use_fast: Optional[bool] = False,
trust_remote_code: Optional[bool] = False,
chat_template: Optional[str] = None,
):
# this logic deals with different huggingface tokenizers having different positional args
if vocab_file is None:
Expand All @@ -192,6 +198,12 @@ def _initialize_tokenizer(
trust_remote_code=trust_remote_code,
)

if chat_template is not None:
if getattr(self.tokenizer, 'chat_template', None) is not None:
logging.info("You are overwriting tokenizer's chat template, confirm this is intended.")
self.tokenizer.chat_template = chat_template
self.tokenizer.chat_template_format = "jinja"

@property
def vocab_size(self):
"""
Expand Down
Loading
Loading