Skip to content
42 changes: 28 additions & 14 deletions gguf-py/gguf/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
SentencePieceTokenizer,
)

try:
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
get_one_valid_tokenizer_file,
)
except ImportError:
# We still want the conversion to work with older mistral-common versions.
get_one_valid_tokenizer_file = None
Comment thread
CISC marked this conversation as resolved.
Outdated
except ImportError:
_mistral_common_installed = False
MistralTokenizer = None
Expand Down Expand Up @@ -673,21 +681,27 @@ def __init__(self, base_path: Path):

# Find the tokenizer files
all_files = [f.as_posix() for f in base_path.glob("**/*") if f.is_file()]
valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)

if len(valid_tokenizer_files) == 0:
raise ValueError(f"No tokenizer file found in the directory: {base_path}")
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
if len(valid_tokenizer_files) > 1:
if "tekken.json" in valid_tokenizer_files:
tokenizer_file = "tekken.json"
else:
tokenizer_file = sorted(valid_tokenizer_files)[-1]
logger.warning(
f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}"
)

if get_one_valid_tokenizer_file is not None:
tokenizer_file_path = get_one_valid_tokenizer_file(all_files)
# get_one_valid_tokenizer_file returns a path rather than just the file name e.g: "ministral3b\tekken.json" instead of "tekken.json"
tokenizer_file = Path(tokenizer_file_path).name
Comment thread
CISC marked this conversation as resolved.
Outdated
else:
tokenizer_file = valid_tokenizer_files[0]
valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)

if len(valid_tokenizer_files) == 0:
raise ValueError(f"No tokenizer file found in the directory: {base_path}")
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
if len(valid_tokenizer_files) > 1:
if "tekken.json" in valid_tokenizer_files:
tokenizer_file = "tekken.json"
else:
tokenizer_file = sorted(valid_tokenizer_files)[-1]
logger.warning(
f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}"
)
else:
tokenizer_file = valid_tokenizer_files[0]

self.tokenizer = MistralTokenizer.from_file(
base_path / tokenizer_file
Expand Down