From eec5c32c9c0475848e588f8e0f0609921e88d769 Mon Sep 17 00:00:00 2001 From: s-smits Date: Mon, 2 Mar 2026 15:59:38 +0100 Subject: [PATCH] Respect tokenizer.json padding settings in HuggingFaceEmbedder Fix #35600: Replace hardcoded .setPadding(false) with .setPadding(info.padding() != DO_NOT_PAD), which comes from the tokenizer's own metadata via ModelInfo. --- .../main/java/ai/vespa/embedding/HuggingFaceEmbedder.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/model-integration/src/main/java/ai/vespa/embedding/HuggingFaceEmbedder.java b/model-integration/src/main/java/ai/vespa/embedding/HuggingFaceEmbedder.java index 4c67dfa2ed09..d83c2032ef3f 100644 --- a/model-integration/src/main/java/ai/vespa/embedding/HuggingFaceEmbedder.java +++ b/model-integration/src/main/java/ai/vespa/embedding/HuggingFaceEmbedder.java @@ -24,6 +24,7 @@ import java.util.Map; import java.util.logging.Logger; +import static com.yahoo.language.huggingface.ModelInfo.PaddingStrategy.DO_NOT_PAD; import static com.yahoo.language.huggingface.ModelInfo.TruncationStrategy.LONGEST_FIRST; @Beta @@ -110,11 +111,11 @@ public HuggingFaceEmbedder(OnnxRuntime onnx, Embedder.Runtime runtime, HuggingFa prependQuery = embedderConfig.prependQuery(); prependDocument = embedderConfig.prependDocument(); var tokenizerPath = modelHelper.getModelPathResolvingIfNecessary(embedderConfig.tokenizerPathReference()); + var info = HuggingFaceTokenizer.getModelInfo(tokenizerPath); var builder = new HuggingFaceTokenizer.Builder() .addSpecialTokens(true) .addDefaultModel(tokenizerPath) - .setPadding(false); - var info = HuggingFaceTokenizer.getModelInfo(tokenizerPath); + .setPadding(info.padding() != DO_NOT_PAD); log.fine(() -> Text.format("'%s' has info '%s'", tokenizerPath, info)); if (info.maxLength() == -1 || info.truncation() != LONGEST_FIRST) { // Force truncation to max token vector length accepted by model if tokenizer.json contains no valid truncation configuration