JohnSnowLabs · DevinTDHa · Jul 23, 2025 · Jul 16, 2025 · Jul 18, 2025
diff --git a/build.sbt b/build.sbt
@@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64)
 
 organization := "com.johnsnowlabs.nlp"
 
-version := "6.0.5"
+version := "6.1.0-rc1"
 
 (ThisBuild / scalaVersion) := scalaVer
 

diff --git a/project/Dependencies.scala b/project/Dependencies.scala
@@ -128,11 +128,11 @@ object Dependencies {
   val azureIdentity = "com.azure" % "azure-identity" % azureIdentityVersion % Provided
   val azureStorage = "com.azure" % "azure-storage-blob" % azureStorageVersion % Provided
 
-  val llamaCppVersion = "0.1.6"
-  val llamaCppCPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-cpu" % llamaCppVersion
-  val llamaCppGPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-gpu" % llamaCppVersion
-  val llamaCppSilicon = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-silicon" % llamaCppVersion
-  val llamaCppAarch64 = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-aarch64" % llamaCppVersion
+  val llamaCppVersion = "1.0.1"
+  val llamaCppCPU = "com.johnsnowlabs.nlp" % "jsl-llamacpp-cpu" % llamaCppVersion
+  val llamaCppGPU = "com.johnsnowlabs.nlp" % "jsl-llamacpp-gpu" % llamaCppVersion
+  val llamaCppSilicon = "com.johnsnowlabs.nlp" % "jsl-llamacpp-silicon" % llamaCppVersion
+  val llamaCppAarch64 = "com.johnsnowlabs.nlp" % "jsl-llamacpp-aarch64" % llamaCppVersion
 
   val jsoupVersion = "1.18.2"
 

diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py
@@ -253,7 +253,9 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFMo
             nCtx=4096,
             nBatch=512,
             embedding=False,
-            nPredict=100
+            nPredict=100,
+            nGpuLayers=99,
+            systemPrompt="You are a helpful assistant."
         )
 
     @staticmethod

diff --git a/python/sparknlp/common/properties.py b/python/sparknlp/common/properties.py
diff --git a/python/test/annotator/embeddings/auto_gguf_embeddings_test.py b/python/test/annotator/embeddings/auto_gguf_embeddings_test.py
@@ -153,8 +153,8 @@ def runTest(self):
             .setInputCols("document")
             .setOutputCol("embeddings")
             .setBatchSize(4)
-            .setNUbatch(2048)
-            .setNBatch(2048)
+            .setNUbatch(4096)
+            .setNBatch(4096)
         )
         pipeline = Pipeline().setStages([self.document_assembler, model])
         results = pipeline.fit(self.long_data).transform(self.long_data)

diff --git a/python/test/annotator/seq2seq/auto_gguf_model_test.py b/python/test/annotator/seq2seq/auto_gguf_model_test.py
@@ -49,7 +49,7 @@ def runTest(self):
             .setOutputCol("completions")
             .setBatchSize(4)
             .setNPredict(20)
-            .setNGpuLayers(5)
+            .setNGpuLayers(99)
             .setTemperature(0.4)
             .setTopK(40)
             .setTopP(0.9)
@@ -78,7 +78,7 @@ def runTest(self):
             DocumentAssembler().setInputCol("text").setOutputCol("document")
         )
 
-        model = (
+        model: AutoGGUFModel = (
             AutoGGUFModel.pretrained()
             .setInputCols("document")
             .setOutputCol("completions")
@@ -87,23 +87,23 @@ def runTest(self):
 
         # Model Parameters
         model.setNThreads(8)
-        model.setNThreadsDraft(8)
+        # model.setNThreadsDraft(8)
         model.setNThreadsBatch(8)
-        model.setNThreadsBatchDraft(8)
+        # model.setNThreadsBatchDraft(8)
         model.setNCtx(512)
         model.setNBatch(32)
         model.setNUbatch(32)
         model.setNDraft(5)
-        model.setNChunks(-1)
-        model.setNSequences(1)
-        model.setPSplit(0.1)
+        # model.setNChunks(-1)
+        # model.setNSequences(1)
+        # model.setPSplit(0.1)
         model.setNGpuLayers(99)
         model.setNGpuLayersDraft(99)
         model.setGpuSplitMode("NONE")
         model.setMainGpu(0)
-        model.setTensorSplit([])
-        model.setGrpAttnN(1)
-        model.setGrpAttnW(512)
+        # model.setTensorSplit([])
+        # model.setGrpAttnN(1)
+        # model.setGrpAttnW(512)
         model.setRopeFreqBase(1.0)
         model.setRopeFreqScale(1.0)
         model.setYarnExtFactor(1.0)
@@ -113,14 +113,14 @@ def runTest(self):
         model.setYarnOrigCtx(0)
         model.setDefragmentationThreshold(-1.0)
         model.setNumaStrategy("DISTRIBUTE")
-        model.setRopeScalingType("UNSPECIFIED")
+        model.setRopeScalingType("NONE")
         model.setPoolingType("NONE")
         model.setModelDraft("")
-        model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache")
-        model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache")
+        # model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache")
+        # model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache")
         model.setEmbedding(False)
         model.setFlashAttention(False)
-        model.setInputPrefixBos(False)
+        # model.setInputPrefixBos(False)
         model.setUseMmap(False)
         model.setUseMlock(False)
         model.setNoKvOffload(False)
@@ -164,7 +164,7 @@ def runTest(self):
         # Special PySpark Parameters (Scala StructFeatures)
         model.setTokenIdBias({0: 0.0, 1: 0.0})
         model.setTokenBias({"!": 0.0, "?": 0.0})
-        model.setLoraAdapters({" ": 0.0})
+        # model.setLoraAdapters({" ": 0.0})
 
         pipeline = Pipeline().setStages([document_assembler, model])
         results = pipeline.fit(data).transform(data)

diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala
@@ -15,8 +15,8 @@
  */
 package com.johnsnowlabs.ml.gguf
 
-import com.johnsnowlabs.nlp.llama.{LlamaModel, ModelParameters}
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import de.kherud.llama.{LlamaModel, ModelParameters}
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.SparkFiles
 import org.apache.spark.sql.SparkSession
@@ -42,7 +42,7 @@ class GGUFWrapper(var modelFileName: String, var modelFolder: String) extends Se
         val modelFilePath = SparkFiles.get(modelFileName)
 
         if (Paths.get(modelFilePath).toFile.exists()) {
-          modelParameters.setModelFilePath(modelFilePath)
+          modelParameters.setModel(modelFilePath)
           llamaModel = GGUFWrapper.withSafeGGUFModelLoader(modelParameters)
         } else
           throw new IllegalStateException(

diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapperMultiModal.scala
@@ -15,7 +15,7 @@
  */
 package com.johnsnowlabs.ml.gguf
 
-import com.johnsnowlabs.nlp.llama.{LlamaModel, ModelParameters}
+import de.kherud.llama.{LlamaModel, ModelParameters}
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.SparkFiles
@@ -44,8 +44,8 @@ class GGUFWrapperMultiModal(var modelFileName: String, var mmprojFileName: Strin
           Paths.get(modelFilePath).toFile.exists() && Paths.get(mmprojFilePath).toFile.exists()
 
         if (filesExist) {
-          modelParameters.setModelFilePath(modelFilePath)
-          modelParameters.setMMProj(mmprojFilePath)
+          modelParameters.setModel(modelFilePath)
+//          modelParameters.setMMProj(mmprojFilePath) // TODO: Vision models implementation
           llamaModel = GGUFWrapperMultiModal.withSafeGGUFModelLoader(modelParameters)
         } else
           throw new IllegalStateException(

diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala
@@ -1,8 +1,8 @@
 package com.johnsnowlabs.nlp
 
 import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel
-import com.johnsnowlabs.nlp.llama.InferenceParameters
-import com.johnsnowlabs.nlp.llama.args._
+import de.kherud.llama.InferenceParameters
+import de.kherud.llama.args._
 import com.johnsnowlabs.nlp.serialization.StructFeature
 import org.apache.spark.ml.param._