llama: add new experimental context params from llama.cpp

deadprogram · deadprogram · commit 380a4b431848 · 2026-01-05T09:00:23.000Z
Signed-off-by: deadprogram &lt;ron@hybridgroup.com&gt;
diff --git a/pkg/llama/context.go b/pkg/llama/context.go
@@ -23,7 +23,8 @@ var FFITypeContextParams = ffi.NewType(
 	&ffi.TypePointer, &ffi.TypePointer,
 	&ffi.TypeUint8, &ffi.TypeUint8,
 	&ffi.TypeUint8, &ffi.TypeUint8,
-	&ffi.TypeUint8, &ffi.TypeUint8)
+	&ffi.TypeUint8, &ffi.TypeUint8,
+	&ffi.TypeUint32, &ffi.TypeSint32)
 
 var (
 	// LLAMA_API struct llama_context_params        llama_context_default_params(void);
diff --git a/pkg/llama/llama.go b/pkg/llama/llama.go
@@ -341,6 +341,11 @@ type ContextParams struct {
 	OpOffload          uint8              // offload host tensor operations to device
 	SwaFull            uint8              // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
 	KVUnified          uint8              // use a unified buffer across the input sequences when computing the attentions
+	// [EXPERIMENTAL]
+	// backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
+	// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
+	Samplers  uintptr // llama_sampler_seq_config *
+	NSamplers uint32  // number of sampler chains
 }
 
 // Model quantize parameters