Skip to content

Commit d8d147f

Browse files
committed
feat: Smart model management
* **New UI option** – `memory_util` added to `settings.json` with a dropdown (high / medium / low) to let users control how aggressively the engine uses system memory. * **Configuration updates** – `LlamacppConfig` now includes `memory_util`; the extension class stores it in a new `memoryMode` property and handles updates through `updateConfig`. * **System memory handling** * Introduced `SystemMemory` interface and `getTotalSystemMemory()` to report combined VRAM + RAM. * Added helper methods `getKVCachePerToken`, `getLayerSize`, and a new `ModelPlan` type. * **Smart model‑load planner** – `planModelLoad()` computes: * Number of GPU layers that can fit in usable VRAM. * Maximum context length based on KV‑cache size and the selected memory utilization mode (high/medium/low). * Whether KV‑cache must be off‑loaded to CPU and the overall loading mode (GPU, Hybrid, CPU, Unsupported). * Detailed logging of the planning decision. * **Improved support check** – `isModelSupported()` now: * Uses the combined VRAM/RAM totals from `getTotalSystemMemory()`. * Applies an 80% usable‑memory heuristic. * Returns **GREEN** only when both weights and KV‑cache fit in VRAM, **YELLOW** when they fit only in total memory or require CPU off‑load, and **RED** when the model cannot fit at all. * **Cleanup** – Removed unused `GgufMetadata` import; updated imports and type definitions accordingly. * **Documentation/comments** – Added explanatory JSDoc comments for the new methods and clarified the return semantics of `isModelSupported`.
1 parent 88fb1ac commit d8d147f

File tree

2 files changed

+210
-34
lines changed

2 files changed

+210
-34
lines changed

extensions/llamacpp-extension/settings.json

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,21 @@
3636
"controllerType": "checkbox",
3737
"controllerProps": { "value": true }
3838
},
39+
{
40+
"key": "memory_util",
41+
"title": "Smart Memory utilization",
42+
"description": "Smart memory utilization mode for running local GGUF models",
43+
"controllerType": "dropdown",
44+
"controllerProps": {
45+
"value": "high",
46+
"options": [
47+
{ "value": "high", "name": "High" },
48+
{ "value": "medium", "name": "Medium" },
49+
{ "value": "low", "name": "Low" }
50+
],
51+
"recommended": "high"
52+
}
53+
},
3954
{
4055
"key": "threads",
4156
"title": "Threads",

extensions/llamacpp-extension/src/index.ts

Lines changed: 195 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,15 @@ import {
3535
import { invoke } from '@tauri-apps/api/core'
3636
import { getProxyConfig } from './util'
3737
import { basename } from '@tauri-apps/api/path'
38-
import {
39-
GgufMetadata,
40-
readGgufMetadata,
41-
} from '@janhq/tauri-plugin-llamacpp-api'
38+
import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api'
4239
import { getSystemUsage } from '@janhq/tauri-plugin-hardware-api'
4340

4441
type LlamacppConfig = {
4542
version_backend: string
4643
auto_update_engine: boolean
4744
auto_unload: boolean
4845
llamacpp_env: string
46+
memory_util: string
4947
chat_template: string
5048
n_gpu_layers: number
5149
offload_mmproj: boolean
@@ -74,6 +72,13 @@ type LlamacppConfig = {
7472
ctx_shift: boolean
7573
}
7674

75+
type ModelPlan = {
76+
gpuLayers: number
77+
maxContextLength: number
78+
noOffloadKVCache: boolean
79+
mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
80+
}
81+
7782
interface DownloadItem {
7883
url: string
7984
save_path: string
@@ -116,6 +121,12 @@ interface DeviceList {
116121
free: number
117122
}
118123

124+
interface SystemMemory {
125+
totalVRAM: number
126+
totalRAM: number
127+
totalMemory: number
128+
}
129+
119130
/**
120131
* Override the default app.log function to use Jan's logging system.
121132
* @param args
@@ -159,6 +170,7 @@ export default class llamacpp_extension extends AIEngine {
159170
provider: string = 'llamacpp'
160171
autoUnload: boolean = true
161172
llamacpp_env: string = ''
173+
memoryMode: string = 'high'
162174
readonly providerId: string = 'llamacpp'
163175

164176
private config: LlamacppConfig
@@ -190,6 +202,7 @@ export default class llamacpp_extension extends AIEngine {
190202

191203
this.autoUnload = this.config.auto_unload
192204
this.llamacpp_env = this.config.llamacpp_env
205+
this.memoryMode = this.config.memory_util
193206

194207
// This sets the base directory where model files for this provider are stored.
195208
this.providerPath = await joinPath([
@@ -836,6 +849,8 @@ export default class llamacpp_extension extends AIEngine {
836849
this.autoUnload = value as boolean
837850
} else if (key === 'llamacpp_env') {
838851
this.llamacpp_env = value as string
852+
} else if (key === 'memory_util') {
853+
this.memoryMode = value as string
839854
}
840855
}
841856

@@ -1848,10 +1863,153 @@ export default class llamacpp_extension extends AIEngine {
18481863
'tokenizer.chat_template'
18491864
]?.includes('tools')
18501865
}
1866+
/**
1867+
* Get total system memory including both VRAM and RAM
1868+
*/
1869+
private async getTotalSystemMemory(): Promise<SystemMemory> {
1870+
const devices = await this.getDevices()
1871+
let totalVRAM = 0
1872+
1873+
if (devices.length > 0) {
1874+
// Sum total VRAM across all GPUs
1875+
totalVRAM = devices
1876+
.map((d) => d.mem * 1024 * 1024)
1877+
.reduce((a, b) => a + b, 0)
1878+
}
1879+
1880+
// Get system RAM
1881+
const sys = await getSystemUsage()
1882+
const totalRAM = sys.total_memory * 1024 * 1024
1883+
1884+
const totalMemory = totalVRAM + totalRAM
1885+
1886+
logger.info(
1887+
`Total VRAM: ${totalVRAM} bytes, Total RAM: ${totalRAM} bytes, Total Memory: ${totalMemory} bytes`
1888+
)
1889+
1890+
return {
1891+
totalVRAM,
1892+
totalRAM,
1893+
totalMemory,
1894+
}
1895+
}
1896+
private async getKVCachePerToken(
1897+
meta: Record<string, string>
1898+
): Promise<number> {
1899+
const arch = meta['general.architecture']
1900+
if (!arch) throw new Error('Invalid metadata: architecture not found')
1901+
1902+
const nLayer = Number(meta[`${arch}.block_count`])
1903+
const nHead = Number(meta[`${arch}.attention.head_count`])
1904+
if (!nLayer || !nHead) {
1905+
throw new Error('Invalid metadata: block_count or head_count not found')
1906+
}
1907+
1908+
const keyLen = Number(meta[`${arch}.attention.key_length`])
1909+
const valLen = Number(meta[`${arch}.attention.value_length`])
1910+
let headDim: number
1911+
if (keyLen && valLen) {
1912+
headDim = keyLen + valLen
1913+
} else {
1914+
const embeddingLen = Number(meta[`${arch}.embedding_length`])
1915+
if (!embeddingLen)
1916+
throw new Error('Invalid metadata: embedding_length not found')
1917+
headDim = (embeddingLen / nHead) * 2
1918+
}
1919+
1920+
const bytesPerElement = 2 // fp16
1921+
return nHead * headDim * bytesPerElement * nLayer
1922+
}
1923+
1924+
private async getLayerSize(
1925+
path: string,
1926+
meta: Record<string, string>
1927+
): Promise<{ layerSize: number; totalLayers: number }> {
1928+
const modelSize = await this.getModelSize(path)
1929+
const arch = meta['general.architecture']
1930+
const totalLayers = Number(meta[`${arch}.block_count`])
1931+
if (!totalLayers) throw new Error('Invalid metadata: block_count not found')
1932+
return { layerSize: modelSize / totalLayers, totalLayers }
1933+
}
1934+
1935+
async planModelLoad(path: string, requestedCtx?: number): Promise<ModelPlan> {
1936+
const modelSize = await this.getModelSize(path)
1937+
const memoryInfo = await this.getTotalSystemMemory()
1938+
const gguf = await readGgufMetadata(path)
1939+
1940+
const { layerSize, totalLayers } = await this.getLayerSize(
1941+
path,
1942+
gguf.metadata
1943+
)
1944+
const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
1945+
1946+
// VRAM budget (70% heuristic)
1947+
const USABLE_VRAM_PERCENTAGE = 0.7
1948+
const usableVRAM = memoryInfo.totalVRAM * USABLE_VRAM_PERCENTAGE
1949+
1950+
// System RAM budget (depends on this.memoryMode: low/medium/high)
1951+
const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
1952+
const usableSystemMemory =
1953+
memoryInfo.totalMemory * memoryPercentages[this.memoryMode]
1954+
1955+
// --- GPU layers ---
1956+
let gpuLayers = 0
1957+
if (modelSize <= usableVRAM) {
1958+
gpuLayers = totalLayers
1959+
} else {
1960+
gpuLayers = Math.floor(usableVRAM / layerSize)
1961+
}
1962+
1963+
// --- Context length & KV cache ---
1964+
let availableForKVCache = usableVRAM - modelSize
1965+
let maxContextLength = 0
1966+
let noOffloadKVCache = false
1967+
let mode: ModelPlan['mode'] = 'Unsupported'
1968+
1969+
if (availableForKVCache > 0) {
1970+
maxContextLength = Math.floor(availableForKVCache / kvCachePerToken)
1971+
noOffloadKVCache = false
1972+
mode = 'GPU'
1973+
}
1974+
1975+
// fallback: system RAM for KV cache
1976+
if (maxContextLength <= 0) {
1977+
availableForKVCache = usableSystemMemory - modelSize
1978+
if (availableForKVCache > 0) {
1979+
maxContextLength = Math.floor(availableForKVCache / kvCachePerToken)
1980+
noOffloadKVCache = true // KV cache forced to CPU
1981+
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
1982+
}
1983+
}
1984+
1985+
// still too big: safe reduction per layer
1986+
if (maxContextLength <= 0) {
1987+
const safeTokensPerLayer = Math.floor(
1988+
usableSystemMemory / ((kvCachePerToken / totalLayers) * 2)
1989+
)
1990+
maxContextLength = Math.max(0, safeTokensPerLayer)
1991+
noOffloadKVCache = true
1992+
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
1993+
}
1994+
1995+
// enforce user-requested context
1996+
if (requestedCtx) {
1997+
maxContextLength = Math.min(maxContextLength, requestedCtx)
1998+
}
1999+
2000+
if (gpuLayers <= 0 && maxContextLength <= 0) {
2001+
mode = 'Unsupported'
2002+
}
2003+
2004+
logger.info(
2005+
`Plan for ${path}: gpuLayers=${gpuLayers}, maxContextLength=${maxContextLength}, noOffloadKVCache=${noOffloadKVCache}, mode=${mode}`
2006+
)
2007+
2008+
return { gpuLayers, maxContextLength, noOffloadKVCache, mode }
2009+
}
18512010

18522011
/**
1853-
* estimate KVCache size of from a given metadata
1854-
*
2012+
* estimate KVCache size from a given metadata
18552013
*/
18562014
private async estimateKVCache(
18572015
meta: Record<string, string>,
@@ -1891,6 +2049,7 @@ export default class llamacpp_extension extends AIEngine {
18912049
`Using embedding_length estimation: ${embeddingLen}, calculated head_dim: ${headDim}`
18922050
)
18932051
}
2052+
18942053
let ctxLen: number
18952054
if (!ctx_size) {
18962055
ctxLen = Number(meta[`${arch}.context_length`])
@@ -1925,60 +2084,62 @@ export default class llamacpp_extension extends AIEngine {
19252084
}
19262085
}
19272086

1928-
/*
1929-
* check the support status of a model by its path (local/remote)
2087+
/**
2088+
* Check the support status of a model by its path (local/remote)
19302089
*
1931-
* * Returns:
1932-
* - "RED" → weights don't fit
1933-
* - "YELLOW" → weights fit, KV cache doesn't
1934-
* - "GREEN" → both weights + KV cache fit
2090+
* Returns:
2091+
* - "RED" → weights don't fit in total memory
2092+
* - "YELLOW" → weights fit in VRAM but need system RAM, or KV cache doesn't fit
2093+
* - "GREEN" → both weights + KV cache fit in VRAM
19352094
*/
19362095
async isModelSupported(
19372096
path: string,
19382097
ctx_size?: number
19392098
): Promise<'RED' | 'YELLOW' | 'GREEN'> {
19402099
try {
19412100
const modelSize = await this.getModelSize(path)
2101+
const memoryInfo = await this.getTotalSystemMemory()
2102+
19422103
logger.info(`modelSize: ${modelSize}`)
1943-
let gguf: GgufMetadata
1944-
gguf = await readGgufMetadata(path)
2104+
2105+
const gguf = await readGgufMetadata(path)
19452106
let kvCacheSize: number
19462107
if (ctx_size) {
19472108
kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
19482109
} else {
19492110
kvCacheSize = await this.estimateKVCache(gguf.metadata)
19502111
}
1951-
// total memory consumption = model weights + kvcache + a small buffer for outputs
1952-
// output buffer is small so not considering here
2112+
2113+
// Total memory consumption = model weights + kvcache
19532114
const totalRequired = modelSize + kvCacheSize
19542115
logger.info(
19552116
`isModelSupported: Total memory requirement: ${totalRequired} for ${path}`
19562117
)
1957-
let totalMemBytes: number
1958-
const devices = await this.getDevices()
1959-
if (devices.length > 0) {
1960-
// Sum total memory across all GPUs
1961-
totalMemBytes = devices
1962-
.map((d) => d.mem * 1024 * 1024)
1963-
.reduce((a, b) => a + b, 0)
1964-
} else {
1965-
// CPU fallback
1966-
const sys = await getSystemUsage()
1967-
totalMemBytes = sys.total_memory * 1024 * 1024
1968-
}
19692118

19702119
// Use 80% of total memory as the usable limit
19712120
const USABLE_MEMORY_PERCENTAGE = 0.8
1972-
const usableMemBytes = totalMemBytes * USABLE_MEMORY_PERCENTAGE
2121+
const usableTotalMemory =
2122+
memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
2123+
const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
19732124

1974-
// check model size wrt 80% of system memory
1975-
if (modelSize > usableMemBytes) {
2125+
// Check if model fits in total memory at all
2126+
if (modelSize > usableTotalMemory) {
19762127
return 'RED'
1977-
} else if (modelSize + kvCacheSize > usableMemBytes) {
1978-
return 'YELLOW'
1979-
} else {
2128+
}
2129+
2130+
// Check if everything fits in VRAM (ideal case)
2131+
if (totalRequired <= usableVRAM) {
19802132
return 'GREEN'
19812133
}
2134+
2135+
// Check if model fits in VRAM but total requirement exceeds VRAM
2136+
// OR if total requirement fits in total memory but not in VRAM
2137+
if (modelSize <= usableVRAM || totalRequired <= usableTotalMemory) {
2138+
return 'YELLOW'
2139+
}
2140+
2141+
// If we get here, nothing fits properly
2142+
return 'RED'
19822143
} catch (e) {
19832144
throw new Error(String(e))
19842145
}

0 commit comments

Comments
 (0)