@@ -35,17 +35,15 @@ import {
3535import { invoke } from '@tauri-apps/api/core'
3636import { getProxyConfig } from './util'
3737import { basename } from '@tauri-apps/api/path'
38- import {
39- GgufMetadata ,
40- readGgufMetadata ,
41- } from '@janhq/tauri-plugin-llamacpp-api'
38+ import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api'
4239import { getSystemUsage } from '@janhq/tauri-plugin-hardware-api'
4340
4441type LlamacppConfig = {
4542 version_backend : string
4643 auto_update_engine : boolean
4744 auto_unload : boolean
4845 llamacpp_env : string
46+ memory_util : string
4947 chat_template : string
5048 n_gpu_layers : number
5149 offload_mmproj : boolean
@@ -74,6 +72,13 @@ type LlamacppConfig = {
7472 ctx_shift : boolean
7573}
7674
75+ type ModelPlan = {
76+ gpuLayers : number
77+ maxContextLength : number
78+ noOffloadKVCache : boolean
79+ mode : 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
80+ }
81+
7782interface DownloadItem {
7883 url : string
7984 save_path : string
@@ -116,6 +121,12 @@ interface DeviceList {
116121 free : number
117122}
118123
124+ interface SystemMemory {
125+ totalVRAM : number
126+ totalRAM : number
127+ totalMemory : number
128+ }
129+
119130/**
120131 * Override the default app.log function to use Jan's logging system.
121132 * @param args
@@ -159,6 +170,7 @@ export default class llamacpp_extension extends AIEngine {
159170 provider : string = 'llamacpp'
160171 autoUnload : boolean = true
161172 llamacpp_env : string = ''
173+ memoryMode : string = 'high'
162174 readonly providerId : string = 'llamacpp'
163175
164176 private config : LlamacppConfig
@@ -190,6 +202,7 @@ export default class llamacpp_extension extends AIEngine {
190202
191203 this . autoUnload = this . config . auto_unload
192204 this . llamacpp_env = this . config . llamacpp_env
205+ this . memoryMode = this . config . memory_util
193206
194207 // This sets the base directory where model files for this provider are stored.
195208 this . providerPath = await joinPath ( [
@@ -836,6 +849,8 @@ export default class llamacpp_extension extends AIEngine {
836849 this . autoUnload = value as boolean
837850 } else if ( key === 'llamacpp_env' ) {
838851 this . llamacpp_env = value as string
852+ } else if ( key === 'memory_util' ) {
853+ this . memoryMode = value as string
839854 }
840855 }
841856
@@ -1864,10 +1879,153 @@ export default class llamacpp_extension extends AIEngine {
18641879 'tokenizer.chat_template'
18651880 ] ?. includes ( 'tools' )
18661881 }
1882+ /**
1883+ * Get total system memory including both VRAM and RAM
1884+ */
1885+ private async getTotalSystemMemory ( ) : Promise < SystemMemory > {
1886+ const devices = await this . getDevices ( )
1887+ let totalVRAM = 0
1888+
1889+ if ( devices . length > 0 ) {
1890+ // Sum total VRAM across all GPUs
1891+ totalVRAM = devices
1892+ . map ( ( d ) => d . mem * 1024 * 1024 )
1893+ . reduce ( ( a , b ) => a + b , 0 )
1894+ }
1895+
1896+ // Get system RAM
1897+ const sys = await getSystemUsage ( )
1898+ const totalRAM = sys . total_memory * 1024 * 1024
1899+
1900+ const totalMemory = totalVRAM + totalRAM
1901+
1902+ logger . info (
1903+ `Total VRAM: ${ totalVRAM } bytes, Total RAM: ${ totalRAM } bytes, Total Memory: ${ totalMemory } bytes`
1904+ )
1905+
1906+ return {
1907+ totalVRAM,
1908+ totalRAM,
1909+ totalMemory,
1910+ }
1911+ }
1912+ private async getKVCachePerToken (
1913+ meta : Record < string , string >
1914+ ) : Promise < number > {
1915+ const arch = meta [ 'general.architecture' ]
1916+ if ( ! arch ) throw new Error ( 'Invalid metadata: architecture not found' )
1917+
1918+ const nLayer = Number ( meta [ `${ arch } .block_count` ] )
1919+ const nHead = Number ( meta [ `${ arch } .attention.head_count` ] )
1920+ if ( ! nLayer || ! nHead ) {
1921+ throw new Error ( 'Invalid metadata: block_count or head_count not found' )
1922+ }
1923+
1924+ const keyLen = Number ( meta [ `${ arch } .attention.key_length` ] )
1925+ const valLen = Number ( meta [ `${ arch } .attention.value_length` ] )
1926+ let headDim : number
1927+ if ( keyLen && valLen ) {
1928+ headDim = keyLen + valLen
1929+ } else {
1930+ const embeddingLen = Number ( meta [ `${ arch } .embedding_length` ] )
1931+ if ( ! embeddingLen )
1932+ throw new Error ( 'Invalid metadata: embedding_length not found' )
1933+ headDim = ( embeddingLen / nHead ) * 2
1934+ }
1935+
1936+ const bytesPerElement = 2 // fp16
1937+ return nHead * headDim * bytesPerElement * nLayer
1938+ }
1939+
1940+ private async getLayerSize (
1941+ path : string ,
1942+ meta : Record < string , string >
1943+ ) : Promise < { layerSize : number ; totalLayers : number } > {
1944+ const modelSize = await this . getModelSize ( path )
1945+ const arch = meta [ 'general.architecture' ]
1946+ const totalLayers = Number ( meta [ `${ arch } .block_count` ] )
1947+ if ( ! totalLayers ) throw new Error ( 'Invalid metadata: block_count not found' )
1948+ return { layerSize : modelSize / totalLayers , totalLayers }
1949+ }
1950+
1951+ async planModelLoad ( path : string , requestedCtx ?: number ) : Promise < ModelPlan > {
1952+ const modelSize = await this . getModelSize ( path )
1953+ const memoryInfo = await this . getTotalSystemMemory ( )
1954+ const gguf = await readGgufMetadata ( path )
1955+
1956+ const { layerSize, totalLayers } = await this . getLayerSize (
1957+ path ,
1958+ gguf . metadata
1959+ )
1960+ const kvCachePerToken = await this . getKVCachePerToken ( gguf . metadata )
1961+
1962+ // VRAM budget (70% heuristic)
1963+ const USABLE_VRAM_PERCENTAGE = 0.7
1964+ const usableVRAM = memoryInfo . totalVRAM * USABLE_VRAM_PERCENTAGE
1965+
1966+ // System RAM budget (depends on this.memoryMode: low/medium/high)
1967+ const memoryPercentages = { high : 0.7 , medium : 0.5 , low : 0.4 }
1968+ const usableSystemMemory =
1969+ memoryInfo . totalMemory * memoryPercentages [ this . memoryMode ]
1970+
1971+ // --- GPU layers ---
1972+ let gpuLayers = 0
1973+ if ( modelSize <= usableVRAM ) {
1974+ gpuLayers = totalLayers
1975+ } else {
1976+ gpuLayers = Math . floor ( usableVRAM / layerSize )
1977+ }
1978+
1979+ // --- Context length & KV cache ---
1980+ let availableForKVCache = usableVRAM - modelSize
1981+ let maxContextLength = 0
1982+ let noOffloadKVCache = false
1983+ let mode : ModelPlan [ 'mode' ] = 'Unsupported'
1984+
1985+ if ( availableForKVCache > 0 ) {
1986+ maxContextLength = Math . floor ( availableForKVCache / kvCachePerToken )
1987+ noOffloadKVCache = false
1988+ mode = 'GPU'
1989+ }
1990+
1991+ // fallback: system RAM for KV cache
1992+ if ( maxContextLength <= 0 ) {
1993+ availableForKVCache = usableSystemMemory - modelSize
1994+ if ( availableForKVCache > 0 ) {
1995+ maxContextLength = Math . floor ( availableForKVCache / kvCachePerToken )
1996+ noOffloadKVCache = true // KV cache forced to CPU
1997+ mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
1998+ }
1999+ }
2000+
2001+ // still too big: safe reduction per layer
2002+ if ( maxContextLength <= 0 ) {
2003+ const safeTokensPerLayer = Math . floor (
2004+ usableSystemMemory / ( ( kvCachePerToken / totalLayers ) * 2 )
2005+ )
2006+ maxContextLength = Math . max ( 0 , safeTokensPerLayer )
2007+ noOffloadKVCache = true
2008+ mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
2009+ }
2010+
2011+ // enforce user-requested context
2012+ if ( requestedCtx ) {
2013+ maxContextLength = Math . min ( maxContextLength , requestedCtx )
2014+ }
2015+
2016+ if ( gpuLayers <= 0 && maxContextLength <= 0 ) {
2017+ mode = 'Unsupported'
2018+ }
2019+
2020+ logger . info (
2021+ `Plan for ${ path } : gpuLayers=${ gpuLayers } , maxContextLength=${ maxContextLength } , noOffloadKVCache=${ noOffloadKVCache } , mode=${ mode } `
2022+ )
2023+
2024+ return { gpuLayers, maxContextLength, noOffloadKVCache, mode }
2025+ }
18672026
18682027 /**
1869- * estimate KVCache size of from a given metadata
1870- *
2028+ * estimate KVCache size from a given metadata
18712029 */
18722030 private async estimateKVCache (
18732031 meta : Record < string , string > ,
@@ -1907,6 +2065,7 @@ export default class llamacpp_extension extends AIEngine {
19072065 `Using embedding_length estimation: ${ embeddingLen } , calculated head_dim: ${ headDim } `
19082066 )
19092067 }
2068+
19102069 let ctxLen : number
19112070 if ( ! ctx_size ) {
19122071 ctxLen = Number ( meta [ `${ arch } .context_length` ] )
@@ -1941,60 +2100,62 @@ export default class llamacpp_extension extends AIEngine {
19412100 }
19422101 }
19432102
1944- /*
1945- * check the support status of a model by its path (local/remote)
2103+ /**
2104+ * Check the support status of a model by its path (local/remote)
19462105 *
1947- * * Returns:
1948- * - "RED" → weights don't fit
1949- * - "YELLOW" → weights fit, KV cache doesn't
1950- * - "GREEN" → both weights + KV cache fit
2106+ * Returns:
2107+ * - "RED" → weights don't fit in total memory
2108+ * - "YELLOW" → weights fit in VRAM but need system RAM, or KV cache doesn't fit
2109+ * - "GREEN" → both weights + KV cache fit in VRAM
19512110 */
19522111 async isModelSupported (
19532112 path : string ,
19542113 ctx_size ?: number
19552114 ) : Promise < 'RED' | 'YELLOW' | 'GREEN' > {
19562115 try {
19572116 const modelSize = await this . getModelSize ( path )
2117+ const memoryInfo = await this . getTotalSystemMemory ( )
2118+
19582119 logger . info ( `modelSize: ${ modelSize } ` )
1959- let gguf : GgufMetadata
1960- gguf = await readGgufMetadata ( path )
2120+
2121+ const gguf = await readGgufMetadata ( path )
19612122 let kvCacheSize : number
19622123 if ( ctx_size ) {
19632124 kvCacheSize = await this . estimateKVCache ( gguf . metadata , ctx_size )
19642125 } else {
19652126 kvCacheSize = await this . estimateKVCache ( gguf . metadata )
19662127 }
1967- // total memory consumption = model weights + kvcache + a small buffer for outputs
1968- // output buffer is small so not considering here
2128+
2129+ // Total memory consumption = model weights + kvcache
19692130 const totalRequired = modelSize + kvCacheSize
19702131 logger . info (
19712132 `isModelSupported: Total memory requirement: ${ totalRequired } for ${ path } `
19722133 )
1973- let totalMemBytes : number
1974- const devices = await this . getDevices ( )
1975- if ( devices . length > 0 ) {
1976- // Sum total memory across all GPUs
1977- totalMemBytes = devices
1978- . map ( ( d ) => d . mem * 1024 * 1024 )
1979- . reduce ( ( a , b ) => a + b , 0 )
1980- } else {
1981- // CPU fallback
1982- const sys = await getSystemUsage ( )
1983- totalMemBytes = sys . total_memory * 1024 * 1024
1984- }
19852134
19862135 // Use 80% of total memory as the usable limit
19872136 const USABLE_MEMORY_PERCENTAGE = 0.8
1988- const usableMemBytes = totalMemBytes * USABLE_MEMORY_PERCENTAGE
2137+ const usableTotalMemory =
2138+ memoryInfo . totalMemory * USABLE_MEMORY_PERCENTAGE
2139+ const usableVRAM = memoryInfo . totalVRAM * USABLE_MEMORY_PERCENTAGE
19892140
1990- // check model size wrt 80% of system memory
1991- if ( modelSize > usableMemBytes ) {
2141+ // Check if model fits in total memory at all
2142+ if ( modelSize > usableTotalMemory ) {
19922143 return 'RED'
1993- } else if ( modelSize + kvCacheSize > usableMemBytes ) {
1994- return 'YELLOW'
1995- } else {
2144+ }
2145+
2146+ // Check if everything fits in VRAM (ideal case)
2147+ if ( totalRequired <= usableVRAM ) {
19962148 return 'GREEN'
19972149 }
2150+
2151+ // Check if model fits in VRAM but total requirement exceeds VRAM
2152+ // OR if total requirement fits in total memory but not in VRAM
2153+ if ( modelSize <= usableVRAM || totalRequired <= usableTotalMemory ) {
2154+ return 'YELLOW'
2155+ }
2156+
2157+ // If we get here, nothing fits properly
2158+ return 'RED'
19982159 } catch ( e ) {
19992160 throw new Error ( String ( e ) )
20002161 }
0 commit comments