@@ -35,17 +35,15 @@ import {
3535import { invoke } from '@tauri-apps/api/core'
3636import { getProxyConfig } from './util'
3737import { basename } from '@tauri-apps/api/path'
38- import {
39- GgufMetadata ,
40- readGgufMetadata ,
41- } from '@janhq/tauri-plugin-llamacpp-api'
38+ import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api'
4239import { getSystemUsage } from '@janhq/tauri-plugin-hardware-api'
4340
4441type LlamacppConfig = {
4542 version_backend : string
4643 auto_update_engine : boolean
4744 auto_unload : boolean
4845 llamacpp_env : string
46+ memory_util : string
4947 chat_template : string
5048 n_gpu_layers : number
5149 offload_mmproj : boolean
@@ -74,6 +72,13 @@ type LlamacppConfig = {
7472 ctx_shift : boolean
7573}
7674
75+ type ModelPlan = {
76+ gpuLayers : number
77+ maxContextLength : number
78+ noOffloadKVCache : boolean
79+ mode : 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
80+ }
81+
7782interface DownloadItem {
7883 url : string
7984 save_path : string
@@ -116,6 +121,12 @@ interface DeviceList {
116121 free : number
117122}
118123
124+ interface SystemMemory {
125+ totalVRAM : number
126+ totalRAM : number
127+ totalMemory : number
128+ }
129+
119130/**
120131 * Override the default app.log function to use Jan's logging system.
121132 * @param args
@@ -159,6 +170,7 @@ export default class llamacpp_extension extends AIEngine {
159170 provider : string = 'llamacpp'
160171 autoUnload : boolean = true
161172 llamacpp_env : string = ''
173+ memoryMode : string = 'high'
162174 readonly providerId : string = 'llamacpp'
163175
164176 private config : LlamacppConfig
@@ -190,6 +202,7 @@ export default class llamacpp_extension extends AIEngine {
190202
191203 this . autoUnload = this . config . auto_unload
192204 this . llamacpp_env = this . config . llamacpp_env
205+ this . memoryMode = this . config . memory_util
193206
194207 // This sets the base directory where model files for this provider are stored.
195208 this . providerPath = await joinPath ( [
@@ -836,6 +849,8 @@ export default class llamacpp_extension extends AIEngine {
836849 this . autoUnload = value as boolean
837850 } else if ( key === 'llamacpp_env' ) {
838851 this . llamacpp_env = value as string
852+ } else if ( key === 'memory_util' ) {
853+ this . memoryMode = value as string
839854 }
840855 }
841856
@@ -1848,10 +1863,153 @@ export default class llamacpp_extension extends AIEngine {
18481863 'tokenizer.chat_template'
18491864 ] ?. includes ( 'tools' )
18501865 }
1866+ /**
1867+ * Get total system memory including both VRAM and RAM
1868+ */
1869+ private async getTotalSystemMemory ( ) : Promise < SystemMemory > {
1870+ const devices = await this . getDevices ( )
1871+ let totalVRAM = 0
1872+
1873+ if ( devices . length > 0 ) {
1874+ // Sum total VRAM across all GPUs
1875+ totalVRAM = devices
1876+ . map ( ( d ) => d . mem * 1024 * 1024 )
1877+ . reduce ( ( a , b ) => a + b , 0 )
1878+ }
1879+
1880+ // Get system RAM
1881+ const sys = await getSystemUsage ( )
1882+ const totalRAM = sys . total_memory * 1024 * 1024
1883+
1884+ const totalMemory = totalVRAM + totalRAM
1885+
1886+ logger . info (
1887+ `Total VRAM: ${ totalVRAM } bytes, Total RAM: ${ totalRAM } bytes, Total Memory: ${ totalMemory } bytes`
1888+ )
1889+
1890+ return {
1891+ totalVRAM,
1892+ totalRAM,
1893+ totalMemory,
1894+ }
1895+ }
1896+ private async getKVCachePerToken (
1897+ meta : Record < string , string >
1898+ ) : Promise < number > {
1899+ const arch = meta [ 'general.architecture' ]
1900+ if ( ! arch ) throw new Error ( 'Invalid metadata: architecture not found' )
1901+
1902+ const nLayer = Number ( meta [ `${ arch } .block_count` ] )
1903+ const nHead = Number ( meta [ `${ arch } .attention.head_count` ] )
1904+ if ( ! nLayer || ! nHead ) {
1905+ throw new Error ( 'Invalid metadata: block_count or head_count not found' )
1906+ }
1907+
1908+ const keyLen = Number ( meta [ `${ arch } .attention.key_length` ] )
1909+ const valLen = Number ( meta [ `${ arch } .attention.value_length` ] )
1910+ let headDim : number
1911+ if ( keyLen && valLen ) {
1912+ headDim = keyLen + valLen
1913+ } else {
1914+ const embeddingLen = Number ( meta [ `${ arch } .embedding_length` ] )
1915+ if ( ! embeddingLen )
1916+ throw new Error ( 'Invalid metadata: embedding_length not found' )
1917+ headDim = ( embeddingLen / nHead ) * 2
1918+ }
1919+
1920+ const bytesPerElement = 2 // fp16
1921+ return nHead * headDim * bytesPerElement * nLayer
1922+ }
1923+
1924+ private async getLayerSize (
1925+ path : string ,
1926+ meta : Record < string , string >
1927+ ) : Promise < { layerSize : number ; totalLayers : number } > {
1928+ const modelSize = await this . getModelSize ( path )
1929+ const arch = meta [ 'general.architecture' ]
1930+ const totalLayers = Number ( meta [ `${ arch } .block_count` ] )
1931+ if ( ! totalLayers ) throw new Error ( 'Invalid metadata: block_count not found' )
1932+ return { layerSize : modelSize / totalLayers , totalLayers }
1933+ }
1934+
1935+ async planModelLoad ( path : string , requestedCtx ?: number ) : Promise < ModelPlan > {
1936+ const modelSize = await this . getModelSize ( path )
1937+ const memoryInfo = await this . getTotalSystemMemory ( )
1938+ const gguf = await readGgufMetadata ( path )
1939+
1940+ const { layerSize, totalLayers } = await this . getLayerSize (
1941+ path ,
1942+ gguf . metadata
1943+ )
1944+ const kvCachePerToken = await this . getKVCachePerToken ( gguf . metadata )
1945+
1946+ // VRAM budget (70% heuristic)
1947+ const USABLE_VRAM_PERCENTAGE = 0.7
1948+ const usableVRAM = memoryInfo . totalVRAM * USABLE_VRAM_PERCENTAGE
1949+
1950+ // System RAM budget (depends on this.memoryMode: low/medium/high)
1951+ const memoryPercentages = { high : 0.7 , medium : 0.5 , low : 0.4 }
1952+ const usableSystemMemory =
1953+ memoryInfo . totalMemory * memoryPercentages [ this . memoryMode ]
1954+
1955+ // --- GPU layers ---
1956+ let gpuLayers = 0
1957+ if ( modelSize <= usableVRAM ) {
1958+ gpuLayers = totalLayers
1959+ } else {
1960+ gpuLayers = Math . floor ( usableVRAM / layerSize )
1961+ }
1962+
1963+ // --- Context length & KV cache ---
1964+ let availableForKVCache = usableVRAM - modelSize
1965+ let maxContextLength = 0
1966+ let noOffloadKVCache = false
1967+ let mode : ModelPlan [ 'mode' ] = 'Unsupported'
1968+
1969+ if ( availableForKVCache > 0 ) {
1970+ maxContextLength = Math . floor ( availableForKVCache / kvCachePerToken )
1971+ noOffloadKVCache = false
1972+ mode = 'GPU'
1973+ }
1974+
1975+ // fallback: system RAM for KV cache
1976+ if ( maxContextLength <= 0 ) {
1977+ availableForKVCache = usableSystemMemory - modelSize
1978+ if ( availableForKVCache > 0 ) {
1979+ maxContextLength = Math . floor ( availableForKVCache / kvCachePerToken )
1980+ noOffloadKVCache = true // KV cache forced to CPU
1981+ mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
1982+ }
1983+ }
1984+
1985+ // still too big: safe reduction per layer
1986+ if ( maxContextLength <= 0 ) {
1987+ const safeTokensPerLayer = Math . floor (
1988+ usableSystemMemory / ( ( kvCachePerToken / totalLayers ) * 2 )
1989+ )
1990+ maxContextLength = Math . max ( 0 , safeTokensPerLayer )
1991+ noOffloadKVCache = true
1992+ mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
1993+ }
1994+
1995+ // enforce user-requested context
1996+ if ( requestedCtx ) {
1997+ maxContextLength = Math . min ( maxContextLength , requestedCtx )
1998+ }
1999+
2000+ if ( gpuLayers <= 0 && maxContextLength <= 0 ) {
2001+ mode = 'Unsupported'
2002+ }
2003+
2004+ logger . info (
2005+ `Plan for ${ path } : gpuLayers=${ gpuLayers } , maxContextLength=${ maxContextLength } , noOffloadKVCache=${ noOffloadKVCache } , mode=${ mode } `
2006+ )
2007+
2008+ return { gpuLayers, maxContextLength, noOffloadKVCache, mode }
2009+ }
18512010
18522011 /**
1853- * estimate KVCache size of from a given metadata
1854- *
2012+ * estimate KVCache size from a given metadata
18552013 */
18562014 private async estimateKVCache (
18572015 meta : Record < string , string > ,
@@ -1891,6 +2049,7 @@ export default class llamacpp_extension extends AIEngine {
18912049 `Using embedding_length estimation: ${ embeddingLen } , calculated head_dim: ${ headDim } `
18922050 )
18932051 }
2052+
18942053 let ctxLen : number
18952054 if ( ! ctx_size ) {
18962055 ctxLen = Number ( meta [ `${ arch } .context_length` ] )
@@ -1925,60 +2084,62 @@ export default class llamacpp_extension extends AIEngine {
19252084 }
19262085 }
19272086
1928- /*
1929- * check the support status of a model by its path (local/remote)
2087+ /**
2088+ * Check the support status of a model by its path (local/remote)
19302089 *
1931- * * Returns:
1932- * - "RED" → weights don't fit
1933- * - "YELLOW" → weights fit, KV cache doesn't
1934- * - "GREEN" → both weights + KV cache fit
2090+ * Returns:
2091+ * - "RED" → weights don't fit in total memory
2092+ * - "YELLOW" → weights fit in VRAM but need system RAM, or KV cache doesn't fit
2093+ * - "GREEN" → both weights + KV cache fit in VRAM
19352094 */
19362095 async isModelSupported (
19372096 path : string ,
19382097 ctx_size ?: number
19392098 ) : Promise < 'RED' | 'YELLOW' | 'GREEN' > {
19402099 try {
19412100 const modelSize = await this . getModelSize ( path )
2101+ const memoryInfo = await this . getTotalSystemMemory ( )
2102+
19422103 logger . info ( `modelSize: ${ modelSize } ` )
1943- let gguf : GgufMetadata
1944- gguf = await readGgufMetadata ( path )
2104+
2105+ const gguf = await readGgufMetadata ( path )
19452106 let kvCacheSize : number
19462107 if ( ctx_size ) {
19472108 kvCacheSize = await this . estimateKVCache ( gguf . metadata , ctx_size )
19482109 } else {
19492110 kvCacheSize = await this . estimateKVCache ( gguf . metadata )
19502111 }
1951- // total memory consumption = model weights + kvcache + a small buffer for outputs
1952- // output buffer is small so not considering here
2112+
2113+ // Total memory consumption = model weights + kvcache
19532114 const totalRequired = modelSize + kvCacheSize
19542115 logger . info (
19552116 `isModelSupported: Total memory requirement: ${ totalRequired } for ${ path } `
19562117 )
1957- let totalMemBytes : number
1958- const devices = await this . getDevices ( )
1959- if ( devices . length > 0 ) {
1960- // Sum total memory across all GPUs
1961- totalMemBytes = devices
1962- . map ( ( d ) => d . mem * 1024 * 1024 )
1963- . reduce ( ( a , b ) => a + b , 0 )
1964- } else {
1965- // CPU fallback
1966- const sys = await getSystemUsage ( )
1967- totalMemBytes = sys . total_memory * 1024 * 1024
1968- }
19692118
19702119 // Use 80% of total memory as the usable limit
19712120 const USABLE_MEMORY_PERCENTAGE = 0.8
1972- const usableMemBytes = totalMemBytes * USABLE_MEMORY_PERCENTAGE
2121+ const usableTotalMemory =
2122+ memoryInfo . totalMemory * USABLE_MEMORY_PERCENTAGE
2123+ const usableVRAM = memoryInfo . totalVRAM * USABLE_MEMORY_PERCENTAGE
19732124
1974- // check model size wrt 80% of system memory
1975- if ( modelSize > usableMemBytes ) {
2125+ // Check if model fits in total memory at all
2126+ if ( modelSize > usableTotalMemory ) {
19762127 return 'RED'
1977- } else if ( modelSize + kvCacheSize > usableMemBytes ) {
1978- return 'YELLOW'
1979- } else {
2128+ }
2129+
2130+ // Check if everything fits in VRAM (ideal case)
2131+ if ( totalRequired <= usableVRAM ) {
19802132 return 'GREEN'
19812133 }
2134+
2135+ // Check if model fits in VRAM but total requirement exceeds VRAM
2136+ // OR if total requirement fits in total memory but not in VRAM
2137+ if ( modelSize <= usableVRAM || totalRequired <= usableTotalMemory ) {
2138+ return 'YELLOW'
2139+ }
2140+
2141+ // If we get here, nothing fits properly
2142+ return 'RED'
19822143 } catch ( e ) {
19832144 throw new Error ( String ( e ) )
19842145 }
0 commit comments