Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions extensions/llamacpp-extension/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ type ModelPlan = {
gpuLayers: number
maxContextLength: number
noOffloadKVCache: boolean
noOffloadMmproj?: boolean
offloadMmproj?: boolean
mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
}

Expand Down Expand Up @@ -2049,8 +2049,8 @@ export default class llamacpp_extension extends AIEngine {

async planModelLoad(
path: string,
requestedCtx?: number,
mmprojPath?: string
mmprojPath?: string,
requestedCtx?: number
): Promise<ModelPlan> {
const modelSize = await this.getModelSize(path)
const memoryInfo = await this.getTotalSystemMemory()
Expand Down Expand Up @@ -2138,12 +2138,12 @@ export default class llamacpp_extension extends AIEngine {
)

// --- Priority 1: Allocate mmproj (if exists) ---
let noOffloadMmproj = false
let offloadMmproj = false
let remainingVRAM = usableVRAM

if (mmprojSize > 0) {
if (mmprojSize <= remainingVRAM) {
noOffloadMmproj = true
offloadMmproj = true
remainingVRAM -= mmprojSize
logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
} else {
Expand Down Expand Up @@ -2218,7 +2218,7 @@ export default class llamacpp_extension extends AIEngine {
const cpuLayers = totalLayers - gpuLayers
const modelCPUSize = cpuLayers * layerSize
const mmprojCPUSize =
mmprojSize > 0 && !noOffloadMmproj ? mmprojSize : 0
mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0
const systemRAMUsed = modelCPUSize + mmprojCPUSize
const availableSystemRAMForKVCache = Math.max(
0,
Expand Down Expand Up @@ -2277,7 +2277,7 @@ export default class llamacpp_extension extends AIEngine {
const estimatedGPUUsage =
gpuLayers * layerSize +
maxContextLength * kvCachePerToken +
(noOffloadMmproj ? mmprojSize : 0)
(offloadMmproj ? mmprojSize : 0)

if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
logger.warn(
Expand All @@ -2293,7 +2293,7 @@ export default class llamacpp_extension extends AIEngine {
const newEstimate =
gpuLayers * layerSize +
maxContextLength * kvCachePerToken +
(noOffloadMmproj ? mmprojSize : 0)
(offloadMmproj ? mmprojSize : 0)
if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
}

Expand Down Expand Up @@ -2329,7 +2329,7 @@ export default class llamacpp_extension extends AIEngine {

// Log final plan
const mmprojInfo = mmprojPath
? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, noOffloadMmproj=${noOffloadMmproj}`
? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
: ''

logger.info(
Expand All @@ -2343,7 +2343,7 @@ export default class llamacpp_extension extends AIEngine {
maxContextLength,
noOffloadKVCache,
mode,
noOffloadMmproj,
offloadMmproj,
}
}

Expand Down
26 changes: 19 additions & 7 deletions web-app/src/containers/ModelSetting.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -24,47 +24,52 @@
}

export function ModelSetting({
model,
provider,
smallIcon,
}: ModelSettingProps) {
const { updateProvider } = useModelProvider()
const { t } = useTranslation()
const serviceHub = useServiceHub()

Check warning on line 33 in web-app/src/containers/ModelSetting.tsx

View workflow job for this annotation

GitHub Actions / coverage-check

27-33 lines are not covered with tests

const [isPlanning, setIsPlanning] = useState(false)

Check warning on line 35 in web-app/src/containers/ModelSetting.tsx

View workflow job for this annotation

GitHub Actions / coverage-check

35 line is not covered with tests

// Create a debounced version of stopModel that waits 500ms after the last call
const debouncedStopModel = debounce((modelId: string) => {
serviceHub.models().stopModel(modelId)
}, 500)

Check warning on line 40 in web-app/src/containers/ModelSetting.tsx

View workflow job for this annotation

GitHub Actions / coverage-check

38-40 lines are not covered with tests

const handlePlanModelLoad = async () => {
if (provider.provider !== 'llamacpp') {
console.warn('planModelLoad is only available for llamacpp provider')
return
}
setIsPlanning(true)
try {

Check warning on line 48 in web-app/src/containers/ModelSetting.tsx

View workflow job for this annotation

GitHub Actions / coverage-check

42-48 lines are not covered with tests
// Read the model config to get the actual model path
// Read the model config to get the actual model path and mmproj path
const modelConfig = await serviceHub.app().readYaml<{

Check warning on line 50 in web-app/src/containers/ModelSetting.tsx

View workflow job for this annotation

GitHub Actions / coverage-check

50 line is not covered with tests
model_path: string
mmproj_path?: string
}>(`llamacpp/models/${model.id}/model.yml`)

Check warning on line 53 in web-app/src/containers/ModelSetting.tsx

View workflow job for this annotation

GitHub Actions / coverage-check

53 line is not covered with tests

if (modelConfig && modelConfig.model_path) {
const result = await serviceHub
.models()
.planModelLoad(modelConfig.model_path)
.planModelLoad(
modelConfig.model_path,
undefined,
modelConfig.mmproj_path
)

Check warning on line 62 in web-app/src/containers/ModelSetting.tsx

View workflow job for this annotation

GitHub Actions / coverage-check

55-62 lines are not covered with tests

// Apply the recommended settings to the model sequentially to avoid race conditions
const settingsToUpdate: Array<{

Check warning on line 65 in web-app/src/containers/ModelSetting.tsx

View workflow job for this annotation

GitHub Actions / coverage-check

65 line is not covered with tests
key: string
value: number | boolean
}> = []

Check warning on line 68 in web-app/src/containers/ModelSetting.tsx

View workflow job for this annotation

GitHub Actions / coverage-check

68 line is not covered with tests

if (model.settings?.ngl && result.gpuLayers !== undefined) {
settingsToUpdate.push({ key: 'ngl', value: result.gpuLayers })
}

Check warning on line 72 in web-app/src/containers/ModelSetting.tsx

View workflow job for this annotation

GitHub Actions / coverage-check

70-72 lines are not covered with tests

if (model.settings?.ctx_len && result.maxContextLength !== undefined) {
settingsToUpdate.push({
Expand Down Expand Up @@ -242,11 +247,18 @@
{provider.provider === 'llamacpp' && (
<div className="pb-4 border-b border-main-view-fg/10 my-4">
<div>
<h3 className="font-medium mb-1">Optimize Settings</h3>
<p className="text-main-view-fg/70 text-xs mb-3">
Analyze your system and model, then apply optimal loading
settings automatically
</p>
<div>
<div className="flex items-center gap-2 mb-1">
<h3 className="font-medium">Optimize Settings</h3>
<div className="text-xs bg-main-view-fg/10 border border-main-view-fg/20 text-main-view-fg/70 rounded-full py-0.5 px-2">
<span>{t('mcp-servers:experimental')}</span>
</div>
</div>
<p className="text-main-view-fg/70 text-xs mb-3">
Analyze your system and model, then apply optimal loading
settings automatically
</p>
</div>
<Button
onClick={handlePlanModelLoad}
disabled={isPlanning}
Expand Down
6 changes: 5 additions & 1 deletion web-app/src/services/models/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -136,5 +136,9 @@ export interface ModelsService {
ctxSize?: number
): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
validateGgufFile(filePath: string): Promise<ModelValidationResult>
planModelLoad(modelPath: string, requestedCtx?: number): Promise<ModelPlan>
planModelLoad(
modelPath: string,
requestedCtx?: number,
mmprojPath?: string
): Promise<ModelPlan>
}
Loading