Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,11 +489,12 @@ std::string common_chat_format_single(
return ss.str();
}

std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
common_chat_templates_inputs inputs;
inputs.use_jinja = use_jinja;
inputs.add_bos = tmpls->add_bos;
inputs.add_eos = tmpls->add_eos;
inputs.chat_template_kwargs = chat_template_kwargs;
auto add_simple_msg = [&](auto role, auto content) {
common_chat_msg msg;
msg.role = role;
Expand Down
3 changes: 2 additions & 1 deletion common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ std::string common_chat_format_single(
// Returns an example of formatted chat
std::string common_chat_format_example(
const struct common_chat_templates * tmpls,
bool use_jinja);
bool use_jinja,
const std::map<std::string, std::string> & chat_template_kwargs);

const char* common_chat_format_name(common_chat_format format);
const char* common_reasoning_format_name(common_reasoning_format format);
Expand Down
11 changes: 10 additions & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -899,7 +899,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "--mmproj") {
CHECK_ARG
params.mmproj = argv[i];
params.mmproj.path = argv[i];
return true;
}
if (arg == "--mmproj-url") {
CHECK_ARG
params.mmproj.url = argv[i];
return true;
}
if (arg == "--no-mmproj-offload") {
params.mmproj_use_gpu = false;
return true;
}
if (arg == "--image") {
Expand Down
37 changes: 35 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,29 @@ struct llama_control_vector_load_info;
int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();

enum llama_example {
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL,
LLAMA_EXAMPLE_PASSKEY,
LLAMA_EXAMPLE_IMATRIX,
LLAMA_EXAMPLE_BENCH,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE,

LLAMA_EXAMPLE_COUNT,
};

//
// CLI argument parsing
//
Expand All @@ -86,6 +109,14 @@ enum common_reasoning_format {
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
};

struct model_paths {
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string docker_repo = ""; // Docker repo // NOLINT
};

struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed

Expand Down Expand Up @@ -230,8 +261,10 @@ struct gpt_params {
std::string cache_type_k_draft = ""; // KV cache data type for K for the draft model
std::string cache_type_v_draft = ""; // KV cache data type for V for the draft model

// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
// multimodal models (see examples/mtmd)
model_paths mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)

// embedding
Expand Down
2 changes: 1 addition & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ else()
add_subdirectory(imatrix)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(main)
Expand All @@ -39,6 +38,7 @@ else()
add_subdirectory(quantize-stats)
add_subdirectory(quantize)
add_subdirectory(retrieval)
add_subdirectory(mtmd)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
Expand Down
2 changes: 1 addition & 1 deletion examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
if (params.conversation) {
if (params.enable_chat_template) {
//LOG_TEE("%s: chat template example: %s\n", __func__, common_chat_format_example(model, *chat_templates.template_default, params.use_jinja).c_str());
LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, {}).c_str());
} else {
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
}
Expand Down
62 changes: 62 additions & 0 deletions examples/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# mtmd

find_package(Threads REQUIRED)

add_library(mtmd
mtmd.cpp
mtmd-audio.cpp
mtmd.h
clip.cpp
clip.h
clip-impl.h
mtmd-helper.cpp
mtmd-helper.h
)

target_link_libraries (mtmd PUBLIC ggml llama)
target_link_libraries (mtmd PRIVATE Threads::Threads)
target_include_directories(mtmd PUBLIC .)
target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../vendor)
target_compile_features (mtmd PRIVATE cxx_std_17)

if (BUILD_SHARED_LIBS)
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
target_compile_definitions(mtmd PUBLIC LLAMA_SHARED)
endif()

set(MTMD_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
)

set_target_properties(mtmd
PROPERTIES
PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")

install(TARGETS mtmd LIBRARY PUBLIC_HEADER)

if (NOT MSVC)
# for stb_image.h and miniaudio.h
target_compile_options(mtmd PRIVATE -Wno-cast-qual)
endif()

if (TARGET BUILD_INFO)
add_dependencies(mtmd BUILD_INFO)
add_dependencies(mtmd-helper BUILD_INFO)
endif()

add_executable(llama-llava-cli deprecation-warning.cpp)
add_executable(llama-gemma3-cli deprecation-warning.cpp)
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)

set(TARGET llama-mtmd-cli)
add_executable (${TARGET} mtmd-cli.cpp)
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()
target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
63 changes: 63 additions & 0 deletions examples/mtmd/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Multimodal Support in llama.cpp

This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported.

> [!IMPORTANT]
>
> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**.

The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify:

- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction.
- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure.
- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users.
- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs.
- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`.

## Pre-quantized models

See the list of pre-quantized model [here](../../docs/multimodal.md)

## How it works and what is `mmproj`?

Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model.

This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging.

Consequently, running a multimodal model typically requires two GGUF files:
1. The standard language model file.
2. A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection.

## What is `libmtmd`?

As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs.

Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages:
- **Unified Interface:** Aims to consolidate interaction for various multimodal models.
- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library.
- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models.

## How to obtain `mmproj`

Multimodal projector (`mmproj`) files are specific to each model architecture.

For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` flag to get the `mmproj` file:
- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ; See the guide [here](../../docs/multimodal/gemma3.md) - Note: 1B variant does not have vision support
- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen))
- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
- InternVL 2.5 and InternVL 3 from [OpenGVLab](https://huggingface.co/OpenGVLab) (note: we don't support conversion of `InternVL3-*-hf` model, only non-HF version is supported ; `InternLM2Model` **text** model is not supported)

For older models, please refer to the relevant guide for instructions on how to obtain or create them:

NOTE: conversion scripts are located under `tools/mtmd/legacy-models`

- [LLaVA](../../docs/multimodal/llava.md)
- [MobileVLM](../../docs/multimodal/MobileVLM.md)
- [GLM-Edge](../../docs/multimodal/glmedge.md)
- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md)
- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md)
- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
Loading