jetson-containers/packages/llm/llama_cpp/config.py at master · dusty-nv/jetson-containers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
GGUF_FLAGS=" ".join([
    "-DGGML_CUDA=ON",
    "-DGGML_NATIVE=OFF",
    "-DGGML_CUDA_F16=ON",
    "-DLLAMA_CURL=ON",
    "-DLLAMA_OPENSSL=ON",
    "-DGGML_CUDA_FA_ALL_QUANTS=ON",
    "-DGGML_CUDA_PEER_MAX_BATCH_SIZE=512",
])
GGML_FLAGS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_F16=1"
from jetson_containers import CUDA_ARCHITECTURES

def llama_cpp(version, default=False, flags=GGUF_FLAGS):
    """
    Define container that builds both llama.cpp and llama-cpp-python.
    Different versions may have some different flag options activated.
    """
    cpp = bool(version[0] == 'b')
    pkg = package.copy()

    pkg['name'] = f'llama_cpp:{version}'

    pkg['build_args'] = {
        'LLAMA_CPP_VERSION': version[1:] if cpp else None,
        'LLAMA_CPP_VERSION_PY': '0.3.22' if cpp else version,
        'LLAMA_CPP_BRANCH': version if cpp else None,
        'LLAMA_CPP_BRANCH_PY': 'main' if cpp else f'v{version}',
        'LLAMA_CPP_FLAGS': flags,
        'CUDA_ARCHITECTURES': ';'.join([str(x) for x in CUDA_ARCHITECTURES]),
    }

    if cpp:
        test_model = "bartowski/Qwen_Qwen3-1.7B-GGUF/Qwen_Qwen3-1.7B-Q4_K_M.gguf"
    else:
        test_model = "TheBloke/Llama-2-7B-GGUF/llama-2-7b.Q4_K_S.gguf"

    pkg['test'] = pkg['test'] + [
        f"test_model.py --model $(huggingface-downloader {test_model})"
    ]

    builder = pkg.copy()
    builder['name'] = builder['name'] + '-builder'
    builder['build_args'] = {**builder['build_args'], 'FORCE_BUILD': 'on'}

    if default:
        pkg['alias'] = 'llama_cpp'
        builder['alias'] = 'llama_cpp:builder'

    return pkg, builder

package = [
    llama_cpp('0.2.57', flags=GGML_FLAGS),
    llama_cpp('0.2.70', flags=GGML_FLAGS),
    # llama_cpp_python appears abandoned (4/25)
    # so we changed over to llama.cpp branches
    llama_cpp('b9016', default=True)
]