Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
{ "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", },
{ "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", },
{ "IQ1_TN", LLAMA_FTYPE_MOSTLY_IQ1_TN, " 1.63 bpw quantization (TriLM)", },
{ "IQ2_TN", LLAMA_FTYPE_MOSTLY_IQ2_TN, " 2.00 bpw quantization (TriLM)", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
Expand Down
6 changes: 0 additions & 6 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,12 +232,6 @@ extern "C" {
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);

// Utility to query whether cached GGML graph is in use
GGML_API bool ggml_use_cached_graph(ggml_backend_sched_t sched);

// Set whether or not to use GGML graph caching
GGML_API void ggml_set_cached_graph(ggml_backend_sched_t sched, bool set_value);


#ifdef __cplusplus
}
Expand Down
11 changes: 2 additions & 9 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -401,8 +401,8 @@ extern "C" {
GGML_TYPE_IQ4_K = 139,
GGML_TYPE_IQ5_K = 140,
GGML_TYPE_IQ6_K = 141,
GGML_TYPE_IQ2_TN = 142,
GGML_TYPE_IQ1_TN = 143,
// depricated: GGML_TYPE_IQ2_TN = 142,
// depricated: GGML_TYPE_IQ1_TN = 143,
GGML_TYPE_IQ4_KS = 144,
GGML_TYPE_IQ2_KS = 145,
GGML_TYPE_IQ4_KSS = 146,
Expand Down Expand Up @@ -597,13 +597,6 @@ extern "C" {
GGML_TENSOR_FLAG_PARAM = 4,
};

// Flag (used on GGML_OP_CPY nodes) on whether node is associated with K or V cache
enum ggml_kv_cache_flag {
GGML_KV_CACHE_FLAG_NONE = 0,
GGML_KV_CACHE_FLAG_K = 1,
GGML_KV_CACHE_FLAG_V = 2
};

// ggml object
struct ggml_object {
size_t offs;
Expand Down
45 changes: 8 additions & 37 deletions ggml/src/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -1040,13 +1040,6 @@ struct ggml_backend_sched_split {
struct ggml_cgraph graph;
};

// Object to facilitate GML graph caching
struct ggml_cached_graph {
bool is_active;
ggml_backend_t input_backend;
struct ggml_tensor * input_cpy[GGML_SCHED_MAX_SPLIT_INPUTS];
};

struct ggml_backend_sched {
bool is_reset; // true if the scheduler has been reset since the last graph split
bool is_alloc;
Expand Down Expand Up @@ -1092,8 +1085,6 @@ struct ggml_backend_sched {
size_t context_buffer_size;

bool debug;

struct ggml_cached_graph cached_graph;
};

#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
Expand Down Expand Up @@ -1771,14 +1762,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
struct ggml_tensor * input = split->inputs[j];
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);

if (!sched->cached_graph.is_active) {
sched->cached_graph.input_backend = input_backend;
sched->cached_graph.input_cpy[j] = input_cpy;
} else {
input_backend = sched->cached_graph.input_backend;
input_cpy = sched->cached_graph.input_cpy[j];
}

if (input->flags & GGML_TENSOR_FLAG_INPUT) {
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
Expand Down Expand Up @@ -1910,8 +1893,6 @@ ggml_backend_sched_t ggml_backend_sched_new(

ggml_backend_sched_reset(sched);

sched->cached_graph.is_active = false;

return sched;
}

Expand Down Expand Up @@ -1988,16 +1969,16 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st
}

enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
if(!sched->cached_graph.is_active) {
if (!sched->is_reset && !sched->is_alloc) {
ggml_backend_sched_reset(sched);
}
if (!sched->is_alloc) {
if (!ggml_backend_sched_alloc_graph(sched, graph)) {
return GGML_STATUS_ALLOC_FAILED;
}
if (!sched->is_reset && !sched->is_alloc) {
ggml_backend_sched_reset(sched);
}

if (!sched->is_alloc) {
if (!ggml_backend_sched_alloc_graph(sched, graph)) {
return GGML_STATUS_ALLOC_FAILED;
}
}

return ggml_backend_sched_compute_splits(sched);
}

Expand Down Expand Up @@ -2262,13 +2243,3 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t

return true;
}

bool ggml_use_cached_graph(ggml_backend_sched_t sched) {
return sched->cached_graph.is_active;
}

void ggml_set_cached_graph(ggml_backend_sched_t sched, bool set_value) {
sched->cached_graph.is_active = set_value;
}


17 changes: 2 additions & 15 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,9 +389,7 @@ typedef struct {
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");

//
// Bitnet - implemented as 1.625 bpw
// The block scale is a waste, but it allows us to plug it in without any additional
// changes to ggml.
// Bitnet and TriLM - implemented as 1.625 bpw
//
#define QK_IQ1BN 64
typedef struct {
Expand All @@ -400,24 +398,13 @@ typedef struct {
} block_iq1_bn;
static_assert(sizeof(block_iq1_bn) == 13, "wrong iq1_bn block size/padding");
//
// Bitnet - implemented as 2.0 bpw
// Bitnet and TriLM - implemented as 2.0 bpw
//
#define QK_IQ2BN 64
typedef struct {
uint8_t qs[QK_IQ2BN/4];
} block_iq2_bn;
static_assert(sizeof(block_iq2_bn) == QK_IQ2BN/4, "wrong iq2_bn block size/padding");
//
// TriLM - implemented as 2.0625 bpw
//
typedef struct {
uint8_t qs[52];
} block_iq1_tn;
static_assert(sizeof(block_iq1_tn) == 52, "wrong iq1_tn block size/padding");
typedef struct {
uint8_t qs[QK_K/4];
} block_iq2_tn;
static_assert(sizeof(block_iq2_tn) == QK_K/4, "wrong iqt_bn block size/padding");

// Used by IQ1_M quants
typedef union {
Expand Down
2 changes: 0 additions & 2 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2841,9 +2841,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_TYPE_IQ5_K:
case GGML_TYPE_IQ6_K:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ1_TN:
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ2_TN:
return true;
default:
return false;
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/binbcast.cu
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ static void scale_f32_cuda_l(const float * x, float * dst, const void * data, co
scale_f32_l<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, data, k);
}

void ggml_cuda_op_scale_tensor(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
static void ggml_cuda_op_scale_tensor(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
Expand Down
14 changes: 0 additions & 14 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -473,27 +473,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_BN> {
static constexpr int qi = QI1_BN;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ1_TN> {
static constexpr int qk = QK_IQ1BN;
static constexpr int qr = QR1_BN;
static constexpr int qi = QI1_BN;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ2_BN> {
static constexpr int qk = QK_IQ1BN;
static constexpr int qr = QR1_BN;
static constexpr int qi = QI1_BN;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ2_TN> {
static constexpr int qk = QK_K;
static constexpr int qr = QR2_K;
static constexpr int qi = QI2_K;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
static constexpr int qk = QK4_NL;
Expand Down
Loading