@@ -142,14 +142,27 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
142142 raise ValueError (f"Mismatch between weight map and model parts for tensor names: { sym_diff } " )
143143
144144 def format_tensor_name (self , key : gguf .MODEL_TENSOR , bid : int | None = None , suffix : str = ".weight" ) -> str :
145- name : str = gguf .TENSOR_NAMES [key ]
146145 if key not in gguf .MODEL_TENSORS [self .model_arch ]:
147146 raise ValueError (f"Missing { key !r} for MODEL_TENSORS of { self .model_arch !r} " )
147+ name : str = gguf .TENSOR_NAMES [key ]
148148 if "{bid}" in name :
149149 assert bid is not None
150150 name = name .format (bid = bid )
151151 return name + suffix
152152
153+ def match_model_tensor_name (self , name : str , key : gguf .MODEL_TENSOR , bid : int | None , suffix : str = ".weight" ) -> bool :
154+ if key not in gguf .MODEL_TENSORS [self .model_arch ]:
155+ return False
156+ key_name : str = gguf .TENSOR_NAMES [key ]
157+ if "{bid}" in key_name :
158+ if bid is None :
159+ return False
160+ key_name = key_name .format (bid = bid )
161+ else :
162+ if bid is not None :
163+ return False
164+ return name == (key_name + suffix )
165+
153166 def map_tensor_name (self , name : str , try_suffixes : Sequence [str ] = (".weight" , ".bias" )) -> str :
154167 new_name = self .tensor_map .get_name (key = name , try_suffixes = try_suffixes )
155168 if new_name is None :
@@ -218,12 +231,12 @@ def write_tensors(self):
218231 # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
219232 def np_fp32_to_bf16 (n : np .ndarray ):
220233 # force nan to quiet
221- n = np .where ((n & 0x7fffffff ) > 0x7f800000 , n | (64 << 16 ), n )
234+ n = np .where ((n & 0x7fffffff ) > 0x7f800000 , ( n & 0xffff0000 ) | (64 << 16 ), n )
222235 # flush subnormals to zero
223236 n = np .where ((n & 0x7f800000 ) == 0 , n & 0x80000000 , n )
224237 # round to nearest even
225238 n = (n + (0x7fff + ((n >> 16 ) & 1 ))) >> 16
226- return n
239+ return n . astype ( np . int16 )
227240
228241 # Doing this row-wise is much, much faster than element-wise, hence the signature
229242 v_fp32_to_bf16 = np .vectorize (np_fp32_to_bf16 , otypes = [np .int16 ], signature = "(n)->(n)" )
@@ -263,10 +276,25 @@ def np_fp32_to_bf16(n: np.ndarray):
263276 extra_f16 = self .extra_f16_tensors (name , new_name , bid , n_dims )
264277
265278 # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
266- extra_f32 = extra_f32 or n_dims == 1 or new_name .endswith ("_norm.weight" )
279+ # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
280+ extra_f32 = any (cond for cond in (
281+ extra_f32 ,
282+ n_dims == 1 ,
283+ new_name .endswith ("_norm.weight" ),
284+ ))
285+
286+ # Some tensor types are always in float32
287+ extra_f32 = extra_f32 or any (self .match_model_tensor_name (new_name , key , bid ) for key in (
288+ gguf .MODEL_TENSOR .FFN_GATE_INP ,
289+ gguf .MODEL_TENSOR .POS_EMBD ,
290+ gguf .MODEL_TENSOR .TOKEN_TYPES ,
291+ ))
267292
268293 # if f16 desired, convert any float32 2-dim weight tensors to float16
269- extra_f16 = extra_f16 or (name .endswith (".weight" ) and n_dims >= 2 )
294+ extra_f16 = any (cond for cond in (
295+ extra_f16 ,
296+ (name .endswith (".weight" ) and n_dims >= 2 ),
297+ ))
270298
271299 if self .ftype != gguf .LlamaFileType .ALL_F32 and extra_f16 and not extra_f32 :
272300 if self .ftype == gguf .LlamaFileType .MOSTLY_F16 :
@@ -2050,12 +2078,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
20502078
20512079 return [(self .map_tensor_name (name ), data_torch )]
20522080
2053- def extra_f32_tensors (self , name : str , new_name : str , bid : int | None , n_dims : int ) -> bool :
2054- del new_name , bid , n_dims # unused
2055-
2056- # not used with get_rows, must be F32
2057- return name == "embeddings.token_type_embeddings.weight"
2058-
20592081
20602082@Model .register ("NomicBertModel" )
20612083class NomicBertModel (BertModel ):
@@ -2453,6 +2475,8 @@ def main() -> None:
24532475 logger .info ("Set model tokenizer" )
24542476 model_instance .set_vocab ()
24552477
2478+ model_instance .gguf_writer .add_quantization_version (gguf .GGML_QUANT_VERSION );
2479+
24562480 if args .vocab_only :
24572481 logger .info (f"Exporting model vocab to '{ fname_out } '" )
24582482 model_instance .write_vocab ()
0 commit comments