You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
f"Exllama kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
180
-
)
176
+
#log.warn.once(
177
+
# f"Exllama kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
178
+
#)
181
179
182
-
x=x.half()
180
+
x=x.to(dtype=torch.float16)
183
181
184
182
# TODO: need to run checks to make sure there is no performance regression padding with F.pad
185
183
# if in_features is padded, we need to pad the input as well
Copy file name to clipboardExpand all lines: gptqmodel/nn_modules/qlinear/exllama_eora.py
+14-18Lines changed: 14 additions & 18 deletions
Original file line number
Diff line number
Diff line change
@@ -155,12 +155,13 @@ def post_init(self):
155
155
156
156
defforward(self, x):
157
157
x_dtype=x.dtype
158
-
ifx_dtype!=torch.float16:
159
-
log.warn.once(
160
-
f"Exllama EoRA kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
161
-
)
162
-
163
-
x=x.to(dtype=torch.float16)
158
+
# if x_dtype != torch.float16:
159
+
# # log.warn.once(
160
+
# # f"Exllama EoRA kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
161
+
# # )
162
+
#
163
+
# # TODO FIXME...Exllam EoRA kernel must run in fp16 or else output (bfloat16) is junk
164
+
# x = x.to(dtype=torch.float16)
164
165
165
166
# sync with vllm
166
167
# log.info(f"x shape: {x.shape}")
@@ -181,23 +182,18 @@ def forward(self, x):
181
182
# if x.size(-1) != self.in_features:
182
183
# x = F.pad(x, self.in_features_padding_shape)
183
184
184
-
ifself.adapter:
185
-
# only 4 bits fused eora kernel has been validated
f"Exllama v2 kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
236
-
)
237
+
# log.warn.once(
238
+
# f"Exllama v2 kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
239
+
# )
237
240
238
-
x=x.half()
241
+
x=x.to(dtype=torch.float16)
239
242
240
243
# TODO: need to run checks to make sure there is no performance regression padding with F.pad
241
244
# if in_features is padded, we need to pad the input as well
0 commit comments