-
-
Notifications
You must be signed in to change notification settings - Fork 3.9k
[4/N] Enable intel GPU for unsloth #2621
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
leizhenyuan
wants to merge
5
commits into
unslothai:main
from
leizhenyuan:intel_zhenyuan_enable_xpu_4
Closed
Changes from 3 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,8 @@ | |
| from transformers import __version__ as transformers_version | ||
| from unsloth_zoo.utils import Version, _get_dtype | ||
| from unsloth_zoo.peft_utils import SKIP_QUANTIZATION_MODULES | ||
| from unsloth import DEVICE_TYPE | ||
|
|
||
| transformers_version = Version(transformers_version) | ||
| # Transformers moved rotary embeddings out of all attention layers | ||
| IS_ATTENTION_REFACTOR = transformers_version > Version("4.47.1") | ||
|
|
@@ -641,7 +643,7 @@ def LlamaModel_fast_forward( | |
| position_ids = torch.arange( | ||
| past_key_values_length, seq_length + past_key_values_length, | ||
| dtype = torch.int32, | ||
| device = "cuda:0", | ||
| device = f"{DEVICE_TYPE}:0", | ||
| ) | ||
| position_ids = position_ids.unsqueeze(0).view(-1, seq_length) | ||
| elif position_ids is not None: | ||
|
|
@@ -814,13 +816,13 @@ def LlamaModel_fast_forward( | |
| is_causal = True, | ||
| sliding_window = self.config.sliding_window, | ||
| )\ | ||
| .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda",)\ | ||
| .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = DEVICE_TYPE,)\ | ||
| .squeeze(0).squeeze(0) | ||
|
|
||
| self.GA_mask = AttentionMaskConverter( | ||
| is_causal = True, | ||
| )\ | ||
| .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda",)\ | ||
| .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = DEVICE_TYPE,)\ | ||
| .squeeze(0).squeeze(0) | ||
| pass | ||
| pass | ||
|
|
@@ -932,11 +934,11 @@ def LlamaModel_fast_forward_inference_custom( | |
| bsz, q_len, hd = X.shape | ||
| assert(q_len == 1) | ||
| # Get saved buffers to reduce memory movement | ||
| residual = torch.empty((bsz, q_len, hd), dtype = torch.float32, device = "cuda:0") | ||
| _XX = torch.empty((2, bsz, q_len, hd), dtype = torch.float32, device = "cuda:0") | ||
| residual = torch.empty((bsz, q_len, hd), dtype = torch.float32, device = f"{DEVICE_TYPE}:0") | ||
| _XX = torch.empty((2, bsz, q_len, hd), dtype = torch.float32, device = f"{DEVICE_TYPE}:0") | ||
| XX, XX2 = _XX[0], _XX[1] | ||
| variance = torch.empty((bsz, q_len, 1), dtype = torch.float32, device = "cuda:0") | ||
| temp_mlp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = "cuda:0") | ||
| variance = torch.empty((bsz, q_len, 1), dtype = torch.float32, device = f"{DEVICE_TYPE}:0") | ||
| temp_mlp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = f"{DEVICE_TYPE}:0") | ||
| temp_gate, temp_up = temp_mlp[0], temp_mlp[1] | ||
|
|
||
| seq_len = past_key_values[0][0].shape[-2] | ||
|
|
@@ -1239,7 +1241,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= | |
| partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 | ||
| dim = getattr(config, "head_dim", None) | ||
| if dim is None: dim = int((config.hidden_size // config.num_attention_heads)) | ||
| device = "cuda" | ||
| device = DEVICE_TYPE | ||
| max_position_embeddings = config.max_position_embeddings | ||
| pass | ||
|
|
||
|
|
@@ -1288,7 +1290,7 @@ def extend_rope_embedding(self, x, seq_len): | |
| if seq_len <= self.current_rope_size: return | ||
| # Iteratively grow by increments of 8192 | ||
| self.current_rope_size = ((seq_len // 8192) + ((seq_len % 8192) != 0)) * 8192 | ||
| self._set_cos_sin_cache(self.current_rope_size, device = "cuda", dtype = x.dtype) | ||
| self._set_cos_sin_cache(self.current_rope_size, device = DEVICE_TYPE, dtype = x.dtype) | ||
| pass | ||
| pass | ||
|
|
||
|
|
@@ -1334,7 +1336,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= | |
| base = config.rope_theta | ||
| partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 | ||
| dim = int((config.hidden_size // config.num_attention_heads)) | ||
| device = "cuda" | ||
| device = DEVICE_TYPE | ||
| max_position_embeddings = config.max_position_embeddings | ||
| pass | ||
|
|
||
|
|
@@ -1414,7 +1416,7 @@ def extend_rope_embedding(self, x, seq_len): | |
| if seq_len <= self.current_rope_size: return | ||
| # Iteratively grow by increments of 8192 | ||
| self.current_rope_size = ((seq_len // 8192) + ((seq_len % 8192) != 0)) * 8192 | ||
| self._set_cos_sin_cache(self.current_rope_size, device = "cuda", dtype = x.dtype) | ||
| self._set_cos_sin_cache(self.current_rope_size, device = DEVICE_TYPE, dtype = x.dtype) | ||
| pass | ||
| pass | ||
|
|
||
|
|
@@ -1441,7 +1443,7 @@ def __init__(self, | |
| base = config.rope_theta | ||
| partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 | ||
| dim = int((config.hidden_size // config.num_attention_heads)) | ||
| device = "cuda" | ||
| device = DEVICE_TYPE | ||
| max_position_embeddings = config.max_position_embeddings | ||
| pass | ||
|
|
||
|
|
@@ -1529,7 +1531,7 @@ def extend_rope_embedding(self, x, seq_len): | |
| if seq_len <= self.current_rope_size: return | ||
| # Iteratively grow by increments of 8192 | ||
| self.current_rope_size = ((seq_len // 8192) + ((seq_len % 8192) != 0)) * 8192 | ||
| self._set_cos_sin_cache(self.current_rope_size, device = "cuda", dtype = x.dtype) | ||
| self._set_cos_sin_cache(self.current_rope_size, device = DEVICE_TYPE, dtype = x.dtype) | ||
| pass | ||
| pass | ||
|
|
||
|
|
@@ -1577,7 +1579,7 @@ def unsloth_fast_generate( | |
| kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id) | ||
|
|
||
| # Mixed precision autocast | ||
| with torch.inference_mode(), torch.autocast(device_type = "cuda", dtype = dtype): | ||
| with torch.inference_mode(), torch.autocast(device_type = DEVICE_TYPE, dtype = dtype): | ||
| output = self._old_generate(*args, **kwargs) | ||
| pass | ||
|
|
||
|
|
@@ -1673,19 +1675,28 @@ def from_pretrained( | |
| if token is None: token = get_token() | ||
| if model_patcher is None: model_patcher = FastLlamaModel | ||
| SUPPORTS_BFLOAT16 = is_bfloat16_supported() | ||
| gpu_stats = torch.cuda.get_device_properties(0) | ||
| gpu_stats = torch.cuda.get_device_properties(0) if DEVICE_TYPE == "cuda" else torch.xpu.get_device_properties(0) | ||
| max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) | ||
|
|
||
| from importlib.metadata import version as importlib_version | ||
| try: vllm_version = f" vLLM: {importlib_version('vllm')}." | ||
| except: vllm_version = "" | ||
| if DEVICE_TYPE == "cuda": | ||
| from importlib.metadata import version as importlib_version | ||
| try: vllm_version = f" vLLM: {importlib_version('vllm')}." | ||
| except: vllm_version = "" | ||
|
|
||
| statistics = \ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On printouts, it's best not to make 2 - instead define new variables like |
||
| f"==((====))== Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers: {transformers_version}.{vllm_version}\n"\ | ||
| f" {chr(92)}{chr(92)} /| {gpu_stats.name}. Num GPUs = {torch.cuda.device_count()}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\ | ||
| f"O^O/ {chr(92)}_/ {chr(92)} Torch: {torch.__version__}. CUDA: {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit: {torch.version.cuda}. Triton: {triton_version}\n"\ | ||
| f"{chr(92)} / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ | ||
| f' "-____-" Free license: http://github.com/unslothai/unsloth' | ||
| elif DEVICE_TYPE == "xpu": | ||
| statistics = \ | ||
| f"==((====))== Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers: {transformers_version}\n"\ | ||
| f" {chr(92)}{chr(92)} /| {gpu_stats.name}. Num GPUs = {torch.xpu.device_count()}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\ | ||
| f"O^O/ {chr(92)}_/ {chr(92)} Torch: {torch.__version__}. Intel Toolkit: {torch.version.xpu}. Triton: {triton_version}\n"\ | ||
| f"{chr(92)} / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ | ||
| f' "-____-" Free license: http://github.com/unslothai/unsloth' | ||
|
|
||
| statistics = \ | ||
| f"==((====))== Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers: {transformers_version}.{vllm_version}\n"\ | ||
| f" {chr(92)}{chr(92)} /| {gpu_stats.name}. Num GPUs = {torch.cuda.device_count()}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\ | ||
| f"O^O/ {chr(92)}_/ {chr(92)} Torch: {torch.__version__}. CUDA: {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit: {torch.version.cuda}. Triton: {triton_version}\n"\ | ||
| f"{chr(92)} / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ | ||
| f' "-____-" Free license: http://github.com/unslothai/unsloth' | ||
| print(statistics) | ||
|
|
||
| # Warn about fast transfers | ||
|
|
@@ -2129,7 +2140,7 @@ def get_peft_model( | |
| pass | ||
|
|
||
| model.get_input_embeddings().modules_to_save.default\ | ||
| .to(device = "cuda", dtype = new_dtype, non_blocking = True) | ||
| .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True) | ||
| model.get_input_embeddings().modules_to_save.default.requires_grad_(True) | ||
|
|
||
| # [TODO] Move old embed_tokens to CPU - should be disk! | ||
|
|
@@ -2149,7 +2160,7 @@ def get_peft_model( | |
| pass | ||
|
|
||
| model.get_output_embeddings().modules_to_save.default\ | ||
| .to(device = "cuda", dtype = new_dtype, non_blocking = True) | ||
| .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True) | ||
| model.get_output_embeddings().modules_to_save.default.requires_grad_(True) | ||
|
|
||
| # [TODO] Move old lm_head to CPU - should be disk! | ||
|
|
@@ -2406,7 +2417,7 @@ def get_peft_model( | |
| pass | ||
|
|
||
| model.get_input_embeddings().modules_to_save.default\ | ||
| .to(device = "cuda", dtype = new_dtype, non_blocking = True) | ||
| .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True) | ||
| model.get_input_embeddings().modules_to_save.default.requires_grad_(True) | ||
| pass | ||
|
|
||
|
|
@@ -2422,7 +2433,7 @@ def get_peft_model( | |
| pass | ||
|
|
||
| model.get_output_embeddings().modules_to_save.default\ | ||
| .to(device = "cuda", dtype = new_dtype, non_blocking = True) | ||
| .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True) | ||
| model.get_output_embeddings().modules_to_save.default.requires_grad_(True) | ||
| pass | ||
|
|
||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just one line blank space is ok?