Skip to content

Commit 178beeb

Browse files
authored
Use accelerator for supporting other device (bigscience-workshop#104)
* xpu support (bigscience-workshop#55) * port accel abs interfece * WA for run3.6b * move on * fix current_dievice * fix typo * enable to run 345M GPT * delete apex_patch * add TODO xpu compatible tg for xpu WA * use deepspeed launcher * enable run3.6b bf16 * add zero2 config json * readd enable_each_rank_log * fix typos * add ccl arg * fix * use short word * use no-masked-softmax-fusion * readd * set train iters to 10 * remove duplicate line * change assert msg * update format * add whitespace * update path * update note * update * fix typos * delete notes * update format * update xpu check to cuda check * update * clean up file * fix typos * add python based gradient clipping * change condition for python based path
1 parent c240204 commit 178beeb

44 files changed

Lines changed: 282 additions & 246 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

megatron/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# limitations under the License.
1515
import os
1616
import torch
17-
17+
from deepspeed.accelerator import get_accelerator
1818
from .package_info import (
1919
__description__,
2020
__contact_names__,
@@ -64,7 +64,7 @@ def is_rank_0():
6464
"""Check whether it is rank 0. For AML, check if it is rank 0 of a node"""
6565
if torch.distributed.is_initialized():
6666
if torch.distributed.get_rank() == 0 or (
67-
is_aml() and torch.distributed.get_rank() % torch.cuda.device_count() == 0
67+
is_aml() and torch.distributed.get_rank() % get_accelerator().device_count() == 0
6868
):
6969
return True
7070
else:

megatron/arguments.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ def _add_distributed_args(parser):
653653
group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
654654
help='Number of layers per virtual pipeline stage')
655655
group.add_argument('--distributed-backend', default='nccl',
656-
choices=['nccl', 'gloo'],
656+
choices=['nccl', 'gloo', 'ccl'],
657657
help='Which backend to use for distributed training.')
658658
group.add_argument('--DDP-impl', default='local',
659659
choices=['local', 'torch'],

megatron/checkpointing.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import random
2020
import sys
2121
import numpy as np
22-
22+
from deepspeed.accelerator import get_accelerator
2323
import torch
2424

2525
from megatron import (get_args,
@@ -150,7 +150,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
150150
state_dict['random_rng_state'] = random.getstate()
151151
state_dict['np_rng_state'] = np.random.get_state()
152152
state_dict['torch_rng_state'] = torch.get_rng_state()
153-
state_dict['cuda_rng_state'] = torch.cuda.get_rng_state()
153+
state_dict['cuda_rng_state'] = get_accelerator().get_rng_state()
154154
state_dict['rng_tracker_states'] \
155155
= mpu.get_cuda_rng_tracker().get_states()
156156

@@ -417,7 +417,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
417417
random.setstate(state_dict['random_rng_state'])
418418
np.random.set_state(state_dict['np_rng_state'])
419419
torch.set_rng_state(state_dict['torch_rng_state'])
420-
torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
420+
get_accelerator().set_rng_state(state_dict['cuda_rng_state'])
421421
# Check for empty states array
422422
if not state_dict['rng_tracker_states']:
423423
raise KeyError

megatron/data/biencoder_dataset_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from megatron.data.dataset_utils import create_masked_lm_predictions, \
99
pad_and_convert_to_numpy
1010
from megatron.data.data_samplers import MegatronPretrainingSampler
11-
11+
from deepspeed.accelerator import get_accelerator
1212
def make_attention_mask(source_block, target_block):
1313
"""
1414
Returns a 2-dimensional (2-D) attention mask
@@ -187,7 +187,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
187187
# This should be a barrier but nccl barrier assumes
188188
# device_index=rank which is not the case for model
189189
# parallel case
190-
counts = torch.cuda.LongTensor([1])
190+
counts = get_accelerator().LongTensor([1])
191191
torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
192192
assert counts[0].item() == torch.distributed.get_world_size(
193193
group=mpu.get_data_parallel_group())

megatron/data/dataset_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
)
3434
from megatron.data.blendable_dataset import BlendableDataset
3535
from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
36-
36+
from deepspeed.accelerator import get_accelerator
3737
DSET_TYPE_BERT = 'standard_bert'
3838
DSET_TYPE_ICT = 'ict'
3939
DSET_TYPE_T5 = 't5'
@@ -711,8 +711,8 @@ def get_samples_mapping(indexed_dataset,
711711
# This should be a barrier but nccl barrier assumes
712712
# device_index=rank which is not the case for model
713713
# parallel case
714-
if torch.cuda.device_count() > 0: # Skip when CPU-only
715-
counts = torch.cuda.LongTensor([1])
714+
if get_accelerator().device_count() > 0: # Skip when CPU-only
715+
counts = get_accelerator().LongTensor([1])
716716
torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
717717
torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
718718
assert counts[0].item() == (

megatron/data/gpt_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import numpy as np
2222
import torch
23-
23+
from deepspeed.accelerator import get_accelerator
2424
from megatron import mpu, is_rank_0, print_rank_0, get_args
2525
from megatron.data.blendable_dataset import BlendableDataset
2626
from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
@@ -313,7 +313,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
313313
# This should be a barrier but nccl barrier assumes
314314
# device_index=rank which is not the case for model
315315
# parallel case
316-
counts = torch.cuda.LongTensor([1])
316+
counts = get_accelerator().LongTensor([1])
317317
torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
318318
torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
319319
assert counts[0].item() == (

megatron/data/realm_dataset_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from megatron import mpu, print_rank_0
88
from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
99
from megatron import get_args, get_tokenizer, print_rank_0, mpu
10-
10+
from deepspeed.accelerator import get_accelerator
1111

1212
def get_one_epoch_dataloader(dataset, micro_batch_size=None):
1313
"""Specifically one epoch to be used in an indexing job."""
@@ -177,7 +177,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
177177
# This should be a barrier but nccl barrier assumes
178178
# device_index=rank which is not the case for model
179179
# parallel case
180-
counts = torch.cuda.LongTensor([1])
180+
counts = get_accelerator().LongTensor([1])
181181
torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
182182
assert counts[0].item() == torch.distributed.get_world_size(
183183
group=mpu.get_data_parallel_group())

megatron/global_vars.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from megatron.tokenizer import build_tokenizer
2525
from .arguments import parse_args
2626
from .microbatches import build_num_microbatches_calculator
27-
27+
from deepspeed.accelerator import get_accelerator
2828
_GLOBAL_ARGS = None
2929
_GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
3030
_GLOBAL_TOKENIZER = None
@@ -192,14 +192,14 @@ def __init__(self, name):
192192
def start(self):
193193
"""Start the timer."""
194194
assert not self.started_, 'timer has already been started'
195-
torch.cuda.synchronize()
195+
get_accelerator().synchronize()
196196
self.start_time = time.time()
197197
self.started_ = True
198198

199199
def stop(self):
200200
"""Stop the timer."""
201201
assert self.started_, 'timer is not started'
202-
torch.cuda.synchronize()
202+
get_accelerator().synchronize()
203203
self.elapsed_ += (time.time() - self.start_time)
204204
self.started_ = False
205205

megatron/initialize.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import numpy as np
2323
import torch
24-
2524
from megatron import fused_kernels
2625
from megatron import get_adlr_autoresume
2726
from megatron import get_args
@@ -30,7 +29,7 @@
3029
from megatron.global_vars import set_global_variables
3130
from megatron.mpu import (set_tensor_model_parallel_rank,
3231
set_tensor_model_parallel_world_size)
33-
32+
from deepspeed.accelerator import get_accelerator
3433
import deepspeed
3534
import deepspeed.utils.groups as groups
3635

@@ -46,7 +45,7 @@ def initialize_megatron(extra_args_provider=None, args_defaults={},
4645
"""
4746
if not allow_no_cuda:
4847
# Make sure cuda is available.
49-
assert torch.cuda.is_available(), 'Megatron requires CUDA.'
48+
assert get_accelerator().is_available(), 'Megatron requires accelerator.'
5049

5150
# Parse args, build tokenizer, and set adlr-autoresume,
5251
# tensorboard-writer, and timers.
@@ -107,7 +106,10 @@ def _compile_dependencies():
107106
compile_helper()
108107
print('>>> done with dataset index builder. Compilation time: {:.3f} '
109108
'seconds'.format(time.time() - start_time), flush=True)
110-
109+
110+
if not get_accelerator().device_name() == 'cuda':
111+
print(">fused kernel is only supported in cuda, skip loading fused kernel")
112+
return
111113
# ==================
112114
# Load fused kernels
113115
# ==================
@@ -134,7 +136,7 @@ def _compile_dependencies():
134136
if _is_rank_0():
135137
start_time = time.time()
136138
print('> compiling and loading fused kernels ...', flush=True)
137-
if torch.cuda.device_count() > 0: # Skip when CPU-only
139+
if get_accelerator().device_count() > 0: # Skip when CPU-only
138140
fused_kernels.load(args)
139141
torch.distributed.barrier()
140142
else:
@@ -185,7 +187,7 @@ def setup_deepspeed_random_and_activation_checkpointing(args):
185187
def _initialize_distributed():
186188
"""Initialize torch.distributed and mpu."""
187189
args = get_args()
188-
device_count = torch.cuda.device_count()
190+
device_count = get_accelerator().device_count()
189191
if torch.distributed.is_initialized():
190192

191193
if args.rank == 0:
@@ -206,7 +208,7 @@ def _initialize_distributed():
206208
else:
207209
args.local_rank = device
208210

209-
torch.cuda.set_device(device) # only do so when device_count > 0
211+
get_accelerator().set_device(device) # only do so when device_count > 0
210212

211213
# Call the init process
212214
init_method = 'tcp://'
@@ -249,14 +251,14 @@ def _set_random_seed(seed_):
249251
if seed_ is not None and seed_ > 0:
250252
# Ensure that different pipeline MP stages get different seeds.
251253
# No need to do so for CPU-only case.
252-
if torch.cuda.device_count() == 0:
254+
if get_accelerator().device_count() == 0:
253255
seed = seed_
254256
else:
255257
seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
256258
random.seed(seed)
257259
np.random.seed(seed)
258260
torch.manual_seed(seed)
259-
if torch.cuda.device_count() > 0:
261+
if get_accelerator().device_count() > 0:
260262
mpu.model_parallel_cuda_manual_seed(seed)
261263
else:
262264
raise ValueError('Seed ({}) should be a positive integer.'.format(seed))
@@ -284,7 +286,7 @@ def _is_rank_0():
284286
"""Check whether it is rank 0. For AML, check if it is rank 0 of a node"""
285287
if torch.distributed.is_initialized():
286288
if torch.distributed.get_rank() == 0 or (
287-
'AZUREML_EXPERIMENT_ID' in os.environ and torch.distributed.get_rank() % torch.cuda.device_count() == 0
289+
'AZUREML_EXPERIMENT_ID' in os.environ and torch.distributed.get_rank() % get_accelerator().device_count() == 0
288290
):
289291
return True
290292
else:

megatron/memory.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616

1717
import torch
18-
18+
from deepspeed.accelerator import get_accelerator
1919

2020
# A dictionary of all the memory buffers allocated.
2121
_MEM_BUFFS = dict()
@@ -58,7 +58,7 @@ def __init__(self, name, numel, dtype, track_usage):
5858
self.dtype = dtype
5959
self.data = torch.empty(self.numel,
6060
dtype=self.dtype,
61-
device=torch.cuda.current_device(),
61+
device=get_accelerator().current_device_name(),
6262
requires_grad=False)
6363

6464
# Index tracking the start of the free memory.

0 commit comments

Comments
 (0)