Skip to content

Commit 54a2602

Browse files
authored
Temporarily Remove Encoder PP Support (#14167)
* remove encoder pp api Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * update Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> * lint Signed-off-by: yaoyu-33 <[email protected]> * fix test Signed-off-by: yaoyu-33 <[email protected]> * some addtional removing in strategy lib Signed-off-by: yaoyu-33 <[email protected]> * update assert Signed-off-by: yaoyu-33 <[email protected]> * Apply isort and black reformatting Signed-off-by: yaoyu-33 <[email protected]> --------- Signed-off-by: yaoyu-33 <[email protected]> Signed-off-by: yaoyu-33 <[email protected]> Co-authored-by: yaoyu-33 <[email protected]>
1 parent ada066c commit 54a2602

File tree

24 files changed

+57
-86
lines changed

24 files changed

+57
-86
lines changed

examples/multimodal/convert_ckpt_to_nemo.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ def convert(local_rank, rank, world_size, args):
130130
parallel_state.initialize_model_parallel(
131131
tensor_model_parallel_size=app_state.tensor_model_parallel_size,
132132
pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
133-
pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
134133
)
135134

136135
app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()

examples/multimodal/text_to_image/convert_hf_ckpt_to_nemo.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,6 @@ def convert(local_rank, rank, world_size, args):
178178
parallel_state.initialize_model_parallel(
179179
tensor_model_parallel_size=app_state.tensor_model_parallel_size,
180180
pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
181-
pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
182181
)
183182

184183
app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()

examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,6 @@ def convert(local_rank, rank, world_size, args):
254254
parallel_state.initialize_model_parallel(
255255
tensor_model_parallel_size=app_state.tensor_model_parallel_size,
256256
pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
257-
pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
258257
)
259258

260259
app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()

examples/nlp/language_modeling/megatron_ckpt_to_nemo.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ def convert(local_rank, rank, world_size, args):
187187
parallel_state.initialize_model_parallel(
188188
tensor_model_parallel_size=app_state.tensor_model_parallel_size,
189189
pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
190-
pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
191190
)
192191

193192
app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()

examples/nlp/language_modeling/megatron_gpt_drop_layers.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,6 @@ def main(local_rank, rank, world_size, args):
152152
parallel_state.initialize_model_parallel(
153153
tensor_model_parallel_size=app_state.tensor_model_parallel_size,
154154
pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
155-
pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
156155
)
157156

158157
app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()

examples/vision/convert_ckpt_to_nemo.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ def convert(local_rank, rank, world_size, args):
124124
parallel_state.initialize_model_parallel(
125125
tensor_model_parallel_size=app_state.tensor_model_parallel_size,
126126
pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
127-
pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
128127
)
129128

130129
app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()

nemo/collections/nlp/models/language_modeling/megatron_base_model.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
201201
pipeline_model_parallel_size=cfg.get('pipeline_model_parallel_size', 1),
202202
pipeline_model_parallel_comm_backend=cfg.get('pipeline_model_parallel_comm_backend', None),
203203
virtual_pipeline_model_parallel_size=vp_size,
204-
pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0),
204+
pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', None),
205205
use_tp_pp_dp_mapping=cfg.get('use_tp_pp_dp_mapping', False),
206206
num_distributed_optimizer_instances=self.cfg.optim.get('num_distributed_optimizer_instances', 1),
207207
context_parallel_size=cfg.get('context_parallel_size', 1),
@@ -1170,17 +1170,13 @@ def _get_total_params_across_model_parallel_groups_enc_dec(self, model):
11701170
num_parameters_on_device -= num_word_embedding_parameters
11711171

11721172
# Subtract decoder position embedding params that are shared with encoder.
1173-
if (
1174-
parallel_state.is_pipeline_stage_at_split()
1175-
and self.cfg.encoder.get("position_embedding_type", "learned_absolute") == "learned_absolute"
1176-
):
1173+
if self.cfg.encoder.get("position_embedding_type", "learned_absolute") == "learned_absolute":
11771174
num_position_embedding_parameters = sum([p.nelement() for p in model.position_embeddings_weight()])
11781175
num_parameters_on_device -= num_position_embedding_parameters
11791176

11801177
# Check and remove RPE embeddings from the encoder that are replicated.
11811178
if (
11821179
parallel_state.get_pipeline_model_parallel_world_size() > 1
1183-
and parallel_state.is_pipeline_stage_before_split()
11841180
and not parallel_state.is_pipeline_first_stage()
11851181
and self.cfg.encoder.get("position_embedding_type", "learned_absolute") == "relative"
11861182
):
@@ -1191,8 +1187,6 @@ def _get_total_params_across_model_parallel_groups_enc_dec(self, model):
11911187
# Check and remove RPE embeddings from the decoder that are replicated.
11921188
if (
11931189
parallel_state.get_pipeline_model_parallel_world_size() > 1
1194-
and parallel_state.is_pipeline_stage_after_split()
1195-
and not parallel_state.is_pipeline_stage_at_split()
11961190
and self.cfg.encoder.get("position_embedding_type", "learned_absolute") == "relative"
11971191
):
11981192
# substract the RPE params on intermediate pipeline stages.

nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,13 +124,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
124124
model_provider_func=self.model_provider_func,
125125
wrap_with_ddp=False,
126126
on_cpu=True,
127-
model_type=ModelType.encoder_and_decoder,
127+
model_type=ModelType.encoder_or_decoder,
128128
)[0]
129129
else:
130130
self.enc_dec_model = build_model(
131131
model_provider_func=self.model_provider_func,
132132
wrap_with_ddp=False,
133-
model_type=ModelType.encoder_and_decoder,
133+
model_type=ModelType.encoder_or_decoder,
134134
)[0]
135135

136136
# We don't need to call it explicitly? Since it is a pytorch lightning hook function
@@ -154,7 +154,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
154154
True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False
155155
)
156156

157-
self.enc_dec_model.model_type = ModelType.encoder_and_decoder
157+
self.enc_dec_model.model_type = ModelType.encoder_or_decoder
158158

159159
def setup_optimizer_param_groups(self):
160160
"""ModelPT override. Optimizer will get self._optimizer_param_groups"""

nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ class MegatronT5PromptLearningModel(MegatronBasePromptLearningModel):
8383

8484
def __init__(self, cfg: DictConfig, trainer: Trainer):
8585
super().__init__(cfg, trainer)
86-
self.model_type = ModelType.encoder_and_decoder
86+
self.model_type = ModelType.encoder_or_decoder
8787

8888
def first_stage_of_pipeline(self):
8989
if self.frozen_model.enc_dec_model.pre_process and parallel_state.get_pipeline_model_parallel_rank() == 0:

nemo/collections/nlp/modules/common/megatron/build_model.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -94,29 +94,6 @@ def build_model(
9494
pre_process=parallel_state.is_pipeline_first_stage(),
9595
post_process=parallel_state.is_pipeline_last_stage(),
9696
)
97-
elif model_type == ModelType.encoder_and_decoder:
98-
pre_process = parallel_state.is_pipeline_first_stage()
99-
post_process = parallel_state.is_pipeline_last_stage()
100-
# `add_encoder` & `add_decoder` logic.
101-
add_encoder, add_decoder = True, True
102-
if parallel_state.get_pipeline_model_parallel_world_size() > 1:
103-
split_rank = parallel_state.get_pipeline_model_parallel_split_rank()
104-
if split_rank is None:
105-
raise RuntimeError("Split rank needs to be specified for model with both encoder and decoder.")
106-
rank = parallel_state.get_pipeline_model_parallel_rank()
107-
world_size = parallel_state.get_pipeline_model_parallel_world_size()
108-
pre_process = rank == 0 or rank == split_rank
109-
post_process = rank == (split_rank - 1) or rank == (world_size - 1)
110-
add_encoder = parallel_state.is_pipeline_stage_before_split()
111-
add_decoder = parallel_state.is_pipeline_stage_after_split()
112-
model = model_provider_func(
113-
*args,
114-
**kwargs,
115-
pre_process=pre_process,
116-
post_process=post_process,
117-
add_encoder=add_encoder,
118-
add_decoder=add_decoder,
119-
)
12097
else:
12198
raise ValueError(f"Unrecognized ModelType '{model_type}'")
12299

0 commit comments

Comments
 (0)