File tree Expand file tree Collapse file tree 1 file changed +18
-8
lines changed
Expand file tree Collapse file tree 1 file changed +18
-8
lines changed Original file line number Diff line number Diff line change @@ -388,16 +388,26 @@ def _verify_args(self) -> None:
388388 if self .pipeline_parallel_size > 1 :
389389 raise NotImplementedError (
390390 "Pipeline parallelism is not supported yet." )
391- if is_hip ():
391+ if not self .disable_custom_all_reduce and self .world_size > 1 :
392+ if is_hip ():
393+ self .disable_custom_all_reduce = True
394+ logger .info (
395+ "Disabled the custom all-reduce kernel because it is not "
396+ "supported on AMD GPUs." )
397+ elif self .pipeline_parallel_size > 1 :
398+ self .disable_custom_all_reduce = True
399+ logger .info (
400+ "Disabled the custom all-reduce kernel because it is not "
401+ "supported with pipeline parallelism." )
402+
403+ # FIXME(woosuk): Fix the stability issues and re-enable the custom
404+ # all-reduce kernel.
405+ if not self .disable_custom_all_reduce and self .world_size > 1 :
392406 self .disable_custom_all_reduce = True
393407 logger .info (
394- "Disabled the custom all-reduce kernel because it is not "
395- "supported on AMD GPUs." )
396- elif self .pipeline_parallel_size > 1 :
397- self .disable_custom_all_reduce = True
398- logger .info (
399- "Disabled the custom all-reduce kernel because it is not "
400- "supported with pipeline parallelism." )
408+ "Custom all-reduce kernels are temporarily disabled due to "
409+ "stability issues. We will re-enable them once the issues are "
410+ "resolved." )
401411
402412
403413class SchedulerConfig :
You can’t perform that action at this time.
0 commit comments