Skip to content

Commit cc1b624

Browse files
liquor233NeMo Bot
authored andcommitted
Fix the load checkpointing issue -- onelogger callback gets called multiple time in some case. (#14945)
* Update modelPT.py Signed-off-by: Jiashang Hu <[email protected]> * Update one_logger_callback.py Signed-off-by: Jiashang Hu <[email protected]> * fix the test for error handling Signed-off-by: Jiashang Hu <[email protected]> * update the dependency version Signed-off-by: Jiashang Hu <[email protected]> --------- Signed-off-by: Jiashang Hu <[email protected]> Signed-off-by: NeMo Bot <[email protected]>
1 parent 7d643af commit cc1b624

File tree

4 files changed

+3
-14
lines changed

4 files changed

+3
-14
lines changed

nemo/core/classes/modelPT.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -478,9 +478,6 @@ def restore_from(
478478
Returns:
479479
An instance of type cls or its underlying config (if return_config is set).
480480
"""
481-
# Notify OneLogger of checkpoint loading start for telemetry tracking
482-
CallbackGroup.get_instance().on_load_checkpoint_start()
483-
484481
if save_restore_connector is None:
485482
save_restore_connector = SaveRestoreConnector()
486483

@@ -514,9 +511,6 @@ def restore_from(
514511
if isinstance(instance, ModelPT):
515512
instance._save_restore_connector = save_restore_connector
516513

517-
# Notify OneLogger of checkpoint loading completion for telemetry tracking
518-
CallbackGroup.get_instance().on_load_checkpoint_end()
519-
520514
return instance
521515

522516
@classmethod

nemo/lightning/one_logger_callback.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,6 @@ def get_one_logger_init_config() -> Dict[str, Any]:
5959
# Important fields with defaults - provide if available from config
6060
"enable_for_current_rank": _should_enable_for_current_rank(),
6161
"world_size_or_fn": world_size,
62-
# Error handling strategy - use DISABLE_QUIETLY_AND_REPORT_METRIC_ERROR to prevent
63-
# telemetry errors from crashing the training application
64-
"error_handling_strategy": "propagate_exceptions",
6562
}
6663

6764
return init_config

requirements/requirements_lightning.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@ torchmetrics>=0.11.0
88
transformers~=4.53.0
99
wandb
1010
webdataset>=0.2.86
11-
nv_one_logger_core>=2.1.0
12-
nv_one_logger_training_telemetry>=2.1.0
13-
nv_one_logger_pytorch_lightning_integration>=2.1.0
11+
nv_one_logger_core>=2.3.0
12+
nv_one_logger_training_telemetry>=2.3.0
13+
nv_one_logger_pytorch_lightning_integration>=2.3.0

tests/lightning/test_one_logger_callback.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ def test_init_configures_provider(
8080
"session_tag_or_fn": "test-session",
8181
"enable_for_current_rank": True,
8282
"world_size_or_fn": 1,
83-
"error_handling_strategy": "propagate_exceptions",
8483
}
8584
mock_get_config.return_value = mock_init_config
8685

@@ -119,7 +118,6 @@ def test_get_one_logger_init_config(self):
119118
assert config["session_tag_or_fn"] == "test_job"
120119
assert "enable_for_current_rank" in config
121120
assert config["world_size_or_fn"] == 4
122-
assert config["error_handling_strategy"] == "propagate_exceptions"
123121

124122
@pytest.mark.unit
125123
def test_get_one_logger_init_config_no_slurm(self):

0 commit comments

Comments
 (0)