Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions RecommenderSystems/dlrm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
|-- logger.py #Loger info
|-- config.py #Argument configuration
|-- train.py #Python script for train mode
|-- train_consistent_eager.sh #Shell script for starting training in eager mode
|-- train_consistent_graph.sh #Shell script for starting training in graph mode
|-- train_global_eager.sh #Shell script for starting training in eager mode
|-- train_global_graph.sh #Shell script for starting training in graph mode
|-- train_ddp.sh #Shell script for starting training in ddp mode
|-- __init__.py
└── README.md #Documentation
Expand Down Expand Up @@ -55,13 +55,13 @@ Please view [how_to_make_ofrecord_for_wdl](https://github.com/Oneflow-Inc/OneFlo
```
bash train_ddp.sh
```
### Train by graph mode in consistent view
### Train by graph mode in global view
```
bash train_consistent_graph.sh
bash train_global_graph.sh
```
### Train by eager mode in consistent view
### Train by eager mode in global view
```
bash train_consistent_eager.sh
bash train_global_eager.sh
```
## Dataset preparation
Currently OneFlow-WDL supports two types of dataset format: ofrecord and onerec, both can be tranformed from HugeCTR parquet format dataset.
Expand Down
2 changes: 1 addition & 1 deletion RecommenderSystems/dlrm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def str_list(x):
else:
assert args.eval_batch_size % args.eval_batch_size_per_proc == 0

args.is_consistent = (
args.is_global = (
flow.env.get_world_size() > 1 and not args.ddp
) or args.execution_mode == "graph"

Expand Down
4 changes: 2 additions & 2 deletions RecommenderSystems/dlrm/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

__all__ = ["make_data_loader"]

def make_data_loader(args, mode, is_consistent=False, data_format="ofrecord"):
def make_data_loader(args, mode, is_global=False, data_format="ofrecord"):
assert mode in ("train", "val")

total_batch_size = args.batch_size
Expand All @@ -17,7 +17,7 @@ def make_data_loader(args, mode, is_consistent=False, data_format="ofrecord"):
placement = None
sbp = None

if is_consistent:
if is_global:
placement = flow.env.all_device_placement("cpu")
sbp = flow.sbp.split(0)
batch_size_per_proc = total_batch_size
Expand Down
18 changes: 9 additions & 9 deletions RecommenderSystems/dlrm/models/dlrm.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def output_feature_size(self, embedding_vec_size, dense_feature_size):

class Embedding(nn.Embedding):
def __init__(self, vocab_size, embed_size, args):
if args.is_consistent:
if args.is_global:
assert args.embedding_split_axis < 2, "Embedding model parallel split axis can only be 0 or 1."
self.split_axis = args.embedding_split_axis
else:
Expand All @@ -114,14 +114,14 @@ def __init__(self, vocab_size, embed_size, args):
# param.data = flow.tensor(W, requires_grad=True)

def set_model_parallel(self, placement=None):
# Overriding to_consistent function does not work
# because to_consistent call is not recursive
# Overriding to_global function does not work
# because to_global call is not recursive
if self.split_axis >= 0:
self.to_consistent(placement, flow.sbp.split(self.split_axis))
self.to_global(placement, flow.sbp.split(self.split_axis))

def forward(self, ids):
if self.split_axis >= 0:
ids = ids.to_consistent(sbp=flow.sbp.broadcast)
ids = ids.to_global(sbp=flow.sbp.broadcast)

# Forward
# weight ids => embedding
Expand All @@ -136,10 +136,10 @@ def forward(self, ids):

if self.split_axis == 0:
# Forward: P => S(0), Backward: S(0) => B
return embeddings.to_consistent(sbp=flow.sbp.split(0), grad_sbp=flow.sbp.broadcast)
return embeddings.to_global(sbp=flow.sbp.split(0), grad_sbp=flow.sbp.broadcast)
elif self.split_axis == 1:
# Forward: S(2) => S(0), Backward: S(0) => S(2)
return embeddings.to_consistent(sbp=flow.sbp.split(0), grad_sbp=flow.sbp.split(2))
return embeddings.to_global(sbp=flow.sbp.split(0), grad_sbp=flow.sbp.split(2))
else:
return embeddings

Expand Down Expand Up @@ -189,8 +189,8 @@ def __init__(self, vocab_size, embed_size, args):
def forward(self, ids):
bsz = ids.shape[0]
column_id = flow.ones((bsz, 1), dtype=flow.int32, sbp=ids.sbp, placement=ids.placement) * self.column_id
if (ids.is_consistent):
column_id = column_id.to_consistent(sbp=ids.sbp, placement=ids.placement)
if (ids.is_global):
column_id = column_id.to_global(sbp=ids.sbp, placement=ids.placement)
return self.one_embedding.forward(ids, column_id)
def set_model_parallel(self, placement=None):
pass
Expand Down
22 changes: 11 additions & 11 deletions RecommenderSystems/dlrm/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,18 @@ def __init__(self):
UserWarning,
)
self.execution_mode = "eager"
self.is_consistent = args.is_consistent
self.is_global = args.is_global
self.rank = flow.env.get_rank()
self.world_size = flow.env.get_world_size()
self.cur_iter = 0
self.eval_interval = args.eval_interval
self.eval_batchs = args.eval_batchs
self.init_logger()
self.train_dataloader = make_data_loader(args, "train", self.is_consistent, self.dataset_format)
self.val_dataloader = make_data_loader(args, "val", self.is_consistent, self.dataset_format)
self.train_dataloader = make_data_loader(args, "train", self.is_global, self.dataset_format)
self.val_dataloader = make_data_loader(args, "val", self.is_global, self.dataset_format)
self.dlrm_module = make_dlrm_module(args)
if self.is_consistent:
self.dlrm_module.to_consistent(flow.env.all_device_placement("cuda"), flow.sbp.broadcast)
if self.is_global:
self.dlrm_module.to_global(flow.env.all_device_placement("cuda"), flow.sbp.broadcast)
self.dlrm_module.embedding.set_model_parallel(flow.env.all_device_placement("cuda"))
else:
self.dlrm_module.to("cuda")
Expand Down Expand Up @@ -129,8 +129,8 @@ def meter_eval(self, auc):

def load_state_dict(self):
print(f"Loading model from {self.args.model_load_dir}")
if self.is_consistent:
state_dict = flow.load(self.args.model_load_dir, consistent_src_rank=0)
if self.is_global:
state_dict = flow.load(self.args.model_load_dir, global_src_rank=0)
elif self.rank == 0:
state_dict = flow.load(self.args.model_load_dir)
else:
Expand All @@ -144,8 +144,8 @@ def save(self, subdir):
if self.rank == 0:
print(f"Saving model to {save_path}")
state_dict = self.dlrm_module.state_dict()
if self.is_consistent:
flow.save(state_dict, save_path, consistent_dst_rank=0)
if self.is_global:
flow.save(state_dict, save_path, global_dst_rank=0)
elif self.rank == 0:
flow.save(state_dict, save_path)
else:
Expand Down Expand Up @@ -250,11 +250,11 @@ def train_one_step(self):

def tol(tensor, pure_local=True):
""" to local """
if tensor.is_consistent:
if tensor.is_global:
if pure_local:
tensor = tensor.to_local()
else:
tensor = tensor.to_consistent(sbp=flow.sbp.broadcast).to_local()
tensor = tensor.to_global(sbp=flow.sbp.broadcast).to_local()

return tensor

Expand Down
5 changes: 3 additions & 2 deletions RecommenderSystems/dlrm/train_ddp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ MASTER_ADDR=127.0.0.1
NUM_NODES=1
NODE_RANK=0
# DATA_DIR=/dataset/wdl_ofrecord/ofrecord
DATA_DIR=/tank/dataset/criteo_kaggle/dlrm_ofrecord
dataset_format=ofrecord
DATA_DIR=/tank/dataset/criteo_kaggle/dlrm_$dataset_format
EMBD_SIZE=33762577 # 33762578
BATHSIZE=32

Expand Down Expand Up @@ -41,6 +42,6 @@ python3 -m oneflow.distributed.launch \
--data_part_name_suffix_length 5 \
--ddp \
--model_load_dir /tank/model_zoo/dlrm_baseline_params_emb$emb_size \
--test_name 'train_eager_conisitent_'$DEVICE_NUM_PER_NODE'gpu'
--test_name 'train_ddp_'$DEVICE_NUM_PER_NODE'gpu'
# --dataset_format torch \
# --model_load_dir /tank/xiexuan/dlrm/initial_parameters \
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ python3 -m oneflow.distributed.launch \
--data_part_name_suffix_length 5 \
--execution_mode 'eager' \
--model_load_dir /tank/model_zoo/dlrm_baseline_params_emb$emb_size \
--test_name 'train_eager_conisitent_'$DEVICE_NUM_PER_NODE'gpu'
--test_name 'train_global_eager_'$DEVICE_NUM_PER_NODE'gpu'
# --dataset_format torch \
# --model_load_dir /tank/xiexuan/dlrm/initial_parameters \
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ python3 -m oneflow.distributed.launch \
--data_part_name_suffix_length 5 \
--execution_mode 'graph' \
--model_load_dir /tank/model_zoo/dlrm_baseline_params_emb$emb_size \
--test_name 'train_graph_conisitent_'$DEVICE_NUM_PER_NODE'gpu'
--test_name 'train_global_graph_'$DEVICE_NUM_PER_NODE'gpu'
# --dataset_format torch \
# --model_load_dir /tank/xiexuan/dlrm/initial_parameters \
2 changes: 1 addition & 1 deletion RecommenderSystems/dlrm/train_nn_graph.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ python3 -m oneflow.distributed.launch \
--data_part_num 256 \
--data_part_name_suffix_length 5 \
--execution_mode 'graph' \
--test_name 'train_eager_graph_'$DEVICE_NUM_PER_NODE'gpu' > $PREFIX'_n'$NUM_NODES'_g'$DEVICE_NUM_PER_NODE'_b'$BATHSIZE'_h'$HIDDEN_UNITS_NUM'_log_'$PREFIX
--test_name 'train_nn_graph_'$DEVICE_NUM_PER_NODE'gpu' > $PREFIX'_n'$NUM_NODES'_g'$DEVICE_NUM_PER_NODE'_b'$BATHSIZE'_h'$HIDDEN_UNITS_NUM'_log_'$PREFIX