deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json
deepspeed \ --master_addr=rogpt1 \ --elastic_training \ --min_elastic_nodes=1 \ --max_elastic_nodes=2 \ --hostfile=hostfile \ train.py \ --deepspeed_config ds_config.json
deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json
deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json
deepspeed \ --master_addr=rogpt1 \ --elastic_training \ --min_elastic_nodes=1 \ --max_elastic_nodes=2 \ --hostfile=hostfile \ train.py \ --deepspeed_config ds_config.json
deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json
deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json
deepspeed \ --master_addr=rogpt1 \ --elastic_training \ --min_elastic_nodes=1 \ --max_elastic_nodes=2 \ --hostfile=hostfile \ train.py \ --deepspeed_config ds_config.json
deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json
deepspeed \ --master_addr=rogpt1 \ --elastic_training \ --min_elastic_nodes=1 \ --max_elastic_nodes=2 \ --hostfile=hostfile \ train.py \ --deepspeed_config ds_config.json
deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json
deepspeed \ --master_addr=rogpt1 \ --elastic_training \ --min_elastic_nodes=1 \ --max_elastic_nodes=2 \ --hostfile=hostfile \ train.py \ --deepspeed_config ds_config.json
deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json
deepspeed \ –master_addr=rogpt1 \ –elastic_training \ –min_elastic_nodes=1 \ –max_elastic_nodes=2 \ –hostfile=hostfile \ train.py \ –deepspeed_config ds_config.json
deepspeed \ --master_addr=rogpt1 \ --elastic_training \ --min_elastic_nodes=1 \ --max_elastic_nodes=2 \ --hostfile=hostfile \ train.py \ --deepspeed_config ds_config.json
{ "steps_per_print": 2000, "checkpoint": { "use_node_local_storage": true }, "elasticity": { "enabled": true, "micro_batch_sizes": [64,128,256], "max_train_batch_size": 1024 }, "optimizer": { "type": "Adam", "params": { "lr": 0.001, "betas": [ 0.8, 0.999 ], "eps": 1e-8, "weight_decay": 3e-7 } }, "scheduler": { "type": "WarmupLR", "params": { "warmup_min_lr": 0, "warmup_max_lr": 0.001, "warmup_num_steps": 1000 } }, "wall_clock_breakdown": false }