From b50f9c21921b27d8bae19883126a8b0575b0c480 Mon Sep 17 00:00:00 2001 From: SuperAZHE Date: Sun, 25 Aug 2024 14:48:12 +0800 Subject: [PATCH] train script using torchrun for torch version >= 2.0.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit torch 2.0.0 use torchrun。 The module torch.distributed.launch is deprecated and will be removed in future. Use torchrun. Note that --use-env is set by default in torchrun. --- run_scripts/coco-cn_finetune_vit-b-16_rbt-base.sh | 3 +-- run_scripts/flickr30k_finetune_vit-b-16_rbt-base.sh | 3 +-- run_scripts/flickr30k_finetune_vit-b-16_rbt-base_flip.sh | 3 +-- run_scripts/muge_finetune_vit-b-16_rbt-base.sh | 3 +-- run_scripts/muge_finetune_vit-b-16_rbt-base_distillation.sh | 3 +-- run_scripts/muge_finetune_vit-b-16_rbt-base_flashattn.sh | 3 +-- run_scripts/muge_finetune_vit-b-16_rbt-base_flip.sh | 3 +-- 7 files changed, 7 insertions(+), 14 deletions(-) diff --git a/run_scripts/coco-cn_finetune_vit-b-16_rbt-base.sh b/run_scripts/coco-cn_finetune_vit-b-16_rbt-base.sh index 6eba098..d500d4e 100644 --- a/run_scripts/coco-cn_finetune_vit-b-16_rbt-base.sh +++ b/run_scripts/coco-cn_finetune_vit-b-16_rbt-base.sh @@ -57,14 +57,13 @@ text_model=RoBERTa-wwm-ext-base-chinese use_augment="--use-augment" # use_augment="" -python3 -m torch.distributed.launch --use_env --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ +torchrun --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ --master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} cn_clip/training/main.py \ --train-data=${train_data} \ --val-data=${val_data} \ --resume=${resume} \ ${reset_data_offset} \ ${reset_optimizer} \ - --logs=${output_base_dir} \ --name=${name} \ --save-step-frequency=${save_step_frequency} \ --save-epoch-frequency=${save_epoch_frequency} \ diff --git a/run_scripts/flickr30k_finetune_vit-b-16_rbt-base.sh b/run_scripts/flickr30k_finetune_vit-b-16_rbt-base.sh index d3a903d..58045a4 100644 --- a/run_scripts/flickr30k_finetune_vit-b-16_rbt-base.sh +++ b/run_scripts/flickr30k_finetune_vit-b-16_rbt-base.sh @@ -57,14 +57,13 @@ text_model=RoBERTa-wwm-ext-base-chinese use_augment="--use-augment" # use_augment="" -python3 -m torch.distributed.launch --use_env --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ +torchrun --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ --master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} cn_clip/training/main.py \ --train-data=${train_data} \ --val-data=${val_data} \ --resume=${resume} \ ${reset_data_offset} \ ${reset_optimizer} \ - --logs=${output_base_dir} \ --name=${name} \ --save-step-frequency=${save_step_frequency} \ --save-epoch-frequency=${save_epoch_frequency} \ diff --git a/run_scripts/flickr30k_finetune_vit-b-16_rbt-base_flip.sh b/run_scripts/flickr30k_finetune_vit-b-16_rbt-base_flip.sh index 66ca849..4e81e9a 100644 --- a/run_scripts/flickr30k_finetune_vit-b-16_rbt-base_flip.sh +++ b/run_scripts/flickr30k_finetune_vit-b-16_rbt-base_flip.sh @@ -58,14 +58,13 @@ mask_ratio=0.5 # use flip: set mask ratio use_augment="--use-augment" # use_augment="" -python3 -m torch.distributed.launch --use_env --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ +torchrun --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ --master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} cn_clip/training/main.py \ --train-data=${train_data} \ --val-data=${val_data} \ --resume=${resume} \ ${reset_data_offset} \ ${reset_optimizer} \ - --logs=${output_base_dir} \ --name=${name} \ --save-step-frequency=${save_step_frequency} \ --save-epoch-frequency=${save_epoch_frequency} \ diff --git a/run_scripts/muge_finetune_vit-b-16_rbt-base.sh b/run_scripts/muge_finetune_vit-b-16_rbt-base.sh index 385eff2..06cc330 100644 --- a/run_scripts/muge_finetune_vit-b-16_rbt-base.sh +++ b/run_scripts/muge_finetune_vit-b-16_rbt-base.sh @@ -57,14 +57,13 @@ text_model=RoBERTa-wwm-ext-base-chinese use_augment="--use-augment" # use_augment="" -python3 -m torch.distributed.launch --use_env --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ +torchrun --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ --master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} cn_clip/training/main.py \ --train-data=${train_data} \ --val-data=${val_data} \ --resume=${resume} \ ${reset_data_offset} \ ${reset_optimizer} \ - --logs=${output_base_dir} \ --name=${name} \ --save-step-frequency=${save_step_frequency} \ --save-epoch-frequency=${save_epoch_frequency} \ diff --git a/run_scripts/muge_finetune_vit-b-16_rbt-base_distillation.sh b/run_scripts/muge_finetune_vit-b-16_rbt-base_distillation.sh index b6e46d9..655f35c 100644 --- a/run_scripts/muge_finetune_vit-b-16_rbt-base_distillation.sh +++ b/run_scripts/muge_finetune_vit-b-16_rbt-base_distillation.sh @@ -59,14 +59,13 @@ distillation="--distillation" teacher_model_name="damo/multi-modal_team-vit-large-patch14_multi-modal-similarity" # use_augment="" -python3 -m torch.distributed.launch --use_env --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ +torchrun --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ --master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} cn_clip/training/main.py \ --train-data=${train_data} \ --val-data=${val_data} \ --resume=${resume} \ ${reset_data_offset} \ ${reset_optimizer} \ - --logs=${output_base_dir} \ --name=${name} \ --save-step-frequency=${save_step_frequency} \ --save-epoch-frequency=${save_epoch_frequency} \ diff --git a/run_scripts/muge_finetune_vit-b-16_rbt-base_flashattn.sh b/run_scripts/muge_finetune_vit-b-16_rbt-base_flashattn.sh index 5ae7349..16dc037 100644 --- a/run_scripts/muge_finetune_vit-b-16_rbt-base_flashattn.sh +++ b/run_scripts/muge_finetune_vit-b-16_rbt-base_flashattn.sh @@ -57,14 +57,13 @@ text_model=RoBERTa-wwm-ext-base-chinese use_augment="--use-augment" # use_augment="" -python3 -m torch.distributed.launch --use_env --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ +torchrun --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ --master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} cn_clip/training/main.py \ --train-data=${train_data} \ --val-data=${val_data} \ --resume=${resume} \ ${reset_data_offset} \ ${reset_optimizer} \ - --logs=${output_base_dir} \ --name=${name} \ --save-step-frequency=${save_step_frequency} \ --save-epoch-frequency=${save_epoch_frequency} \ diff --git a/run_scripts/muge_finetune_vit-b-16_rbt-base_flip.sh b/run_scripts/muge_finetune_vit-b-16_rbt-base_flip.sh index b742d9e..6060933 100644 --- a/run_scripts/muge_finetune_vit-b-16_rbt-base_flip.sh +++ b/run_scripts/muge_finetune_vit-b-16_rbt-base_flip.sh @@ -58,14 +58,13 @@ mask_ratio=0.5 # use flip: set mask ratio use_augment="--use-augment" # use_augment="" -python3 -m torch.distributed.launch --use_env --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ +torchrun --nproc_per_node=${GPUS_PER_NODE} --nnodes=${WORKER_CNT} --node_rank=${RANK} \ --master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} cn_clip/training/main.py \ --train-data=${train_data} \ --val-data=${val_data} \ --resume=${resume} \ ${reset_data_offset} \ ${reset_optimizer} \ - --logs=${output_base_dir} \ --name=${name} \ --save-step-frequency=${save_step_frequency} \ --save-epoch-frequency=${save_epoch_frequency} \