forked from extreme-bert/extreme-bert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_glue.sh
executable file
·148 lines (130 loc) · 6.35 KB
/
run_glue.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/bin/bash
# Copyright 2022 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pretrain_dataset_name=$1 # e.g. wikipedia-20220401.en,bookcorpus
task_name=$2 # e.g. cola, sst2, mrpc, stsb, qqp, mnli, qnli, rte, wnli
prefix=$3 # Any string, e.g. epoch-2_batch-size-16_init-lr-5e-5
num_epoch=$4 # e.g. 3
batch_size=$5 # e.g. 16
init_lr=$6 # Initial learning rate, e.g. 5e-5
model_path=$7 # e.g. pretrain/saved_models/xxx/elastic-step-decay_step-based-num-iter-1000_optimizer-adamw_batch-size-16_init-lr-1.0_interval-shrink-rate-1.4142_cr-k-3
model_name=$8 # e.g. elastic-step-decay_step-based-num-iter-1000_optimizer-adamw_batch-size-16_init-lr-1.0_interval-shrink-rate-1.4142_cr-k-3
# In our case, it is the basename of ${model_path} and
# means the pretrained model's lr scheduler setting
# during pretraining
saved_finetune_checkpoint_at_end=$9 # "True" or "False"
prerequisite_file=${10} # None or a file path, the script will not invoke
# `run_glue.py` until this file starts to exist
num_gpu=${11}
if [ ${num_gpu} -ne 1 -a ${num_gpu} -ne 2 -a ${num_gpu} -ne 4 -a ${num_gpu} -ne 8 -a ${num_gpu} -ne 16 ]; then
echo "$(date): [ERROR] process $$: unsupported number of gpus ${num_gpu}. Valid option: [1, 2, 4, 8, 16]"
exit 1
fi
per_device_train_batch_size=$(python -c "print(${batch_size} // ${num_gpu})")
per_device_eval_batch_size=16 # Default value in huggingface is 8
shared_dir_suffix="finetune/${pretrain_dataset_name}/${task_name}/${model_name}"
log_dir="log/${shared_dir_suffix}"
tmp_dir="tmp/${shared_dir_suffix}"
output_dir="output/${shared_dir_suffix}/${prefix}"
output_test_dir="output_test/${shared_dir_suffix}"
mkdir -p ${log_dir} ${tmp_dir} ${output_dir} ${output_test_dir}
ongoing_mark_file=${tmp_dir}/${prefix}.ongoing.mark
completed_mark_file=${tmp_dir}/${prefix}.completed.mark
# Skips experiments if it is being executed by another process right now or
# completed before
if [ -f ${ongoing_mark_file} -o -f ${completed_mark_file} ]; then
exit
fi
trap "rm -rf ${ongoing_mark_file} ${completed_mark_file} ${output_dir}; exit" SIGINT SIGTERM SIGKILL
touch "${ongoing_mark_file}"
echo "$(date): process: $$, task: ${task_name}, num_epoch: ${num_epoch}, batch_size: ${batch_size}, init_lr: ${init_lr} start..."
# Waits until the prerequisite file exists
wait_time=0
check_frequency=30 # unit of measure: seconds
if [ "${prerequisite_file}" != "" ]; then
while true; do
if [ -f ${prerequisite_file} ]; then
if [ ${wait_time} -gt 0 ]; then
echo "$(date): process $$: prerequisite file completed, start training."
fi
break
fi
if [ ${wait_time} -eq 0 ]; then
echo "$(date): process $$: prerequisite file hasn't completed yet, waiting."
fi
sleep ${check_frequency}
wait_time=$(( wait_time + check_frequency ))
if [ ${wait_time} -eq 21600 ]; then
echo "$(date): [WARNING] process $$ have waited for at least 6 hours, normally it should take at most 3-4 hours for the corresponding intermediate training to complete. Please check if the intermediate training result for "${prerequisite_file}" is normal, or if the running status is okay."
fi
done
fi
# Finds the saved model binary for finetuning
#
# 1) "pretraining_experiment-" = "{job_name}-{current_run_id}" is the default
# job name "pretraining_experiment" with an empty current run id (default).
#
# 2) "epoch1000000_step-*" = "epoch-{num_epoch}_step-{actual_num_step}" is
# the model saved at epoch 1000000 (which is the default number of epochs,
# also can be used to specify the last saved model) with the number of
# steps actually runs during training.
#
# Since under some scenarios, "actual_num_step" != "num_step" due to
# dynamic features of the trainer, e.g. ignoring steps with gradient
# overflows, which are caused by the dynamic loss scaling feature of mixed
# precision training. For example, when we have num_step = 1000,
# actual_num_step can be 1016 due to gradient overflows in first 16 steps.
#
# But that doesn't matter, since we will only have one such directory, we
# just choose the first one, which will be the only one.
model_bin_parent_dir="${model_path}/pretraining_experiment-"
model_bin_subdir=$(ls ${model_bin_parent_dir} | grep epoch1000000_step | head -1)
model_bin_path=${model_bin_parent_dir}/${model_bin_subdir}
python run_glue.py \
--model_name_or_path ${model_bin_path} \
--task_name ${task_name} \
--max_seq_length 128 \
--output_dir ${output_dir} \
--overwrite_output_dir \
--do_train \
--do_eval \
--do_predict \
--per_device_train_batch_size ${per_device_train_batch_size} \
--gradient_accumulation_steps 1 \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--learning_rate ${init_lr} \
--weight_decay 0.1 \
--logging_strategy steps \
--evaluation_strategy no \
--save_strategy no \
--max_grad_norm 1.0 \
--num_train_epochs ${num_epoch} \
--lr_scheduler_type polynomial \
--warmup_ratio 0.06 \
--disable_tqdm True \
--finetune_checkpoint_at_end ${save_finetune_checkpoint_at_end} \
> ${log_dir}/${prefix}.log \
2> ${log_dir}/${prefix}.err
# Copies the prediction results to ${output_test_dir}
cp ${output_dir}/predict_results_${task_name}.txt ${output_test_dir}/${prefix}.txt
if [ "${task_name}" = "mnli" ]; then
cp ${output_dir}/predict_results_${task_name}-mm.txt ${output_test_dir}/${prefix}-mm.txt
fi
# We need to save MNLI models for RTE, MRPC, STSB finetuning
# This is one of the tricks used in 24h-bert
if [ "${task_name}" != "mnli" ]; then
rm -rf ${output_dir}
fi
touch ${completed_mark_file}
echo "$(date): process: $$, task: ${task_name}, num_epoch: ${num_epoch}, batch_size: ${batch_size}, init_lr: ${init_lr} done"