Skip to content

Commit

Permalink
Add wenetspeech run.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
pkufool committed Feb 5, 2024
1 parent f2f4087 commit 91f1382
Show file tree
Hide file tree
Showing 2 changed files with 199 additions and 2 deletions.
197 changes: 197 additions & 0 deletions egs/wenetspeech/KWS/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#!/usr/bin/env bash

# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

set -eou pipefail

export CUDA_VISIBLE_DEVICES="0,1,2,3"
export PYTHONPATH=../../../:$PYTHONPATH

stage=0
stop_stage=100

pre_trained_model_host=github

. shared/parse_options.sh || exit 1

log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}


if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "Stage -1: Download a pre-trained model."


fi



if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Train a model."
if [ ! -e data/fbank/.gigaspeech.done ]; then
log "You need to run the prepare.sh first."
exit -1
fi

python ./zipformer/train.py \
--world-size 4 \
--exp-dir zipformer/exp \
--decoder-dim 320 \
--joiner-dim 320 \
--num-encoder-layers 1,1,1,1,1,1 \
--feedforward-dim 192,192,192,192,192,192 \
--encoder-dim 128,128,128,128,128,128 \
--encoder-unmasked-dim 128,128,128,128,128,128 \
--num-epochs 15 \
--lr-epochs 1.5 \
--use-fp16 1 \
--start-epoch 1 \
--training-subset L \
--pinyin-type partial_with_tone \
--causal 1 \
--lang-dir data/lang_partial_tone \
--max-duration 1000
fi

if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Decode the model."
for t in small, large; do
python ./zipformer/decode.py \
--epoch 15 \
--avg 2 \
--exp-dir ./zipformer/exp \
--lang-dir ./data/lang_partial_tone \
--pinyin-type partial_with_tone \
--causal 1 \
--chunk-size 16 \
--left-context-frames 64 \
--decoder-dim 320 \
--joiner-dim 320 \
--num-encoder-layers 1,1,1,1,1,1 \
--feedforward-dim 192,192,192,192,192,192 \
--encoder-dim 128,128,128,128,128,128 \
--encoder-unmasked-dim 128,128,128,128,128,128 \
--test-set $t \
--keywords-score 1.0 \
--keywords-threshold 0.35 \
--keywords-file ./data/commands_${t}.txt \
--max-duration 3000
done
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Export the model."

python ./zipformer/export.py \
--epoch 15 \
--avg 2 \
--exp-dir ./zipformer/exp \
--tokens data/lang_partial_tone/tokens.txt \
--causal 1 \
--chunk-size 16 \
--left-context-frames 64 \
--decoder-dim 320 \
--joiner-dim 320 \
--num-encoder-layers 1,1,1,1,1,1 \
--feedforward-dim 192,192,192,192,192,192 \
--encoder-dim 128,128,128,128,128,128 \
--encoder-unmasked-dim 128,128,128,128,128,128

python ./zipformer/export_onnx_streaming.py \
--exp-dir zipformer/exp \
--tokens data/lang_partial_tone/tokens.txt \
--epoch 15 \
--avg 2 \
--chunk-size 16 \
--left-context-frames 128 \
--decoder-dim 320 \
--joiner-dim 320 \
--num-encoder-layers 1,1,1,1,1,1 \
--feedforward-dim 192,192,192,192,192,192 \
--encoder-dim 128,128,128,128,128,128 \
--encoder-unmasked-dim 128,128,128,128,128,128 \
--causal 1
fi

if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 2: Finetune the model"

# The following configuration of lr schedule should work well
# You may also tune the following parameters to adjust learning rate schedule
base_lr=0.0005
lr_epochs=100
lr_batches=100000

# We recommend to start from an averaged model
finetune_ckpt=zipformer/exp/pretrained.pt

./zipformer/finetune.py \
--world-size 4 \
--num-epochs 10 \
--start-epoch 1 \
--exp-dir zipformer/exp_finetune
--lang-dir ./data/lang_partial_tone \
--pinyin-type partial_with_tone \
--use-fp16 1 \
--decoder-dim 320 \
--joiner-dim 320 \
--num-encoder-layers 1,1,1,1,1,1 \
--feedforward-dim 192,192,192,192,192,192 \
--encoder-dim 128,128,128,128,128,128 \
--encoder-unmasked-dim 128,128,128,128,128,128 \
--causal 1 \
--base-lr $base_lr \
--lr-epochs $lr_epochs \
--lr-batches $lr_batches \
--finetune-ckpt $finetune_ckpt \
--max-duration 1500
fi

if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 1: Decode the finetuned model."
for t in small, large; do
python ./zipformer/decode.py \
--epoch 15 \
--avg 2 \
--exp-dir ./zipformer/exp_finetune \
--lang-dir ./data/lang_partial_tone \
--pinyin-type partial_with_tone \
--causal 1 \
--chunk-size 16 \
--left-context-frames 64 \
--decoder-dim 320 \
--joiner-dim 320 \
--num-encoder-layers 1,1,1,1,1,1 \
--feedforward-dim 192,192,192,192,192,192 \
--encoder-dim 128,128,128,128,128,128 \
--encoder-unmasked-dim 128,128,128,128,128,128 \
--test-set $t \
--keywords-score 1.0 \
--keywords-threshold 0.35 \
--keywords-file ./data/commands_${t}.txt \
--max-duration 3000
done
fi

if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 2: Export the finetuned model."

python ./zipformer/export_onnx_streaming.py \
--exp-dir zipformer/exp_finetune \
--tokens data/lang_partial_tone/tokens.txt \
--epoch 15 \
--avg 2 \
--chunk-size 16 \
--left-context-frames 128 \
--decoder-dim 320 \
--joiner-dim 320 \
--num-encoder-layers 1,1,1,1,1,1 \
--feedforward-dim 192,192,192,192,192,192 \
--encoder-dim 128,128,128,128,128,128 \
--encoder-unmasked-dim 128,128,128,128,128,128 \
--causal 1
fi
4 changes: 2 additions & 2 deletions egs/wenetspeech/KWS/zipformer/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,9 @@ def get_parser():
default="partial_with_tone",
help="""
The style of the output pinyin, should be:
full_with_tone : zhong1 guo2
full_with_tone : zhōng guó
full_no_tone : zhong guo
partial_with_tone : zh ong1 g uo2
partial_with_tone : zh ōng g
partial_no_tone : zh ong g uo
""",
)
Expand Down

0 comments on commit 91f1382

Please sign in to comment.