From de86ce96597a57d950ccfe8d731e660253efc90c Mon Sep 17 00:00:00 2001 From: Dewi Bryn Jones Date: Mon, 14 Sep 2020 08:34:57 +0100 Subject: [PATCH] ar gyfer/for DeepSpeech 0.8.x --- Dockerfile | 23 ++++-------- Dockerfile.train.tmpl | 68 ++++++++++++++++++++++++++++++++++ Makefile | 29 +++++++++------ local/build_lm_scorer.sh | 20 ++++++---- local/optimize_lm_scorer.sh | 2 +- local/run_tl_cv_cy.sh | 48 +++++++++++++++--------- local/transcribe.py | 74 +++++++++++++++++++++++++++++++++++++ local/utils/audio.py | 10 +++-- 8 files changed, 216 insertions(+), 58 deletions(-) create mode 100644 Dockerfile.train.tmpl create mode 100755 local/transcribe.py diff --git a/Dockerfile b/Dockerfile index e706aba..be79698 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,30 +8,21 @@ RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.d libxslt1-dev libjpeg8-dev zlib1g-dev dos2unix \ && apt-get clean \ && git lfs install \ - && pip install sox wget sklearn pandas python_speech_features virtualenv requests tqdm columnize \ + && pip install sox wget sklearn pandas python_speech_features virtualenv \ + webrtcvad requests tqdm columnize praatio \ && rm -rf /var/lib/apt/lists/* ENV LC_ALL cy_GB.UTF-8 ENV LANG cy_GB.UTF-8 ENV LANGUAGE cy_GB.UTF-8 - -# KenLM missing in Mozilla Dockerfile for trainig -# Build KenLM in /DeepSpeech/native_client/kenlm folder +# WORKDIR /DeepSpeech/native_client -RUN rm -rf kenlm && \ - git clone https://github.com/kpu/kenlm && \ - cd kenlm && \ - git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \ - mkdir -p build && \ - cd build && \ - cmake .. && \ - make -j $(nproc) - -# Done -WORKDIR /DeepSpeech -RUN python util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target . +RUN python ../util/taskcluster.py --target . +RUN python ../util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target . ENV PATH /DeepSpeech/native_client:/DeepSpeech/native_client/kenlm/build/bin:$PATH +# Done +WORKDIR /DeepSpeech diff --git a/Dockerfile.train.tmpl b/Dockerfile.train.tmpl new file mode 100644 index 0000000..3e534b6 --- /dev/null +++ b/Dockerfile.train.tmpl @@ -0,0 +1,68 @@ +# Please refer to the TRAINING documentation, "Basic Dockerfile for training" + +FROM tensorflow/tensorflow:1.15.2-gpu-py3 +ENV DEBIAN_FRONTEND=noninteractive + +ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# +ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA# + +RUN apt-get update && apt-get install -y --no-install-recommends \ + apt-utils \ + bash-completion \ + build-essential \ + cmake \ + curl \ + git \ + libboost-all-dev \ + libbz2-dev \ + locales \ + python3-venv \ + unzip \ + wget + +# We need to remove it because it's breaking deepspeech install later with +# weird errors about setuptools +RUN apt-get purge -y python3-xdg + +# Install dependencies for audio augmentation +RUN apt-get install -y --no-install-recommends libopus0 libsndfile1 + +# Try and free some space +RUN rm -rf /var/lib/apt/lists/* + +WORKDIR / +RUN git clone $DEEPSPEECH_REPO DeepSpeech + +WORKDIR /DeepSpeech +RUN git checkout $DEEPSPEECH_SHA + +# Build CTC decoder first, to avoid clashes on incompatible versions upgrades +RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings +RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl + +# Prepare deps +RUN pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0 + +# Install DeepSpeech +# - No need for the decoder since we did it earlier +# - There is already correct TensorFlow GPU installed on the base image, +# we don't want to break that +RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . + +# Tool to convert output graph for inference +RUN python3 util/taskcluster.py --source tensorflow --branch r1.15 \ + --artifact convert_graphdef_memmapped_format --target . + +# Build KenLM to generate new scorers +WORKDIR /DeepSpeech/native_client +RUN rm -rf kenlm && \ + git clone https://github.com/kpu/kenlm && \ + cd kenlm && \ + git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \ + mkdir -p build && \ + cd build && \ + cmake .. && \ + make -j $(nproc) +WORKDIR /DeepSpeech + +RUN ./bin/run-ldc93s1.sh diff --git a/Makefile b/Makefile index b91dbc9..2b5be99 100644 --- a/Makefile +++ b/Makefile @@ -1,24 +1,28 @@ default: build -DEEPSPEECH_RELEASE := 0.7.4 -DEEPSPEECH_BRANCH := v$(DEEPSPEECH_RELEASE) -TECHIAITH_RELEASE := 20.06 + +DEEPSPEECH_RELEASE := 0.8.2 +TECHIAITH_RELEASE := 20.09 run: - docker run --gpus all --name techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER} -it \ + docker run --gpus all --name techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER} -it \ -v ${PWD}/data/:/data \ -v ${PWD}/checkpoints/:/checkpoints \ -v ${PWD}/models/:/models \ -v ${PWD}/export/:/export \ -v ${PWD}/homedir/:/root \ -v ${PWD}/local/:/DeepSpeech/bin/bangor_welsh \ - techiaith/deepspeech:${DEEPSPEECH_BRANCH} bash + --env DEEPSPEECH_RELEASE=${DEEPSPEECH_RELEASE} \ + --env TECHIAITH_RELEASE=${TECHIAITH_RELEASE} \ + techiaith/deepspeech:v${DEEPSPEECH_RELEASE} bash build: + if [ ! -d "DeepSpeech" ]; then \ - git clone --branch $(DEEPSPEECH_BRANCH) https://github.com/mozilla/DeepSpeech.git; \ - cd DeepSpeech && make Dockerfile.train DEEPSPEECH_SHA=tags/${DEEPSPEECH_BRANCH} && docker build --rm -t mozilla/deepspeech:${DEEPSPEECH_BRANCH} -f Dockerfile.train .; \ + git clone https://github.com/mozilla/DeepSpeech.git; \ + cp Dockerfile.train.tmpl DeepSpeech/; \ fi + cd DeepSpeech && make Dockerfile.train DEEPSPEECH_SHA=tags/v${DEEPSPEECH_RELEASE} && docker build --rm -t mozilla/deepspeech:v${DEEPSPEECH_RELEASE} -f Dockerfile.train . if [ ! -d "checkpoints/mozilla" ]; then \ mkdir -p checkpoints/mozilla; \ cd checkpoints/mozilla && \ @@ -45,18 +49,19 @@ build: wget https://github.com/techiaith/docker-deepspeech-cy/releases/download/$(TECHIAITH_RELEASE)/techiaith_bangor_macsen_$(TECHIAITH_RELEASE).scorer && \ wget https://github.com/techiaith/docker-deepspeech-cy/releases/download/$(TECHIAITH_RELEASE)/techiaith_bangor_transcribe_$(TECHIAITH_RELEASE).scorer;\ fi - docker build --build-arg BRANCH=${DEEPSPEECH_BRANCH} --rm -t techiaith/deepspeech:${DEEPSPEECH_BRANCH} . + docker build --build-arg BRANCH=v${DEEPSPEECH_RELEASE} --rm -t techiaith/deepspeech:v${DEEPSPEECH_RELEASE} . clean: - -docker rmi techiaith/deepspeech:${DEEPSPEECH_BRANCH} - -docker rmi mozilla/deepspeech:${DEEPSPEECH_BRANCH} + -docker rmi techiaith/deepspeech:v${DEEPSPEECH_RELEASE} + -docker rmi mozilla/deepspeech:v${DEEPSPEECH_RELEASE} -docker rmi nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04 + -docker rmi tensorflow/tensorflow:1.15.2-gpu-py3 sudo rm -rf DeepSpeech sudo rm -rf homedir sudo rm -rf checkpoints stop: - -docker stop techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER} - -docker rm techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER} + -docker stop techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER} + -docker rm techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER} diff --git a/local/build_lm_scorer.sh b/local/build_lm_scorer.sh index 9275d4f..78c9f55 100755 --- a/local/build_lm_scorer.sh +++ b/local/build_lm_scorer.sh @@ -68,16 +68,21 @@ echo "#### Generating package for un-optimized language model package echo "#### ####" echo "#### Default alpha and beta values used. Previous optimal values were: ####" echo "#### ####" -echo "#### Voice Assistant lm : alpha: 1.7242448485503816 ####" -echo "#### beta: 4.9065413926676165 ####" +echo "#### Voice Assistant Language Model ####" +echo "#### alpha: 1.7242448485503816 ####" +echo "#### beta: 4.9065413926676165 ####" +echo "#### ####" +echo "#### Transcription Language Model ####" +echo "#### alpha: 1.1417685444561605 ####" +echo "#### beta: 0.5798010479098541 ####" echo "#### ####" echo "####################################################################################" set -x -python3 /DeepSpeech/data/lm/generate_package.py \ +/DeepSpeech/native_client/generate_scorer_package \ --alphabet "${alphabet_file_path}" \ --lm lm.binary \ --vocab vocab-50000.txt \ - --package kenlm.scorer \ + --package kenlm.scorer \ --default_alpha 0.75 \ --default_beta 1.85 @@ -89,9 +94,8 @@ echo "########################################################################## set -x python -u /DeepSpeech/evaluate.py \ --test_files "${test_files}" --test_batch_size 1 \ - --alphabet_config_path "${alphabet_file_path}" \ - --load_checkpoint_dir "${checkpoint_cy_dir}" \ - --scorer_path kenlm.scorer - + --alphabet_config_path "${alphabet_file_path}" \ + --load_checkpoint_dir "${checkpoint_cy_dir}" \ + --scorer_path kenlm.scorer cd - diff --git a/local/optimize_lm_scorer.sh b/local/optimize_lm_scorer.sh index e67ff10..2be6e36 100755 --- a/local/optimize_lm_scorer.sh +++ b/local/optimize_lm_scorer.sh @@ -58,7 +58,7 @@ read -p "Enter best default beta: " beta echo "####################################################################################" echo "#### Generating package with optimal alpha and beta ####" echo "####################################################################################" -python3 /DeepSpeech/data/lm/generate_package.py \ +/DeepSpeech/native_client/generate_scorer_package \ --alphabet "${alphabet_file_path}" \ --lm lm.binary \ --vocab vocab-50000.txt \ diff --git a/local/run_tl_cv_cy.sh b/local/run_tl_cv_cy.sh index d31423d..47ae6a6 100755 --- a/local/run_tl_cv_cy.sh +++ b/local/run_tl_cv_cy.sh @@ -1,19 +1,6 @@ #!/bin/bash set -e -### -model_name='bangor' -model_language='cy-Latn-GB' -model_license='CC-BY-4.0' -model_description='Welsh language acoustic model trained using transfer learning and approximately 90hrs of validated and other Welsh speech data from the Mozilla CommonVoice June 2020 release.' - -model_author='techiaith' -model_contact_info='techiaith@bangor.ac.uk' - -model_version='20.07' -deepspeech_version='0.7.4' - - ### csv_dir='' while getopts ":a:" opt; do @@ -33,12 +20,34 @@ if [ -z "${csv_dir}" ]; then exit 2 fi +### +model_name='bangor-welsh' +model_language='cy-Latn-GB' +model_license='CC-BY-4.0' +model_description='Welsh language acoustic model trained using transfer learning and approximately 90hrs of validated and other Welsh speech data from the Mozilla CommonVoice June 2020 release.' + +model_author='techiaith' +model_contact_info='techiaith@bangor.ac.uk' + +echo +echo "####################################################################################" +echo " model_name : ${model_name}" +echo " model_language : ${cy-Latn-GB}" +echo " model_license : ${model_license}" +echo " model_description : ${model_description}" +echo " model_author : ${model_author}" +echo " model_contact_info : ${model_contact_info}" +echo " model_version : ${TECHIAITH_RELEASE} " +echo " DeepSpeech Version : ${DEEPSPEECH_RELEASE} " +echo "####################################################################################" +echo + ### train_files=${csv_dir}/validated.clean.csv,${csv_dir}/other.clean.csv alphabet_cy_file=/DeepSpeech/bin/bangor_welsh/alphabet.txt checkpoint_dir=/checkpoints -export_dir=/export/${deepspeech_version}_${model_version} +export_dir=/export/${DEEPSPEECH_RELEASE}_${TECHIAITH_RELEASE} ### Force UTF-8 output @@ -58,6 +67,7 @@ mkdir -p ${export_dir} cp -r /checkpoints/mozilla/deepspeech-en-checkpoint/ $checkpoint_en_dir ### +echo echo "####################################################################################" echo "#### Transfer to WELSH model with --save_checkpoint_dir --load_checkpoint_dir ####" echo "####################################################################################" @@ -72,6 +82,7 @@ python -u DeepSpeech.py \ set +x +echo echo "####################################################################################" echo "#### Export new Welsh checkpoint to frozen model ####" echo "####################################################################################" @@ -86,21 +97,22 @@ python -u DeepSpeech.py \ --export_dir "${export_dir}" \ --export_author_id "${model_author}" \ --export_model_name "${model_name}" \ - --export_model_version "${model_version}" \ + --export_model_version "${TECHIAITH_RELEASE}" \ --export_contact_info "${model_contact_info}" \ --export_license "${model_license}" \ --export_language "${model_language}" \ - --export_min_ds_version "${deepspeech_version}" \ - --export_max_ds_version "${deepspeech_version}" \ + --export_min_ds_version "${DEEPSPEECH_RELEASE}" \ + --export_max_ds_version "${DEEPSPEECH_RELEASE}" \ --export_description "${model_description}" ### -/DeepSpeech/convert_graphdef_memmapped_format \ +/DeepSpeech/native_client/convert_graphdef_memmapped_format \ --in_graph=${export_dir}/output_graph.pb \ --out_graph=${export_dir}/output_graph.pbmm set +x +echo echo "####################################################################################" echo "#### Exported acoustic models (.pb/.pbmm files) can be found in ${export_dir} " echo "####################################################################################" diff --git a/local/transcribe.py b/local/transcribe.py new file mode 100755 index 0000000..546a6c9 --- /dev/null +++ b/local/transcribe.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os +import sys +import pathlib +import shlex +import shutil +import subprocess +import glob + +import json + +from praatio import tgio +from utils.audio import downsample_wavfile + +from utils.clean_transcript import clean_transcript +from argparse import ArgumentParser, RawTextHelpFormatter + +DESCRIPTION = """ +""" + +ALPHABET_FILE_PATH = "/DeepSpeech/bin/bangor_welsh/alphabet.txt" +TECHIAITH_RELEASE = os.environ['TECHIAITH_RELEASE'] +CHECKPOINTS_DIR = "/checkpoints/cy" +LANGUAGE_MODEL = "/models/techiaith/techiaith_bangor_transcribe_%s.scorer" % TECHIAITH_RELEASE + + +def convert_json_to_textgrid(wav_file_path, transcript_file_path): + + textgrid_file_path = transcript_file_path.replace(".tlog",".TextGrid") + + with open(transcript_file_path) as json_file: + textgrid_entries_list = [] + json_data = json.load(json_file) + for transcript in json_data: + start_seconds = float(transcript["start"] / 1000) + end_seconds = float(transcript["end"] / 1000) + textgrid_entry = (start_seconds,end_seconds, transcript["transcript"]) + textgrid_entries_list.append(textgrid_entry) + + utterance_tier = tgio.IntervalTier('utterance', textgrid_entries_list, 0, pairedWav=wav_file_path) + tg = tgio.Textgrid() + tg.addTier(utterance_tier) + tg.save(textgrid_file_path, useShortForm=False, outputFormat='textgrid') + + print ("Textgrid of transcription saved to %s" % textgrid_file_path) + + +def main(wav_file_path, **args): + + cmd = "python3 /DeepSpeech/transcribe.py --src %s --checkpoint_dir %s --alphabet_config_path %s --scorer %s --force" + cmd = cmd % (wav_file_path, CHECKPOINTS_DIR, ALPHABET_FILE_PATH, LANGUAGE_MODEL) + + downsample_wavfile(wav_file_path) + + import_process = subprocess.Popen(shlex.split(cmd)) + import_process.wait() + + transcript_file = wav_file_path.replace(".wav", ".tlog") + + convert_json_to_textgrid(wav_file_path, transcript_file) + + + +if __name__ == "__main__": + + parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) + + parser.add_argument("--wavfile", dest="wav_file_path", required=True, help="path to downloaded tar.gz containing speech corpus in CommonVoice v2.0 format") + #parser.add_argument("--target_dir", dest="cv_root_dir", required=True, help="target directory for extracted archive, also root directory for training data") + + parser.set_defaults(func=main) + args = parser.parse_args() + args.func(**vars(args)) diff --git a/local/utils/audio.py b/local/utils/audio.py index d7623b1..18f427b 100755 --- a/local/utils/audio.py +++ b/local/utils/audio.py @@ -6,10 +6,11 @@ import shutil import wave -from sox import Transformer +import sox import pandas as pd import numpy as np + import scipy.io.wavfile as wav from python_speech_features import mfcc @@ -18,6 +19,9 @@ def downsample_wavfile(wavfile): + if sox.file_info.sample_rate(wavfile)==16000.0: + return + temp_48kHz_wavfile = wavfile.replace(".wav","_48kHz.wav") shutil.move(wavfile, temp_48kHz_wavfile) transform_audio(temp_48kHz_wavfile, wavfile) @@ -37,12 +41,12 @@ def convert_mp3(mp3file): return False - def transform_audio(old_file, new_file): - tf = Transformer() + tf = sox.Transformer() tf.convert(samplerate=16000, n_channels=1) tf.build(old_file, new_file) + def get_duration_wav(wavfile): f = wave.open(wavfile, 'r') frames = f.getnframes()