From de86ce96597a57d950ccfe8d731e660253efc90c Mon Sep 17 00:00:00 2001
From: Dewi Bryn Jones <d.b.jones@bangor.ac.uk>
Date: Mon, 14 Sep 2020 08:34:57 +0100
Subject: [PATCH] ar gyfer/for DeepSpeech 0.8.x

---
 Dockerfile                  | 23 ++++--------
 Dockerfile.train.tmpl       | 68 ++++++++++++++++++++++++++++++++++
 Makefile                    | 29 +++++++++------
 local/build_lm_scorer.sh    | 20 ++++++----
 local/optimize_lm_scorer.sh |  2 +-
 local/run_tl_cv_cy.sh       | 48 +++++++++++++++---------
 local/transcribe.py         | 74 +++++++++++++++++++++++++++++++++++++
 local/utils/audio.py        | 10 +++--
 8 files changed, 216 insertions(+), 58 deletions(-)
 create mode 100644 Dockerfile.train.tmpl
 create mode 100755 local/transcribe.py

diff --git a/Dockerfile b/Dockerfile
index e706aba..be79698 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,30 +8,21 @@ RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.d
 						libxslt1-dev libjpeg8-dev zlib1g-dev dos2unix \
 	&& apt-get clean \
 	&& git lfs install \
-	&& pip install sox wget sklearn pandas python_speech_features virtualenv requests tqdm columnize \
+	&& pip install sox wget sklearn pandas python_speech_features virtualenv \ 
+				   webrtcvad requests tqdm columnize praatio \
 	&& rm -rf /var/lib/apt/lists/* 
 
 ENV LC_ALL cy_GB.UTF-8
 ENV LANG cy_GB.UTF-8
 ENV LANGUAGE cy_GB.UTF-8
 
-
-# KenLM missing in Mozilla Dockerfile for trainig
-# Build KenLM in /DeepSpeech/native_client/kenlm folder
+#
 WORKDIR /DeepSpeech/native_client
-RUN rm -rf kenlm && \
-	git clone https://github.com/kpu/kenlm && \
-	cd kenlm && \
-	git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \
-	mkdir -p build && \
-	cd build && \
-	cmake .. && \
-	make -j $(nproc)
-
-# Done
-WORKDIR /DeepSpeech
 
-RUN python util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target .
+RUN python ../util/taskcluster.py --target .
+RUN python ../util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target .
 
 ENV PATH /DeepSpeech/native_client:/DeepSpeech/native_client/kenlm/build/bin:$PATH
 
+# Done
+WORKDIR /DeepSpeech
diff --git a/Dockerfile.train.tmpl b/Dockerfile.train.tmpl
new file mode 100644
index 0000000..3e534b6
--- /dev/null
+++ b/Dockerfile.train.tmpl
@@ -0,0 +1,68 @@
+# Please refer to the TRAINING documentation, "Basic Dockerfile for training"
+
+FROM tensorflow/tensorflow:1.15.2-gpu-py3
+ENV DEBIAN_FRONTEND=noninteractive
+
+ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO#
+ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA#
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        apt-utils \
+        bash-completion \
+        build-essential \
+        cmake \
+        curl \
+        git \
+        libboost-all-dev \
+        libbz2-dev \
+        locales \
+        python3-venv \
+        unzip \
+        wget
+
+# We need to remove it because it's breaking deepspeech install later with
+# weird errors about setuptools
+RUN apt-get purge -y python3-xdg
+
+# Install dependencies for audio augmentation
+RUN apt-get install -y --no-install-recommends libopus0 libsndfile1
+
+# Try and free some space
+RUN rm -rf /var/lib/apt/lists/*
+
+WORKDIR /
+RUN git clone $DEEPSPEECH_REPO DeepSpeech
+
+WORKDIR /DeepSpeech
+RUN git checkout $DEEPSPEECH_SHA
+
+# Build CTC decoder first, to avoid clashes on incompatible versions upgrades
+RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
+RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
+
+# Prepare deps
+RUN pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0
+
+# Install DeepSpeech
+#  - No need for the decoder since we did it earlier
+#  - There is already correct TensorFlow GPU installed on the base image,
+#    we don't want to break that
+RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .
+
+# Tool to convert output graph for inference
+RUN python3 util/taskcluster.py --source tensorflow --branch r1.15 \
+        --artifact convert_graphdef_memmapped_format  --target .
+
+# Build KenLM to generate new scorers
+WORKDIR /DeepSpeech/native_client
+RUN rm -rf kenlm && \
+	git clone https://github.com/kpu/kenlm && \
+	cd kenlm && \
+	git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \
+	mkdir -p build && \
+	cd build && \
+	cmake .. && \
+	make -j $(nproc)
+WORKDIR /DeepSpeech
+
+RUN ./bin/run-ldc93s1.sh
diff --git a/Makefile b/Makefile
index b91dbc9..2b5be99 100644
--- a/Makefile
+++ b/Makefile
@@ -1,24 +1,28 @@
 default: build
-DEEPSPEECH_RELEASE := 0.7.4
-DEEPSPEECH_BRANCH := v$(DEEPSPEECH_RELEASE)
-TECHIAITH_RELEASE := 20.06
+
+DEEPSPEECH_RELEASE := 0.8.2
+TECHIAITH_RELEASE := 20.09
 
 run: 
-	docker run --gpus all --name techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER} -it \
+	docker run --gpus all --name techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER} -it \
 		-v ${PWD}/data/:/data \
 		-v ${PWD}/checkpoints/:/checkpoints \
 		-v ${PWD}/models/:/models \
 		-v ${PWD}/export/:/export \
 		-v ${PWD}/homedir/:/root \
 		-v ${PWD}/local/:/DeepSpeech/bin/bangor_welsh \
-		techiaith/deepspeech:${DEEPSPEECH_BRANCH} bash
+		--env DEEPSPEECH_RELEASE=${DEEPSPEECH_RELEASE} \
+		--env TECHIAITH_RELEASE=${TECHIAITH_RELEASE} \
+		techiaith/deepspeech:v${DEEPSPEECH_RELEASE} bash
 
 
 build:
+	
 	if [ ! -d "DeepSpeech" ]; then \
-	    git clone --branch $(DEEPSPEECH_BRANCH) https://github.com/mozilla/DeepSpeech.git; \
-	    cd DeepSpeech && make Dockerfile.train DEEPSPEECH_SHA=tags/${DEEPSPEECH_BRANCH} && docker build --rm -t mozilla/deepspeech:${DEEPSPEECH_BRANCH} -f Dockerfile.train .; \
+	    git clone https://github.com/mozilla/DeepSpeech.git; \
+	    cp Dockerfile.train.tmpl DeepSpeech/; \
 	fi
+	cd DeepSpeech && make Dockerfile.train DEEPSPEECH_SHA=tags/v${DEEPSPEECH_RELEASE} && docker build --rm -t mozilla/deepspeech:v${DEEPSPEECH_RELEASE} -f Dockerfile.train .
 	if [ ! -d "checkpoints/mozilla" ]; then \
 	    mkdir -p checkpoints/mozilla; \
 	    cd checkpoints/mozilla && \
@@ -45,18 +49,19 @@ build:
 		wget https://github.com/techiaith/docker-deepspeech-cy/releases/download/$(TECHIAITH_RELEASE)/techiaith_bangor_macsen_$(TECHIAITH_RELEASE).scorer && \
 		wget https://github.com/techiaith/docker-deepspeech-cy/releases/download/$(TECHIAITH_RELEASE)/techiaith_bangor_transcribe_$(TECHIAITH_RELEASE).scorer;\
 	fi
-	docker build --build-arg BRANCH=${DEEPSPEECH_BRANCH} --rm -t techiaith/deepspeech:${DEEPSPEECH_BRANCH} .
+	docker build --build-arg BRANCH=v${DEEPSPEECH_RELEASE} --rm -t techiaith/deepspeech:v${DEEPSPEECH_RELEASE} .
 
 
 clean:
-	-docker rmi techiaith/deepspeech:${DEEPSPEECH_BRANCH}
-	-docker rmi mozilla/deepspeech:${DEEPSPEECH_BRANCH}
+	-docker rmi techiaith/deepspeech:v${DEEPSPEECH_RELEASE}
+	-docker rmi mozilla/deepspeech:v${DEEPSPEECH_RELEASE}
 	-docker rmi nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
+	-docker rmi tensorflow/tensorflow:1.15.2-gpu-py3
 	sudo rm -rf DeepSpeech
 	sudo rm -rf homedir
 	sudo rm -rf checkpoints
 
 
 stop:
-	-docker stop techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER}
-	-docker rm techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER}
+	-docker stop techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER}
+	-docker rm techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER}
diff --git a/local/build_lm_scorer.sh b/local/build_lm_scorer.sh
index 9275d4f..78c9f55 100755
--- a/local/build_lm_scorer.sh
+++ b/local/build_lm_scorer.sh
@@ -68,16 +68,21 @@ echo "#### Generating package for un-optimized language model package
 echo "####                                                                            ####"
 echo "#### Default alpha and beta values used. Previous optimal values were:          ####"
 echo "####                                                                            ####"
-echo "#### Voice Assistant lm     : alpha: 1.7242448485503816                         ####"
-echo "#### 			    beta: 4.9065413926676165                          ####"
+echo "#### Voice Assistant Language Model                                             ####"
+echo "####    alpha: 1.7242448485503816                                               ####"
+echo "####    beta:  4.9065413926676165                                               ####"
+echo "####                                                                            ####"
+echo "#### Transcription Language Model                                               ####"
+echo "####    alpha: 1.1417685444561605                                               ####"
+echo "####    beta:  0.5798010479098541                                               ####"
 echo "####                                                                            ####"
 echo "####################################################################################"
 set -x
-python3 /DeepSpeech/data/lm/generate_package.py \
+/DeepSpeech/native_client/generate_scorer_package \
 	--alphabet "${alphabet_file_path}" \
 	--lm lm.binary \
 	--vocab vocab-50000.txt \
-  	--package kenlm.scorer \
+	--package kenlm.scorer \
  	--default_alpha 0.75 \
 	--default_beta 1.85
 
@@ -89,9 +94,8 @@ echo "##########################################################################
 set -x
 python -u /DeepSpeech/evaluate.py \
 	--test_files "${test_files}" --test_batch_size 1 \
-    --alphabet_config_path "${alphabet_file_path}" \
-    --load_checkpoint_dir "${checkpoint_cy_dir}" \
-    --scorer_path kenlm.scorer
-
+	--alphabet_config_path "${alphabet_file_path}" \
+	--load_checkpoint_dir "${checkpoint_cy_dir}" \
+	--scorer_path kenlm.scorer
 
 cd -
diff --git a/local/optimize_lm_scorer.sh b/local/optimize_lm_scorer.sh
index e67ff10..2be6e36 100755
--- a/local/optimize_lm_scorer.sh
+++ b/local/optimize_lm_scorer.sh
@@ -58,7 +58,7 @@ read -p "Enter best default beta: " beta
 echo "####################################################################################"
 echo "#### Generating package with optimal alpha and beta                             ####"
 echo "####################################################################################"
-python3 /DeepSpeech/data/lm/generate_package.py \
+/DeepSpeech/native_client/generate_scorer_package \
 	--alphabet "${alphabet_file_path}" \
 	--lm lm.binary \
 	--vocab vocab-50000.txt \
diff --git a/local/run_tl_cv_cy.sh b/local/run_tl_cv_cy.sh
index d31423d..47ae6a6 100755
--- a/local/run_tl_cv_cy.sh
+++ b/local/run_tl_cv_cy.sh
@@ -1,19 +1,6 @@
 #!/bin/bash
 set -e
 
-###
-model_name='bangor'
-model_language='cy-Latn-GB'
-model_license='CC-BY-4.0'
-model_description='Welsh language acoustic model trained using transfer learning and approximately 90hrs of validated and other Welsh speech data from the Mozilla CommonVoice June 2020 release.'
-
-model_author='techiaith'
-model_contact_info='techiaith@bangor.ac.uk'
-
-model_version='20.07'
-deepspeech_version='0.7.4'
-
-
 ###
 csv_dir=''
 while getopts ":a:" opt; do
@@ -33,12 +20,34 @@ if [ -z "${csv_dir}" ]; then
     exit 2
 fi
 
+###
+model_name='bangor-welsh'
+model_language='cy-Latn-GB'
+model_license='CC-BY-4.0'
+model_description='Welsh language acoustic model trained using transfer learning and approximately 90hrs of validated and other Welsh speech data from the Mozilla CommonVoice June 2020 release.'
+
+model_author='techiaith'
+model_contact_info='techiaith@bangor.ac.uk'
+
+echo
+echo "####################################################################################"
+echo " model_name : ${model_name}"
+echo " model_language : ${cy-Latn-GB}"
+echo " model_license : ${model_license}"
+echo " model_description : ${model_description}"
+echo " model_author : ${model_author}"
+echo " model_contact_info : ${model_contact_info}"
+echo " model_version : ${TECHIAITH_RELEASE} "
+echo " DeepSpeech Version : ${DEEPSPEECH_RELEASE} "
+echo "####################################################################################"
+echo
+
 ###
 train_files=${csv_dir}/validated.clean.csv,${csv_dir}/other.clean.csv
 alphabet_cy_file=/DeepSpeech/bin/bangor_welsh/alphabet.txt
 
 checkpoint_dir=/checkpoints
-export_dir=/export/${deepspeech_version}_${model_version}
+export_dir=/export/${DEEPSPEECH_RELEASE}_${TECHIAITH_RELEASE}
 
 
 ### Force UTF-8 output
@@ -58,6 +67,7 @@ mkdir -p ${export_dir}
 cp -r /checkpoints/mozilla/deepspeech-en-checkpoint/ $checkpoint_en_dir
 
 ###
+echo
 echo "####################################################################################"
 echo "#### Transfer to WELSH model with --save_checkpoint_dir --load_checkpoint_dir   ####"
 echo "####################################################################################"
@@ -72,6 +82,7 @@ python -u DeepSpeech.py \
 
 
 set +x
+echo
 echo "####################################################################################"
 echo "#### Export new Welsh checkpoint to frozen model								  ####"
 echo "####################################################################################"
@@ -86,21 +97,22 @@ python -u DeepSpeech.py \
 	--export_dir "${export_dir}" \
 	--export_author_id "${model_author}" \
 	--export_model_name "${model_name}" \
-	--export_model_version "${model_version}" \
+	--export_model_version "${TECHIAITH_RELEASE}" \
 	--export_contact_info "${model_contact_info}" \
 	--export_license "${model_license}" \
 	--export_language "${model_language}" \
-	--export_min_ds_version "${deepspeech_version}" \
-	--export_max_ds_version "${deepspeech_version}" \
+	--export_min_ds_version "${DEEPSPEECH_RELEASE}" \
+	--export_max_ds_version "${DEEPSPEECH_RELEASE}" \
 	--export_description "${model_description}"
 
 ###
-/DeepSpeech/convert_graphdef_memmapped_format \
+/DeepSpeech/native_client/convert_graphdef_memmapped_format \
 	--in_graph=${export_dir}/output_graph.pb \
 	--out_graph=${export_dir}/output_graph.pbmm
 
 
 set +x
+echo
 echo "####################################################################################"
 echo "#### Exported acoustic models (.pb/.pbmm files) can be found in ${export_dir} "
 echo "####################################################################################"
diff --git a/local/transcribe.py b/local/transcribe.py
new file mode 100755
index 0000000..546a6c9
--- /dev/null
+++ b/local/transcribe.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import sys
+import pathlib
+import shlex
+import shutil
+import subprocess
+import glob
+
+import json
+
+from praatio import tgio
+from utils.audio import downsample_wavfile
+
+from utils.clean_transcript import clean_transcript
+from argparse import ArgumentParser, RawTextHelpFormatter
+
+DESCRIPTION = """
+"""
+
+ALPHABET_FILE_PATH = "/DeepSpeech/bin/bangor_welsh/alphabet.txt"
+TECHIAITH_RELEASE = os.environ['TECHIAITH_RELEASE']
+CHECKPOINTS_DIR = "/checkpoints/cy"
+LANGUAGE_MODEL = "/models/techiaith/techiaith_bangor_transcribe_%s.scorer" % TECHIAITH_RELEASE
+
+
+def convert_json_to_textgrid(wav_file_path, transcript_file_path):
+    
+    textgrid_file_path = transcript_file_path.replace(".tlog",".TextGrid")    
+
+    with open(transcript_file_path) as json_file:        
+        textgrid_entries_list = []
+        json_data = json.load(json_file)
+        for transcript in json_data:
+            start_seconds = float(transcript["start"] / 1000)
+            end_seconds = float(transcript["end"] / 1000)
+            textgrid_entry = (start_seconds,end_seconds, transcript["transcript"])
+            textgrid_entries_list.append(textgrid_entry)
+            
+        utterance_tier = tgio.IntervalTier('utterance', textgrid_entries_list, 0, pairedWav=wav_file_path)
+        tg = tgio.Textgrid()
+        tg.addTier(utterance_tier)
+        tg.save(textgrid_file_path, useShortForm=False, outputFormat='textgrid')
+
+        print ("Textgrid of transcription saved to %s" % textgrid_file_path)
+
+
+def main(wav_file_path, **args):
+
+    cmd = "python3 /DeepSpeech/transcribe.py --src %s --checkpoint_dir %s --alphabet_config_path %s --scorer %s --force"
+    cmd = cmd % (wav_file_path, CHECKPOINTS_DIR, ALPHABET_FILE_PATH, LANGUAGE_MODEL)
+
+    downsample_wavfile(wav_file_path)
+    
+    import_process = subprocess.Popen(shlex.split(cmd))
+    import_process.wait()
+
+    transcript_file = wav_file_path.replace(".wav", ".tlog")
+
+    convert_json_to_textgrid(wav_file_path, transcript_file)
+
+
+    
+if __name__ == "__main__": 
+
+    parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 
+
+    parser.add_argument("--wavfile", dest="wav_file_path", required=True, help="path to downloaded tar.gz containing speech corpus in CommonVoice v2.0 format")
+    #parser.add_argument("--target_dir", dest="cv_root_dir", required=True, help="target directory for extracted archive, also root directory for training data")
+   
+    parser.set_defaults(func=main)
+    args = parser.parse_args()
+    args.func(**vars(args))
diff --git a/local/utils/audio.py b/local/utils/audio.py
index d7623b1..18f427b 100755
--- a/local/utils/audio.py
+++ b/local/utils/audio.py
@@ -6,10 +6,11 @@
 import shutil
 
 import wave
-from sox import Transformer
+import sox
 
 import pandas as pd
 import numpy as np
+
 import scipy.io.wavfile as wav
 
 from python_speech_features import mfcc
@@ -18,6 +19,9 @@
 
 
 def downsample_wavfile(wavfile):
+    if sox.file_info.sample_rate(wavfile)==16000.0:
+        return
+
     temp_48kHz_wavfile = wavfile.replace(".wav","_48kHz.wav")
     shutil.move(wavfile, temp_48kHz_wavfile)
     transform_audio(temp_48kHz_wavfile, wavfile)
@@ -37,12 +41,12 @@ def convert_mp3(mp3file):
         return False
 
 
-
 def transform_audio(old_file, new_file):
-    tf = Transformer()
+    tf = sox.Transformer()
     tf.convert(samplerate=16000, n_channels=1)
     tf.build(old_file, new_file)
 
+
 def get_duration_wav(wavfile):
     f = wave.open(wavfile, 'r')
     frames = f.getnframes()