Skip to content

Commit

Permalink
ar gyfer/for DeepSpeech 0.8.x
Browse files Browse the repository at this point in the history
  • Loading branch information
DewiBrynJones committed Sep 14, 2020
1 parent 478b7bd commit de86ce9
Show file tree
Hide file tree
Showing 8 changed files with 216 additions and 58 deletions.
23 changes: 7 additions & 16 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,21 @@ RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.d
libxslt1-dev libjpeg8-dev zlib1g-dev dos2unix \
&& apt-get clean \
&& git lfs install \
&& pip install sox wget sklearn pandas python_speech_features virtualenv requests tqdm columnize \
&& pip install sox wget sklearn pandas python_speech_features virtualenv \
webrtcvad requests tqdm columnize praatio \
&& rm -rf /var/lib/apt/lists/*

ENV LC_ALL cy_GB.UTF-8
ENV LANG cy_GB.UTF-8
ENV LANGUAGE cy_GB.UTF-8


# KenLM missing in Mozilla Dockerfile for trainig
# Build KenLM in /DeepSpeech/native_client/kenlm folder
#
WORKDIR /DeepSpeech/native_client
RUN rm -rf kenlm && \
git clone https://github.com/kpu/kenlm && \
cd kenlm && \
git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \
mkdir -p build && \
cd build && \
cmake .. && \
make -j $(nproc)

# Done
WORKDIR /DeepSpeech

RUN python util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target .
RUN python ../util/taskcluster.py --target .
RUN python ../util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target .

ENV PATH /DeepSpeech/native_client:/DeepSpeech/native_client/kenlm/build/bin:$PATH

# Done
WORKDIR /DeepSpeech
68 changes: 68 additions & 0 deletions Dockerfile.train.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Please refer to the TRAINING documentation, "Basic Dockerfile for training"

FROM tensorflow/tensorflow:1.15.2-gpu-py3
ENV DEBIAN_FRONTEND=noninteractive

ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO#
ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA#

RUN apt-get update && apt-get install -y --no-install-recommends \
apt-utils \
bash-completion \
build-essential \
cmake \
curl \
git \
libboost-all-dev \
libbz2-dev \
locales \
python3-venv \
unzip \
wget

# We need to remove it because it's breaking deepspeech install later with
# weird errors about setuptools
RUN apt-get purge -y python3-xdg

# Install dependencies for audio augmentation
RUN apt-get install -y --no-install-recommends libopus0 libsndfile1

# Try and free some space
RUN rm -rf /var/lib/apt/lists/*

WORKDIR /
RUN git clone $DEEPSPEECH_REPO DeepSpeech

WORKDIR /DeepSpeech
RUN git checkout $DEEPSPEECH_SHA

# Build CTC decoder first, to avoid clashes on incompatible versions upgrades
RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl

# Prepare deps
RUN pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0

# Install DeepSpeech
# - No need for the decoder since we did it earlier
# - There is already correct TensorFlow GPU installed on the base image,
# we don't want to break that
RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .

# Tool to convert output graph for inference
RUN python3 util/taskcluster.py --source tensorflow --branch r1.15 \
--artifact convert_graphdef_memmapped_format --target .

# Build KenLM to generate new scorers
WORKDIR /DeepSpeech/native_client
RUN rm -rf kenlm && \
git clone https://github.com/kpu/kenlm && \
cd kenlm && \
git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \
mkdir -p build && \
cd build && \
cmake .. && \
make -j $(nproc)
WORKDIR /DeepSpeech

RUN ./bin/run-ldc93s1.sh
29 changes: 17 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
default: build
DEEPSPEECH_RELEASE := 0.7.4
DEEPSPEECH_BRANCH := v$(DEEPSPEECH_RELEASE)
TECHIAITH_RELEASE := 20.06

DEEPSPEECH_RELEASE := 0.8.2
TECHIAITH_RELEASE := 20.09

run:
docker run --gpus all --name techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER} -it \
docker run --gpus all --name techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER} -it \
-v ${PWD}/data/:/data \
-v ${PWD}/checkpoints/:/checkpoints \
-v ${PWD}/models/:/models \
-v ${PWD}/export/:/export \
-v ${PWD}/homedir/:/root \
-v ${PWD}/local/:/DeepSpeech/bin/bangor_welsh \
techiaith/deepspeech:${DEEPSPEECH_BRANCH} bash
--env DEEPSPEECH_RELEASE=${DEEPSPEECH_RELEASE} \
--env TECHIAITH_RELEASE=${TECHIAITH_RELEASE} \
techiaith/deepspeech:v${DEEPSPEECH_RELEASE} bash


build:

if [ ! -d "DeepSpeech" ]; then \
git clone --branch $(DEEPSPEECH_BRANCH) https://github.com/mozilla/DeepSpeech.git; \
cd DeepSpeech && make Dockerfile.train DEEPSPEECH_SHA=tags/${DEEPSPEECH_BRANCH} && docker build --rm -t mozilla/deepspeech:${DEEPSPEECH_BRANCH} -f Dockerfile.train .; \
git clone https://github.com/mozilla/DeepSpeech.git; \
cp Dockerfile.train.tmpl DeepSpeech/; \
fi
cd DeepSpeech && make Dockerfile.train DEEPSPEECH_SHA=tags/v${DEEPSPEECH_RELEASE} && docker build --rm -t mozilla/deepspeech:v${DEEPSPEECH_RELEASE} -f Dockerfile.train .
if [ ! -d "checkpoints/mozilla" ]; then \
mkdir -p checkpoints/mozilla; \
cd checkpoints/mozilla && \
Expand All @@ -45,18 +49,19 @@ build:
wget https://github.com/techiaith/docker-deepspeech-cy/releases/download/$(TECHIAITH_RELEASE)/techiaith_bangor_macsen_$(TECHIAITH_RELEASE).scorer && \
wget https://github.com/techiaith/docker-deepspeech-cy/releases/download/$(TECHIAITH_RELEASE)/techiaith_bangor_transcribe_$(TECHIAITH_RELEASE).scorer;\
fi
docker build --build-arg BRANCH=${DEEPSPEECH_BRANCH} --rm -t techiaith/deepspeech:${DEEPSPEECH_BRANCH} .
docker build --build-arg BRANCH=v${DEEPSPEECH_RELEASE} --rm -t techiaith/deepspeech:v${DEEPSPEECH_RELEASE} .


clean:
-docker rmi techiaith/deepspeech:${DEEPSPEECH_BRANCH}
-docker rmi mozilla/deepspeech:${DEEPSPEECH_BRANCH}
-docker rmi techiaith/deepspeech:v${DEEPSPEECH_RELEASE}
-docker rmi mozilla/deepspeech:v${DEEPSPEECH_RELEASE}
-docker rmi nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
-docker rmi tensorflow/tensorflow:1.15.2-gpu-py3
sudo rm -rf DeepSpeech
sudo rm -rf homedir
sudo rm -rf checkpoints


stop:
-docker stop techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER}
-docker rm techiaith-deepspeech-${DEEPSPEECH_BRANCH}-${USER}
-docker stop techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER}
-docker rm techiaith-deepspeech-v${DEEPSPEECH_RELEASE}-${USER}
20 changes: 12 additions & 8 deletions local/build_lm_scorer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,21 @@ echo "#### Generating package for un-optimized language model package
echo "#### ####"
echo "#### Default alpha and beta values used. Previous optimal values were: ####"
echo "#### ####"
echo "#### Voice Assistant lm : alpha: 1.7242448485503816 ####"
echo "#### beta: 4.9065413926676165 ####"
echo "#### Voice Assistant Language Model ####"
echo "#### alpha: 1.7242448485503816 ####"
echo "#### beta: 4.9065413926676165 ####"
echo "#### ####"
echo "#### Transcription Language Model ####"
echo "#### alpha: 1.1417685444561605 ####"
echo "#### beta: 0.5798010479098541 ####"
echo "#### ####"
echo "####################################################################################"
set -x
python3 /DeepSpeech/data/lm/generate_package.py \
/DeepSpeech/native_client/generate_scorer_package \
--alphabet "${alphabet_file_path}" \
--lm lm.binary \
--vocab vocab-50000.txt \
--package kenlm.scorer \
--package kenlm.scorer \
--default_alpha 0.75 \
--default_beta 1.85

Expand All @@ -89,9 +94,8 @@ echo "##########################################################################
set -x
python -u /DeepSpeech/evaluate.py \
--test_files "${test_files}" --test_batch_size 1 \
--alphabet_config_path "${alphabet_file_path}" \
--load_checkpoint_dir "${checkpoint_cy_dir}" \
--scorer_path kenlm.scorer

--alphabet_config_path "${alphabet_file_path}" \
--load_checkpoint_dir "${checkpoint_cy_dir}" \
--scorer_path kenlm.scorer

cd -
2 changes: 1 addition & 1 deletion local/optimize_lm_scorer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ read -p "Enter best default beta: " beta
echo "####################################################################################"
echo "#### Generating package with optimal alpha and beta ####"
echo "####################################################################################"
python3 /DeepSpeech/data/lm/generate_package.py \
/DeepSpeech/native_client/generate_scorer_package \
--alphabet "${alphabet_file_path}" \
--lm lm.binary \
--vocab vocab-50000.txt \
Expand Down
48 changes: 30 additions & 18 deletions local/run_tl_cv_cy.sh
Original file line number Diff line number Diff line change
@@ -1,19 +1,6 @@
#!/bin/bash
set -e

###
model_name='bangor'
model_language='cy-Latn-GB'
model_license='CC-BY-4.0'
model_description='Welsh language acoustic model trained using transfer learning and approximately 90hrs of validated and other Welsh speech data from the Mozilla CommonVoice June 2020 release.'

model_author='techiaith'
model_contact_info='[email protected]'

model_version='20.07'
deepspeech_version='0.7.4'


###
csv_dir=''
while getopts ":a:" opt; do
Expand All @@ -33,12 +20,34 @@ if [ -z "${csv_dir}" ]; then
exit 2
fi

###
model_name='bangor-welsh'
model_language='cy-Latn-GB'
model_license='CC-BY-4.0'
model_description='Welsh language acoustic model trained using transfer learning and approximately 90hrs of validated and other Welsh speech data from the Mozilla CommonVoice June 2020 release.'

model_author='techiaith'
model_contact_info='[email protected]'

echo
echo "####################################################################################"
echo " model_name : ${model_name}"
echo " model_language : ${cy-Latn-GB}"
echo " model_license : ${model_license}"
echo " model_description : ${model_description}"
echo " model_author : ${model_author}"
echo " model_contact_info : ${model_contact_info}"
echo " model_version : ${TECHIAITH_RELEASE} "
echo " DeepSpeech Version : ${DEEPSPEECH_RELEASE} "
echo "####################################################################################"
echo

###
train_files=${csv_dir}/validated.clean.csv,${csv_dir}/other.clean.csv
alphabet_cy_file=/DeepSpeech/bin/bangor_welsh/alphabet.txt

checkpoint_dir=/checkpoints
export_dir=/export/${deepspeech_version}_${model_version}
export_dir=/export/${DEEPSPEECH_RELEASE}_${TECHIAITH_RELEASE}


### Force UTF-8 output
Expand All @@ -58,6 +67,7 @@ mkdir -p ${export_dir}
cp -r /checkpoints/mozilla/deepspeech-en-checkpoint/ $checkpoint_en_dir

###
echo
echo "####################################################################################"
echo "#### Transfer to WELSH model with --save_checkpoint_dir --load_checkpoint_dir ####"
echo "####################################################################################"
Expand All @@ -72,6 +82,7 @@ python -u DeepSpeech.py \


set +x
echo
echo "####################################################################################"
echo "#### Export new Welsh checkpoint to frozen model ####"
echo "####################################################################################"
Expand All @@ -86,21 +97,22 @@ python -u DeepSpeech.py \
--export_dir "${export_dir}" \
--export_author_id "${model_author}" \
--export_model_name "${model_name}" \
--export_model_version "${model_version}" \
--export_model_version "${TECHIAITH_RELEASE}" \
--export_contact_info "${model_contact_info}" \
--export_license "${model_license}" \
--export_language "${model_language}" \
--export_min_ds_version "${deepspeech_version}" \
--export_max_ds_version "${deepspeech_version}" \
--export_min_ds_version "${DEEPSPEECH_RELEASE}" \
--export_max_ds_version "${DEEPSPEECH_RELEASE}" \
--export_description "${model_description}"

###
/DeepSpeech/convert_graphdef_memmapped_format \
/DeepSpeech/native_client/convert_graphdef_memmapped_format \
--in_graph=${export_dir}/output_graph.pb \
--out_graph=${export_dir}/output_graph.pbmm


set +x
echo
echo "####################################################################################"
echo "#### Exported acoustic models (.pb/.pbmm files) can be found in ${export_dir} "
echo "####################################################################################"
Expand Down
74 changes: 74 additions & 0 deletions local/transcribe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import pathlib
import shlex
import shutil
import subprocess
import glob

import json

from praatio import tgio
from utils.audio import downsample_wavfile

from utils.clean_transcript import clean_transcript
from argparse import ArgumentParser, RawTextHelpFormatter

DESCRIPTION = """
"""

ALPHABET_FILE_PATH = "/DeepSpeech/bin/bangor_welsh/alphabet.txt"
TECHIAITH_RELEASE = os.environ['TECHIAITH_RELEASE']
CHECKPOINTS_DIR = "/checkpoints/cy"
LANGUAGE_MODEL = "/models/techiaith/techiaith_bangor_transcribe_%s.scorer" % TECHIAITH_RELEASE


def convert_json_to_textgrid(wav_file_path, transcript_file_path):

textgrid_file_path = transcript_file_path.replace(".tlog",".TextGrid")

with open(transcript_file_path) as json_file:
textgrid_entries_list = []
json_data = json.load(json_file)
for transcript in json_data:
start_seconds = float(transcript["start"] / 1000)
end_seconds = float(transcript["end"] / 1000)
textgrid_entry = (start_seconds,end_seconds, transcript["transcript"])
textgrid_entries_list.append(textgrid_entry)

utterance_tier = tgio.IntervalTier('utterance', textgrid_entries_list, 0, pairedWav=wav_file_path)
tg = tgio.Textgrid()
tg.addTier(utterance_tier)
tg.save(textgrid_file_path, useShortForm=False, outputFormat='textgrid')

print ("Textgrid of transcription saved to %s" % textgrid_file_path)


def main(wav_file_path, **args):

cmd = "python3 /DeepSpeech/transcribe.py --src %s --checkpoint_dir %s --alphabet_config_path %s --scorer %s --force"
cmd = cmd % (wav_file_path, CHECKPOINTS_DIR, ALPHABET_FILE_PATH, LANGUAGE_MODEL)

downsample_wavfile(wav_file_path)

import_process = subprocess.Popen(shlex.split(cmd))
import_process.wait()

transcript_file = wav_file_path.replace(".wav", ".tlog")

convert_json_to_textgrid(wav_file_path, transcript_file)



if __name__ == "__main__":

parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter)

parser.add_argument("--wavfile", dest="wav_file_path", required=True, help="path to downloaded tar.gz containing speech corpus in CommonVoice v2.0 format")
#parser.add_argument("--target_dir", dest="cv_root_dir", required=True, help="target directory for extracted archive, also root directory for training data")

parser.set_defaults(func=main)
args = parser.parse_args()
args.func(**vars(args))
Loading

0 comments on commit de86ce9

Please sign in to comment.