-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
478b7bd
commit de86ce9
Showing
8 changed files
with
216 additions
and
58 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
# Please refer to the TRAINING documentation, "Basic Dockerfile for training" | ||
|
||
FROM tensorflow/tensorflow:1.15.2-gpu-py3 | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# | ||
ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA# | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
apt-utils \ | ||
bash-completion \ | ||
build-essential \ | ||
cmake \ | ||
curl \ | ||
git \ | ||
libboost-all-dev \ | ||
libbz2-dev \ | ||
locales \ | ||
python3-venv \ | ||
unzip \ | ||
wget | ||
|
||
# We need to remove it because it's breaking deepspeech install later with | ||
# weird errors about setuptools | ||
RUN apt-get purge -y python3-xdg | ||
|
||
# Install dependencies for audio augmentation | ||
RUN apt-get install -y --no-install-recommends libopus0 libsndfile1 | ||
|
||
# Try and free some space | ||
RUN rm -rf /var/lib/apt/lists/* | ||
|
||
WORKDIR / | ||
RUN git clone $DEEPSPEECH_REPO DeepSpeech | ||
|
||
WORKDIR /DeepSpeech | ||
RUN git checkout $DEEPSPEECH_SHA | ||
|
||
# Build CTC decoder first, to avoid clashes on incompatible versions upgrades | ||
RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings | ||
RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl | ||
|
||
# Prepare deps | ||
RUN pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0 | ||
|
||
# Install DeepSpeech | ||
# - No need for the decoder since we did it earlier | ||
# - There is already correct TensorFlow GPU installed on the base image, | ||
# we don't want to break that | ||
RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . | ||
|
||
# Tool to convert output graph for inference | ||
RUN python3 util/taskcluster.py --source tensorflow --branch r1.15 \ | ||
--artifact convert_graphdef_memmapped_format --target . | ||
|
||
# Build KenLM to generate new scorers | ||
WORKDIR /DeepSpeech/native_client | ||
RUN rm -rf kenlm && \ | ||
git clone https://github.com/kpu/kenlm && \ | ||
cd kenlm && \ | ||
git checkout 87e85e66c99ceff1fab2500a7c60c01da7315eec && \ | ||
mkdir -p build && \ | ||
cd build && \ | ||
cmake .. && \ | ||
make -j $(nproc) | ||
WORKDIR /DeepSpeech | ||
|
||
RUN ./bin/run-ldc93s1.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,6 @@ | ||
#!/bin/bash | ||
set -e | ||
|
||
### | ||
model_name='bangor' | ||
model_language='cy-Latn-GB' | ||
model_license='CC-BY-4.0' | ||
model_description='Welsh language acoustic model trained using transfer learning and approximately 90hrs of validated and other Welsh speech data from the Mozilla CommonVoice June 2020 release.' | ||
|
||
model_author='techiaith' | ||
model_contact_info='[email protected]' | ||
|
||
model_version='20.07' | ||
deepspeech_version='0.7.4' | ||
|
||
|
||
### | ||
csv_dir='' | ||
while getopts ":a:" opt; do | ||
|
@@ -33,12 +20,34 @@ if [ -z "${csv_dir}" ]; then | |
exit 2 | ||
fi | ||
|
||
### | ||
model_name='bangor-welsh' | ||
model_language='cy-Latn-GB' | ||
model_license='CC-BY-4.0' | ||
model_description='Welsh language acoustic model trained using transfer learning and approximately 90hrs of validated and other Welsh speech data from the Mozilla CommonVoice June 2020 release.' | ||
|
||
model_author='techiaith' | ||
model_contact_info='[email protected]' | ||
|
||
echo | ||
echo "####################################################################################" | ||
echo " model_name : ${model_name}" | ||
echo " model_language : ${cy-Latn-GB}" | ||
echo " model_license : ${model_license}" | ||
echo " model_description : ${model_description}" | ||
echo " model_author : ${model_author}" | ||
echo " model_contact_info : ${model_contact_info}" | ||
echo " model_version : ${TECHIAITH_RELEASE} " | ||
echo " DeepSpeech Version : ${DEEPSPEECH_RELEASE} " | ||
echo "####################################################################################" | ||
echo | ||
|
||
### | ||
train_files=${csv_dir}/validated.clean.csv,${csv_dir}/other.clean.csv | ||
alphabet_cy_file=/DeepSpeech/bin/bangor_welsh/alphabet.txt | ||
|
||
checkpoint_dir=/checkpoints | ||
export_dir=/export/${deepspeech_version}_${model_version} | ||
export_dir=/export/${DEEPSPEECH_RELEASE}_${TECHIAITH_RELEASE} | ||
|
||
|
||
### Force UTF-8 output | ||
|
@@ -58,6 +67,7 @@ mkdir -p ${export_dir} | |
cp -r /checkpoints/mozilla/deepspeech-en-checkpoint/ $checkpoint_en_dir | ||
|
||
### | ||
echo | ||
echo "####################################################################################" | ||
echo "#### Transfer to WELSH model with --save_checkpoint_dir --load_checkpoint_dir ####" | ||
echo "####################################################################################" | ||
|
@@ -72,6 +82,7 @@ python -u DeepSpeech.py \ | |
|
||
|
||
set +x | ||
echo | ||
echo "####################################################################################" | ||
echo "#### Export new Welsh checkpoint to frozen model ####" | ||
echo "####################################################################################" | ||
|
@@ -86,21 +97,22 @@ python -u DeepSpeech.py \ | |
--export_dir "${export_dir}" \ | ||
--export_author_id "${model_author}" \ | ||
--export_model_name "${model_name}" \ | ||
--export_model_version "${model_version}" \ | ||
--export_model_version "${TECHIAITH_RELEASE}" \ | ||
--export_contact_info "${model_contact_info}" \ | ||
--export_license "${model_license}" \ | ||
--export_language "${model_language}" \ | ||
--export_min_ds_version "${deepspeech_version}" \ | ||
--export_max_ds_version "${deepspeech_version}" \ | ||
--export_min_ds_version "${DEEPSPEECH_RELEASE}" \ | ||
--export_max_ds_version "${DEEPSPEECH_RELEASE}" \ | ||
--export_description "${model_description}" | ||
|
||
### | ||
/DeepSpeech/convert_graphdef_memmapped_format \ | ||
/DeepSpeech/native_client/convert_graphdef_memmapped_format \ | ||
--in_graph=${export_dir}/output_graph.pb \ | ||
--out_graph=${export_dir}/output_graph.pbmm | ||
|
||
|
||
set +x | ||
echo | ||
echo "####################################################################################" | ||
echo "#### Exported acoustic models (.pb/.pbmm files) can be found in ${export_dir} " | ||
echo "####################################################################################" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
import os | ||
import sys | ||
import pathlib | ||
import shlex | ||
import shutil | ||
import subprocess | ||
import glob | ||
|
||
import json | ||
|
||
from praatio import tgio | ||
from utils.audio import downsample_wavfile | ||
|
||
from utils.clean_transcript import clean_transcript | ||
from argparse import ArgumentParser, RawTextHelpFormatter | ||
|
||
DESCRIPTION = """ | ||
""" | ||
|
||
ALPHABET_FILE_PATH = "/DeepSpeech/bin/bangor_welsh/alphabet.txt" | ||
TECHIAITH_RELEASE = os.environ['TECHIAITH_RELEASE'] | ||
CHECKPOINTS_DIR = "/checkpoints/cy" | ||
LANGUAGE_MODEL = "/models/techiaith/techiaith_bangor_transcribe_%s.scorer" % TECHIAITH_RELEASE | ||
|
||
|
||
def convert_json_to_textgrid(wav_file_path, transcript_file_path): | ||
|
||
textgrid_file_path = transcript_file_path.replace(".tlog",".TextGrid") | ||
|
||
with open(transcript_file_path) as json_file: | ||
textgrid_entries_list = [] | ||
json_data = json.load(json_file) | ||
for transcript in json_data: | ||
start_seconds = float(transcript["start"] / 1000) | ||
end_seconds = float(transcript["end"] / 1000) | ||
textgrid_entry = (start_seconds,end_seconds, transcript["transcript"]) | ||
textgrid_entries_list.append(textgrid_entry) | ||
|
||
utterance_tier = tgio.IntervalTier('utterance', textgrid_entries_list, 0, pairedWav=wav_file_path) | ||
tg = tgio.Textgrid() | ||
tg.addTier(utterance_tier) | ||
tg.save(textgrid_file_path, useShortForm=False, outputFormat='textgrid') | ||
|
||
print ("Textgrid of transcription saved to %s" % textgrid_file_path) | ||
|
||
|
||
def main(wav_file_path, **args): | ||
|
||
cmd = "python3 /DeepSpeech/transcribe.py --src %s --checkpoint_dir %s --alphabet_config_path %s --scorer %s --force" | ||
cmd = cmd % (wav_file_path, CHECKPOINTS_DIR, ALPHABET_FILE_PATH, LANGUAGE_MODEL) | ||
|
||
downsample_wavfile(wav_file_path) | ||
|
||
import_process = subprocess.Popen(shlex.split(cmd)) | ||
import_process.wait() | ||
|
||
transcript_file = wav_file_path.replace(".wav", ".tlog") | ||
|
||
convert_json_to_textgrid(wav_file_path, transcript_file) | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) | ||
|
||
parser.add_argument("--wavfile", dest="wav_file_path", required=True, help="path to downloaded tar.gz containing speech corpus in CommonVoice v2.0 format") | ||
#parser.add_argument("--target_dir", dest="cv_root_dir", required=True, help="target directory for extracted archive, also root directory for training data") | ||
|
||
parser.set_defaults(func=main) | ||
args = parser.parse_args() | ||
args.func(**vars(args)) |
Oops, something went wrong.