Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding DVAE fine-tuning + Interface rework #34

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,6 @@ cython_debug/
base_models
finetune_models
/test_model
/dataset
/datasets
/tmp
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ If you are looking for an option for normal XTTS use look here [https://github.c
1. Make sure you have `Cuda` installed
2. `git clone https://github.com/daswer123/xtts-finetune-webui`
3. `cd xtts-finetune-webui`
4. `pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 --index-url https://download.pytorch.org/whl/cu118`
4. `pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu118`
5. `pip install -r requirements.txt`

### If you're using Windows
Expand Down
2 changes: 1 addition & 1 deletion install.bat
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ call venv/scripts/activate


pip install -r .\requirements.txt
pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu118

python xtts_demo.py
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ faster_whisper==1.0.2
gradio==4.13.0
spacy==3.7.4
coqui-tts[languages] == 0.24.1

cutlet
fugashi[unidic-lite]
fugashi[unidic-lite]
audio-separator[gpu]
huggingface_hub[cli]
datasets
82 changes: 82 additions & 0 deletions utils/dataset_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os
import json
import shutil
import pandas as pd
from datasets import Dataset, Audio, Value, Features, DatasetDict

def create_readme_from_info(info_file_path, readme_file_path):
if os.path.exists(info_file_path):
with open(info_file_path, 'r', encoding='utf-8') as f:
info_data = json.load(f)

readme_content = "# Dataset Information\n\n"
readme_content += "## General Info\n"
readme_content += f"-**Language**: {info_data.get('language', 'N/A')}\n"
readme_content += f"-**Number of Segments**: {info_data.get('num_segments', 'N/A')}\n"
readme_content += f"-**Total Duration**: {info_data.get('total_duration', 'N/A')} seconds\n\n"

with open(readme_file_path, 'w', encoding='utf-8') as f:
f.write(readme_content)

def create_and_upload_dataset(upload_address, dataset_path):
# Загрузка manifest.csv

# dataset_path_folder = os.path.dirname(dataset_path)
# dataset_not_abs_path = os.path.join("datasets", dataset_path_folder)

def read_text_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()

manifest_df = pd.read_csv(os.path.join(dataset_path, "manifest.csv"), sep="|", header=None, names=["audio_file", "text_file", "duration"])

manifest_df['audio_file'] = manifest_df['audio_file'].apply(lambda x: os.path.join(dataset_path,x))
manifest_df['text_file'] = manifest_df['text_file'].apply(lambda x: os.path.join(dataset_path,x))
# manifest_df['text_file'] = manifest_df['text_file'].apply(read_text_file)

# Разделение данных на train и eval
train_size = int(0.8 * len(manifest_df))
train_manifest_df = manifest_df[:train_size].copy()
eval_manifest_df = manifest_df[train_size:].copy()

# Переименование столбцов 'audio_file' и 'text_file'
train_manifest_df = train_manifest_df.rename(columns={'audio_file': 'audio', 'text_file': 'text'})
eval_manifest_df = eval_manifest_df.rename(columns={'audio_file': 'audio', 'text_file': 'text'})

# Создание train и eval datasets
train_dataset = Dataset.from_pandas(train_manifest_df, features=Features({
"audio": Audio(),
"text": Value("string"),
"duration": Value("float32"),
}))

eval_dataset = Dataset.from_pandas(eval_manifest_df, features=Features({
"audio": Audio(),
"text": Value("string"),
"duration": Value("float32"),
}))

# Обновление путей к аудиофайлам и текстовым файлам
def update_paths(example):
# example['audio'] = {'path': os.path.join(dataset_path, example['audio'])}
with open(os.path.abspath(example['text']), 'r', encoding='utf-8') as f:
example['text'] = f.read().strip()
return example

train_dataset = train_dataset.map(update_paths)
eval_dataset = eval_dataset.map(update_paths)

# Создание DatasetDict
dataset_dict = DatasetDict({
"train": train_dataset,
"eval": eval_dataset,
})

# Создание README.md из info.json
info_json_path = os.path.join(dataset_path, "info.json")
readme_md_path = os.path.join(dataset_path, "README.md")
create_readme_from_info(info_json_path, readme_md_path)

# Загрузка датасета на Hugging Face Hub
dataset_dict.push_to_hub(upload_address)

Empty file added utils/dvae_tunner.py
Empty file.
176 changes: 158 additions & 18 deletions utils/formatter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import gc
import torchaudio
import pandas
import pandas as pd
from faster_whisper import WhisperModel
from glob import glob

Expand All @@ -13,8 +13,13 @@

import torch
import torchaudio
import json
# torch.set_num_threads(1)

from audio_separator.separator import Separator

import shutil


torch.set_num_threads(16)
import os
Expand Down Expand Up @@ -51,15 +56,48 @@ def list_files(basePath, validExts=None, contains=None):
audioPath = os.path.join(rootDir, filename)
yield audioPath

def format_audio_list(audio_files, asr_model, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):

def save_dataset_info(out_path, audio_files, num_segments, total_duration, target_language):
info_file_path = os.path.join(out_path, "info.json")

if os.path.exists(info_file_path):
with open(info_file_path, 'r', encoding='utf-8') as f:
dataset_info = json.load(f)
else:
dataset_info = {}

dataset_info['audio_files'] = dataset_info.get('audio_files', []) + audio_files
dataset_info['num_segments'] = dataset_info.get('num_segments', 0) + num_segments
dataset_info['total_duration'] = dataset_info.get('total_duration', 0) + total_duration
dataset_info['language'] = target_language

with open(info_file_path, 'w', encoding='utf-8') as f:
json.dump(dataset_info, f, indent=4)


def save_transcriptions(out_path, audio_file_name, sentence):
txt_dir = os.path.join(out_path, "txt")
os.makedirs(txt_dir, exist_ok=True)

txt_file_path = os.path.join(txt_dir, f"{audio_file_name}")
with open(txt_file_path, 'w', encoding='utf-8') as f:
f.write(sentence)


def create_manifest(out_path, audio_file, txt_file, duration):
manifest_file_path = os.path.join(out_path, "manifest.csv")

with open(manifest_file_path, 'a', encoding='utf-8') as f:
f.write(f"{audio_file}|{txt_file}|{duration}\n")


def format_audio_list(audio_files, asr_model, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", use_separate_audio=True, gradio_progress=None):
audio_total_size = 0
num_segments = 0
os.makedirs(out_path, exist_ok=True)

lang_file_path = os.path.join(out_path, "lang.txt")
current_language = None
if os.path.exists(lang_file_path):
with open(lang_file_path, 'r', encoding='utf-8') as existing_lang_file:
current_language = existing_lang_file.read().strip()

if current_language != target_language:
with open(lang_file_path, 'w', encoding='utf-8') as lang_file:
Expand All @@ -74,11 +112,11 @@ def format_audio_list(audio_files, asr_model, target_language="en", out_path=Non

existing_metadata = {'train': None, 'eval': None}
if os.path.exists(train_metadata_path):
existing_metadata['train'] = pandas.read_csv(train_metadata_path, sep="|")
existing_metadata['train'] = pd.read_csv(train_metadata_path, sep="|")
print("Existing training metadata found and loaded.")

if os.path.exists(eval_metadata_path):
existing_metadata['eval'] = pandas.read_csv(eval_metadata_path, sep="|")
existing_metadata['eval'] = pd.read_csv(eval_metadata_path, sep="|")
print("Existing evaluation metadata found and loaded.")

if gradio_progress is not None:
Expand All @@ -87,7 +125,29 @@ def format_audio_list(audio_files, asr_model, target_language="en", out_path=Non
tqdm_object = tqdm(audio_files)

for audio_path in tqdm_object:
audio_file_name_without_ext, _= os.path.splitext(os.path.basename(audio_path))

# original_audio_path = audio_path
separate_audio_path = audio_path

if use_separate_audio:
# Use separator
# Load a model
separator = Separator(output_dir="tmp")
separator.load_model(model_filename='Kim_Vocal_2.onnx')

# Separate multiple audio files without reloading the model
separate_audio_path = separator.separate(audio_path)

# Remove instrumetnal version
instrumental_path = os.path.join("tmp", separate_audio_path[0])
os.remove(instrumental_path)

# Use vocal part as main audio
separate_audio_path = separate_audio_path[1]
separate_audio_path = os.path.join("tmp", separate_audio_path)
print(f"Separated {audio_path}")

audio_file_name_without_ext,_ = os.path.splitext(os.path.basename(audio_path))
prefix_check = f"wavs/{audio_file_name_without_ext}_"

skip_processing = False
Expand All @@ -109,7 +169,7 @@ def format_audio_list(audio_files, asr_model, target_language="en", out_path=Non
wav = wav.squeeze()
audio_total_size += (wav.size(-1) / sr)

segments, _= asr_model.transcribe(audio_path, vad_filter=True, word_timestamps=True, language=target_language)
segments,_ = asr_model.transcribe(separate_audio_path, vad_filter=True, word_timestamps=True, language=target_language)
segments = list(segments)
i = 0
sentence = ""
Expand Down Expand Up @@ -137,7 +197,7 @@ def format_audio_list(audio_files, asr_model, target_language="en", out_path=Non
if word.word[-1] in ["!", "。", ".", "?"]:
sentence = sentence[1:]
sentence = multilingual_cleaners(sentence, target_language)
audio_file_name, _= os.path.splitext(os.path.basename(audio_path))
audio_file_name,_ = os.path.splitext(os.path.basename(audio_path))
audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"

if word_idx + 1 < len(words_list):
Expand All @@ -152,17 +212,29 @@ def format_audio_list(audio_files, asr_model, target_language="en", out_path=Non
i += 1
first_word = True

audio = wav[int(sr*sentence_start):int(sr *word_end)].unsqueeze(0)
audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
if audio.size(-1) >= sr / 3:
torchaudio.save(absolute_path, audio, sr)

txt_filename = f"{audio_file_name}_{str(i).zfill(8)}.txt"

# Save transcription
save_transcriptions(out_path, txt_filename, sentence)

# Update manifest
txt_file = f"txt/{txt_filename}"
duration = audio.size(-1) / sr
create_manifest(out_path, audio_file, txt_file, duration)

num_segments += 1
else:
continue

metadata["audio_file"].append(audio_file)
metadata["text"].append(sentence)
metadata["speaker_name"].append(speaker_name)

df = pandas.DataFrame(metadata)
df = pd.DataFrame(metadata)

mode = 'w' if not os.path.exists(train_metadata_path) else 'a'
header = not os.path.exists(train_metadata_path)
Expand All @@ -173,26 +245,94 @@ def format_audio_list(audio_files, asr_model, target_language="en", out_path=Non
df.to_csv(eval_metadata_path, sep="|", index=False, mode=mode, header=header)

metadata = {"audio_file": [], "text": [], "speaker_name": []}

# Delete separated audio files
if use_separate_audio:
os.remove(separate_audio_path)


if os.path.exists(train_metadata_path) and os.path.exists(eval_metadata_path):
existing_train_df = existing_metadata['train']
existing_eval_df = existing_metadata['eval']
else:
existing_train_df = pandas.DataFrame(columns=["audio_file", "text", "speaker_name"])
existing_eval_df = pandas.DataFrame(columns=["audio_file", "text", "speaker_name"])
existing_train_df = pd.DataFrame(columns=["audio_file", "text", "speaker_name"])
existing_eval_df = pd.DataFrame(columns=["audio_file", "text", "speaker_name"])

new_data_df = pandas.read_csv(train_metadata_path, sep="|")
new_data_df = pd.read_csv(train_metadata_path, sep="|")

combined_train_df = pandas.concat([existing_train_df, new_data_df], ignore_index=True).drop_duplicates().reset_index(drop=True)
combined_eval_df = pandas.concat([existing_eval_df, new_data_df], ignore_index=True).drop_duplicates().reset_index(drop=True)
combined_train_df = pd.concat([existing_train_df, new_data_df], ignore_index=True).drop_duplicates().reset_index(drop=True)
combined_eval_df = pd.concat([existing_eval_df, new_data_df], ignore_index=True).drop_duplicates().reset_index(drop=True)

combined_train_df_shuffled = combined_train_df.sample(frac=1)
num_val_samples = int(len(combined_train_df_shuffled)* eval_percentage)
num_val_samples = int(len(combined_train_df_shuffled) * eval_percentage)

final_eval_set = combined_train_df_shuffled[:num_val_samples]
final_training_set = combined_train_df_shuffled[num_val_samples:]

final_training_set.sort_values('audio_file').to_csv(train_metadata_path, sep='|', index=False)
final_eval_set.sort_values('audio_file').to_csv(eval_metadata_path, sep='|', index=False)

# Save dataset info
save_dataset_info(out_path, audio_files, num_segments, audio_total_size, target_language)

return train_metadata_path, eval_metadata_path, audio_total_size



def merge_datasets(dataset1_path, dataset2_path, merged_dataset_path):
# Создаем директорию для нового датасета
os.makedirs(merged_dataset_path, exist_ok=True)

# Копируем все файлы из первого датасета в новый датасет
for root, dirs, files in os.walk(dataset1_path):
for file in files:
src_path = os.path.join(root, file)
dst_path = os.path.join(merged_dataset_path, os.path.relpath(src_path, dataset1_path))
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
shutil.copy(src_path, dst_path)

# Копируем все файлы из второго датасета в новый датасет, пропуская дубликаты
for root, dirs, files in os.walk(dataset2_path):
for file in files:
src_path = os.path.join(root, file)
dst_path = os.path.join(merged_dataset_path, os.path.relpath(src_path, dataset2_path))
if not os.path.exists(dst_path):
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
shutil.copy(src_path, dst_path)

# Объединяем файлы metadata_train.csv и metadata_eval.csv, удаляя дубликаты
for metadata_file in ["metadata_train.csv", "metadata_eval.csv"]:
metadata1_path = os.path.join(dataset1_path, metadata_file)
metadata2_path = os.path.join(dataset2_path, metadata_file)
merged_metadata_path = os.path.join(merged_dataset_path, metadata_file)

if os.path.exists(metadata1_path) and os.path.exists(metadata2_path):
metadata1 = pd.read_csv(metadata1_path, sep="|")
metadata2 = pd.read_csv(metadata2_path, sep="|")
merged_metadata = pd.concat([metadata1, metadata2]).drop_duplicates(subset="audio_file").reset_index(drop=True)
merged_metadata.to_csv(merged_metadata_path, sep="|", index=False)
elif os.path.exists(metadata1_path):
shutil.copy(metadata1_path, merged_metadata_path)
elif os.path.exists(metadata2_path):
shutil.copy(metadata2_path, merged_metadata_path)

# Объединяем файлы info.json, удаляя дубликаты
info1_path = os.path.join(dataset1_path, "info.json")
info2_path = os.path.join(dataset2_path, "info.json")
merged_info_path = os.path.join(merged_dataset_path, "info.json")

if os.path.exists(info1_path) and os.path.exists(info2_path):
with open(info1_path, "r") as f1, open(info2_path, "r") as f2, open(merged_info_path, "w") as f_merged:
info1 = json.load(f1)
info2 = json.load(f2)
merged_info = {
"audio_files": list(set(info1["audio_files"] + info2["audio_files"])),
"num_segments": info1["num_segments"] + info2["num_segments"],
"total_duration": info1["total_duration"] + info2["total_duration"],
"language": info1["language"]
}
json.dump(merged_info, f_merged, indent=4)
elif os.path.exists(info1_path):
shutil.copy(info1_path, merged_info_path)
elif os.path.exists(info2_path):
shutil.copy(info2_path, merged_info_path)
Loading