[Feature request] Add Recipe for all 3 Training stages - XTTS V2 #3704

smallsudarshan · 2024-04-24T16:17:57Z

🚀 Feature Description
Hey, we saw that there is no training code for fine-tuning all parts of XTTS V2. We would like to contribute if it adds value.

The aim can be to make it work very reliably on a particular accent [Indian for eg.], in a particular language[English], in a particular speaking style with very little variability. We tried simply fine-tuning and it seems like it learns the accent somewhat and the speaking style, but is not super robust and mispronounces quite a lot.

Solution

First fine-tune the DVAE model using code from the Dall-E repo
Then get fine-tune the GPT-2 model - maybe this can be the recipe provided
Finally fine-tune end to end with the Hi-Fi GAN

We are not sure if the perceiver needs any fine-tuning.

If licenses permit, we will also share the data.

Does this make sense?

smallsudarshan · 2024-04-27T13:39:59Z

Ok so here you go. I picked the code for training from this repo.

train.py:

import torch
import wandb
from models.dvae import DiscreteVAE
from utils.arch_utils import TorchMelSpectrogram
from torch.utils.data import DataLoader
from utils.dvae_dataset import DVAEDataset
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_

import pdb
from TTS.tts.datasets import load_tts_samples
from TTS.config.shared_configs import BaseDatasetConfig

dvae_checkpoint = '/home/ubuntu/test_tts/SimpleTTS/xtts/run/training/XTTS_v2.0_original_model_files/dvae.pth'
mel_norm_file = '/home/ubuntu/test_tts/SimpleTTS/xtts/run/training/XTTS_v2.0_original_model_files/mel_stats.pth'

config_dataset = BaseDatasetConfig(
    formatter="ljspeech",
    dataset_name="ljspeech",
    path="/home/ubuntu/test_tts/sapien-formatted-english-22050",
    meta_file_train="/home/ubuntu/test_tts/sapien-formatted-english-22050/metadata_norm.txt",
    language="en",
)

# Add here the configs of the datasets
DATASETS_CONFIG_LIST = [config_dataset]
GRAD_CLIP_NORM = 0.5
LEARNING_RATE = 5e-05

dvae = DiscreteVAE(
            channels=80,
            normalization=None,
            positional_dims=1,
            num_tokens=1024,
            codebook_dim=512,
            hidden_dim=512,
            num_resnet_blocks=3,
            kernel_size=3,
            num_layers=2,
            use_transposed_convs=False,
        )

dvae.load_state_dict(torch.load(dvae_checkpoint), strict=False)
dvae.cuda()
opt = Adam(dvae.parameters(), lr = LEARNING_RATE)
torch_mel_spectrogram_dvae = TorchMelSpectrogram(
            mel_norm_file=mel_norm_file, sampling_rate=22050
        ).cuda()

train_samples, eval_samples = load_tts_samples(
        DATASETS_CONFIG_LIST,
        eval_split=True,
        eval_split_max_size=256,
        eval_split_size=0.01,
    )

eval_dataset = DVAEDataset(eval_samples, 22050, True)
train_dataset = DVAEDataset(train_samples, 22050, False)
epochs = 20
eval_data_loader = DataLoader(
                    eval_dataset,
                    batch_size=3,
                    shuffle=False,
                    drop_last=False,
                    collate_fn=eval_dataset.collate_fn,
                    num_workers=0,
                    pin_memory=False,
                )

train_data_loader = DataLoader(
                    train_dataset,
                    batch_size=3,
                    shuffle=False,
                    drop_last=False,
                    collate_fn=train_dataset.collate_fn,
                    num_workers=4,
                    pin_memory=False,
                )

torch.set_grad_enabled(True)
dvae.train()

wandb.init(project = 'train_dvae')
wandb.watch(dvae)

def to_cuda(x: torch.Tensor) -> torch.Tensor:
    if x is None:
        return None
    if torch.is_tensor(x):
        x = x.contiguous()
        if torch.cuda.is_available():
            x = x.cuda(non_blocking=True)
    return x

@torch.no_grad()
def format_batch(batch):
    if isinstance(batch, dict):
        for k, v in batch.items():
            batch[k] = to_cuda(v)
    elif isinstance(batch, list):
        batch = [to_cuda(v) for v in batch]

    try:
        batch['mel'] = torch_mel_spectrogram_dvae(batch['wav'])
        # if the mel spectogram is not divisible by 4 then input.shape != output.shape 
        # for dvae
        remainder = batch['mel'].shape[-1] % 4
        if remainder:
            batch['mel'] = batch['mel'][:, :, :-remainder]
    except NotImplementedError:
        pass
    return batch

for i in range(epochs):
    for cur_step, batch in enumerate(train_data_loader):

        opt.zero_grad()
        batch = format_batch(batch)
        recon_loss, commitment_loss, out = dvae(batch['mel'])
        total_loss = recon_loss + commitment_loss
        total_loss.backward()
        clip_grad_norm_(dvae.parameters(), GRAD_CLIP_NORM)
        opt.step()

        log = {'epoch': i,
               'cur_step': cur_step,
               'loss': total_loss.item(),
               'recon_loss': recon_loss.item(),
               'commit_loss': commitment_loss.item()}
        print(f"epoch: {i}", print(f"step: {cur_step}"), f'loss - {total_loss.item()}', f'recon_loss - {recon_loss.item()}', f'commit_loss - {commitment_loss.item()}')
        wandb.log(log)
        torch.cuda.empty_cache()
#     if i%10:
#         save_model(f'.dvae.pth')

# wandb.save('./dvae.pth')
# wandb.finish()

Wrote a custom DVAEDataset that is imported in the above train.py file.


import torch
import random
from utils.dataset import key_samples_by_col
from TTS.tts.models.xtts import load_audio

torch.set_num_threads(1)

class DVAEDataset(torch.utils.data.Dataset):
    def __init__(self, samples, sample_rate, is_eval):
        self.sample_rate = sample_rate
        self.is_eval = is_eval
        self.max_wav_len = 255995
        self.samples = samples
        self.training_seed = 1
        self.failed_samples = set()
        if not is_eval:
            random.seed(self.training_seed)
            # random.shuffle(self.samples)
            random.shuffle(self.samples)
            # order by language
            self.samples = key_samples_by_col(self.samples, "language")
            print(" > Sampling by language:", self.samples.keys())
        else:
            # for evaluation load and check samples that are corrupted to ensures the reproducibility
            self.check_eval_samples()

    def check_eval_samples(self):
        print(" > Filtering invalid eval samples!!")
        new_samples = []
        for sample in self.samples:
            try:
                _, wav = self.load_item(sample)
            except:
                continue
            # Basically, this audio file is nonexistent or too long to be supported by the dataset.
            if (
                wav is None
                or (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len)
            ):
                continue
            new_samples.append(sample)
        self.samples = new_samples
        print(" > Total eval samples after filtering:", len(self.samples))

    def load_item(self, sample):
        audiopath = sample["audio_file"]
        wav = load_audio(audiopath, self.sample_rate)
        if wav is None or wav.shape[-1] < (0.5 * self.sample_rate):
            # Ultra short clips are also useless (and can cause problems within some models).
            raise ValueError

        return audiopath, wav
    
    def __getitem__(self, index):
        if self.is_eval:
            sample = self.samples[index]
            sample_id = str(index)
        else:
            # select a random language
            lang = random.choice(list(self.samples.keys()))
            # select random sample
            index = random.randint(0, len(self.samples[lang]) - 1)
            sample = self.samples[lang][index]
            # a unique id for each sampel to deal with fails
            sample_id = lang + "_" + str(index)

        # ignore samples that we already know that is not valid ones
        if sample_id in self.failed_samples:
            # call get item again to get other sample
            return self[1]

        # try to load the sample, if fails added it to the failed samples list
        try:
            audiopath, wav = self.load_item(sample)
        except:
            self.failed_samples.add(sample_id)
            return self[1]

        # check if the audio and text size limits and if it out of the limits, added it failed_samples
        if (
            wav is None
            or (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len)
        ):
            # Basically, this audio file is nonexistent or too long to be supported by the dataset.
            # It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result.
            self.failed_samples.add(sample_id)
            return self[1]

        res = {
            "wav": wav,
            "wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long),
            "filenames": audiopath,
        }
        return res
    
    def __len__(self):
        if self.is_eval:
            return len(self.samples)
        return sum([len(v) for v in self.samples.values()])

    def collate_fn(self, batch):
        # convert list of dicts to dict of lists
        B = len(batch)

        batch = {k: [dic[k] for dic in batch] for k in batch[0]}

        # stack for features that already have the same shape
        batch["wav_lengths"] = torch.stack(batch["wav_lengths"])

        max_wav_len = batch["wav_lengths"].max()

        # create padding tensors
        wav_padded = torch.FloatTensor(B, 1, max_wav_len)

        # initialize tensors for zero padding
        wav_padded = wav_padded.zero_()
        for i in range(B):
            wav = batch["wav"][i]
            wav_padded[i, :, : batch["wav_lengths"][i]] = torch.FloatTensor(wav)

        batch["wav"] = wav_padded
        return batch

This trains the DVAE to encode and decode mel-spectograms.

Few things:

You can see my import paths are not standard. That is because I have changed the structure of the repo a bit in my personal fork. You can follow the standard import paths as per TTS.
There is a loss called DiscretizationLoss here but I am not sure where or how this is used? So I am not using it currently.
For some reason, in the dvae.py on line 378, the author has added self.loss_fn(img, out, reduction="none"). I am not sure what is the purpose of doing reduction='none'. So I have summed it up in my code and just added it to calculate loss.
I am not sure what recipe to use for training(grad ACC steps, LR changes etc), I am just doing basic fine-tuning for now.
For a small batch, my train loss seems to be very low initially and also converge quickly:

Next step would be to fine-tune a larger dataset. @erogol @eginhard if this is in the right direction, I can convert this into a training recipe
and add to the repo.

PS: The code is a bit dirty since I have just re-used whatever was available as long as it doesn't harm my training.

smallsudarshan · 2024-04-29T04:43:19Z

I also now understand that the decoder of DVAE is not used, but instead an LM head is used on the GPT-2 to recompute the mel from the audio-codes. Need to understand this a bit better before writing the next stage training code.

ScottishFold007 · 2024-05-31T13:08:50Z

Awesome! Amazing! Did you implement the stage 'Finally fine-tune end to end with the Hi-Fi GAN' ？

ScottishFold007 · 2024-06-03T10:47:16Z

dvae

May I ask a question haha, to train the dvae model, is it only necessary to use the features of the audio file? Text is not needed?

pineking · 2024-06-04T01:40:47Z

dvae

May I ask a question haha, to train the dvae model, is it only necessary to use the features of the audio file? Text is not needed?

yes.

smallsudarshan · 2024-06-04T05:25:17Z

Awesome! Amazing! Did you implement the stage 'Finally fine-tune end to end with the Hi-Fi GAN' ？

Hey @ScottishFold007 unfortunately no, we have been experimenting with fine-tuning just the GPT2 model with larger and much more accurately annotated custom datasets.

In case you are facing quality issues, my suggestion would be to focus a lot on the dataset, it really helped us drastically improve quality. Particularly:

Consistent and clear pronunciations - eg. if randomly sometimes your word has high pitch, high speed and sometimes low, then it will not work -> but however, if there is a pattern, for eg. words before a comma are slow and words after comma are fast, it will pick it up.
Commas match pauses - if you don't have commas where you have pauses, then it will randomly pause/generate gibberish when it sees a comma etc. If you don't have commas, your speech will seem monotonous/bland.

We are yet to pick up training for the other stages, it's in my to-do list. I just deprioritized it a bit since I did not get any response either from the repo owners or someone who has previously contributed to this. And I did not want to build something that might mislead people by implementing the wrong thing without peer review.

ScottishFold007 · 2024-06-04T08:49:57Z

Awesome! Amazing! Did you implement the stage 'Finally fine-tune end to end with the Hi-Fi GAN' ？

Hey @ScottishFold007 unfortunately no, we have been experimenting with fine-tuning just the GPT2 model with larger and much more accurately annotated custom datasets.

In case you are facing quality issues, my suggestion would be to focus a lot on the dataset, it really helped us drastically improve quality. Particularly:

Consistent and clear pronunciations - eg. if randomly sometimes your word has high pitch, high speed and sometimes low, then it will not work -> but however, if there is a pattern, for eg. words before a comma are slow and words after comma are fast, it will pick it up.

Commas match pauses - if you don't have commas where you have pauses, then it will randomly pause/generate gibberish when it sees a comma etc. If you don't have commas, your speech will seem monotonous/bland.

We are yet to pick up training for the other stages, it's in my to-do list. I just deprioritized it a bit since I did not get any response either from the repo owners or someone who has previously contributed to this. And I did not want to build something that might mislead people by implementing the wrong thing without peer review.

I must say, you are very meticulous, kudos to you! Hasn't coqui-ai shut down? With no one maintaining it, I'm currently putting into practice the inspiration you provided. With a large amount of data, it still has a significant effect; moreover, training the dvae is just the first phase. After training is complete, we use this new dvae model to continue to the second phase: training the GPT model, followed by the third phase of training Hifi. I think that in the absence of peer review, we could team up to put this into practice, then report on progress and any issues that may arise, and work together to solve them. I'm not sure if you have WeChat (or any other social media), but I've started some discussion groups to explore each other's practical experiences and to pioneer together.

pineking · 2024-06-06T05:55:58Z

Awesome! Amazing! Did you implement the stage 'Finally fine-tune end to end with the Hi-Fi GAN' ？

Hey @ScottishFold007 unfortunately no, we have been experimenting with fine-tuning just the GPT2 model with larger and much more accurately annotated custom datasets.
In case you are facing quality issues, my suggestion would be to focus a lot on the dataset, it really helped us drastically improve quality. Particularly:

Consistent and clear pronunciations - eg. if randomly sometimes your word has high pitch, high speed and sometimes low, then it will not work -> but however, if there is a pattern, for eg. words before a comma are slow and words after comma are fast, it will pick it up.

Commas match pauses - if you don't have commas where you have pauses, then it will randomly pause/generate gibberish when it sees a comma etc. If you don't have commas, your speech will seem monotonous/bland.

We are yet to pick up training for the other stages, it's in my to-do list. I just deprioritized it a bit since I did not get any response either from the repo owners or someone who has previously contributed to this. And I did not want to build something that might mislead people by implementing the wrong thing without peer review.

I must say, you are very meticulous, kudos to you! Hasn't coqui-ai shut down? With no one maintaining it, I'm currently putting into practice the inspiration you provided. With a large amount of data, it still has a significant effect; moreover, training the dvae is just the first phase. After training is complete, we use this new dvae model to continue to the second phase: training the GPT model, followed by the third phase of training Hifi. I think that in the absence of peer review, we could team up to put this into practice, then report on progress and any issues that may arise, and work together to solve them. I'm not sure if you have WeChat (or any other social media), but I've started some discussion groups to explore each other's practical experiences and to pioneer together.

my wechat: pineking, we can discuss the training questions.

ScottishFold007 · 2024-06-06T06:43:02Z

Awesome! Amazing! Did you implement the stage 'Finally fine-tune end to end with the Hi-Fi GAN' ？

Hey @ScottishFold007 unfortunately no, we have been experimenting with fine-tuning just the GPT2 model with larger and much more accurately annotated custom datasets.
In case you are facing quality issues, my suggestion would be to focus a lot on the dataset, it really helped us drastically improve quality. Particularly:

Consistent and clear pronunciations - eg. if randomly sometimes your word has high pitch, high speed and sometimes low, then it will not work -> but however, if there is a pattern, for eg. words before a comma are slow and words after comma are fast, it will pick it up.

Commas match pauses - if you don't have commas where you have pauses, then it will randomly pause/generate gibberish when it sees a comma etc. If you don't have commas, your speech will seem monotonous/bland.

We are yet to pick up training for the other stages, it's in my to-do list. I just deprioritized it a bit since I did not get any response either from the repo owners or someone who has previously contributed to this. And I did not want to build something that might mislead people by implementing the wrong thing without peer review.

I must say, you are very meticulous, kudos to you! Hasn't coqui-ai shut down? With no one maintaining it, I'm currently putting into practice the inspiration you provided. With a large amount of data, it still has a significant effect; moreover, training the dvae is just the first phase. After training is complete, we use this new dvae model to continue to the second phase: training the GPT model, followed by the third phase of training Hifi. I think that in the absence of peer review, we could team up to put this into practice, then report on progress and any issues that may arise, and work together to solve them. I'm not sure if you have WeChat (or any other social media), but I've started some discussion groups to explore each other's practical experiences and to pioneer together.

my wechat: pineking, we can discuss the training questions.

好的，加你了

smallsudarshan · 2024-06-08T08:36:29Z

@ScottishFold007 @pineking unfortunately I don't use wechat. Maybe we can connect on discord?

There is this repository https://github.com/idiap/coqui-ai-TTS -> where they are maintaining a new pip package for TTS. I had asked the author if they would consider merging something like this, and he said he would, if we are able to replicate the TTS model from scratch.

Also, currently I have 2-3 projects running, so not sure if I will move on this with speed, but happy to connect and contribute in any way I can every now and then.

daswer123 · 2024-06-17T20:06:23Z

@smallsudarshan Hi, thank you for the code, I put everything in one place and made it easier for someone who will want to do a DVAE finetune,
https://github.com/daswer123/xtts-finetune-tests/tree/main/dvae-finetune

smallsudarshan · 2024-06-20T04:29:36Z

@daswer123 thanks a lot for picking up the baton!

Few things I have observed:

Currently the speaker conditioning does not work well for out of training data samples/speakers.

One of the ways to make the model more robust in this to change the training recipe a bit. Currently the ljspeech data loader completely ignores speaker information.

During training, the same sample is giving to the perceiver that needs to be synthesized. What if instead, we keep the speaker (and if applicable other characteristics like emotion) the same but use a sample with different spoken content?

That way, the model might learn that it is the style from the speaker that has to be picked and it might also work a bit better for out-of-distribution (not sure though).

The speaker conditioning just directly uses the perceiver model which does not have any explicit vectors for emotions. So how to produce voice that is angry, sad, happy etc for the same speaker?

If this has to truly work, it needs to have explicit separate vectors maybe that represent emotion and speaker info?

Point 2 is a bit of a deviation from the XTTS architecture, but point 1 seems simple to implement.

@ScottishFold007 @pineking

tuanh123789 · 2024-06-21T07:03:46Z

@smallsudarshan @daswer123 If you looking for Hifigan XTTS training code. You can checkout this: https://github.com/tuanh123789/Train_Hifigan_XTTS

daswer123 · 2024-06-21T10:08:29Z

@tuanh123789 Wow, thanks, it turns out we have the ability to fine-tune each component for XTTS.

and can you tell me approximately how fine-tuning will affect the result, can we train on multiple speakers?

And how do you think pipelines when we train one voice through all stages: DVAE -> GPT-2 -> HifiGAN , this should give a much better result than fine tuning GPT-2

tuanh123789 · 2024-06-21T10:14:20Z

I experiment with Ljspeech dataset both finetune and train from scratch and output very promising. With vietnamese I use 80h. Sure we can train on multi speakers

tuanh123789 · 2024-06-21T10:15:19Z

One problem with finetune GPT part. The short text audio output is very bad, do you solve it @daswer123

daswer123 · 2024-06-21T10:20:04Z

@tuanh123789 Yeah, I noticed that, too. Unfortunately, I haven't found a solution yet.

manmay-nakhashi · 2024-06-21T11:10:44Z

@smallsudarshan @daswer123 you never have to train a dvae , for finetuning only tune gpt-2 plus hifigan for finetung on larger datasets, dvae works for every langauge, you can even use a pretrained tortoise dvae.

manmay-nakhashi · 2024-06-21T11:11:34Z

for a shorter text it's data problem , add enough short sentences and it'll work.

tuanh123789 · 2024-06-21T11:39:12Z

for a shorter text it's data problem , add enough short sentences and it'll work.

thanks for response. After finetune gpt part with normal data, I use extra corpus about 11h of short text-audio to finetune one more time. But the results is not improve

daswer123 · 2024-06-21T11:53:18Z

@manmay-nakhashi I'm not really familiar with all the processes and maybe I don't understand something, but why is fine tuning DVAE and then passing it to GPT-2 not necessary, wouldn't pre-training DVAE on the training dataset give GPT-2 a better view of the dataset?

manmay-nakhashi · 2024-06-21T12:03:14Z

@daswer123 dvae is universal, can adapt to any language , it just learns how to compress a spectrogram.

tuanh123789 · 2024-06-21T12:12:44Z

It's true, I implement training Dvae pipeline for Vietnamese, but the results is quite the same when using pretrain on other languages. But the short text after finetune gpt is the problem

manmay-nakhashi · 2024-06-21T13:30:08Z

@tuanh123789 it's a data problem add lot's of single word and short sentences.

tuanh123789 · 2024-06-21T14:10:04Z

Yeah, Let's try

smallsudarshan · 2024-06-21T14:46:35Z

I also now understand that the decoder of DVAE is not used, but instead an LM head is used on the GPT-2 to recompute the mel from the audio-codes. Need to understand this a bit better before writing the next stage training code.

@manmay-nakhashi @tuanh123789 is the dvae even being used? I had checked it sometime back and I don't think it was being used.

And yes, short text is just a simple data problem.

One more problem - I have also seen short audio spikes at the end of speech, not sure how to solve it, but can probably be post-processed.

smallsudarshan · 2024-06-21T14:47:47Z

for a shorter text it's data problem , add enough short sentences and it'll work.

thanks for response. After finetune gpt part with normal data, I use extra corpus about 11h of short text-audio to finetune one more time. But the results is not improve

@tuanh123789 did you try mix training? Sequential had not given great results for us.

NikitaKononov · 2024-07-03T12:22:36Z

@tuanh123789 it certainly gets better when you try mix training (small and large sentences together), use a large set of single words. We are still testing this. Will post here.
Can you share the part in the code where the DVAE is being imported and used?

Hello, did you train GPT part of xtts using multi-gpu? for some reason DDP in coqui's codes doesn't work properly, did you face the same issue?

Yes i use 8 GPU to train GPT part and succesful. What problem do you get?

Do you use num_workers> 0 in dataloader?

I get those gpu load graphs with DDP (gpu0 purple, gpu1 green - all the rest GPUs behave the same)
than training hangs, gets stuck

With one GPU and num_workers > 0 things go the same way

it only works with 1 GPU and num_workers=0 in my case

It's probably not a hardware problem, tortoise TTS and some other DDP tunings go well, only coqui's Trainer has those problems

tuanh123789 · 2024-07-03T12:24:14Z

@tuanh123789 it certainly gets better when you try mix training (small and large sentences together), use a large set of single words. We are still testing this. Will post here.
Can you share the part in the code where the DVAE is being imported and used?

Hello, did you train GPT part of xtts using multi-gpu? for some reason DDP in coqui's codes doesn't work properly, did you face the same issue?

Yes i use 8 GPU to train GPT part and succesful. What problem do you get?

Do you use num_workers> 0 in dataloader?

I get those gpu load graphs with DDP (gpu0 purple, gpu1 green - all the rest GPUs behave the same) than training hangs, gets stuck

With one GPU and num_workers > 0 things go the same way

it only works with 1 GPU and num_workers=0 in my case

It's probably not a hardware problem, tortoise TTS and some other DDP tunings go well, only coqui's Trainer has those problems

Yes I set num_woker > 0. What hardware do you use?

NikitaKononov · 2024-07-03T12:26:52Z

Yes I set num_woker > 0. What hardware do you use?

x6 RTX a6000 48GB, 512GB RAM, 128 amd cores, nvme fast ssds

NikitaKononov · 2024-07-03T12:32:20Z

Yes I set num_woker > 0. What hardware do you use?

did you use standart coqui/TTS code to train?
or some kind of fork

tuanh123789 · 2024-07-05T02:22:05Z

I use code provide by coqui

NikitaKononov · 2024-07-05T10:51:34Z

I use code provide by coqui

Can you please tell me, do you use the same command?
python -m trainer.distribute --script recipes/ljspeech/xtts_v2/train_gpt_xtts.py --gpus 0,1,2,3,4,5

maybe it's the problem

tuanh123789 · 2024-07-12T02:25:08Z

@tuanh123789 it certainly gets better when you try mix training (small and large sentences together), use a large set of single words. We are still testing this. Will post here.

Can you share the part in the code where the DVAE is being imported and used?

Can you provide sentences length ratio in training dataset. You said that adding single words during training. But in the code there is a section that removes audio segments < 0.5s

manmay-nakhashi · 2024-07-12T06:04:36Z

you can reduce that to 0.3 may be if you want to just add hi, hello etc.

tuanh123789 · 2024-07-12T06:07:39Z

you can reduce that to 0.3 may be if you want to just add hi, hello etc.

Thank you 🤗

anhnh2002 · 2024-07-27T16:47:44Z

Hi @tuanh123789, have you overcome the short text error yet?

tuanh123789 · 2024-07-27T16:54:29Z

Yes. Add more short sentences. And config min_condition_length in train smaller

anhnh2002 · 2024-07-28T05:15:31Z

My thanks

anhnh2002 · 2024-07-28T06:31:32Z

Yes. Add more short sentences. And config min_condition_length in train smaller

As I understand it, min_condition_length is only related to the reference audio. So how does it address the short text problem?

tuanh123789 · 2024-07-28T06:46:40Z

Yes add more short audio and this config will solve the problem

anhnh2002 · 2024-07-28T07:31:40Z

Yes add more short audio and this config will solve the problem

Can you provide me with more information about the number of hours of short audio and the specific min_condition_length value to achieve good results?

tuanh123789 · 2024-07-30T08:48:46Z

Yes add more short audio and this config will solve the problem

Can you provide me with more information about the number of hours of short audio and the specific min_condition_length value to achieve good results?

Finetune Dvae with your data :D

thivux · 2024-07-31T11:02:22Z

@tuanh123789 hi, can you share what changes did you make to the training code to enable fine-tuning on Vietnamese data?

saiful9379 · 2024-08-13T18:23:53Z

@tuanh123789

Yes. Add more short sentences. And config min_condition_length in train smaller

`
model_args = GPTArgs(
max_conditioning_length=132300, # 6 secs
min_conditioning_length=66150 # 3 secs

`
Do you mention "min_conditioning_length" but what length do you suggest?

anhnh2002 · 2024-09-08T08:20:34Z

Hello everyone, below is my code for fine-tuning XTTS for a new language. It works well in my case with over 100 hours of audio (even for short text), based on the code by @smallsudarshan
https://github.com/nguyenhoanganh2002/XTTSv2-Finetuning-for-New-Languages

sachin-seisei · 2024-10-12T07:32:50Z

@smallsudarshan Hi, thank you for the code, I put everything in one place and made it easier for someone who will want to do a DVAE finetune, https://github.com/daswer123/xtts-finetune-tests/tree/main/dvae-finetune

Man can't thank you enough you just saved a lot of my time!!

sachin-seisei · 2024-10-12T08:01:51Z

did some one try fine tuning perceiver sampler??

iamdenay · 2024-10-16T09:04:24Z

Can we conclude that DVAE retraining is not worth it? Can anyone confirm that it had a positive effect?
Should we stick to GPT and Hifigan fine-tuning?

stale · 2024-12-08T09:24:25Z

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions. You might also look our discussion channels.

ukemamaster · 2024-12-23T17:34:13Z

Finetune Dvae with your data :D

Hi @tuanh123789 , I am trying to fine-tune xtts-v2. And i face the "short text" problem. The model hallucinates for shot text (1-3 words). I added a lot of short text data to my dataset but the problem persists. i kept min_conditioning_length in GPT config to 3s (as default).

Is it NECESSARY to fine-tune VAE before the GPT model, or just changing min_conditioning_length to 0.5s in GPT config solves the problem?
Also, it is NECESSARY to fine-tune HifiGan after GPT?

NikitaKononov · 2024-12-23T17:38:29Z

Finetune Dvae with your data :D

Hi @tuanh123789 , I am trying to fine-tune xtts-v2. And i face the "short text" problem. The model hallucinates for shot text (1-3 words). I added a lot of short text data to my dataset but the problem persists. i kept min_conditioning_length in GPT config to 3s (as default).

Is it NECESSARY to fine-tune VAE before the GPT model, or just changing min_conditioning_length to 0.5s in GPT config solves the problem? Also, it is NECESSARY to fine-tune HifiGan after GPT?

Changing condition length brakes model's abilities to voice cloning, but you can try

Hifigan tuning is not necessary
DVAE is worth tuning before gpt I think, but haven't tried that yet, it could solve the problem

ukemamaster · 2024-12-24T12:13:50Z

Yes add more short audio and this config will solve the problem

@tuanh123789 No need to train VAE?

smallsudarshan added the feature request feature requests for making TTS better. label Apr 24, 2024

smallsudarshan mentioned this issue Apr 29, 2024

[Feature request] Add Recipe for all 3 Training stages - XTTS V2 idiap/coqui-ai-TTS#15

Closed

daswer123 mentioned this issue Jun 18, 2024

Adding DVAE fine-tuning + Interface rework daswer123/xtts-finetune-webui#34

Open

VafaKnm mentioned this issue Jul 13, 2024

Roadmap for training XTTS for custom language tuanh123789/Train_Hifigan_XTTS#5

Open

erew123 mentioned this issue Aug 11, 2024

Alltalkbeta erew123/alltalk_tts#288

Merged

stale bot added the wontfix This will not be worked on but feel free to help. label Dec 8, 2024

stale bot removed the wontfix This will not be worked on but feel free to help. label Dec 24, 2024

[Feature request] Add Recipe for all 3 Training stages - XTTS V2 #3704

[Feature request] Add Recipe for all 3 Training stages - XTTS V2 #3704

Comments

smallsudarshan commented Apr 24, 2024

smallsudarshan commented Apr 27, 2024 • edited Loading

smallsudarshan commented Apr 29, 2024 • edited Loading

ScottishFold007 commented May 31, 2024

ScottishFold007 commented Jun 3, 2024

pineking commented Jun 4, 2024

smallsudarshan commented Jun 4, 2024 • edited Loading

ScottishFold007 commented Jun 4, 2024

pineking commented Jun 6, 2024

ScottishFold007 commented Jun 6, 2024

smallsudarshan commented Jun 8, 2024

daswer123 commented Jun 17, 2024 • edited Loading

smallsudarshan commented Jun 20, 2024

tuanh123789 commented Jun 21, 2024

daswer123 commented Jun 21, 2024 • edited Loading

tuanh123789 commented Jun 21, 2024

tuanh123789 commented Jun 21, 2024

daswer123 commented Jun 21, 2024

manmay-nakhashi commented Jun 21, 2024

manmay-nakhashi commented Jun 21, 2024

tuanh123789 commented Jun 21, 2024

daswer123 commented Jun 21, 2024

manmay-nakhashi commented Jun 21, 2024

tuanh123789 commented Jun 21, 2024

manmay-nakhashi commented Jun 21, 2024

tuanh123789 commented Jun 21, 2024

smallsudarshan commented Jun 21, 2024

smallsudarshan commented Jun 21, 2024

NikitaKononov commented Jul 3, 2024

tuanh123789 commented Jul 3, 2024

NikitaKononov commented Jul 3, 2024

NikitaKononov commented Jul 3, 2024

tuanh123789 commented Jul 5, 2024

NikitaKononov commented Jul 5, 2024

tuanh123789 commented Jul 12, 2024

manmay-nakhashi commented Jul 12, 2024

tuanh123789 commented Jul 12, 2024

anhnh2002 commented Jul 27, 2024

tuanh123789 commented Jul 27, 2024

anhnh2002 commented Jul 28, 2024

anhnh2002 commented Jul 28, 2024

tuanh123789 commented Jul 28, 2024

anhnh2002 commented Jul 28, 2024

tuanh123789 commented Jul 30, 2024

thivux commented Jul 31, 2024

saiful9379 commented Aug 13, 2024 • edited Loading

anhnh2002 commented Sep 8, 2024 • edited Loading

sachin-seisei commented Oct 12, 2024

sachin-seisei commented Oct 12, 2024

iamdenay commented Oct 16, 2024

stale bot commented Dec 8, 2024

ukemamaster commented Dec 23, 2024

NikitaKononov commented Dec 23, 2024

ukemamaster commented Dec 24, 2024

smallsudarshan commented Apr 27, 2024 •

edited

Loading

smallsudarshan commented Apr 29, 2024 •

edited

Loading

smallsudarshan commented Jun 4, 2024 •

edited

Loading

daswer123 commented Jun 17, 2024 •

edited

Loading

daswer123 commented Jun 21, 2024 •

edited

Loading

saiful9379 commented Aug 13, 2024 •

edited

Loading

anhnh2002 commented Sep 8, 2024 •

edited

Loading