diff --git a/aligner.py b/aligner.py new file mode 100644 index 0000000..d89fe1b --- /dev/null +++ b/aligner.py @@ -0,0 +1,235 @@ +from typing import Tuple +import numpy as np + +import torch +from torch import nn, Tensor +from torch.nn import Module +import torch.nn.functional as F + +from einops import rearrange, repeat + +from beartype import beartype +from beartype.typing import Optional + +def exists(val): + return val is not None + +class AlignerNet(Module): + """alignment model https://arxiv.org/pdf/2108.10447.pdf """ + def __init__( + self, + dim_in=80, + dim_hidden=512, + attn_channels=80, + temperature=0.0005, + ): + super().__init__() + self.temperature = temperature + + self.key_layers = nn.ModuleList([ + nn.Conv1d( + dim_hidden, + dim_hidden * 2, + kernel_size=3, + padding=1, + bias=True, + ), + nn.ReLU(inplace=True), + nn.Conv1d(dim_hidden * 2, attn_channels, kernel_size=1, padding=0, bias=True) + ]) + + self.query_layers = nn.ModuleList([ + nn.Conv1d( + dim_in, + dim_in * 2, + kernel_size=3, + padding=1, + bias=True, + ), + nn.ReLU(inplace=True), + nn.Conv1d(dim_in * 2, dim_in, kernel_size=1, padding=0, bias=True), + nn.ReLU(inplace=True), + nn.Conv1d(dim_in, attn_channels, kernel_size=1, padding=0, bias=True) + ]) + + @beartype + def forward( + self, + queries: Tensor, + keys: Tensor, + mask: Optional[Tensor] = None + ): + key_out = keys + for layer in self.key_layers: + key_out = layer(key_out) + + query_out = queries + for layer in self.query_layers: + query_out = layer(query_out) + + key_out = rearrange(key_out, 'b c t -> b t c') + query_out = rearrange(query_out, 'b c t -> b t c') + + attn_logp = torch.cdist(query_out, key_out) + attn_logp = rearrange(attn_logp, 'b ... -> b 1 ...') + + if exists(mask): + mask = rearrange(mask.bool(), '... c -> ... 1 c') + attn_logp.data.masked_fill_(~mask, -torch.finfo(attn_logp.dtype).max) + + attn = attn_logp.softmax(dim = -1) + return attn, attn_logp + +def pad_tensor(input, pad, value=0): + pad = [item for sublist in reversed(pad) for item in sublist] # Flatten the tuple + assert len(pad) // 2 == len(input.shape), 'Padding dimensions do not match input dimensions' + return F.pad(input, pad, mode='constant', value=value) + +def maximum_path(value, mask, const=None): + device = value.device + dtype = value.dtype + if not exists(const): + const = torch.tensor(float('-inf')).to(device) # Patch for Sphinx complaint + value = value * mask + + b, t_x, t_y = value.shape + direction = torch.zeros(value.shape, dtype=torch.int64, device=device) + v = torch.zeros((b, t_x), dtype=torch.float32, device=device) + x_range = torch.arange(t_x, dtype=torch.float32, device=device).view(1, -1) + + for j in range(t_y): + v0 = pad_tensor(v, ((0, 0), (1, 0)), value = const)[:, :-1] + v1 = v + max_mask = v1 >= v0 + v_max = torch.where(max_mask, v1, v0) + direction[:, :, j] = max_mask + + index_mask = x_range <= j + v = torch.where(index_mask.view(1,-1), v_max + value[:, :, j], const) + + direction = torch.where(mask.bool(), direction, 1) + + path = torch.zeros(value.shape, dtype=torch.float32, device=device) + index = mask[:, :, 0].sum(1).long() - 1 + index_range = torch.arange(b, device=device) + + for j in reversed(range(t_y)): + path[index_range, index, j] = 1 + index = index + direction[index_range, index, j] - 1 + + path = path * mask.float() + path = path.to(dtype=dtype) + return path + +class ForwardSumLoss(Module): + def __init__( + self, + blank_logprob = -1 + ): + super().__init__() + self.blank_logprob = blank_logprob + + self.ctc_loss = torch.nn.CTCLoss( + blank = 0, # check this value + zero_infinity = True + ) + + def forward(self, attn_logprob, key_lens, query_lens): + device, blank_logprob = attn_logprob.device, self.blank_logprob + max_key_len = attn_logprob.size(-1) + + # Reorder input to [query_len, batch_size, key_len] + attn_logprob = rearrange(attn_logprob, 'b 1 c t -> c b t') + + # Add blank label + attn_logprob = F.pad(attn_logprob, (1, 0, 0, 0, 0, 0), value = blank_logprob) + + # Convert to log probabilities + # Note: Mask out probs beyond key_len + mask_value = -torch.finfo(attn_logprob.dtype).max + attn_logprob.masked_fill_(torch.arange(max_key_len + 1, device=device, dtype=torch.long).view(1, 1, -1) > key_lens.view(1, -1, 1), mask_value) + + attn_logprob = attn_logprob.log_softmax(dim = -1) + + # Target sequences + target_seqs = torch.arange(1, max_key_len + 1, device=device, dtype=torch.long) + target_seqs = repeat(target_seqs, 'n -> b n', b = key_lens.numel()) + + # Evaluate CTC loss + cost = self.ctc_loss(attn_logprob, target_seqs, query_lens, key_lens) + + return cost + +class BinLoss(Module): + def forward(self, attn_hard, attn_logprob, key_lens): + batch, device = attn_logprob.shape[0], attn_logprob.device + max_key_len = attn_logprob.size(-1) + + # Reorder input to [query_len, batch_size, key_len] + attn_logprob = rearrange(attn_logprob, 'b 1 c t -> c b t') + attn_hard = rearrange(attn_hard, 'b t c -> c b t') + + mask_value = -torch.finfo(attn_logprob.dtype).max + + attn_logprob.masked_fill_(torch.arange(max_key_len, device=device, dtype=torch.long).view(1, 1, -1) > key_lens.view(1, -1, 1), mask_value) + attn_logprob = attn_logprob.log_softmax(dim = -1) + + return (attn_hard * attn_logprob).sum() / batch + +class Aligner(Module): + def __init__( + self, + dim_in, + dim_hidden, + attn_channels=80, + temperature=0.0005 + ): + super().__init__() + self.dim_in = dim_in + self.dim_hidden = dim_hidden + self.attn_channels = attn_channels + self.temperature = temperature + self.aligner = AlignerNet( + dim_in = self.dim_in, + dim_hidden = self.dim_hidden, + attn_channels = self.attn_channels, + temperature = self.temperature + ) + + def forward( + self, + x, + x_mask, + y, + y_mask + ): + alignment_soft, alignment_logprob = self.aligner(y, rearrange(x, 'b d t -> b t d'), x_mask) + + x_mask = rearrange(x_mask, '... i -> ... i 1') + y_mask = rearrange(y_mask, '... j -> ... 1 j') + attn_mask = x_mask * y_mask + attn_mask = rearrange(attn_mask, 'b 1 i j -> b i j') + + alignment_soft = rearrange(alignment_soft, 'b 1 c t -> b t c') + alignment_mask = maximum_path(alignment_soft, attn_mask) + + alignment_hard = torch.sum(alignment_mask, -1).int() + return alignment_hard, alignment_soft, alignment_logprob, alignment_mask + +if __name__ == '__main__': + batch_size = 10 + seq_len_y = 200 # length of sequence y + seq_len_x = 35 + feature_dim = 80 # feature dimension + + x = torch.randn(batch_size, 512, seq_len_x) + x = x.transpose(1,2) #dim-1 is the channels for conv + y = torch.randn(batch_size, seq_len_y, feature_dim) + y = y.transpose(1,2) #dim-1 is the channels for conv + + # Create masks + x_mask = torch.ones(batch_size, 1, seq_len_x) + y_mask = torch.ones(batch_size, 1, seq_len_y) + + align = Aligner(dim_in = 80, dim_hidden=512, attn_channels=80) + alignment_hard, alignment_soft, alignment_logprob, alignment_mas = align(x, x_mask, y, y_mask) \ No newline at end of file diff --git a/models.py b/models.py index dc8fa64..7519365 100644 --- a/models.py +++ b/models.py @@ -12,6 +12,7 @@ import modules import monotonic_align from commons import get_padding, init_weights +from .aligner import Aligner, ForwardSumLoss, BinLoss AVAILABLE_FLOW_TYPES = [ "pre_conv", @@ -1231,6 +1232,16 @@ def __init__( self.dp = DurationPredictor( hidden_channels, 256, 3, 0.5, gin_channels=gin_channels ) + + self.aligner = Aligner( + dim_in=80, + dim_hidden=self.enc_gin_channels, + attn_channels=self.enc_gin_channels, + ) + + self.aligner_loss = ForwardSumLoss() + self.bin_loss = BinLoss() + self.aligner_bin_loss_weight = 0.0 if n_speakers > 1: self.emb_g = nn.Embedding(n_speakers, gin_channels) @@ -1245,37 +1256,46 @@ def forward(self, x, x_lengths, y, y_lengths, sid=None): z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z_p = self.flow(z, y_mask, g=g) - with torch.no_grad(): - # negative cross-entropy - s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t] - neg_cent1 = torch.sum( - -0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True - ) # [b, 1, t_s] - neg_cent2 = torch.matmul( - -0.5 * (z_p**2).transpose(1, 2), s_p_sq_r - ) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] - neg_cent3 = torch.matmul( - z_p.transpose(1, 2), (m_p * s_p_sq_r) - ) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] - neg_cent4 = torch.sum( - -0.5 * (m_p**2) * s_p_sq_r, [1], keepdim=True - ) # [b, 1, t_s] - neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4 - - if self.use_noise_scaled_mas: - epsilon = ( - torch.std(neg_cent) - * torch.randn_like(neg_cent) - * self.current_mas_noise_scale - ) - neg_cent = neg_cent + epsilon - - attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) - attn = ( - monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)) - .unsqueeze(1) - .detach() + # with torch.no_grad(): + # # negative cross-entropy + # s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t] + # neg_cent1 = torch.sum( + # -0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True + # ) # [b, 1, t_s] + # neg_cent2 = torch.matmul( + # -0.5 * (z_p**2).transpose(1, 2), s_p_sq_r + # ) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] + # neg_cent3 = torch.matmul( + # z_p.transpose(1, 2), (m_p * s_p_sq_r) + # ) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] + # neg_cent4 = torch.sum( + # -0.5 * (m_p**2) * s_p_sq_r, [1], keepdim=True + # ) # [b, 1, t_s] + # neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4 + + # if self.use_noise_scaled_mas: + # epsilon = ( + # torch.std(neg_cent) + # * torch.randn_like(neg_cent) + # * self.current_mas_noise_scale + # ) + # neg_cent = neg_cent + epsilon + + # attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) + # attn = ( + # monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)) + # .unsqueeze(1) + # .detach() + # ) + + aln_hard, aln_soft, aln_log, aln_mask = self.aligner( + m_p.transpose(1,2), x_mask, y, y_mask ) + attn = aln_mask.transpose(1,2).unsqueeze(1) + align_loss = self.aligner_loss(aln_log, x_lengths, y_lengths) + if self.aligner_bin_loss_weight > 0.: + align_bin_loss = self.bin_loss(aln_mask, aln_log, x_lengths) * self.aligner_bin_loss_weight + align_loss = align_loss + align_bin_loss w = attn.sum(2) if self.use_sdp: @@ -1307,6 +1327,7 @@ def forward(self, x, x_lengths, y, y_lengths, sid=None): y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), (x, logw, logw_), + align_loss, ) def infer( diff --git a/train_ms.py b/train_ms.py index 17d4cd3..4876436 100644 --- a/train_ms.py +++ b/train_ms.py @@ -358,6 +358,7 @@ def train_and_evaluate( z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), (hidden_x, logw, logw_), + align_loss, ) = net_g(x, x_lengths, spec, spec_lengths, speakers) if ( @@ -437,7 +438,7 @@ def train_and_evaluate( if net_dur_disc is not None: y_dur_hat_r, y_dur_hat_g = net_dur_disc(hidden_x, x_mask, logw_, logw) with autocast(enabled=False): - loss_dur = torch.sum(l_length.float()) + loss_dur = torch.sum(l_length.float()) + align_loss loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl