-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_duration_model.py
119 lines (99 loc) · 3.96 KB
/
train_duration_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import torch # isort:skip
from tqdm.cli import tqdm
from transformers import Adafactor
from torch_ema import ExponentialMovingAverage
import matplotlib.pyplot as plt
import tensorflow as tf
from models import DurationNet
device = "cuda" if torch.cuda.is_available() else "cpu"
tf.config.set_visible_devices([], 'GPU')
def load_tfdata(root, split, batch_size):
feature_description = {
"phone_idx": tf.io.FixedLenFeature([], tf.string),
"phone_duration": tf.io.FixedLenFeature([], tf.string),
}
def parse_tfrecord(r):
r = tf.io.parse_example(r, feature_description)
phone_idx = tf.reshape(tf.io.parse_tensor(r["phone_idx"], out_type=tf.int32), [-1])
phone_duration = tf.reshape(
tf.io.parse_tensor(r["phone_duration"], out_type=tf.float32), [-1]
)
return {
"phone_idx": phone_idx,
"phone_duration": phone_duration,
"phone_length": tf.shape(phone_duration)[0],
}
files = tf.data.Dataset.list_files(f"{root}/{split}/part_*.tfrecords")
return (
tf.data.TFRecordDataset(files, num_parallel_reads=4)
.map(parse_tfrecord, num_parallel_calls=4)
.shuffle(buffer_size=batch_size * 32)
.bucket_by_sequence_length(
lambda x: x["phone_length"],
bucket_boundaries=(32, 64, 128, 256, 512),
bucket_batch_sizes=[batch_size] * 6,
pad_to_bucket_boundary=False,
drop_remainder=True,
)
.prefetch(1)
)
net = DurationNet(256, 64, 4).to(device)
optim = Adafactor(net.parameters(), warmup_init=False)
ema = ExponentialMovingAverage(net.parameters(), decay=0.995)
batch_size = 4
num_epochs = 50
def loss_fn(net, batch):
token = batch["phone_idx"]
duration = batch["phone_duration"] / 1000.
length = batch["phone_length"]
mask = torch.arange(0, duration.shape[1], device=device)[None, :] < length[:, None]
y = net(token, length).squeeze(-1)
loss = torch.nn.functional.l1_loss(y, duration, reduction="none")
loss = torch.where(mask == 1, loss, 0.0)
loss = torch.sum(loss) / torch.sum(mask)
return loss
ds = load_tfdata("data/tfdata", "train", batch_size)
val_ds = load_tfdata("data/tfdata", "test", batch_size)
def prepare_batch(batch):
return {
"phone_idx": torch.from_numpy(batch["phone_idx"]).to(device, non_blocking=True),
"phone_duration": torch.from_numpy(batch["phone_duration"]).to(device, non_blocking=True),
"phone_length": torch.from_numpy(batch["phone_length"]).to(device, non_blocking=True),
}
for epoch in range(num_epochs):
losses = []
for batch in tqdm(ds.as_numpy_iterator()):
batch = prepare_batch(batch)
loss = loss_fn(net, batch)
optim.zero_grad(set_to_none=True)
loss.backward()
optim.step()
ema.update()
losses.append(loss.item())
train_loss = sum(losses) / len(losses)
losses = []
with ema.average_parameters():
with torch.inference_mode():
net.eval()
for batch in val_ds.as_numpy_iterator():
batch = prepare_batch(batch)
loss = loss_fn(net, batch)
losses.append(loss.item())
net.train()
val_loss = sum(losses) / len(losses)
print(f"epoch {epoch:<3d} train loss {train_loss:.5} val loss {val_loss:.5f}")
ema.copy_to(net.parameters())
net = net.eval()
torch.save(net.state_dict(), "ckpts/duration_model/duration_model.pth")
with torch.inference_mode():
for batch in val_ds.as_numpy_iterator():
batch = prepare_batch(batch)
duration = batch["phone_duration"] / 1000
y = net(batch["phone_idx"], batch["phone_length"]).squeeze(-1)
break
plt.figure(figsize=(10, 5))
d = duration[0].tolist()
t = y[0].tolist()
plt.plot(d, '-*', label="target")
plt.plot(t, '-*', label="predict")
plt.legend()