-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
0007f6c
commit 8429b5b
Showing
60 changed files
with
6,269 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from tango_edm.audioldm.ldm import LatentDiffusion | ||
from tango_edm.audioldm.utils import seed_everything, save_wave, get_time, get_duration | ||
# from tango_edm.audioldm.pipeline import * | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
#!/usr/bin/python3 | ||
import os | ||
from tango_edm.audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time, round_up_duration, get_duration | ||
import argparse | ||
|
||
CACHE_DIR = os.getenv( | ||
"AUDIOLDM_CACHE_DIR", | ||
os.path.join(os.path.expanduser("~"), ".cache/audioldm")) | ||
|
||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument( | ||
"--mode", | ||
type=str, | ||
required=False, | ||
default="generation", | ||
help="generation: text-to-audio generation; transfer: style transfer", | ||
choices=["generation", "transfer"] | ||
) | ||
|
||
parser.add_argument( | ||
"-t", | ||
"--text", | ||
type=str, | ||
required=False, | ||
default="", | ||
help="Text prompt to the model for audio generation", | ||
) | ||
|
||
parser.add_argument( | ||
"-f", | ||
"--file_path", | ||
type=str, | ||
required=False, | ||
default=None, | ||
help="(--mode transfer): Original audio file for style transfer; Or (--mode generation): the guidance audio file for generating simialr audio", | ||
) | ||
|
||
parser.add_argument( | ||
"--transfer_strength", | ||
type=float, | ||
required=False, | ||
default=0.5, | ||
help="A value between 0 and 1. 0 means original audio without transfer, 1 means completely transfer to the audio indicated by text", | ||
) | ||
|
||
parser.add_argument( | ||
"-s", | ||
"--save_path", | ||
type=str, | ||
required=False, | ||
help="The path to save model output", | ||
default="./output", | ||
) | ||
|
||
parser.add_argument( | ||
"--model_name", | ||
type=str, | ||
required=False, | ||
help="The checkpoint you gonna use", | ||
default="audioldm-s-full", | ||
choices=["audioldm-s-full", "audioldm-l-full", "audioldm-s-full-v2"] | ||
) | ||
|
||
parser.add_argument( | ||
"-ckpt", | ||
"--ckpt_path", | ||
type=str, | ||
required=False, | ||
help="The path to the pretrained .ckpt model", | ||
default=None, | ||
) | ||
|
||
parser.add_argument( | ||
"-b", | ||
"--batchsize", | ||
type=int, | ||
required=False, | ||
default=1, | ||
help="Generate how many samples at the same time", | ||
) | ||
|
||
parser.add_argument( | ||
"--ddim_steps", | ||
type=int, | ||
required=False, | ||
default=200, | ||
help="The sampling step for DDIM", | ||
) | ||
|
||
parser.add_argument( | ||
"-gs", | ||
"--guidance_scale", | ||
type=float, | ||
required=False, | ||
default=2.5, | ||
help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)", | ||
) | ||
|
||
parser.add_argument( | ||
"-dur", | ||
"--duration", | ||
type=float, | ||
required=False, | ||
default=10.0, | ||
help="The duration of the samples", | ||
) | ||
|
||
parser.add_argument( | ||
"-n", | ||
"--n_candidate_gen_per_text", | ||
type=int, | ||
required=False, | ||
default=3, | ||
help="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation", | ||
) | ||
|
||
parser.add_argument( | ||
"--seed", | ||
type=int, | ||
required=False, | ||
default=42, | ||
help="Change this value (any integer number) will lead to a different generation result.", | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
if(args.ckpt_path is not None): | ||
print("Warning: ckpt_path has no effect after version 0.0.20.") | ||
|
||
assert args.duration % 2.5 == 0, "Duration must be a multiple of 2.5" | ||
|
||
mode = args.mode | ||
if(mode == "generation" and args.file_path is not None): | ||
mode = "generation_audio_to_audio" | ||
if(len(args.text) > 0): | ||
print("Warning: You have specified the --file_path. --text will be ignored") | ||
args.text = "" | ||
|
||
save_path = os.path.join(args.save_path, mode) | ||
|
||
if(args.file_path is not None): | ||
save_path = os.path.join(save_path, os.path.basename(args.file_path.split(".")[0])) | ||
|
||
text = args.text | ||
random_seed = args.seed | ||
duration = args.duration | ||
guidance_scale = args.guidance_scale | ||
n_candidate_gen_per_text = args.n_candidate_gen_per_text | ||
|
||
os.makedirs(save_path, exist_ok=True) | ||
audioldm = build_model(model_name=args.model_name) | ||
|
||
if(args.mode == "generation"): | ||
waveform = text_to_audio( | ||
audioldm, | ||
text, | ||
args.file_path, | ||
random_seed, | ||
duration=duration, | ||
guidance_scale=guidance_scale, | ||
ddim_steps=args.ddim_steps, | ||
n_candidate_gen_per_text=n_candidate_gen_per_text, | ||
batchsize=args.batchsize, | ||
) | ||
|
||
elif(args.mode == "transfer"): | ||
assert args.file_path is not None | ||
assert os.path.exists(args.file_path), "The original audio file \'%s\' for style transfer does not exist." % args.file_path | ||
waveform = style_transfer( | ||
audioldm, | ||
text, | ||
args.file_path, | ||
args.transfer_strength, | ||
random_seed, | ||
duration=duration, | ||
guidance_scale=guidance_scale, | ||
ddim_steps=args.ddim_steps, | ||
batchsize=args.batchsize, | ||
) | ||
waveform = waveform[:,None,:] | ||
|
||
save_wave(waveform, save_path, name="%s_%s" % (get_time(), text)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from tango_edm.audioldm.audio.tools import wav_to_fbank, read_wav_file | ||
from tango_edm.audioldm.audio.stft import TacotronSTFT |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import torch | ||
import numpy as np | ||
import librosa.util as librosa_util | ||
from scipy.signal import get_window | ||
|
||
|
||
def window_sumsquare( | ||
window, | ||
n_frames, | ||
hop_length, | ||
win_length, | ||
n_fft, | ||
dtype=np.float32, | ||
norm=None, | ||
): | ||
""" | ||
# from librosa 0.6 | ||
Compute the sum-square envelope of a window function at a given hop length. | ||
This is used to estimate modulation effects induced by windowing | ||
observations in short-time fourier transforms. | ||
Parameters | ||
---------- | ||
window : string, tuple, number, callable, or list-like | ||
Window specification, as in `get_window` | ||
n_frames : int > 0 | ||
The number of analysis frames | ||
hop_length : int > 0 | ||
The number of samples to advance between frames | ||
win_length : [optional] | ||
The length of the window function. By default, this matches `n_fft`. | ||
n_fft : int > 0 | ||
The length of each analysis frame. | ||
dtype : np.dtype | ||
The data type of the output | ||
Returns | ||
------- | ||
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` | ||
The sum-squared envelope of the window function | ||
""" | ||
if win_length is None: | ||
win_length = n_fft | ||
|
||
n = n_fft + hop_length * (n_frames - 1) | ||
x = np.zeros(n, dtype=dtype) | ||
|
||
# Compute the squared window at the desired length | ||
win_sq = get_window(window, win_length, fftbins=True) | ||
win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 | ||
win_sq = librosa_util.pad_center(win_sq, n_fft) | ||
|
||
# Fill the envelope | ||
for i in range(n_frames): | ||
sample = i * hop_length | ||
x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] | ||
return x | ||
|
||
|
||
def griffin_lim(magnitudes, stft_fn, n_iters=30): | ||
""" | ||
PARAMS | ||
------ | ||
magnitudes: spectrogram magnitudes | ||
stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods | ||
""" | ||
|
||
angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) | ||
angles = angles.astype(np.float32) | ||
angles = torch.autograd.Variable(torch.from_numpy(angles)) | ||
signal = stft_fn.inverse(magnitudes, angles).squeeze(1) | ||
|
||
for i in range(n_iters): | ||
_, angles = stft_fn.transform(signal) | ||
signal = stft_fn.inverse(magnitudes, angles).squeeze(1) | ||
return signal | ||
|
||
|
||
def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5): | ||
""" | ||
PARAMS | ||
------ | ||
C: compression factor | ||
""" | ||
return normalize_fun(torch.clamp(x, min=clip_val) * C) | ||
|
||
|
||
def dynamic_range_decompression(x, C=1): | ||
""" | ||
PARAMS | ||
------ | ||
C: compression factor used to compress | ||
""" | ||
return torch.exp(x) / C |
Oops, something went wrong.