Skip to content

Commit

Permalink
feat: update v1.8
Browse files Browse the repository at this point in the history
feat: update voice separation logic and optimize code structure
fix: resolve ffmpeg encoding errors and phrase initialization issue
refactor: remove whisperX replicate API support for simplicity
perf: enhance prompt for broader model compatibility
style: reduce translation block size to minimize errors
  • Loading branch information
Huanshere committed Nov 13, 2024
1 parent 059e6ca commit 6e85833
Show file tree
Hide file tree
Showing 25 changed files with 506 additions and 846 deletions.
4 changes: 2 additions & 2 deletions batch/utils/video_processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os, sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
from core import step2_whisper, step1_ytdlp, step3_1_spacy_split, step3_2_splitbymeaning
from core import step1_ytdlp, step2_whisperX, step3_1_spacy_split, step3_2_splitbymeaning
from core import step4_1_summarize, step4_2_translate_all, step5_splitforsub, step6_generate_final_timeline
from core import step7_merge_sub_to_vid, step8_gen_audio_task, step10_gen_audio, step11_merge_audio_to_vid
from core.onekeycleanup import cleanup
Expand All @@ -14,7 +14,7 @@ def process_video(file, dubbing=False, is_retry=False):

steps = [
("Processing input file", partial(process_input_file, file)),
("Transcribing with Whisper", partial(step2_whisper.transcribe)),
("Transcribing with Whisper", partial(step2_whisperX.transcribe)),
("Splitting sentences", split_sentences),
("Summarizing and translating", summarize_and_translate),
("Processing and aligning subtitles", process_and_align_subtitles),
Expand Down
21 changes: 9 additions & 12 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,19 @@ api:
base_url: 'https://yunwu.zeabur.app'
model: 'gemini-1.5-pro-002'

# Replicate API settings, only necessary for whisperxapi
replicate_api_token: 'YOUR_KEY'

# Language settings, written into the prompt, can be described in natural language
target_language: 'Chinese'
target_language: '简体中文'

# Whether to use Demucs for vocal separation before transcription, warning this may slow down the process and cause lines missing!
demucs: false

whisper:
# Whisper settings [whisperx, whisperxapi]
method: 'whisperx'
# Whisper specified recognition language [en, zh, auto] auto for automatic detection, en for forced translation to English
language: 'en'
detected_language: 'en'

# Video resolution [0x0, 640x360, 1920x1080] 0x0 will generate a 0-second black video placeholder
resolution: '640x360'
resolution: '1920x1080'

## ======================== Advanced Settings ======================== ##
# *Default resolution for downloading YouTube videos [360, 1080, best]
Expand All @@ -30,7 +28,7 @@ subtitle:
# *Maximum length of each subtitle line in characters
max_length: 75
# *Translated subtitles are slightly larger than source subtitles, affecting the reference length for subtitle splitting
target_multiplier: 1.1
target_multiplier: 1.2

# *Number of LLM multi-threaded accesses
max_workers: 8
Expand All @@ -42,7 +40,7 @@ pause_before_translate: false

## ======================== Dubbing Settings ======================== ##
# TTS selection [openai_tts, gpt_sovits, azure_tts, fish_tts]
tts_method: 'azure_tts'
tts_method: 'openai_tts'

# OpenAI TTS-1 API configuration
openai_tts:
Expand Down Expand Up @@ -80,8 +78,7 @@ min_subtitle_duration: 3
min_trim_duration: 2.50

# Volume settings
original_volume: 0.1 # Original voice volume in dubbed video (0.1 = 10% or 0)
dub_volume: 1.5 # *Dubbed audio volume (1.5 = 150%, most original dubbing audio is relatively quiet)
dub_volume: 1.3 # *Dubbed audio volume (1.3 = 130%, most original dubbing audio is relatively quiet)



Expand Down Expand Up @@ -140,4 +137,4 @@ language_split_with_space:
# Languages that do not use space as separator
language_split_without_space:
- 'zh'
- 'ja'
- 'ja'
86 changes: 26 additions & 60 deletions core/all_whisper_methods/demucs_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,87 +2,53 @@
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
import torch
from rich.console import Console
from rich import print as rprint
from demucs.pretrained import get_model
from demucs.audio import save_audio
from torch.cuda import is_available as is_cuda_available
from typing import Optional
from demucs.api import Separator
from demucs.apply import BagOfModels

class PreloadedSeparator(Separator):
def __init__(
self,
model: BagOfModels,
shifts: int = 1,
overlap: float = 0.25,
split: bool = True,
segment: Optional[int] = None,
jobs: int = 0,
):
self._model = model
self._audio_channels = model.audio_channels
self._samplerate = model.samplerate
AUDIO_DIR = "output/audio"
RAW_AUDIO_FILE = os.path.join(AUDIO_DIR, "raw.mp3")
BACKGROUND_AUDIO_FILE = os.path.join(AUDIO_DIR, "background.mp3")
VOCAL_AUDIO_FILE = os.path.join(AUDIO_DIR, "vocal.mp3")

self.update_parameter(
device="cuda" if is_cuda_available() else "mps" if torch.backends.mps.is_available() else "cpu",
shifts=shifts,
overlap=overlap,
split=split,
segment=segment,
jobs=jobs,
progress=True,
callback=None,
callback_arg=None,
)
class PreloadedSeparator(Separator):
def __init__(self, model: BagOfModels, shifts: int = 1, overlap: float = 0.25,
split: bool = True, segment: Optional[int] = None, jobs: int = 0):
self._model, self._audio_channels, self._samplerate = model, model.audio_channels, model.samplerate
device = "cuda" if is_cuda_available() else "mps" if torch.backends.mps.is_available() else "cpu"
self.update_parameter(device=device, shifts=shifts, overlap=overlap, split=split,
segment=segment, jobs=jobs, progress=True, callback=None, callback_arg=None)

def demucs_main(music_file, save_dir='output/audio', background_file='output/audio/background.mp3', original_vocal_file='output/audio/vocal.mp3'):
def demucs_main():
if os.path.exists(VOCAL_AUDIO_FILE) and os.path.exists(BACKGROUND_AUDIO_FILE):
rprint(f"[yellow]⚠️ {VOCAL_AUDIO_FILE} and {BACKGROUND_AUDIO_FILE} already exist, skip Demucs processing.[/yellow]")
return

console = Console()
os.makedirs(AUDIO_DIR, exist_ok=True)

# Ensure output directory exists
if not os.path.exists(save_dir):
os.makedirs(save_dir)

# Load model
console.print("🤖 Loading <htdemucs> model...")
model = get_model('htdemucs')
separator = PreloadedSeparator(model=model, shifts=1, overlap=0.25)

# Use fixed parameters
separator = PreloadedSeparator(
model=model,
shifts=1,
overlap=0.25,
segment=None,
split=True,
jobs=0,
)

# Separate audio
console.print("🎵 Separating audio...")
_, outputs = separator.separate_audio_file(music_file)
_, outputs = separator.separate_audio_file(RAW_AUDIO_FILE)

# Audio output parameters
kwargs = {
"samplerate": model.samplerate,
"bitrate": 64,
"preset": 4,
"clip": "rescale",
"as_float": False,
"bits_per_sample": 16,
}
kwargs = {"samplerate": model.samplerate, "bitrate": 64, "preset": 4,
"clip": "rescale", "as_float": False, "bits_per_sample": 16}

# Save vocals
console.print("🎤 Saving vocals track...")
save_audio(outputs['vocals'].cpu(), original_vocal_file, **kwargs)
save_audio(outputs['vocals'].cpu(), VOCAL_AUDIO_FILE, **kwargs)

# Create and save background music
console.print("🎹 Saving background music...")
background = torch.zeros_like(outputs['vocals'])
for source, audio in outputs.items():
if source != 'vocals':
background += audio
save_audio(background.cpu(), background_file, **kwargs)
background = sum(audio for source, audio in outputs.items() if source != 'vocals')
save_audio(background.cpu(), BACKGROUND_AUDIO_FILE, **kwargs)

console.print("[green]✨ Audio separation completed![/green]")

if __name__ == "__main__":
demucs_main("output/audio/raw_full_audio.mp3")
demucs_main()
131 changes: 131 additions & 0 deletions core/all_whisper_methods/whisperX_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os, sys, subprocess
import pandas as pd
from moviepy.editor import AudioFileClip
from typing import Dict, List, Tuple
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from core.config_utils import update_key
from core.all_whisper_methods.demucs_vl import RAW_AUDIO_FILE, AUDIO_DIR

def convert_video_to_audio(input_file: str) -> str:
os.makedirs(AUDIO_DIR, exist_ok=True)
if not os.path.exists(RAW_AUDIO_FILE):
print(f"🎬➡️🎵 Converting to audio with FFmpeg ......")
subprocess.run([
'ffmpeg', '-y', '-i', input_file, '-vn', '-b:a', '64k',
'-ar', '16000', '-ac', '1', '-metadata', 'encoding=UTF-8',
'-f', 'mp3', RAW_AUDIO_FILE
], check=True, stderr=subprocess.PIPE)
print(f"🎬➡️🎵 Converted <{input_file}> to <{RAW_AUDIO_FILE}> with FFmpeg\n")

return RAW_AUDIO_FILE

def _detect_silence(audio_file: str, start: float, end: float) -> List[float]:
"""Detect silence points in the given audio segment"""
cmd = ['ffmpeg', '-y', '-i', audio_file,
'-ss', str(start), '-to', str(end),
'-af', 'silencedetect=n=-30dB:d=0.5',
'-f', 'null', '-']

output = subprocess.run(cmd, capture_output=True, text=True,
encoding='utf-8').stderr

return [float(line.split('silence_end: ')[1].split(' ')[0])
for line in output.split('\n')
if 'silence_end' in line]

def split_audio(audio_file: str, target_len: int = 50*60, win: int = 60) -> List[Tuple[float, float]]:
print("🔪 Starting audio segmentation...")

with AudioFileClip(audio_file) as audio:
duration = audio.duration
segments = []
pos = 0
while pos < duration:
if duration - pos < target_len:
segments.append((pos, duration))
break
win_start = pos + target_len - win
win_end = min(win_start + 2 * win, duration)
silences = _detect_silence(audio_file, win_start, win_end)

if silences:
target_pos = target_len - (win_start - pos)
split_at = next((t for t in silences if t - win_start > target_pos), None)
if split_at:
segments.append((pos, split_at))
pos = split_at
continue
segments.append((pos, pos + target_len))
pos += target_len

print(f"🔪 Audio split into {len(segments)} segments")
return segments

def process_transcription(result: Dict) -> pd.DataFrame:
all_words = []
for segment in result['segments']:
for word in segment['words']:
# Check word length
if len(word["word"]) > 20:
print(f"⚠️ Warning: Detected word longer than 20 characters, skipping: {word['word']}")
continue

# ! For French, we need to convert guillemets to empty strings
word["word"] = word["word"].replace('»', '').replace('«', '')

if 'start' not in word and 'end' not in word:
if all_words:
# Assign the end time of the previous word as the start and end time of the current word
word_dict = {
'text': word["word"],
'start': all_words[-1]['end'],
'end': all_words[-1]['end'],
}
all_words.append(word_dict)
else:
# If it's the first word, look next for a timestamp then assign it to the current word
next_word = next((w for w in segment['words'] if 'start' in w and 'end' in w), None)
if next_word:
word_dict = {
'text': word["word"],
'start': next_word["start"],
'end': next_word["end"],
}
all_words.append(word_dict)
else:
raise Exception(f"No next word with timestamp found for the current word : {word}")
else:
# Normal case, with start and end times
word_dict = {
'text': f'{word["word"]}',
'start': word.get('start', all_words[-1]['end'] if all_words else 0),
'end': word['end'],
}

all_words.append(word_dict)

return pd.DataFrame(all_words)

def save_results(df: pd.DataFrame):
os.makedirs('output/log', exist_ok=True)
excel_path = os.path.join('output/log', "cleaned_chunks.xlsx")

# Remove rows where 'text' is empty
initial_rows = len(df)
df = df[df['text'].str.len() > 0]
removed_rows = initial_rows - len(df)
if removed_rows > 0:
print(f"ℹ️ Removed {removed_rows} row(s) with empty text.")

# Check for and remove words longer than 20 characters
long_words = df[df['text'].str.len() > 20]
if not long_words.empty:
print(f"⚠️ Warning: Detected {len(long_words)} word(s) longer than 20 characters. These will be removed.")
df = df[df['text'].str.len() <= 20]

df['text'] = df['text'].apply(lambda x: f'"{x}"')
df.to_excel(excel_path, index=False)
print(f"📊 Excel file saved to {excel_path}")

def save_language(language: str):
update_key("whisper.detected_language", language)
Loading

0 comments on commit 6e85833

Please sign in to comment.