feat: update v1.8

feat: update voice separation logic and optimize code structure fix: resolve ffmpeg encoding errors and phrase initialization issue refactor: remove whisperX replicate API support for simplicity perf: enhance prompt for broader model compatibility style: reduce translation block size to minimize errors
Huanshere · Nov 13, 2024 · 6e85833 · 6e85833
1 parent 059e6ca
commit 6e85833
Show file tree

Hide file tree

Showing 25 changed files with 506 additions and 846 deletions.
diff --git a/batch/utils/video_processor.py b/batch/utils/video_processor.py
@@ -1,6 +1,6 @@
 import os, sys
 sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
-from core import step2_whisper, step1_ytdlp, step3_1_spacy_split, step3_2_splitbymeaning
+from core import step1_ytdlp, step2_whisperX, step3_1_spacy_split, step3_2_splitbymeaning
 from core import step4_1_summarize, step4_2_translate_all, step5_splitforsub, step6_generate_final_timeline 
 from core import step7_merge_sub_to_vid, step8_gen_audio_task, step10_gen_audio, step11_merge_audio_to_vid
 from core.onekeycleanup import cleanup
@@ -14,7 +14,7 @@ def process_video(file, dubbing=False, is_retry=False):
 
     steps = [
         ("Processing input file", partial(process_input_file, file)),
-        ("Transcribing with Whisper", partial(step2_whisper.transcribe)),
+        ("Transcribing with Whisper", partial(step2_whisperX.transcribe)),
         ("Splitting sentences", split_sentences),
         ("Summarizing and translating", summarize_and_translate),
         ("Processing and aligning subtitles", process_and_align_subtitles),

diff --git a/config.yaml b/config.yaml
@@ -6,21 +6,19 @@ api:
   base_url: 'https://yunwu.zeabur.app'
   model: 'gemini-1.5-pro-002'
 
-# Replicate API settings, only necessary for whisperxapi
-replicate_api_token: 'YOUR_KEY'
-
 # Language settings, written into the prompt, can be described in natural language
-target_language: 'Chinese'
+target_language: '简体中文'
+
+# Whether to use Demucs for vocal separation before transcription, warning this may slow down the process and cause lines missing!
+demucs: false
 
 whisper:
-  # Whisper settings [whisperx, whisperxapi]
-  method: 'whisperx'
   # Whisper specified recognition language [en, zh, auto] auto for automatic detection, en for forced translation to English
   language: 'en'
   detected_language: 'en'
 
 # Video resolution [0x0, 640x360, 1920x1080]  0x0 will generate a 0-second black video placeholder
-resolution: '640x360'
+resolution: '1920x1080'
 
 ## ======================== Advanced Settings ======================== ##
 # *Default resolution for downloading YouTube videos [360, 1080, best]
@@ -30,7 +28,7 @@ subtitle:
   # *Maximum length of each subtitle line in characters
   max_length: 75
   # *Translated subtitles are slightly larger than source subtitles, affecting the reference length for subtitle splitting
-  target_multiplier: 1.1
+  target_multiplier: 1.2
 
 # *Number of LLM multi-threaded accesses
 max_workers: 8
@@ -42,7 +40,7 @@ pause_before_translate: false
 
 ## ======================== Dubbing Settings ======================== ##
 # TTS selection [openai_tts, gpt_sovits, azure_tts, fish_tts]
-tts_method: 'azure_tts'
+tts_method: 'openai_tts'
 
 # OpenAI TTS-1 API configuration
 openai_tts:
@@ -80,8 +78,7 @@ min_subtitle_duration: 3
 min_trim_duration: 2.50
 
 # Volume settings
-original_volume: 0.1  # Original voice volume in dubbed video (0.1 = 10% or 0)
-dub_volume: 1.5  # *Dubbed audio volume (1.5 = 150%, most original dubbing audio is relatively quiet)
+dub_volume: 1.3  # *Dubbed audio volume (1.3 = 130%, most original dubbing audio is relatively quiet)
 
 
 
@@ -140,4 +137,4 @@ language_split_with_space:
 # Languages that do not use space as separator
 language_split_without_space:
 - 'zh'
-- 'ja'
+- 'ja'
diff --git a/core/all_whisper_methods/demucs_vl.py b/core/all_whisper_methods/demucs_vl.py
@@ -2,87 +2,53 @@
 sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
 import torch
 from rich.console import Console
+from rich import print as rprint
 from demucs.pretrained import get_model
 from demucs.audio import save_audio
 from torch.cuda import is_available as is_cuda_available
 from typing import Optional
 from demucs.api import Separator
 from demucs.apply import BagOfModels
 
-class PreloadedSeparator(Separator):
-    def __init__(
-        self,
-        model: BagOfModels,
-        shifts: int = 1,
-        overlap: float = 0.25,
-        split: bool = True,
-        segment: Optional[int] = None,
-        jobs: int = 0,
-    ):
-        self._model = model
-        self._audio_channels = model.audio_channels
-        self._samplerate = model.samplerate
+AUDIO_DIR = "output/audio"
+RAW_AUDIO_FILE = os.path.join(AUDIO_DIR, "raw.mp3")
+BACKGROUND_AUDIO_FILE = os.path.join(AUDIO_DIR, "background.mp3")
+VOCAL_AUDIO_FILE = os.path.join(AUDIO_DIR, "vocal.mp3")
 
-        self.update_parameter(
-            device="cuda" if is_cuda_available() else "mps" if torch.backends.mps.is_available() else "cpu",
-            shifts=shifts,
-            overlap=overlap,
-            split=split,
-            segment=segment,
-            jobs=jobs,
-            progress=True,
-            callback=None,
-            callback_arg=None,
-        )
+class PreloadedSeparator(Separator):
+    def __init__(self, model: BagOfModels, shifts: int = 1, overlap: float = 0.25,
+                 split: bool = True, segment: Optional[int] = None, jobs: int = 0):
+        self._model, self._audio_channels, self._samplerate = model, model.audio_channels, model.samplerate
+        device = "cuda" if is_cuda_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+        self.update_parameter(device=device, shifts=shifts, overlap=overlap, split=split,
+                            segment=segment, jobs=jobs, progress=True, callback=None, callback_arg=None)
 
-def demucs_main(music_file, save_dir='output/audio', background_file='output/audio/background.mp3', original_vocal_file='output/audio/vocal.mp3'):
+def demucs_main():
+    if os.path.exists(VOCAL_AUDIO_FILE) and os.path.exists(BACKGROUND_AUDIO_FILE):
+        rprint(f"[yellow]⚠️ {VOCAL_AUDIO_FILE} and {BACKGROUND_AUDIO_FILE} already exist, skip Demucs processing.[/yellow]")
+        return
+
     console = Console()
+    os.makedirs(AUDIO_DIR, exist_ok=True)
 
-    # Ensure output directory exists
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir)
-
-    # Load model
     console.print("🤖 Loading <htdemucs> model...")
     model = get_model('htdemucs')
+    separator = PreloadedSeparator(model=model, shifts=1, overlap=0.25)
 
-    # Use fixed parameters
-    separator = PreloadedSeparator(
-        model=model,
-        shifts=1,
-        overlap=0.25,
-        segment=None,
-        split=True,
-        jobs=0,
-    )
-
-    # Separate audio
     console.print("🎵 Separating audio...")
-    _, outputs = separator.separate_audio_file(music_file)
+    _, outputs = separator.separate_audio_file(RAW_AUDIO_FILE)
 
-    # Audio output parameters
-    kwargs = {
-        "samplerate": model.samplerate,
-        "bitrate": 64,
-        "preset": 4,
-        "clip": "rescale",
-        "as_float": False,
-        "bits_per_sample": 16,
-    }
+    kwargs = {"samplerate": model.samplerate, "bitrate": 64, "preset": 4, 
+             "clip": "rescale", "as_float": False, "bits_per_sample": 16}
 
-    # Save vocals
     console.print("🎤 Saving vocals track...")
-    save_audio(outputs['vocals'].cpu(), original_vocal_file, **kwargs)
+    save_audio(outputs['vocals'].cpu(), VOCAL_AUDIO_FILE, **kwargs)
 
-    # Create and save background music
     console.print("🎹 Saving background music...")
-    background = torch.zeros_like(outputs['vocals'])
-    for source, audio in outputs.items():
-        if source != 'vocals':
-            background += audio
-    save_audio(background.cpu(), background_file, **kwargs)
+    background = sum(audio for source, audio in outputs.items() if source != 'vocals')
+    save_audio(background.cpu(), BACKGROUND_AUDIO_FILE, **kwargs)
 
     console.print("[green]✨ Audio separation completed![/green]")
 
 if __name__ == "__main__":
-    demucs_main("output/audio/raw_full_audio.mp3")
+    demucs_main()
diff --git a/core/all_whisper_methods/whisperX_utils.py b/core/all_whisper_methods/whisperX_utils.py
@@ -0,0 +1,131 @@
+import os, sys, subprocess
+import pandas as pd
+from moviepy.editor import AudioFileClip
+from typing import Dict, List, Tuple
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+from core.config_utils import update_key
+from core.all_whisper_methods.demucs_vl import RAW_AUDIO_FILE, AUDIO_DIR
+
+def convert_video_to_audio(input_file: str) -> str:
+    os.makedirs(AUDIO_DIR, exist_ok=True)
+    if not os.path.exists(RAW_AUDIO_FILE):
+        print(f"🎬➡️🎵 Converting to audio with FFmpeg ......")
+        subprocess.run([
+            'ffmpeg', '-y', '-i', input_file, '-vn', '-b:a', '64k',
+            '-ar', '16000', '-ac', '1', '-metadata', 'encoding=UTF-8',
+            '-f', 'mp3', RAW_AUDIO_FILE
+        ], check=True, stderr=subprocess.PIPE)
+        print(f"🎬➡️🎵 Converted <{input_file}> to <{RAW_AUDIO_FILE}> with FFmpeg\n")
+
+    return RAW_AUDIO_FILE
+
+def _detect_silence(audio_file: str, start: float, end: float) -> List[float]:
+    """Detect silence points in the given audio segment"""
+    cmd = ['ffmpeg', '-y', '-i', audio_file, 
+           '-ss', str(start), '-to', str(end),
+           '-af', 'silencedetect=n=-30dB:d=0.5', 
+           '-f', 'null', '-']
+
+    output = subprocess.run(cmd, capture_output=True, text=True, 
+                          encoding='utf-8').stderr
+
+    return [float(line.split('silence_end: ')[1].split(' ')[0])
+            for line in output.split('\n')
+            if 'silence_end' in line]
+
+def split_audio(audio_file: str, target_len: int = 50*60, win: int = 60) -> List[Tuple[float, float]]:
+    print("🔪 Starting audio segmentation...")
+
+    with AudioFileClip(audio_file) as audio:
+        duration = audio.duration
+    segments = []
+    pos = 0
+    while pos < duration:
+        if duration - pos < target_len:
+            segments.append((pos, duration))
+            break
+        win_start = pos + target_len - win
+        win_end = min(win_start + 2 * win, duration)
+        silences = _detect_silence(audio_file, win_start, win_end)
+
+        if silences:
+            target_pos = target_len - (win_start - pos)
+            split_at = next((t for t in silences if t - win_start > target_pos), None)
+            if split_at:
+                segments.append((pos, split_at))
+                pos = split_at
+                continue
+        segments.append((pos, pos + target_len))
+        pos += target_len
+
+    print(f"🔪 Audio split into {len(segments)} segments")
+    return segments
+
+def process_transcription(result: Dict) -> pd.DataFrame:
+    all_words = []
+    for segment in result['segments']:
+        for word in segment['words']:
+            # Check word length
+            if len(word["word"]) > 20:
+                print(f"⚠️ Warning: Detected word longer than 20 characters, skipping: {word['word']}")
+                continue
+
+            # ! For French, we need to convert guillemets to empty strings
+            word["word"] = word["word"].replace('»', '').replace('«', '')
+
+            if 'start' not in word and 'end' not in word:
+                if all_words:
+                    # Assign the end time of the previous word as the start and end time of the current word
+                    word_dict = {
+                        'text': word["word"],
+                        'start': all_words[-1]['end'],
+                        'end': all_words[-1]['end'],
+                    }
+                    all_words.append(word_dict)
+                else:
+                    # If it's the first word, look next for a timestamp then assign it to the current word
+                    next_word = next((w for w in segment['words'] if 'start' in w and 'end' in w), None)
+                    if next_word:
+                        word_dict = {
+                            'text': word["word"],
+                            'start': next_word["start"],
+                            'end': next_word["end"],
+                        }
+                        all_words.append(word_dict)
+                    else:
+                        raise Exception(f"No next word with timestamp found for the current word : {word}")
+            else:
+                # Normal case, with start and end times
+                word_dict = {
+                    'text': f'{word["word"]}',
+                    'start': word.get('start', all_words[-1]['end'] if all_words else 0),
+                    'end': word['end'],
+                }
+
+                all_words.append(word_dict)
+
+    return pd.DataFrame(all_words)
+
+def save_results(df: pd.DataFrame):
+    os.makedirs('output/log', exist_ok=True)
+    excel_path = os.path.join('output/log', "cleaned_chunks.xlsx")
+
+    # Remove rows where 'text' is empty
+    initial_rows = len(df)
+    df = df[df['text'].str.len() > 0]
+    removed_rows = initial_rows - len(df)
+    if removed_rows > 0:
+        print(f"ℹ️ Removed {removed_rows} row(s) with empty text.")
+
+    # Check for and remove words longer than 20 characters
+    long_words = df[df['text'].str.len() > 20]
+    if not long_words.empty:
+        print(f"⚠️ Warning: Detected {len(long_words)} word(s) longer than 20 characters. These will be removed.")
+        df = df[df['text'].str.len() <= 20]
+
+    df['text'] = df['text'].apply(lambda x: f'"{x}"')
+    df.to_excel(excel_path, index=False)
+    print(f"📊 Excel file saved to {excel_path}")
+
+def save_language(language: str):
+    update_key("whisper.detected_language", language)