diff --git a/.github/ISSUE_TEMPLATE/issue-report.yml b/.github/ISSUE_TEMPLATE/issue-report.yml deleted file mode 100644 index d9d64089..00000000 --- a/.github/ISSUE_TEMPLATE/issue-report.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: 使用问题报告 / Usage Problem Report -description: 报告在使用过程中遇到的问题 / Report issues encountered while using -labels: ["bug"] -body: - - type: markdown - attributes: - value: | - 感谢您花时间填写这份问题报告!在提交之前,请先搜索是否已存在类似的 issue。 - Thank you for taking the time to fill out this problem report! Before submitting, please search to see if a similar issue already exists. - - - type: checkboxes - id: search - attributes: - label: 搜索现有 issues / Search existing issues - description: 请确保没有重复的 issue。/ Please make sure there are no duplicate issues. - options: - - label: 我已经搜索了现有的 issues / I have searched the existing issues - required: true - - - type: input - id: llm-model - attributes: - label: 使用的 LLM 模型 / LLM Model Used - description: 请指明您使用的是哪个 LLM 模型 / Please specify which LLM model you are using - placeholder: 例如:GPT-3.5-turbo, BERT, etc. / For example: GPT-3.5-turbo, BERT, etc. - validations: - required: true - - - type: textarea - id: problem-step - attributes: - label: 问题发生的步骤 / Steps Where the Problem Occurred - description: 请详细描述在哪个步骤遇到了问题 / Please describe in detail at which step you encountered the problem - placeholder: | - 1. 首先我... / First, I... - 2. 然后我... / Then, I... - 3. 接着出现了... / After that... - validations: - required: true - - - type: textarea - id: command-screenshot - attributes: - label: 命令行截图 / Command Line Screenshot - description: 请提供包含完整代码的命令行截图 / Please provide a screenshot of the command line including the full code - placeholder: 请在此处粘贴您的截图 / Please paste your screenshot here - validations: - required: true - - - type: textarea - id: additional-info - attributes: - label: 其他信息 / Additional Information - description: 还有什么其他相关信息可以提供吗?/ Is there any other relevant information you can provide? - placeholder: 任何您认为可能有帮助的额外信息 / Any additional information you think might be helpful diff --git a/.gitignore b/.gitignore index 7dce1fb3..ca46faeb 100644 --- a/.gitignore +++ b/.gitignore @@ -170,4 +170,5 @@ config.backup.yaml # runtime runtime/ -dev/ \ No newline at end of file +dev/ +installer_files/ \ No newline at end of file diff --git a/OneKeyInstall&Start.bat b/OneKeyInstall&Start.bat new file mode 100644 index 00000000..d1a4e77f --- /dev/null +++ b/OneKeyInstall&Start.bat @@ -0,0 +1,67 @@ +@echo off + +cd /D "%~dp0" + +set PATH=%PATH%;%SystemRoot%\system32 + +echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end + +@rem fix failed install when installing to a separate drive +set TMP=%cd%\installer_files +set TEMP=%cd%\installer_files + +@rem config +set INSTALL_DIR=%cd%\installer_files +set CONDA_ROOT_PREFIX=%cd%\installer_files\conda +set INSTALL_ENV_DIR=%cd%\installer_files\env +set MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Windows-x86_64.exe +set conda_exists=F + +@rem figure out whether git and conda needs to be installed +call "%CONDA_ROOT_PREFIX%\_conda.exe" --version >nul 2>&1 +if "%ERRORLEVEL%" EQU "0" set conda_exists=T + +@rem (if necessary) install git and conda into a contained environment +@rem download conda +if "%conda_exists%" == "F" ( + echo Downloading Miniconda from %MINICONDA_DOWNLOAD_URL% to %INSTALL_DIR%\miniconda_installer.exe + + mkdir "%INSTALL_DIR%" + call curl -Lk "%MINICONDA_DOWNLOAD_URL%" > "%INSTALL_DIR%\miniconda_installer.exe" || ( echo. && echo Miniconda failed to download. && goto end ) + + echo Installing Miniconda to %CONDA_ROOT_PREFIX% + start /wait "" "%INSTALL_DIR%\miniconda_installer.exe" /InstallationType=JustMe /NoShortcuts=1 /AddToPath=0 /RegisterPython=0 /NoRegistry=1 /S /D=%CONDA_ROOT_PREFIX% + + @rem test the conda binary + echo Miniconda version: + call "%CONDA_ROOT_PREFIX%\_conda.exe" --version || ( echo. && echo Miniconda not found. && goto end ) +) + +@rem create the installer env +if not exist "%INSTALL_ENV_DIR%" ( + echo Packages to install: python=3.10.0 requests rich ruamel.yaml + call "%CONDA_ROOT_PREFIX%\_conda.exe" create --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python=3.10.0 requests rich "ruamel.yaml" || ( echo. && echo Conda environment creation failed. && goto end ) +) + +@rem check if conda environment was actually created +if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end ) + +@rem environment isolation +set PYTHONNOUSERSITE=1 +set PYTHONPATH= +set PYTHONHOME= +@rem ! may cause error if we use cudnn on windows +set "CUDA_PATH=%INSTALL_ENV_DIR%" +set "CUDA_HOME=%CUDA_PATH%" + +@rem activate installer env +call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end ) + +@rem Run pip setup +call python pip_setup.py + +echo. +echo Done! + +:end +pause diff --git a/OneKeyStart.bat b/OneKeyStart.bat deleted file mode 100644 index 29ea8efc..00000000 --- a/OneKeyStart.bat +++ /dev/null @@ -1,13 +0,0 @@ -@echo off -cd /d %~dp0 -if exist runtime ( - echo Using runtime folder... - runtime\python.exe -m streamlit run st.py -) else ( - echo Runtime folder not found. Using conda environment... - call activate videolingo - python -m streamlit run st.py - call deactivate -) - -pause \ No newline at end of file diff --git a/batch/OneKeyBatch.bat b/batch/OneKeyBatch.bat index 38086bfd..24141944 100644 --- a/batch/OneKeyBatch.bat +++ b/batch/OneKeyBatch.bat @@ -1,14 +1,24 @@ @echo off -cd /d %~dp0.. - -if exist runtime ( - echo Using runtime folder... - runtime\python.exe batch\utils\batch_processor.py -) else ( - echo Runtime folder not found. Using conda environment... - call conda activate videolingo - python batch\utils\batch_processor.py - call conda deactivate -) +cd /D "%~dp0" +cd .. +@rem 设置环境变量 +set INSTALL_DIR=%cd%\installer_files +set CONDA_ROOT_PREFIX=%cd%\installer_files\conda +set INSTALL_ENV_DIR=%cd%\installer_files\env + +@rem 环境隔离设置 +set PYTHONNOUSERSITE=1 +set PYTHONPATH= +set PYTHONHOME= +set "CUDA_PATH=%INSTALL_ENV_DIR%" +set "CUDA_HOME=%CUDA_PATH%" + +@rem 激活conda环境 +call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Conda environment not found && goto end ) + +@rem 运行批处理脚本 +call python batch\utils\batch_processor.py + +:end pause diff --git a/batch/README.md b/batch/README.md index bc79c3e8..d0977406 100644 --- a/batch/README.md +++ b/batch/README.md @@ -2,29 +2,27 @@ [English](./README.md) | [简体中文](./README.zh.md) -Before utilizing the batch mode, ensure you have familiarized yourself with the Streamlit mode and properly configured the parameters in `config.yaml`. +Before utilizing the batch mode, ensure you have used the Streamlit mode and properly configured the parameters in `config.yaml`. ## Usage Guide -> Note: All referenced files, with the exception of `config.yaml`, are located within the `batch` folder. - ### 1. Video File Preparation -- Upload your video files for processing to the `input` folder -- YouTube links can be specified in the subsequent step +- Place your video files in the `input` folder +- YouTube links can be specified in the next step ### 2. Task Configuration -Modify the `tasks_setting.xlsx` file as follows: +Edit the `tasks_setting.xlsx` file: | Field | Description | Acceptable Values | |-------|-------------|-------------------| -| Video File | Video filename (excluding `input/` prefix) or YouTube URL | - | -| Source Language | Original language of the video | 'en', 'zh', 'auto', or leave empty for default | -| Target Language | Desired translation language | Use natural language description, or leave empty for default | -| Dubbing | Enable or disable dubbing | 0 or empty: no dubbing; 1: enable dubbing | +| Video File | Video filename (without `input/` prefix) or YouTube URL | - | +| Source Language | Source language | 'en', 'zh', ... or leave empty for default | +| Target Language | Translation language | Use natural language description, or leave empty for default | +| Dubbing | Enable dubbing | 0 or empty: no dubbing; 1: enable dubbing | -Example configuration: +Example: | Video File | Source Language | Target Language | Dubbing | |------------|-----------------|-----------------|---------| @@ -33,24 +31,23 @@ Example configuration: ### 3. Executing Batch Processing -1. Launch `OneKeyBatch.bat` with a double-click -2. Processed files will be stored in the `output` folder -3. Monitor task progress in the `Status` column of `tasks_setting.xlsx` +1. Double-click to run `OneKeyBatch.bat` +2. Output files will be saved in the `output` folder +3. Task status can be monitored in the `Status` column of `tasks_setting.xlsx` > Note: Keep `tasks_setting.xlsx` closed during execution to prevent interruptions due to file access conflicts. - ## Important Considerations ### Handling Interruptions -In the event of an unexpected command line closure, language settings in `config.yaml` may be altered. Verify these settings before attempting to resume processing. +If the command line is closed unexpectedly, language settings in `config.yaml` may be altered. Check settings before retrying. ### Error Management -- Files that fail to process will be relocated to the `output/ERROR` folder -- Detailed error messages are logged in the `Status` column of `tasks_setting.xlsx` -- To reattempt processing: - 1. Transfer the specific video folder from `ERROR` to the root directory - 2. Rename this folder to `output` - 3. Utilize the Streamlit mode to reinitiate processing +- Failed files will be moved to the `output/ERROR` folder +- Error messages are recorded in the `Status` column of `tasks_setting.xlsx` +- To retry: + 1. Move the single video folder from `ERROR` to the root directory + 2. Rename it to `output` + 3. Use Streamlit mode to process again diff --git a/batch/README.zh.md b/batch/README.zh.md index 8a0ed089..73103140 100644 --- a/batch/README.zh.md +++ b/batch/README.zh.md @@ -6,8 +6,6 @@ ## 使用方法 -> 注:以下所说文件除了 `config.yaml` 以外都在 `batch` 文件夹下。 - ### 1. 准备视频文件 - 将要处理的视频文件放入 `input` 文件夹 @@ -20,7 +18,7 @@ | 字段 | 说明 | 可选值 | |------|------|--------| | Video File | 视频文件名(无需 `input/` 前缀)或 YouTube 链接 | - | -| Source Language | 源语言 | 'en', 'zh', 'auto',或留空使用默认设置 | +| Source Language | 源语言 | 'en', 'zh', ... 或留空使用默认设置 | | Target Language | 翻译语言 | 使用自然语言描述,或留空使用默认设置 | | Dubbing | 是否配音 | 0 或留空:不配音;1:配音 | diff --git a/batch/utils/settings_check.py b/batch/utils/settings_check.py index a90fc53e..5ea05612 100644 --- a/batch/utils/settings_check.py +++ b/batch/utils/settings_check.py @@ -4,11 +4,17 @@ from rich.console import Console from rich.panel import Panel +# Constants +SETTINGS_FILE = 'batch/tasks_setting.xlsx' +INPUT_FOLDER = os.path.join('batch', 'input') +VALID_DUBBING_VALUES = [0, 1] + console = Console() def check_settings(): - df = pd.read_excel('batch/tasks_setting.xlsx') - input_files = set(os.listdir(os.path.join('batch', 'input'))) + os.makedirs(INPUT_FOLDER, exist_ok=True) + df = pd.read_excel(SETTINGS_FILE) + input_files = set(os.listdir(INPUT_FOLDER)) excel_files = set(df['Video File'].tolist()) files_not_in_excel = input_files - excel_files @@ -31,19 +37,14 @@ def check_settings(): if video_file.startswith('http'): url_tasks += 1 - elif os.path.isfile(os.path.join('batch', 'input', video_file)): + elif os.path.isfile(os.path.join(INPUT_FOLDER, video_file)): local_video_tasks += 1 else: console.print(Panel(f"Invalid video file or URL 「{video_file}」", title=f"[bold red]Error in row {index + 2}", expand=False)) all_passed = False - if not pd.isna(source_language): - if source_language.lower() not in ['en', 'zh', 'auto']: - console.print(Panel(f"Invalid source language 「{source_language}」", title=f"[bold red]Error in row {index + 2}", expand=False)) - all_passed = False - if not pd.isna(dubbing): - if int(dubbing) not in [0, 1]: + if int(dubbing) not in VALID_DUBBING_VALUES: console.print(Panel(f"Invalid dubbing value 「{dubbing}」", title=f"[bold red]Error in row {index + 2}", expand=False)) all_passed = False diff --git a/batch/utils/video_processor.py b/batch/utils/video_processor.py index 342e5228..3d5cb12f 100644 --- a/batch/utils/video_processor.py +++ b/batch/utils/video_processor.py @@ -1,53 +1,74 @@ import os, sys sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) -from core import step1_ytdlp, step2_whisperX, step3_1_spacy_split, step3_2_splitbymeaning -from core import step4_1_summarize, step4_2_translate_all, step5_splitforsub, step6_generate_final_timeline -from core import step7_merge_sub_to_vid, step8_gen_audio_task, step10_gen_audio, step11_merge_audio_to_vid +from st_components.imports_and_utils import * from core.onekeycleanup import cleanup from core.config_utils import load_key import shutil from functools import partial +from rich.panel import Panel +from rich.console import Console + +console = Console() + +INPUT_DIR = 'batch/input' +OUTPUT_DIR = 'output' +SAVE_DIR = 'batch/output' +ERROR_OUTPUT_DIR = 'batch/output/ERROR' +YTB_RESOLUTION_KEY = "ytb_resolution" def process_video(file, dubbing=False, is_retry=False): if not is_retry: - prepare_output_folder('output') + prepare_output_folder(OUTPUT_DIR) - steps = [ - ("Processing input file", partial(process_input_file, file)), - ("Transcribing with Whisper", partial(step2_whisperX.transcribe)), - ("Splitting sentences", split_sentences), - ("Summarizing and translating", summarize_and_translate), - ("Processing and aligning subtitles", process_and_align_subtitles), - ("Merging subtitles to video", step7_merge_sub_to_vid.merge_subtitles_to_video), + text_steps = [ + ("🎥 Processing input file", partial(process_input_file, file)), + ("🎙️ Transcribing with Whisper", partial(step2_whisperX.transcribe)), + ("✂️ Splitting sentences", split_sentences), + ("📝 Summarizing and translating", summarize_and_translate), + ("⚡ Processing and aligning subtitles", process_and_align_subtitles), + ("🎬 Merging subtitles to video", step7_merge_sub_to_vid.merge_subtitles_to_video), ] if dubbing: - steps.extend([ - ("Generating audio tasks", step8_gen_audio_task.gen_audio_task_main), - ("Generating audio using SoVITS", step10_gen_audio.process_sovits_tasks), - ("Merging generated audio with video", step11_merge_audio_to_vid.merge_main), - ]) + dubbing_steps = [ + ("🔊 Generating audio tasks", gen_audio_tasks), + ("🎵 Extracting reference audio", step9_extract_refer_audio.extract_refer_audio_main), + ("🗣️ Generating audio", step10_gen_audio.gen_audio), + ("🔄 Merging full audio", step11_merge_full_audio.merge_full_audio), + ("🎞️ Merging dubbing to video", step12_merge_dub_to_vid.merge_video_audio), + ] + text_steps.extend(dubbing_steps) current_step = "" - for step_name, step_func in steps: + for step_name, step_func in text_steps: current_step = step_name for attempt in range(3): try: - print(f"Executing: {step_name}...") + console.print(Panel( + f"[bold green]{step_name}[/]", + subtitle=f"Attempt {attempt + 1}/3" if attempt > 0 else None, + border_style="blue" + )) result = step_func() if result is not None: globals().update(result) break except Exception as e: if attempt == 2: - error_message = f"Error in step '{current_step}': {str(e)}" - print(error_message) - cleanup("batch/output/ERROR") - return False, current_step, error_message - print(f"Attempt {attempt + 1} failed. Retrying...") + error_panel = Panel( + f"[bold red]Error in step '{current_step}':[/]\n{str(e)}", + border_style="red" + ) + console.print(error_panel) + cleanup(ERROR_OUTPUT_DIR) + return False, current_step, str(e) + console.print(Panel( + f"[yellow]Attempt {attempt + 1} failed. Retrying...[/]", + border_style="yellow" + )) - print("All steps completed successfully!") - cleanup("batch/output") + console.print(Panel("[bold green]All steps completed successfully! 🎉[/]", border_style="green")) + cleanup(SAVE_DIR) return True, "", "" def prepare_output_folder(output_folder): @@ -57,11 +78,11 @@ def prepare_output_folder(output_folder): def process_input_file(file): if file.startswith('http'): - step1_ytdlp.download_video_ytdlp(file, resolution=load_key("ytb_resolution"), cutoff_time=None) + step1_ytdlp.download_video_ytdlp(file, resolution=load_key(YTB_RESOLUTION_KEY), cutoff_time=None) video_file = step1_ytdlp.find_video_files() else: input_file = os.path.join('batch', 'input', file) - output_file = os.path.join('output', file) + output_file = os.path.join(OUTPUT_DIR, file) shutil.copy(input_file, output_file) video_file = output_file return {'video_file': video_file} @@ -77,3 +98,7 @@ def summarize_and_translate(): def process_and_align_subtitles(): step5_splitforsub.split_for_sub_main() step6_generate_final_timeline.align_timestamp_main() + +def gen_audio_tasks(): + step8_1_gen_audio_task.gen_audio_task_main() + step8_2_gen_dub_chunks.gen_dub_chunks() diff --git a/config.yaml b/config.yaml index cdc1ec8e..daabb838 100644 --- a/config.yaml +++ b/config.yaml @@ -1,10 +1,11 @@ # * Settings marked with * are advanced settings that won't appear in the Streamlit page and can only be modified manually in config.py +version: "2.0.0" ## ======================== Basic Settings ======================== ## # API settings api: - key: 'YOUR_KEY' - base_url: 'https://yunwu.zeabur.app' - model: 'gemini-1.5-pro-002' + key: 'YOUR_API_KEY' + base_url: 'https://api.siliconflow.cn' + model: 'Qwen/Qwen2.5-72B-Instruct' # Language settings, written into the prompt, can be described in natural language target_language: '简体中文' @@ -13,7 +14,9 @@ target_language: '简体中文' demucs: false whisper: - # Whisper specified recognition language [en, zh, auto] auto for automatic detection, en for forced translation to English + # ["medium", "large-v3", "large-v3-turbo"]. Note: for zh model will force to use Belle/large-v3 + model: 'large-v3' + # Whisper specified recognition language [en, zh, ...] language: 'en' detected_language: 'en' @@ -22,7 +25,7 @@ resolution: '1920x1080' ## ======================== Advanced Settings ======================== ## # *Default resolution for downloading YouTube videos [360, 1080, best] -ytb_resolution: '360' +ytb_resolution: '1080' subtitle: # *Maximum length of each subtitle line in characters @@ -39,8 +42,20 @@ max_split_length: 20 pause_before_translate: false ## ======================== Dubbing Settings ======================== ## -# TTS selection [openai_tts, gpt_sovits, azure_tts, fish_tts] -tts_method: 'openai_tts' +# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts] +tts_method: 'sf_fish_tts' + +# SiliconFlow FishTTS +sf_fish_tts: + # SiliconFlow API key + api_key: 'YOUR_API_KEY' + # only for mode "preset" + voice: 'anna' + # *only for mode "custom", dont set manually + custom_name: '' + voice_id: '' + # preset, custom, dynamic + mode: "preset" # OpenAI TTS-1 API configuration openai_tts: @@ -70,15 +85,16 @@ fish_tts: # *Audio speed range speed_factor: min: 1 + accept: 1.2 # 可以接受的最大速度 max: 1.4 - normal: 1.2 # *Considered normal speech rate # *Merge audio configuration -min_subtitle_duration: 3 -min_trim_duration: 2.50 +min_subtitle_duration: 2.5 # 最小字幕出现时间 会强制扩展 +min_trim_duration: 3.5 # 小于这个值的字幕不会切割 +tolerance: 1.5 # 允许向后延申的时间 # Volume settings -dub_volume: 1.3 # *Dubbed audio volume (1.3 = 130%, most original dubbing audio is relatively quiet) +dub_volume: 1.5 # *Dubbed audio volume (1.5 = 150%, most original dubbing audio is relatively quiet) @@ -114,6 +130,11 @@ llm_support_json: - 'gemini-1.5-pro-latest' - 'gemini-1.5-pro-002' +# have problems +# - 'Qwen/Qwen2.5-72B-Instruct' +# - 'Qwen/Qwen2.5-Coder-32B-Instruct' +# - 'Qwen/Qwen2.5-Chat-72B-Instruct-128K' + # Spacy models spacy_model_map: en: 'en_core_web_md' @@ -137,4 +158,4 @@ language_split_with_space: # Languages that do not use space as separator language_split_without_space: - 'zh' -- 'ja' \ No newline at end of file +- 'ja' diff --git a/core/all_tts_functions/estimate_duration.py b/core/all_tts_functions/estimate_duration.py new file mode 100644 index 00000000..c4f7e807 --- /dev/null +++ b/core/all_tts_functions/estimate_duration.py @@ -0,0 +1,128 @@ +import syllables +from pypinyin import pinyin, Style +from g2p_en import G2p +from typing import Optional +import re + +class AdvancedSyllableEstimator: + def __init__(self): + self.g2p_en = G2p() + self.duration_params = {'en': 0.225, 'zh': 0.21, 'ja': 0.21, 'fr': 0.22, 'es': 0.22, 'ko': 0.21, 'default': 0.22} + self.lang_patterns = { + 'zh': r'[\u4e00-\u9fff]', 'ja': r'[\u3040-\u309f\u30a0-\u30ff]', + 'fr': r'[àâçéèêëîïôùûüÿœæ]', 'es': r'[áéíóúñ¿¡]', 'en': r'[a-zA-Z]+', 'ko': r'[\uac00-\ud7af\u1100-\u11ff]'} + self.lang_joiners = {'zh': '', 'ja': '', 'en': ' ', 'fr': ' ', 'es': ' ', 'ko': ' '} + self.punctuation = { + 'mid': r'[,;:,;、]+', 'end': r'[。!?.!?]+', 'space': r'\s+', + 'pause': {'space': 0.15, 'default': 0.1} + } + + def estimate_duration(self, text: str, lang: Optional[str] = None) -> float: + syllable_count = self.count_syllables(text, lang) + return syllable_count * self.duration_params.get(lang or 'default') + + def count_syllables(self, text: str, lang: Optional[str] = None) -> int: + if not text.strip(): return 0 + lang = lang or self._detect_language(text) + + vowels_map = { + 'fr': 'aeiouyàâéèêëîïôùûüÿœæ', + 'es': 'aeiouáéíóúü' + } + + if lang == 'en': + return self._count_english_syllables(text) + elif lang == 'zh': + text = re.sub(r'[^\u4e00-\u9fff]', '', text) + return len(pinyin(text, style=Style.NORMAL)) + elif lang == 'ja': + text = re.sub(r'[きぎしじちぢにひびぴみり][ょゅゃ]', 'X', text) + text = re.sub(r'[っー]', '', text) + return len(re.findall(r'[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9fff]', text)) + elif lang in ('fr', 'es'): + text = re.sub(r'e\b', '', text.lower()) if lang == 'fr' else text.lower() + return max(1, len(re.findall(f'[{vowels_map[lang]}]+', text))) + elif lang == 'ko': + return len(re.findall(r'[\uac00-\ud7af]', text)) + return len(text.split()) + + def _count_english_syllables(self, text: str) -> int: + total = 0 + for word in text.strip().split(): + try: + total += syllables.estimate(word) + except: + phones = self.g2p_en(word) + total += max(1, len([p for p in phones if any(c in p for c in 'aeiou')])) + return max(1, total) + + def _detect_language(self, text: str) -> str: + for lang, pattern in self.lang_patterns.items(): + if re.search(pattern, text): return lang + return 'en' + + def process_mixed_text(self, text: str) -> dict: + result = {'language_breakdown': {}, 'total_syllables': 0, 'punctuation': [], 'spaces': []} + segments = re.split(f"({self.punctuation['space']}|{self.punctuation['mid']}|{self.punctuation['end']})", text) + total_duration = 0 + + for i, segment in enumerate(segments): + if not segment: continue + + if re.match(self.punctuation['space'], segment): + prev_lang = self._detect_language(segments[i-1]) if i > 0 else None + next_lang = self._detect_language(segments[i+1]) if i < len(segments)-1 else None + if prev_lang and next_lang and (self.lang_joiners[prev_lang] == '' or self.lang_joiners[next_lang] == ''): + result['spaces'].append(segment) + total_duration += self.punctuation['pause']['space'] + elif re.match(f"{self.punctuation['mid']}|{self.punctuation['end']}", segment): + result['punctuation'].append(segment) + total_duration += self.punctuation['pause']['default'] + else: + lang = self._detect_language(segment) + if lang: + syllables = self.count_syllables(segment, lang) + if lang not in result['language_breakdown']: + result['language_breakdown'][lang] = {'syllables': 0, 'text': ''} + result['language_breakdown'][lang]['syllables'] += syllables + result['language_breakdown'][lang]['text'] += (self.lang_joiners[lang] + segment + if result['language_breakdown'][lang]['text'] else segment) + result['total_syllables'] += syllables + total_duration += syllables * self.duration_params.get(lang, self.duration_params['default']) + + result['estimated_duration'] = total_duration + + return result + +def init_estimator(): + return AdvancedSyllableEstimator() + +def estimate_duration(text: str, estimator: AdvancedSyllableEstimator): + return estimator.process_mixed_text(text)['estimated_duration'] + +# 使用示例 +if __name__ == "__main__": + estimator = init_estimator() + print(estimate_duration('你好', estimator)) + + # 测试用例 + test_cases = [ + # "Hello world this is a test", # 纯英文 + # "你好世界 这是一个测试", # 中文带空格 + # "Hello 你好 world 世界", # 中英混合 + # "The weather is nice 所以我们去公园", # 中英混合带空格 + # "我们需要在输出中体现空格的停顿时间", + # "I couldn't help but notice the vibrant colors of the autumn leaves cascading gently from the trees" + "가을 나뭇잎이 부드럽게 떨어지는 생생한 색깔을 주목하지 않을 수 없었다" + ] + + for text in test_cases: + result = estimator.process_mixed_text(text) + print(f"\nText: {text}") + print(f"Total syllables: {result['total_syllables']}") + print(f"Estimated duration: {result['estimated_duration']:.2f}s") + print("Language breakdown:") + for lang, info in result['language_breakdown'].items(): + print(f"- {lang}: {info['syllables']} syllables ({info['text']})") + print(f"Punctuation: {result['punctuation']}") + print(f"Spaces: {result['spaces']}") \ No newline at end of file diff --git a/core/all_tts_functions/siliconflow_fish_tts.py b/core/all_tts_functions/siliconflow_fish_tts.py new file mode 100644 index 00000000..ce54e68e --- /dev/null +++ b/core/all_tts_functions/siliconflow_fish_tts.py @@ -0,0 +1,250 @@ +import requests +from pathlib import Path +import os, sys +import base64 +import uuid +from typing import List, Tuple +sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) +from core.config_utils import load_key, update_key +from core.step1_ytdlp import find_video_files +from core.all_whisper_methods.whisperX_utils import get_audio_duration +import hashlib +from rich import print as rprint +from pydub import AudioSegment +import time +from rich.panel import Panel +from rich.text import Text + +API_URL_SPEECH = "https://api.siliconflow.cn/v1/audio/speech" +API_URL_VOICE = "https://api.siliconflow.cn/v1/uploads/audio/voice" + +AUDIO_REFERS_DIR = "output/audio/refers" +MODEL_NAME = "fishaudio/fish-speech-1.4" + +def _get_headers(): + return {"Authorization": f'Bearer {load_key("sf_fish_tts.api_key")}', "Content-Type": "application/json"} + +def siliconflow_fish_tts(text, save_path, mode="preset", voice_id=None, ref_audio=None, ref_text=None, check_duration=False): + sf_fish_set, headers = load_key("sf_fish_tts"), _get_headers() + payload = {"model": MODEL_NAME, "response_format": "wav", "stream": False, "input": text} + + if mode == "preset": + payload["voice"] = f"fishaudio/fish-speech-1.4:{sf_fish_set['voice']}" + elif mode == "custom": + if not voice_id: + raise ValueError("custom mode requires voice_id") + payload["voice"] = voice_id + elif mode == "dynamic": + if not ref_audio or not ref_text: + raise ValueError("dynamic mode requires ref_audio and ref_text") + with open(ref_audio, 'rb') as f: + audio_base64 = base64.b64encode(f.read()).decode('utf-8') + payload = { + "model": MODEL_NAME, + "response_format": "wav", + "stream": False, + "input": text, + "voice": None, + "references": [{ + "audio": f"data:audio/wav;base64,{audio_base64}", + "text": ref_text + }] + } + else: raise ValueError("Invalid mode") + + max_retries = 2 + retry_delay = 1 + + for attempt in range(max_retries): + response = requests.post(API_URL_SPEECH, json=payload, headers=headers) + if response.status_code == 200: + wav_file_path = Path(save_path).with_suffix('.wav') + wav_file_path.parent.mkdir(parents=True, exist_ok=True) + with open(wav_file_path, 'wb') as f: f.write(response.content) + + if check_duration: + duration = get_audio_duration(wav_file_path) + rprint(f"[blue]Audio Duration: {duration:.2f} seconds") + + rprint(f"[green]Successfully generated audio file: {wav_file_path}") + return True + + error_msg = response.json() + rprint(f"[red]Failed to generate audio | HTTP {response.status_code} (Attempt {attempt + 1}/{max_retries})") + rprint(f"[red]Text: {text}") + rprint(f"[red]Error details: {error_msg}") + + if attempt < max_retries - 1: + time.sleep(retry_delay) + rprint(f"[yellow]Retrying in {retry_delay} second...") + + return False + +def create_custom_voice(audio_path, text, custom_name=None): + if not Path(audio_path).exists(): + raise FileNotFoundError(f"Audio file not found at {audio_path}") + + try: + audio_base64 = f"data:audio/wav;base64,{base64.b64encode(open(audio_path, 'rb').read()).decode('utf-8')}" + rprint(f"[yellow]✅ Successfully encoded audio file") + except Exception as e: + rprint(f"[red]❌ Error reading file: {str(e)}") + raise + + payload = { + "audio": audio_base64, + "model": MODEL_NAME, + "customName": custom_name or str(uuid.uuid4())[:8], + "text": text + } + + rprint(f"[yellow]🚀 Sending request to create voice...") + response = requests.post(API_URL_VOICE, json=payload, headers=_get_headers()) + response_json = response.json() + + if response.status_code == 200: + voice_id = response_json.get('uri') + status_text = Text() + status_text.append("✨ Successfully created custom voice!\n", style="green") + status_text.append(f"🎙️ Voice ID: {voice_id}\n", style="green") + status_text.append(f"⌛ Creation Time: {time.strftime('%Y-%m-%d %H:%M:%S')}", style="green") + rprint(Panel(status_text, title="Voice Creation Status")) + return voice_id + + error_text = Text() + error_text.append("❌ Failed to create custom voice\n", style="red") + error_text.append(f"⚠️ HTTP Status: {response.status_code}\n", style="red") + error_text.append(f"💬 Error Details: {response_json}", style="red") + rprint(Panel(error_text, title="Error", border_style="red")) + raise ValueError(f"Failed to create custom voice 🚫 HTTP {response.status_code}, Error details: {response_json}") + +def merge_audio(files: List[str], output: str) -> bool: + """Merge audio files, add a brief silence""" + try: + # Create an empty audio segment + combined = AudioSegment.empty() + silence = AudioSegment.silent(duration=100) # 100ms silence + + # Add audio files one by one + for file in files: + audio = AudioSegment.from_wav(file) + combined += audio + silence + + # Export the combined file + combined.export(output, format="wav", parameters=[ + "-acodec", "pcm_s16le", + "-ar", "44100", + "-ac", "1" + ]) + + if os.path.getsize(output) == 0: + rprint(f"[red]Output file size is 0") + return False + + rprint(f"[green]Successfully merged audio files") + return True + + except Exception as e: + rprint(f"[red]Failed to merge audio: {str(e)}") + return False + +def get_ref_audio(task_df) -> Tuple[str, str]: + """Get reference audio and text, ensuring the combined text length does not exceed 100 characters""" + rprint(f"[blue]🎯 Starting reference audio selection process...") + + duration = 0 + selected = [] + combined_text = "" + found_first = False + + for _, row in task_df.iterrows(): + current_text = row['origin'] + + # If no valid record has been found yet + if not found_first: + if len(current_text) <= 100: + selected.append(row) + combined_text = current_text + duration += row['duration'] + found_first = True + rprint(f"[yellow]📝 Found first valid row: {current_text[:50]}...") + else: + rprint(f"[yellow]⏭️ Skipping long row: {current_text[:50]}... ({len(current_text)} chars)") + continue + + # Check subsequent rows + new_text = combined_text + " " + current_text + if len(new_text) > 100: + break + + selected.append(row) + combined_text = new_text + duration += row['duration'] + rprint(f"[yellow]📝 Added row: {current_text[:50]}...") + + if duration > 10: + break + + if not selected: + rprint(f"[red]❌ No valid segments found (all texts exceed 100 characters)") + return None, None + + rprint(f"[blue]📊 Selected {len(selected)} segments, total duration: {duration:.2f}s") + + audio_files = [f"{AUDIO_REFERS_DIR}/{row['number']}.wav" for row in selected] + rprint(f"[yellow]🎵 Audio files to merge: {audio_files}") + + combined_audio = f"{AUDIO_REFERS_DIR}/combined_reference.wav" + success = merge_audio(audio_files, combined_audio) + + if not success: + rprint(f"[red]❌ Error: Failed to merge audio files") + return None, None + + rprint(f"[green]✅ Successfully created combined audio: {combined_audio}") + rprint(f"[green]📝 Final combined text: {combined_text} | Length: {len(combined_text)}") + + return combined_audio, combined_text + +def siliconflow_fish_tts_for_videolingo(text, save_as, number, task_df): + sf_fish_set = load_key("sf_fish_tts") + MODE = sf_fish_set["mode"] + + if MODE == "preset": + return siliconflow_fish_tts(text, save_as, mode="preset") + elif MODE == "custom": + video_file = find_video_files() + custom_name = hashlib.md5(video_file.encode()).hexdigest()[:8] + rprint(f"[yellow]Using custom name: {custom_name}") + log_name = load_key("sf_fish_tts.custom_name") + + if log_name != custom_name: + # Get the merged reference audio and text + ref_audio, ref_text = get_ref_audio(task_df) + if ref_audio is None or ref_text is None: + rprint(f"[red]Failed to get reference audio and text, falling back to preset mode") + return siliconflow_fish_tts(text, save_as, mode="preset") + + voice_id = create_custom_voice(ref_audio, ref_text, custom_name) + update_key("sf_fish_tts.voice_id", voice_id) + update_key("sf_fish_tts.custom_name", custom_name) + else: + voice_id = load_key("sf_fish_tts.voice_id") + return siliconflow_fish_tts(text=text, save_path=save_as, mode="custom", voice_id=voice_id) + elif MODE == "dynamic": + ref_audio_path = f"{AUDIO_REFERS_DIR}/{number}.wav" + if not Path(ref_audio_path).exists(): + rprint(f"[red]Reference audio not found: {ref_audio_path}, falling back to preset mode") + return siliconflow_fish_tts(text, save_as, mode="preset") + + ref_text = task_df[task_df['number'] == number]['origin'].iloc[0] + return siliconflow_fish_tts(text=text, save_path=save_as, mode="dynamic", ref_audio=str(ref_audio_path), ref_text=ref_text) + else: + raise ValueError("Invalid mode. Choose 'preset', 'custom', or 'dynamic'") + +if __name__ == '__main__': + pass + # create_custom_voice("output/audio/refers/1.wav", "Okay folks, welcome back. This is price action model number four, position trading.") + siliconflow_fish_tts("가을 나뭇잎이 부드럽게 떨어지는 생생한 색깔을 주목하지 않을 수 없었다", "preset_test.wav", mode="preset", check_duration=True) + # siliconflow_fish_tts("使用客制化音色测试", "custom_test.wav", mode="custom", voice_id="speech:your-voice-name:cm04pf7az00061413w7kz5qxs:mjtkgbyuunvtybnsvbxd") + # siliconflow_fish_tts("使用动态音色测试", "dynamic_test.wav", mode="dynamic", ref_audio="output/audio/refers/1.wav", ref_text="Okay folks, welcome back. This is price action model number four, position trading.") \ No newline at end of file diff --git a/core/all_tts_functions/tts_main.py b/core/all_tts_functions/tts_main.py new file mode 100644 index 00000000..91b4f2fe --- /dev/null +++ b/core/all_tts_functions/tts_main.py @@ -0,0 +1,57 @@ +import os, sys +import re +from rich import print as rprint +from pydub import AudioSegment + +sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) +from core.config_utils import load_key +from core.all_whisper_methods.whisperX_utils import get_audio_duration +from core.all_tts_functions.gpt_sovits_tts import gpt_sovits_tts_for_videolingo +from core.all_tts_functions.siliconflow_fish_tts import siliconflow_fish_tts_for_videolingo +from core.all_tts_functions.openai_tts import openai_tts +from core.all_tts_functions.fish_tts import fish_tts +from core.all_tts_functions.azure_tts import azure_tts + +def tts_main(text, save_as, number, task_df): + # 检查文本是否为空或单字符,单字符配音容易触发bug + cleaned_text = re.sub(r'[^\w\s]', '', text).strip() + if not cleaned_text or len(cleaned_text) <= 1: + silence = AudioSegment.silent(duration=100) # 100ms = 0.1s + silence.export(save_as, format="wav") + rprint(f"Created silent audio for empty/single-char text: {save_as}") + return + + # 如果文件存在,跳过 + if os.path.exists(save_as): + return + + print(f"Generating <{text}...>") + TTS_METHOD = load_key("tts_method") + + max_retries = 3 + for attempt in range(max_retries): + try: + if TTS_METHOD == 'openai_tts': + openai_tts(text, save_as) + elif TTS_METHOD == 'gpt_sovits': + gpt_sovits_tts_for_videolingo(text, save_as, number, task_df) + elif TTS_METHOD == 'fish_tts': + fish_tts(text, save_as) + elif TTS_METHOD == 'azure_tts': + azure_tts(text, save_as) + elif TTS_METHOD == 'sf_fish_tts': + siliconflow_fish_tts_for_videolingo(text, save_as, number, task_df) + + # 检查生成的音频时长 + duration = get_audio_duration(save_as) + if duration > 0: + break + else: + if os.path.exists(save_as): + os.remove(save_as) + raise Exception("Generated audio duration is 0") + + except Exception as e: + if attempt == max_retries - 1: + raise Exception(f"Failed to generate audio after {max_retries} attempts: {str(e)}") + print(f"Attempt {attempt + 1} failed, retrying...") \ No newline at end of file diff --git a/core/all_whisper_methods/whisperX_utils.py b/core/all_whisper_methods/whisperX_utils.py index 2e8ed8a5..e599b930 100644 --- a/core/all_whisper_methods/whisperX_utils.py +++ b/core/all_whisper_methods/whisperX_utils.py @@ -4,20 +4,36 @@ from rich import print sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from core.config_utils import update_key -from core.all_whisper_methods.demucs_vl import RAW_AUDIO_FILE, AUDIO_DIR -def convert_video_to_audio(input_file: str) -> str: - os.makedirs(AUDIO_DIR, exist_ok=True) - if not os.path.exists(RAW_AUDIO_FILE): - print(f"🎬➡️🎵 Converting to audio with FFmpeg ......") +AUDIO_DIR = "output/audio" +RAW_AUDIO_FILE = "output/audio/raw.mp3" +CLEANED_CHUNKS_EXCEL_PATH = "output/log/cleaned_chunks.xlsx" + +def compress_audio(input_file: str, output_file: str): + """将输入音频文件压缩为低质量音频文件,用于转录""" + if not os.path.exists(output_file): + print(f"🗜️ Converting to low quality audio with FFmpeg ......") + # 16000 Hz, 1 channel, (Whisper default) , 96kbps to keep more details as well as smaller file size subprocess.run([ - 'ffmpeg', '-y', '-i', input_file, '-vn', '-b:a', '64k', + 'ffmpeg', '-y', '-i', input_file, '-vn', '-b:a', '96k', '-ar', '16000', '-ac', '1', '-metadata', 'encoding=UTF-8', - '-f', 'mp3', RAW_AUDIO_FILE + '-f', 'mp3', output_file ], check=True, stderr=subprocess.PIPE) - print(f"🎬➡️🎵 Converted <{input_file}> to <{RAW_AUDIO_FILE}> with FFmpeg\n") + print(f"🗜️ Converted <{input_file}> to <{output_file}> with FFmpeg") + return output_file - return RAW_AUDIO_FILE +def convert_video_to_audio(video_file: str): + os.makedirs(AUDIO_DIR, exist_ok=True) + if not os.path.exists(RAW_AUDIO_FILE): + print(f"🎬➡️🎵 Converting to high quality audio with FFmpeg ......") + subprocess.run([ + 'ffmpeg', '-y', '-i', video_file, '-vn', + '-c:a', 'libmp3lame', '-b:a', '128k', + '-ar', '32000', + '-ac', '1', + '-metadata', 'encoding=UTF-8', RAW_AUDIO_FILE + ], check=True, stderr=subprocess.PIPE) + print(f"🎬➡️🎵 Converted <{video_file}> to <{RAW_AUDIO_FILE}> with FFmpeg\n") def _detect_silence(audio_file: str, start: float, end: float) -> List[float]: """Detect silence points in the given audio segment""" @@ -40,13 +56,17 @@ def get_audio_duration(audio_file: str) -> float: _, stderr = process.communicate() output = stderr.decode('utf-8', errors='ignore') - duration_str = [line for line in output.split('\n') if 'Duration' in line][0] - duration_parts = duration_str.split('Duration: ')[1].split(',')[0].split(':') - duration = float(duration_parts[0])*3600 + float(duration_parts[1])*60 + float(duration_parts[2]) - print(f"🔪 Audio duration: {duration:.2f}s") + try: + duration_str = [line for line in output.split('\n') if 'Duration' in line][0] + duration_parts = duration_str.split('Duration: ')[1].split(',')[0].split(':') + duration = float(duration_parts[0])*3600 + float(duration_parts[1])*60 + float(duration_parts[2]) + except Exception as e: + print(f"[red]❌ Error: Failed to get audio duration: {e}[/red]") + duration = 0 return duration -def split_audio(audio_file: str, target_len: int = 50*60, win: int = 60) -> List[Tuple[float, float]]: +def split_audio(audio_file: str, target_len: int = 30*60, win: int = 60) -> List[Tuple[float, float]]: + # 30 min 16000 Hz 96kbps ~ 22MB < 25MB required by whisper print("[bold blue]🔪 Starting audio segmentation...[/]") duration = get_audio_duration(audio_file) @@ -121,8 +141,7 @@ def process_transcription(result: Dict) -> pd.DataFrame: def save_results(df: pd.DataFrame): os.makedirs('output/log', exist_ok=True) - excel_path = os.path.join('output/log', "cleaned_chunks.xlsx") - + # Remove rows where 'text' is empty initial_rows = len(df) df = df[df['text'].str.len() > 0] @@ -137,8 +156,8 @@ def save_results(df: pd.DataFrame): df = df[df['text'].str.len() <= 20] df['text'] = df['text'].apply(lambda x: f'"{x}"') - df.to_excel(excel_path, index=False) - print(f"📊 Excel file saved to {excel_path}") + df.to_excel(CLEANED_CHUNKS_EXCEL_PATH, index=False) + print(f"📊 Excel file saved to {CLEANED_CHUNKS_EXCEL_PATH}") def save_language(language: str): update_key("whisper.detected_language", language) \ No newline at end of file diff --git a/core/delete_retry_dubbing.py b/core/delete_retry_dubbing.py index 29f5013b..cfb81749 100644 --- a/core/delete_retry_dubbing.py +++ b/core/delete_retry_dubbing.py @@ -4,8 +4,8 @@ def delete_dubbing_files(): files_to_delete = [ - os.path.join("output", "trans_vocal_total.wav"), - os.path.join("output", "output_video_with_audio.mp4") + os.path.join("output", "dub.wav"), + os.path.join("output", "output_dub.mp4") ] for file_path in files_to_delete: diff --git a/core/pypi_autochoose.py b/core/pypi_autochoose.py new file mode 100644 index 00000000..0d3c4d25 --- /dev/null +++ b/core/pypi_autochoose.py @@ -0,0 +1,110 @@ +import subprocess +import time +import requests +import os +import concurrent.futures +from rich.console import Console +from rich.table import Table +from rich.progress import Progress, SpinnerColumn, TextColumn +from rich.panel import Panel +import sys + +MIRRORS = { + "Tsinghua Mirror": "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple", + "PyPI Official": "https://pypi.org/simple" +} + +console = Console() + +FAST_THRESHOLD = 1000 # ms +SLOW_THRESHOLD = 1500 # ms + +def get_optimal_thread_count(): + try: + cpu_count = os.cpu_count() + return max(cpu_count - 1, 1) + except: + return 2 + +def test_mirror_speed(name, url): + try: + start_time = time.time() + response = requests.get(url, timeout=5) + end_time = time.time() + if response.status_code == 200: + speed = (end_time - start_time) * 1000 + return name, speed + else: + return name, float('inf') + except requests.RequestException: + return name, float('inf') + +def set_pip_mirror(url): + try: + subprocess.run([sys.executable, "-m", "pip", "config", "set", "global.index-url", url], + check=True, + capture_output=True) + return True + except subprocess.CalledProcessError as e: + print(f"Failed to set pip mirror: {e}") + return False + +def get_current_pip_mirror(): + try: + result = subprocess.run([sys.executable, "-m", "pip", "config", "get", "global.index-url"], + capture_output=True, text=True, check=True) + return result.stdout.strip() + except subprocess.CalledProcessError: + return None + +def main(): + console.print(Panel.fit("🚀 PyPI Mirror Speed Test", style="bold cyan")) + + # Test all mirrors simultaneously + speeds = {} + with Progress( + SpinnerColumn(), + TextColumn("[cyan]Testing mirrors...[/cyan]"), + ) as progress: + progress.add_task("", total=None) # Indeterminate spinner + + with concurrent.futures.ThreadPoolExecutor(max_workers=get_optimal_thread_count()) as executor: + future_to_mirror = {executor.submit(test_mirror_speed, name, url): name + for name, url in MIRRORS.items()} + + for future in concurrent.futures.as_completed(future_to_mirror): + name = future_to_mirror[future] + try: + name, speed = future.result() + if speed != float('inf'): + speeds[name] = speed + except Exception as exc: + print(f'{name} generated an exception: {exc}') + + # Results display + table = Table(show_header=False) + table.add_column(style="cyan") + table.add_column(justify="right", style="magenta") + + for name, speed in sorted(speeds.items(), key=lambda x: x[1]): + table.add_row(name, f"{speed:.0f}ms") + + console.print(table) + + if speeds: + fastest_mirror = min(speeds, key=speeds.get) + fastest_url = MIRRORS[fastest_mirror] + + if set_pip_mirror(fastest_url): + current_mirror = get_current_pip_mirror() + if current_mirror == fastest_url: + console.print(f"✅ Switched to {fastest_mirror}\n🔗 {fastest_url}", style="green") + else: + console.print(f"❌ Switch failed\nExpected: {fastest_url}\nCurrent: {current_mirror}\n💡 Try running with admin privileges", style="red") + else: + console.print(f"❌ Failed to switch mirror\n💡 Check permissions and try again", style="red") + else: + console.print("❌ All mirrors unreachable\n💡 Check network connection", style="red") + +if __name__ == "__main__": + main() diff --git a/core/step10_gen_audio.py b/core/step10_gen_audio.py index a56a096b..ac85f45c 100644 --- a/core/step10_gen_audio.py +++ b/core/step10_gen_audio.py @@ -1,173 +1,215 @@ -import os, sys -import pandas as pd -from tqdm import tqdm -import soundfile as sf +import os +import sys +import time +import shutil import subprocess +from typing import Tuple + +import pandas as pd +from pydub import AudioSegment from rich import print as rprint -from rich.panel import Panel from rich.console import Console -import time +from rich.progress import Progress +from concurrent.futures import ThreadPoolExecutor, as_completed + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from core.all_tts_functions.gpt_sovits_tts import gpt_sovits_tts_for_videolingo -from core.all_tts_functions.openai_tts import openai_tts -from core.all_tts_functions.fish_tts import fish_tts -from core.all_tts_functions.azure_tts import azure_tts -from core.prompts_storage import get_subtitle_trim_prompt -from core.ask_gpt import ask_gpt from core.config_utils import load_key +from core.all_whisper_methods.whisperX_utils import get_audio_duration +from core.all_tts_functions.tts_main import tts_main console = Console() TEMP_DIR = 'output/audio/tmp' SEGS_DIR = 'output/audio/segs' -TASKS_FILE = "output/audio/sovits_tasks.xlsx" +TASKS_FILE = "output/audio/tts_tasks.xlsx" +OUTPUT_FILE = "output/audio/tts_tasks.xlsx" TEMP_FILE_TEMPLATE = f"{TEMP_DIR}/{{}}_temp.wav" OUTPUT_FILE_TEMPLATE = f"{SEGS_DIR}/{{}}.wav" +WARMUP_SIZE = 5 -def check_wav_duration(file_path): - try: - audio_info = sf.info(file_path) - return audio_info.duration - except Exception as e: - raise Exception(f"Error checking duration: {str(e)}") - -def parse_srt_time(time_str): +def parse_df_srt_time(time_str: str) -> float: + """Convert SRT time format to seconds""" hours, minutes, seconds = time_str.strip().split(':') - seconds, milliseconds = seconds.split(',') + seconds, milliseconds = seconds.split('.') return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000 -def tts_main(text, save_as, number, task_df): - TTS_METHOD = load_key("tts_method") - if TTS_METHOD == 'openai_tts': - openai_tts(text, save_as) - elif TTS_METHOD == 'gpt_sovits': - #! 注意 gpt_sovits_tts 只支持输出中文,输入中文或英文 - gpt_sovits_tts_for_videolingo(text, save_as, number, task_df) - elif TTS_METHOD == 'fish_tts': - fish_tts(text, save_as) - elif TTS_METHOD == 'azure_tts': - azure_tts(text, save_as) - -def generate_audio(text, target_duration, save_as, number, task_df): - MIN_SPEED = load_key("speed_factor.min") - MAX_SPEED = load_key("speed_factor.max") - os.makedirs(TEMP_DIR, exist_ok=True) - temp_file = TEMP_FILE_TEMPLATE.format(number) - - # handle empty text or nan - if pd.isna(text) or not str(text).strip(): - # generate silent audio - cmd = ['ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=mono', '-t', '0.1', '-q:a', '0', '-y', save_as] - subprocess.run(cmd, check=True, stderr=subprocess.PIPE) - rprint(f"ℹ️ {number} Generated silent audio for empty text: {save_as}") +def adjust_audio_speed(input_file: str, output_file: str, speed_factor: float) -> None: + """Adjust audio speed and handle edge cases""" + # If the speed factor is close to 1, directly copy the file + if abs(speed_factor - 1.0) < 0.001: + shutil.copy2(input_file, output_file) return - - tts_main(text, temp_file, number, task_df) - - original_duration = check_wav_duration(temp_file) - # -0.03 to avoid the duration is too close to the target_duration - speed_factor = original_duration / (target_duration-0.03) - - # Check speed factor and adjust audio speed - if MIN_SPEED <= speed_factor <= MAX_SPEED: - change_audio_speed(temp_file, save_as, speed_factor) - final_duration = check_wav_duration(save_as) - rprint(f"✅ {number} Adjusted audio: {save_as} | Duration: {final_duration:.2f}s | Required: {target_duration:.2f}s | Speed factor: {speed_factor:.2f}") - elif speed_factor < MIN_SPEED: - change_audio_speed(temp_file, save_as, MIN_SPEED) - final_duration = check_wav_duration(save_as) - rprint(f"⚠️ {number} Adjusted audio: {save_as} | Duration: {final_duration:.2f}s | Required: {target_duration:.2f}s | Speed factor: {MIN_SPEED}") - else: # speed_factor > MAX_SPEED - rprint(f"🚨 {number} Speed factor out of range: {speed_factor:.2f}, attempting to simplify subtitle...") - original_text = text - prompt = get_subtitle_trim_prompt(text, target_duration) - response = ask_gpt(prompt, response_json=True, log_title='subtitle_trim') - shortened_text = response['result'] - - rprint(f"Original subtitle: {original_text} | Simplified subtitle: {shortened_text}") - - tts_main(shortened_text, temp_file, number, task_df) - new_original_duration = check_wav_duration(temp_file) - new_speed_factor = new_original_duration / (target_duration-0.03) - - if MIN_SPEED <= new_speed_factor <= MAX_SPEED: - change_audio_speed(temp_file, save_as, new_speed_factor) - final_duration = check_wav_duration(save_as) - rprint(f"✅ {number} Adjusted audio: {save_as} | Duration: {final_duration:.2f}s | Required: {target_duration:.2f}s | Speed factor: {new_speed_factor:.2f}") - elif new_speed_factor > MAX_SPEED: - rprint(f"🚔 {number} Speed factor still out of range after simplification: {new_speed_factor:.2f}") - change_audio_speed(temp_file, save_as, new_speed_factor) #! force adjust - final_duration = check_wav_duration(save_as) - rprint(f"🚔 {number} Forced adjustment: {save_as} | Duration: {final_duration:.2f}s | Required: {target_duration:.2f}s | Speed factor: {new_speed_factor}") - elif new_speed_factor < MIN_SPEED: - rprint(f"⚠️ {number} Speed factor too low after simplification: {new_speed_factor:.2f}") - change_audio_speed(temp_file, save_as, MIN_SPEED) - final_duration = check_wav_duration(save_as) - rprint(f"⚠️ {number} Forced adjustment: {save_as} | Duration: {final_duration:.2f}s | Required: {target_duration:.2f}s | Speed factor: {MIN_SPEED}") - - #! check duration for safety - if final_duration > target_duration: - rprint(f"❎ {number} Final duration is longer than target duration: {final_duration:.2f}s | Required: {target_duration:.2f}s. This is a bug, please report it.") - raise Exception() - - if os.path.exists(temp_file): - os.remove(temp_file) - -def change_audio_speed(input_file, output_file, speed_factor): atempo = speed_factor cmd = ['ffmpeg', '-i', input_file, '-filter:a', f'atempo={atempo}', '-y', output_file] - - max_retries = 3 + input_duration = get_audio_duration(input_file) + max_retries = 2 for attempt in range(max_retries): try: subprocess.run(cmd, check=True, stderr=subprocess.PIPE) - return # Success, exit the function + output_duration = get_audio_duration(output_file) + expected_duration = input_duration / speed_factor + diff = output_duration - expected_duration + # If the output duration exceeds the expected duration, but the input audio is less than 3 seconds, and the error is within 0.1 seconds, truncate to the expected length + if output_duration >= expected_duration * 1.01 and input_duration < 3 and diff <= 0.1: + audio = AudioSegment.from_wav(output_file) + trimmed_audio = audio[:(expected_duration * 1000)] # pydub uses milliseconds + trimmed_audio.export(output_file, format="wav") + print(f"✂️ Trimmed to expected duration: {expected_duration:.2f} seconds") + return + elif output_duration >= expected_duration * 1.01: + raise Exception(f"Audio duration abnormal: input file={input_file}, output file={output_file}, speed factor={speed_factor}, input duration={input_duration:.2f}s, output duration={output_duration:.2f}s") + return except subprocess.CalledProcessError as e: - if attempt < max_retries - 1: # If it's not the last attempt - rprint(f"[yellow]Warning: Failed to change audio speed, retrying in 1 second (Attempt {attempt + 1}/{max_retries})[/yellow]") + if attempt < max_retries - 1: + rprint(f"[yellow]⚠️ Audio speed adjustment failed, retrying in 1s ({attempt + 1}/{max_retries})[/yellow]") time.sleep(1) else: - rprint(f"[red]Error: Failed to change audio speed, maximum retry attempts reached ({max_retries})[/red]") - raise e # Re-raise the exception if all retries failed - -def process_sovits_tasks(): - tasks_df = pd.read_excel(TASKS_FILE) - errors = [] - os.makedirs(SEGS_DIR, exist_ok=True) - - with console.status("[bold green]Processing tasks...") as status: - for _, row in tqdm(tasks_df.iterrows(), total=len(tasks_df)): - out_file = OUTPUT_FILE_TEMPLATE.format(row["number"]) - if os.path.exists(out_file): - rprint(f"[yellow]File {out_file} already exists, skipping[/yellow]") - continue - try: - generate_audio(row['text'], float(row['duration']), out_file, row['number'], tasks_df) - except Exception as e: - errors.append(row['number']) - rprint(Panel(f"Error processing task {row['number']}: {str(e)}", title="Error", border_style="red")) - - if errors: - # Retry once, sometimes there might be network issues or file I/O errors - rprint(Panel(f"The following tasks encountered errors, retrying: {', '.join(map(str, errors))}", title="Retry", border_style="yellow")) - retry_tasks = errors.copy() - errors.clear() - for task_number in retry_tasks: - row = tasks_df[tasks_df['number'] == task_number].iloc[0] - out_file = OUTPUT_FILE_TEMPLATE.format(row["number"]) + rprint(f"[red]❌ Audio speed adjustment failed, max retries reached ({max_retries})[/red]") + raise e + +def process_row(row: pd.Series, tasks_df: pd.DataFrame) -> Tuple[int, float]: + """Helper function for processing single row data""" + number = row['number'] + lines = eval(row['lines']) if isinstance(row['lines'], str) else row['lines'] + real_dur = 0 + for line_index, line in enumerate(lines): + temp_file = TEMP_FILE_TEMPLATE.format(f"{number}_{line_index}") + tts_main(line, temp_file, number, tasks_df) + real_dur += get_audio_duration(temp_file) + return number, real_dur + +def generate_tts_audio(tasks_df: pd.DataFrame) -> pd.DataFrame: + """Generate TTS audio sequentially and calculate actual duration""" + tasks_df['real_dur'] = 0 + rprint("[bold green]🎯 Starting TTS audio generation...[/bold green]") + + with Progress() as progress: + task = progress.add_task("[cyan]🔄 Generating TTS audio...", total=len(tasks_df)) + + # warm up for first 5 rows + warmup_size = min(WARMUP_SIZE, len(tasks_df)) + for _, row in tasks_df.head(warmup_size).iterrows(): try: - generate_audio(row['text'], float(row['duration']), out_file, row['number'], tasks_df) + number, real_dur = process_row(row, tasks_df) + tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur + progress.advance(task) except Exception as e: - errors.append(row['number']) - rprint(Panel(f"Error retrying task {row['number']}: {str(e)}", title="Error", border_style="red")) + rprint(f"[red]❌ Error in warmup: {str(e)}[/red]") + raise e + + # parallel processing for remaining tasks + if len(tasks_df) > warmup_size: + remaining_tasks = tasks_df.iloc[warmup_size:].copy() + with ThreadPoolExecutor() as executor: + futures = [ + executor.submit(process_row, row, tasks_df.copy()) + for _, row in remaining_tasks.iterrows() + ] + + for future in as_completed(futures): + try: + number, real_dur = future.result() + tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur + progress.advance(task) + except Exception as e: + rprint(f"[red]❌ Error: {str(e)}[/red]") + raise e + + rprint("[bold green]✨ TTS audio generation completed![/bold green]") + return tasks_df + +def process_chunk(chunk_df: pd.DataFrame, accept: float, min_speed: float) -> tuple[float, bool]: + """Process audio chunk and calculate speed factor""" + chunk_durs = chunk_df['real_dur'].sum() + tol_durs = chunk_df['tol_dur'].sum() + durations = tol_durs - chunk_df.iloc[-1]['tolerance'] + all_gaps = chunk_df['gap'].sum() - chunk_df.iloc[-1]['gap'] + + keep_gaps = True + speed_var_error = 0.1 + + if (chunk_durs + all_gaps) / accept < durations: + speed_factor = max(min_speed, (chunk_durs + all_gaps) / (durations-speed_var_error)) + elif chunk_durs / accept < durations: + speed_factor = max(min_speed, chunk_durs / (durations-speed_var_error)) + keep_gaps = False + elif (chunk_durs + all_gaps) / accept < tol_durs: + speed_factor = max(min_speed, (chunk_durs + all_gaps) / (tol_durs-speed_var_error)) + else: + speed_factor = chunk_durs / (tol_durs-speed_var_error) + keep_gaps = False + + return round(speed_factor, 3), keep_gaps + +def merge_chunks(tasks_df: pd.DataFrame) -> pd.DataFrame: + """Merge audio chunks and adjust timeline""" + rprint("[bold blue]🔄 Starting audio chunks processing...[/bold blue]") + accept = load_key("speed_factor.accept") + min_speed = load_key("speed_factor.min") + chunk_start = 0 + + tasks_df['new_sub_times'] = None + + for index, row in tasks_df.iterrows(): + if row['cut_off'] == 1: + chunk_df = tasks_df.iloc[chunk_start:index+1].reset_index(drop=True) + speed_factor, keep_gaps = process_chunk(chunk_df, accept, min_speed) + + # 🎯 Step1: Start processing new timeline + chunk_start_time = parse_df_srt_time(chunk_df.iloc[0]['start_time']) + chunk_end_time = parse_df_srt_time(chunk_df.iloc[-1]['end_time']) + chunk_df.iloc[-1]['tolerance'] # 加上tolerance才是这一块的结束 + cur_time = chunk_start_time + for i, row in chunk_df.iterrows(): + # If i is not 0, which is not the first row of the chunk, cur_time needs to be added with the gap of the previous row, remember to divide by speed_factor + if i != 0 and keep_gaps: + cur_time += chunk_df.iloc[i-1]['gap']/speed_factor + new_sub_times = [] + number = row['number'] + lines = eval(row['lines']) if isinstance(row['lines'], str) else row['lines'] + for line_index, line in enumerate(lines): + # 🔄 Step2: Start speed change and save as OUTPUT_FILE_TEMPLATE + temp_file = TEMP_FILE_TEMPLATE.format(f"{number}_{line_index}") + output_file = OUTPUT_FILE_TEMPLATE.format(f"{number}_{line_index}") + adjust_audio_speed(temp_file, output_file, speed_factor) + ad_dur = get_audio_duration(output_file) + new_sub_times.append([cur_time, cur_time+ad_dur]) + cur_time += ad_dur + # 🔄 Step3: Find corresponding main DataFrame index and update new_sub_times + main_df_idx = tasks_df[tasks_df['number'] == row['number']].index[0] + tasks_df.at[main_df_idx, 'new_sub_times'] = new_sub_times + # 🎯 Step4: Choose emoji based on speed_factor and accept comparison + emoji = "⚡" if speed_factor <= accept else "⚠️" + rprint(f"[cyan]{emoji} Processed chunk {chunk_start} to {index} with speed factor {speed_factor}[/cyan]") + # 🔄 Step5: Check if the last row exceeds the range + if cur_time > chunk_end_time: + raise Exception(f"Chunk {chunk_start} to {index} exceeds the chunk end time {chunk_end_time:.2f} seconds with current time {cur_time:.2f} seconds") + chunk_start = index+1 + + rprint("[bold green]✅ Audio chunks processing completed![/bold green]") + return tasks_df - if errors: - error_msg = f"The following tasks failed to process: {', '.join(map(str, errors))}" - rprint(Panel(error_msg, title="Failed Tasks", border_style="red")) - raise Exception("tasks failed to process, please check cli output for details") +def gen_audio() -> None: + """Main function: Generate audio and process timeline""" + rprint("[bold magenta]🚀 Starting audio generation process...[/bold magenta]") + + # 🎯 Step1: Create necessary directories + os.makedirs(TEMP_DIR, exist_ok=True) + os.makedirs(SEGS_DIR, exist_ok=True) + + # 📝 Step2: Load task file + tasks_df = pd.read_excel(TASKS_FILE) + rprint("[green]📊 Loaded task file successfully[/green]") + + # 🔊 Step3: Generate TTS audio + tasks_df = generate_tts_audio(tasks_df) + + # 🔄 Step4: Merge audio chunks + tasks_df = merge_chunks(tasks_df) - rprint(Panel("Task processing completed", title="Success", border_style="green")) + # 💾 Step5: Save results + tasks_df.to_excel(OUTPUT_FILE, index=False) + rprint("[bold green]🎉 Audio generation completed successfully![/bold green]") if __name__ == "__main__": - process_sovits_tasks() \ No newline at end of file + gen_audio() diff --git a/core/step11_merge_audio_to_vid.py b/core/step11_merge_audio_to_vid.py deleted file mode 100644 index 04310dde..00000000 --- a/core/step11_merge_audio_to_vid.py +++ /dev/null @@ -1,110 +0,0 @@ -import sys, os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from core.config_utils import load_key -from datetime import datetime -import pandas as pd -import subprocess -from pydub import AudioSegment -from rich import print as rprint -import numpy as np -import soundfile as sf -import cv2 -from core.all_whisper_methods.demucs_vl import BACKGROUND_AUDIO_FILE -from core.step7_merge_sub_to_vid import check_gpu_available - -INPUT_EXCEL = 'output/audio/sovits_tasks.xlsx' -OUTPUT_AUDIO = 'output/trans_vocal_total.wav' -VIDEO_FILE = "output/output_video_with_subs.mp4" -OUTPUT_VIDEO = "output/output_video_with_audio.mp4" - -def time_to_datetime(time_str): - return datetime.strptime(time_str, '%H:%M:%S.%f') - -def create_silence(duration, output_file): - sample_rate = 32000 - num_samples = int(duration * sample_rate) - silence = np.zeros(num_samples, dtype=np.float32) - sf.write(output_file, silence, sample_rate) - -def merge_all_audio(): - # Define input and output paths - input_excel = INPUT_EXCEL - output_audio = OUTPUT_AUDIO - - df = pd.read_excel(input_excel) - - # Get the sample rate of the first audio file - first_audio = f'output/audio/segs/{df.iloc[0]["number"]}.wav' - sample_rate = AudioSegment.from_wav(first_audio).frame_rate - - # Create an empty AudioSegment object - merged_audio = AudioSegment.silent(duration=0, frame_rate=sample_rate) - - prev_target_start_time = None - prev_actual_duration = 0 - - for index, row in df.iterrows(): - number = row['number'] - start_time = row['start_time'] - input_audio = f'output/audio/segs/{number}.wav' - - if not os.path.exists(input_audio): - rprint(f"[bold yellow]Warning: File {input_audio} does not exist, skipping this file.[/bold yellow]") - continue - - audio_segment = AudioSegment.from_wav(input_audio) - actual_duration = len(audio_segment) / 1000 # Convert to seconds - target_start_time = time_to_datetime(start_time) - - silence_duration = (target_start_time - datetime(1900, 1, 1)).total_seconds() if prev_target_start_time is None else (target_start_time - prev_target_start_time).total_seconds() - prev_actual_duration - - if silence_duration > 0: - silence = AudioSegment.silent(duration=int(silence_duration * 1000), frame_rate=sample_rate) - merged_audio += silence - - merged_audio += audio_segment - - prev_target_start_time = target_start_time - prev_actual_duration = actual_duration - - # Export the merged audio - merged_audio.export(output_audio, format="wav") - rprint(f"[bold green]Audio file successfully merged, output file: {output_audio}[/bold green]") - -def merge_video_audio(): - """Merge video and audio, and reduce video volume""" - background_file = BACKGROUND_AUDIO_FILE - - if load_key("resolution") == '0x0': - rprint("[bold yellow]Warning: A 0-second black video will be generated as a placeholder as Resolution is set to 0x0.[/bold yellow]") - - # Create a black frame - frame = np.zeros((1080, 1920, 3), dtype=np.uint8) - fourcc = cv2.VideoWriter_fourcc(*'mp4v') - out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, 1, (1920, 1080)) - out.write(frame) - out.release() - - rprint("[bold green]Placeholder video has been generated.[/bold green]") - return - - # Merge video and audio - dub_volume = load_key("dub_volume") - cmd = ['ffmpeg', '-y', '-i', VIDEO_FILE, '-i', background_file, '-i', OUTPUT_AUDIO, - '-filter_complex', f'[1:a]volume=1[a1];[2:a]volume={dub_volume}[a2];[a1][a2]amix=inputs=2:duration=first:dropout_transition=3[a]'] - - if check_gpu_available(): - rprint("[bold green]Using GPU acceleration...[/bold green]") - cmd.extend(['-c:v', 'h264_nvenc']) - - cmd.extend(['-map', '0:v', '-map', '[a]', '-c:a', 'aac', '-b:a', '192k', OUTPUT_VIDEO]) - - subprocess.run(cmd) - rprint(f"[bold green]Video and audio successfully merged into {OUTPUT_VIDEO}[/bold green]") - -def merge_main(): - merge_all_audio() - merge_video_audio() - -if __name__ == "__main__": - merge_main() \ No newline at end of file diff --git a/core/step11_merge_full_audio.py b/core/step11_merge_full_audio.py new file mode 100644 index 00000000..384937d5 --- /dev/null +++ b/core/step11_merge_full_audio.py @@ -0,0 +1,144 @@ +import sys, os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import pandas as pd +import subprocess +from pydub import AudioSegment +from rich import print as rprint +from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn +from rich.console import Console +console = Console() + +INPUT_EXCEL = 'output/audio/tts_tasks.xlsx' +DUB_VOCAL_FILE = 'output/dub.mp3' + +DUB_SUB_FILE = 'output/dub.srt' +SEGS_DIR = 'output/audio/segs' +OUTPUT_FILE_TEMPLATE = f"{SEGS_DIR}/{{}}.wav" + +def load_and_flatten_data(excel_file): + """Load and flatten Excel data""" + df = pd.read_excel(excel_file) + lines = [eval(line) if isinstance(line, str) else line for line in df['lines'].tolist()] + lines = [item for sublist in lines for item in sublist] + + new_sub_times = [eval(time) if isinstance(time, str) else time for time in df['new_sub_times'].tolist()] + new_sub_times = [item for sublist in new_sub_times for item in sublist] + + return df, lines, new_sub_times + +def get_audio_files(df): + """Generate a list of audio file paths""" + audios = [] + for index, row in df.iterrows(): + number = row['number'] + line_count = len(eval(row['lines']) if isinstance(row['lines'], str) else row['lines']) + for line_index in range(line_count): + temp_file = OUTPUT_FILE_TEMPLATE.format(f"{number}_{line_index}") + audios.append(temp_file) + return audios + +def process_audio_segment(audio_file): + """Process a single audio segment with MP3 compression""" + temp_file = f"{audio_file}_temp.mp3" + ffmpeg_cmd = [ + 'ffmpeg', '-y', + '-i', audio_file, + '-ar', '16000', # 固定采样率为16kHz + '-ac', '1', # 单声道 + '-b:a', '64k', # 比特率64kbps + temp_file + ] + subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + audio_segment = AudioSegment.from_mp3(temp_file) + os.remove(temp_file) + return audio_segment + +def merge_audio_segments(audios, new_sub_times, sample_rate): + merged_audio = AudioSegment.silent(duration=0, frame_rate=sample_rate) + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + ) as progress: + merge_task = progress.add_task("🎵 Merging audio segments...", total=len(audios)) + + for i, (audio_file, time_range) in enumerate(zip(audios, new_sub_times)): + if not os.path.exists(audio_file): + console.print(f"[bold yellow]⚠️ Warning: File {audio_file} does not exist, skipping...[/bold yellow]") + progress.advance(merge_task) + continue + + audio_segment = process_audio_segment(audio_file) + start_time, end_time = time_range + + # Add silence segment + if i > 0: + prev_end = new_sub_times[i-1][1] + silence_duration = start_time - prev_end + if silence_duration > 0: + silence = AudioSegment.silent(duration=int(silence_duration * 1000), frame_rate=sample_rate) + merged_audio += silence + elif start_time > 0: + silence = AudioSegment.silent(duration=int(start_time * 1000), frame_rate=sample_rate) + merged_audio += silence + + merged_audio += audio_segment + progress.advance(merge_task) + + return merged_audio + +def create_srt_subtitle(): + df, lines, new_sub_times = load_and_flatten_data(INPUT_EXCEL) + + with open(DUB_SUB_FILE, 'w', encoding='utf-8') as f: + for i, ((start_time, end_time), line) in enumerate(zip(new_sub_times, lines), 1): + start_str = f"{int(start_time//3600):02d}:{int((start_time%3600)//60):02d}:{int(start_time%60):02d},{int((start_time*1000)%1000):03d}" + end_str = f"{int(end_time//3600):02d}:{int((end_time%3600)//60):02d}:{int(end_time%60):02d},{int((end_time*1000)%1000):03d}" + + f.write(f"{i}\n") + f.write(f"{start_str} --> {end_str}\n") + f.write(f"{line}\n\n") + + rprint(f"[bold green]✅ Subtitle file created: {DUB_SUB_FILE}[/bold green]") + +def merge_full_audio(): + """Main function: Process the complete audio merging process""" + console.print("\n[bold cyan]🎬 Starting audio merging process...[/bold cyan]") + + with console.status("[bold cyan]📊 Loading data from Excel...[/bold cyan]"): + df, lines, new_sub_times = load_and_flatten_data(INPUT_EXCEL) + console.print("[bold green]✅ Data loaded successfully[/bold green]") + + with console.status("[bold cyan]🔍 Getting audio file list...[/bold cyan]"): + audios = get_audio_files(df) + console.print(f"[bold green]✅ Found {len(audios)} audio segments[/bold green]") + + with console.status("[bold cyan]📝 Generating subtitle file...[/bold cyan]"): + create_srt_subtitle() + + if not os.path.exists(audios[0]): + console.print(f"[bold red]❌ Error: First audio file {audios[0]} does not exist![/bold red]") + return + + with console.status("[bold cyan]🎚️ Getting sample rate...[/bold cyan]"): + detected_rate = AudioSegment.from_wav(audios[0]).frame_rate + sample_rate = min(16000, detected_rate) + console.print(f"[bold green]✅ Sample rate: {sample_rate}Hz (detected: {detected_rate}Hz)[/bold green]") + + console.print("[bold cyan]🔄 Starting audio merge process...[/bold cyan]") + merged_audio = merge_audio_segments(audios, new_sub_times, sample_rate) + + with console.status("[bold cyan]💾 Exporting final audio file...[/bold cyan]"): + merged_audio = merged_audio.set_frame_rate(16000).set_channels(1) + merged_audio.export( + DUB_VOCAL_FILE, + format="mp3", + parameters=["-b:a", "64k"] + ) + console.print(f"[bold green]✅ Audio file successfully merged![/bold green]") + console.print(f"[bold green]📁 Output file: {DUB_VOCAL_FILE}[/bold green]") + +if __name__ == "__main__": + merge_full_audio() \ No newline at end of file diff --git a/core/step12_merge_dub_to_vid.py b/core/step12_merge_dub_to_vid.py new file mode 100644 index 00000000..f261b97f --- /dev/null +++ b/core/step12_merge_dub_to_vid.py @@ -0,0 +1,82 @@ +import os +import sys +import platform +import subprocess + +import numpy as np +import cv2 +from rich import print as rprint + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from core.all_whisper_methods.demucs_vl import BACKGROUND_AUDIO_FILE +from core.step7_merge_sub_to_vid import check_gpu_available +from core.config_utils import load_key +from core.step1_ytdlp import find_video_files + +DUB_VIDEO = "output/output_dub.mp4" +DUB_SUB_FILE = 'output/dub.srt' +DUB_AUDIO = 'output/dub.mp3' + +TRANS_FONT_SIZE = 20 +TRANS_FONT_NAME = 'Arial' +if platform.system() == 'Linux': + TRANS_FONT_NAME = 'NotoSansCJK-Regular' + +TRANS_FONT_COLOR = '&H00FFFF' +TRANS_OUTLINE_COLOR = '&H000000' +TRANS_OUTLINE_WIDTH = 1 +TRANS_BACK_COLOR = '&H33000000' + +def merge_video_audio(): + """Merge video and audio, and reduce video volume""" + VIDEO_FILE = find_video_files() + background_file = BACKGROUND_AUDIO_FILE + + if load_key("resolution") == '0x0': + rprint("[bold yellow]Warning: A 0-second black video will be generated as a placeholder as Resolution is set to 0x0.[/bold yellow]") + + # Create a black frame + frame = np.zeros((1080, 1920, 3), dtype=np.uint8) + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + out = cv2.VideoWriter(DUB_VIDEO, fourcc, 1, (1920, 1080)) + out.write(frame) + out.release() + + rprint("[bold green]Placeholder video has been generated.[/bold green]") + return + + # Merge video and audio with translated subtitles + dub_volume = load_key("dub_volume") + resolution = load_key("resolution") + target_width, target_height = resolution.split('x') + + subtitle_filter = ( + f"subtitles={DUB_SUB_FILE}:force_style='FontSize={TRANS_FONT_SIZE}," + f"FontName={TRANS_FONT_NAME},PrimaryColour={TRANS_FONT_COLOR}," + f"OutlineColour={TRANS_OUTLINE_COLOR},OutlineWidth={TRANS_OUTLINE_WIDTH}," + f"BackColour={TRANS_BACK_COLOR},Alignment=2,MarginV=27,BorderStyle=4'" + ) + + cmd = [ + 'ffmpeg', '-y', '-i', VIDEO_FILE, '-i', background_file, '-i', DUB_AUDIO, + '-filter_complex', + f'[0:v]scale={target_width}:{target_height}:force_original_aspect_ratio=decrease,' + f'pad={target_width}:{target_height}:(ow-iw)/2:(oh-ih)/2,' + f'{subtitle_filter}[v];' + f'[1:a]volume=1[a1];[2:a]volume={dub_volume}[a2];' + f'[a1][a2]amix=inputs=2:duration=first:dropout_transition=3[a]' + ] + + if check_gpu_available(): + rprint("[bold green]Using GPU acceleration...[/bold green]") + cmd.extend(['-map', '[v]', '-map', '[a]', '-c:v', 'h264_nvenc']) + else: + cmd.extend(['-map', '[v]', '-map', '[a]']) + + cmd.extend(['-c:a', 'aac', '-b:a', '192k', DUB_VIDEO]) + + subprocess.run(cmd) + rprint(f"[bold green]Video and audio successfully merged into {DUB_VIDEO}[/bold green]") + +if __name__ == '__main__': + merge_video_audio() diff --git a/core/step2_whisperX.py b/core/step2_whisperX.py index c0a4d6f3..064b7750 100644 --- a/core/step2_whisperX.py +++ b/core/step2_whisperX.py @@ -11,15 +11,48 @@ from rich import print as rprint import subprocess import tempfile +import time from core.config_utils import load_key from core.all_whisper_methods.demucs_vl import demucs_main, RAW_AUDIO_FILE, VOCAL_AUDIO_FILE -from core.all_whisper_methods.whisperX_utils import process_transcription, convert_video_to_audio, split_audio, save_results, save_language +from core.all_whisper_methods.whisperX_utils import process_transcription, convert_video_to_audio, split_audio, save_results, save_language, compress_audio, CLEANED_CHUNKS_EXCEL_PATH from core.step1_ytdlp import find_video_files MODEL_DIR = load_key("model_dir") +WHISPER_FILE = "output/audio/for_whisper.mp3" + +def check_hf_mirror() -> str: + """Check and return the fastest HF mirror""" + mirrors = { + 'Official': 'huggingface.co', + 'Mirror': 'hf-mirror.com' + } + fastest_url = f"https://{mirrors['Official']}" + best_time = float('inf') + rprint("[cyan]🔍 Checking HuggingFace mirrors...[/cyan]") + for name, domain in mirrors.items(): + try: + if os.name == 'nt': + cmd = ['ping', '-n', '1', '-w', '3000', domain] + else: + cmd = ['ping', '-c', '1', '-W', '3', domain] + start = time.time() + result = subprocess.run(cmd, capture_output=True, text=True) + response_time = time.time() - start + if result.returncode == 0: + if response_time < best_time: + best_time = response_time + fastest_url = f"https://{domain}" + rprint(f"[green]✓ {name}:[/green] {response_time:.2f}s") + except: + rprint(f"[red]✗ {name}:[/red] Failed to connect") + if best_time == float('inf'): + rprint("[yellow]⚠️ All mirrors failed, using default[/yellow]") + rprint(f"[cyan]🚀 Selected mirror:[/cyan] {fastest_url} ({best_time:.2f}s)") + return fastest_url def transcribe_audio(audio_file: str, start: float, end: float) -> Dict: + os.environ['HF_ENDPOINT'] = check_hf_mirror() #? don't know if it's working... WHISPER_LANGUAGE = load_key("whisper.language") device = "cuda" if torch.cuda.is_available() else "cpu" rprint(f"🚀 Starting WhisperX using device: {device} ...") @@ -40,8 +73,8 @@ def transcribe_audio(audio_file: str, start: float, end: float) -> Dict: model_name = "Huan69/Belle-whisper-large-v3-zh-punct-fasterwhisper" local_model = os.path.join(MODEL_DIR, "Belle-whisper-large-v3-zh-punct-fasterwhisper") else: - model_name = "large-v3" - local_model = os.path.join(MODEL_DIR, "large-v3") + model_name = load_key("whisper.model") + local_model = os.path.join(MODEL_DIR, model_name) if os.path.exists(local_model): rprint(f"[green]📥 Loading local WHISPER model:[/green] {local_model} ...") @@ -49,14 +82,8 @@ def transcribe_audio(audio_file: str, start: float, end: float) -> Dict: else: rprint(f"[green]📥 Using WHISPER model from HuggingFace:[/green] {model_name} ...") - vad_options = { - "vad_onset": 0.500, - "vad_offset": 0.363 - } - asr_options = { - "temperatures": [0], - "initial_prompt": "", - } + vad_options = {"vad_onset": 0.500,"vad_offset": 0.363} + asr_options = {"temperatures": [0],"initial_prompt": "",} whisper_language = None if 'auto' in WHISPER_LANGUAGE else WHISPER_LANGUAGE rprint("[bold yellow]**You can ignore warning of `Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118...`**[/bold yellow]") model = whisperx.load_model(model_name, device, compute_type=compute_type, language=whisper_language, vad_options=vad_options, asr_options=asr_options, download_root=MODEL_DIR) @@ -108,7 +135,7 @@ def transcribe_audio(audio_file: str, start: float, end: float) -> Dict: raise def transcribe(): - if os.path.exists("output/log/cleaned_chunks.xlsx"): + if os.path.exists(CLEANED_CHUNKS_EXCEL_PATH): rprint("[yellow]⚠️ Transcription results already exist, skipping transcription step.[/yellow]") return @@ -120,22 +147,25 @@ def transcribe(): if load_key("demucs"): demucs_main() - whisper_file = VOCAL_AUDIO_FILE if load_key("demucs") else RAW_AUDIO_FILE + # step2 Compress audio + choose_audio = VOCAL_AUDIO_FILE if load_key("demucs") else RAW_AUDIO_FILE + whisper_audio = compress_audio(choose_audio, WHISPER_FILE) - # step2 Extract audio - segments = split_audio(whisper_file) + # step3 Extract audio + segments = split_audio(whisper_audio) - # step3 Transcribe audio + # step4 Transcribe audio all_results = [] for start, end in segments: - result = transcribe_audio(whisper_file, start, end) + result = transcribe_audio(whisper_audio, start, end) all_results.append(result) - # step4 Combine results + # step5 Combine results combined_result = {'segments': []} for result in all_results: combined_result['segments'].extend(result['segments']) + # step6 Process df df = process_transcription(combined_result) save_results(df) diff --git a/core/step4_1_summarize.py b/core/step4_1_summarize.py index d9604ddb..6a9de493 100644 --- a/core/step4_1_summarize.py +++ b/core/step4_1_summarize.py @@ -12,7 +12,7 @@ def combine_chunks(): sentences = file.readlines() cleaned_sentences = [line.strip() for line in sentences] combined_text = ' '.join(cleaned_sentences) - return combined_text[:16000] #! Return only the first 16000 characters + return combined_text[:32000] #! Return only the first 32000 characters def search_things_to_note_in_prompt(sentence): """Search for terms to note in the given sentence""" diff --git a/core/step4_2_translate_all.py b/core/step4_2_translate_all.py index aa0fcdd0..8c9ac707 100644 --- a/core/step4_2_translate_all.py +++ b/core/step4_2_translate_all.py @@ -5,7 +5,7 @@ import concurrent.futures from core.translate_once import translate_lines from core.step4_1_summarize import search_things_to_note_in_prompt -from core.step8_gen_audio_task import check_len_then_trim +from core.step8_1_gen_audio_task import check_len_then_trim from core.step6_generate_final_timeline import align_timestamp from core.config_utils import load_key from rich.console import Console diff --git a/core/step5_splitforsub.py b/core/step5_splitforsub.py index b567c8f1..212b31b3 100644 --- a/core/step5_splitforsub.py +++ b/core/step5_splitforsub.py @@ -67,43 +67,37 @@ def valid_align(response_data): return src_parts, tr_parts, tr_remerged -def split_align_subs(src_lines: List[str], tr_lines: List[str], max_retry=5) -> Tuple[List[str], List[str], List[str]]: +def split_align_subs(src_lines: List[str], tr_lines: List[str]) -> Tuple[List[str], List[str], List[str]]: subtitle_set = load_key("subtitle") MAX_SUB_LENGTH = subtitle_set["max_length"] TARGET_SUB_MULTIPLIER = subtitle_set["target_multiplier"] remerged_tr_lines = tr_lines.copy() - for attempt in range(max_retry): - console.print(Panel(f"🔄 Split attempt {attempt + 1}", expand=False)) - to_split = [] - - for i, (src, tr) in enumerate(zip(src_lines, tr_lines)): - src, tr = str(src), str(tr) - if len(src) > MAX_SUB_LENGTH or calc_len(tr) * TARGET_SUB_MULTIPLIER > MAX_SUB_LENGTH: - to_split.append(i) - table = Table(title=f"📏 Line {i} needs to be split") - table.add_column("Type", style="cyan") - table.add_column("Content", style="magenta") - table.add_row("Source Line", src) - table.add_row("Target Line", tr) - console.print(table) - - def process(i): - split_src = split_sentence(src_lines[i], num_parts=2).strip() - src_parts, tr_parts, tr_remerged = align_subs(src_lines[i], tr_lines[i], split_src) - src_lines[i] = src_parts - tr_lines[i] = tr_parts - remerged_tr_lines[i] = tr_remerged - - with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor: - executor.map(process, to_split) - - # Flatten `src_lines` and `tr_lines` - src_lines = [item for sublist in src_lines for item in (sublist if isinstance(sublist, list) else [sublist])] - tr_lines = [item for sublist in tr_lines for item in (sublist if isinstance(sublist, list) else [sublist])] - - if all(len(src) <= MAX_SUB_LENGTH for src in src_lines) and all(calc_len(tr) * TARGET_SUB_MULTIPLIER <= MAX_SUB_LENGTH for tr in tr_lines): - break + to_split = [] + for i, (src, tr) in enumerate(zip(src_lines, tr_lines)): + src, tr = str(src), str(tr) + if len(src) > MAX_SUB_LENGTH or calc_len(tr) * TARGET_SUB_MULTIPLIER > MAX_SUB_LENGTH: + to_split.append(i) + table = Table(title=f"📏 Line {i} needs to be split") + table.add_column("Type", style="cyan") + table.add_column("Content", style="magenta") + table.add_row("Source Line", src) + table.add_row("Target Line", tr) + console.print(table) + + def process(i): + split_src = split_sentence(src_lines[i], num_parts=2).strip() + src_parts, tr_parts, tr_remerged = align_subs(src_lines[i], tr_lines[i], split_src) + src_lines[i] = src_parts + tr_lines[i] = tr_parts + remerged_tr_lines[i] = tr_remerged + + with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor: + executor.map(process, to_split) + + # Flatten `src_lines` and `tr_lines` + src_lines = [item for sublist in src_lines for item in (sublist if isinstance(sublist, list) else [sublist])] + tr_lines = [item for sublist in tr_lines for item in (sublist if isinstance(sublist, list) else [sublist])] return src_lines, tr_lines, remerged_tr_lines @@ -114,12 +108,25 @@ def split_for_sub_main(): src = df['Source'].tolist() trans = df['Translation'].tolist() - split_src, split_trans, remerged = split_align_subs(src.copy(), trans, max_retry=3) + subtitle_set = load_key("subtitle") + MAX_SUB_LENGTH = subtitle_set["max_length"] + TARGET_SUB_MULTIPLIER = subtitle_set["target_multiplier"] + for attempt in range(3): # 使用固定的3次重试 + console.print(Panel(f"🔄 Split attempt {attempt + 1}", expand=False)) + split_src, split_trans, remerged = split_align_subs(src.copy(), trans) + + # 检查是否所有字幕都符合长度要求 + if all(len(src) <= MAX_SUB_LENGTH for src in split_src) and \ + all(calc_len(tr) * TARGET_SUB_MULTIPLIER <= MAX_SUB_LENGTH for tr in split_trans): + break + + # 更新源数据继续下一轮分割 + src = split_src + trans = split_trans + pd.DataFrame({'Source': split_src, 'Translation': split_trans}).to_excel(OUTPUT_SPLIT_FILE, index=False) pd.DataFrame({'Source': src, 'Translation': remerged}).to_excel(OUTPUT_REMERGED_FILE, index=False) - - console.print("[bold green]✅ Subtitles splitting and remerging completed![/bold green]") if __name__ == '__main__': split_for_sub_main() diff --git a/core/step7_merge_sub_to_vid.py b/core/step7_merge_sub_to_vid.py index 07641bb1..dd9bd7ac 100644 --- a/core/step7_merge_sub_to_vid.py +++ b/core/step7_merge_sub_to_vid.py @@ -27,7 +27,7 @@ TRANS_BACK_COLOR = '&H33000000' OUTPUT_DIR = "output" -OUTPUT_VIDEO = f"{OUTPUT_DIR}/output_video_with_subs.mp4" +OUTPUT_VIDEO = f"{OUTPUT_DIR}/output_sub.mp4" SRC_SRT = f"{OUTPUT_DIR}/src.srt" TRANS_SRT = f"{OUTPUT_DIR}/trans.srt" diff --git a/core/step8_gen_audio_task.py b/core/step8_1_gen_audio_task.py similarity index 63% rename from core/step8_gen_audio_task.py rename to core/step8_1_gen_audio_task.py index 494de96f..3b63499c 100644 --- a/core/step8_gen_audio_task.py +++ b/core/step8_1_gen_audio_task.py @@ -9,38 +9,23 @@ from rich.panel import Panel from rich.console import Console from core.config_utils import load_key +from core.all_tts_functions.estimate_duration import init_estimator, estimate_duration console = Console() speed_factor = load_key("speed_factor") TRANS_SUBS_FOR_AUDIO_FILE = 'output/audio/trans_subs_for_audio.srt' SRC_SUBS_FOR_AUDIO_FILE = 'output/audio/src_subs_for_audio.srt' -SOVITS_TASKS_FILE = 'output/audio/sovits_tasks.xlsx' +SOVITS_TASKS_FILE = 'output/audio/tts_tasks.xlsx' +ESTIMATOR = None def check_len_then_trim(text, duration): - multiplier = speed_factor['normal'] * speed_factor['max'] - # Define speech speed: characters/second or words/second, punctuation/second - speed_zh_ja = 4 * multiplier # Chinese and Japanese characters per second - speed_en_and_others = 5 * multiplier # Words per second for English and other languages - speed_punctuation = 4 * multiplier # Punctuation marks per second - - # Count characters, words, and punctuation for each language - chinese_japanese_chars = len(re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\u3400-\u4dbf\uf900-\ufaff\uff66-\uff9f]', text)) - en_and_others_words = len(re.findall(r'\b[a-zA-ZàâçéèêëîïôûùüÿñæœáéíóúüñÁÉÍÓÚÜÑàèéìíîòóùúÀÈÉÌÍÎÒÓÙÚäöüßÄÖÜа-яА-Я]+\b', text)) - punctuation_count = len(re.findall(r'[,.!?;:,。!?;:](?=.)', text)) - - # Estimate duration for each language part and punctuation - chinese_japanese_duration = chinese_japanese_chars / speed_zh_ja - en_and_others_duration = en_and_others_words / speed_en_and_others - punctuation_duration = punctuation_count / speed_punctuation - - # Total estimated duration - estimated_duration = chinese_japanese_duration + en_and_others_duration + punctuation_duration + global ESTIMATOR + if ESTIMATOR is None: + ESTIMATOR = init_estimator() + estimated_duration = estimate_duration(text, ESTIMATOR) / speed_factor['max'] console.print(f"Subtitle text: {text}, " - f"Subtitle info: Chinese/Japanese chars: {chinese_japanese_chars}, " - f"English and other language words: {en_and_others_words}, " - f"Punctuation marks: {punctuation_count}, " f"[bold green]Estimated reading duration: {estimated_duration:.2f} seconds[/bold green]") if estimated_duration > duration: @@ -62,8 +47,15 @@ def valid_trim(response): else: return text +def time_diff_seconds(t1, t2, base_date): + """Calculate the difference in seconds between two time objects""" + dt1 = datetime.datetime.combine(base_date, t1) + dt2 = datetime.datetime.combine(base_date, t2) + return (dt2 - dt1).total_seconds() + def process_srt(): """Process srt file, generate audio tasks""" + with open(TRANS_SUBS_FOR_AUDIO_FILE, 'r', encoding='utf-8') as file: content = file.read() @@ -92,8 +84,7 @@ def process_srt(): start_time, end_time = lines[1].split(' --> ') start_time = datetime.datetime.strptime(start_time, '%H:%M:%S,%f').time() end_time = datetime.datetime.strptime(end_time, '%H:%M:%S,%f').time() - duration = (datetime.datetime.combine(datetime.date.today(), end_time) - - datetime.datetime.combine(datetime.date.today(), start_time)).total_seconds() + duration = time_diff_seconds(start_time, end_time, datetime.date.today()) text = ' '.join(lines[2:]) # Remove content within parentheses (including English and Chinese parentheses) text = re.sub(r'\([^)]*\)', '', text).strip() @@ -108,48 +99,40 @@ def process_srt(): rprint(Panel(f"Unable to parse subtitle block '{block}', error: {str(e)}, skipping this subtitle block.", title="Error", border_style="red")) continue - subtitles.append({ - 'number': number, - 'start_time': start_time, - 'end_time': end_time, - 'duration': duration, - 'text': text, - 'origin': origin - }) + subtitles.append({'number': number, 'start_time': start_time, 'end_time': end_time, 'duration': duration, 'text': text, 'origin': origin}) df = pd.DataFrame(subtitles) i = 0 - MIN_SUBTITLE_DURATION = load_key("min_subtitle_duration") + MIN_SUB_DUR = load_key("min_subtitle_duration") while i < len(df): - if df.loc[i, 'duration'] < MIN_SUBTITLE_DURATION: - if i < len(df) - 1 and (datetime.datetime.combine(datetime.date.today(), df.loc[i+1, 'start_time']) - - datetime.datetime.combine(datetime.date.today(), df.loc[i, 'start_time'])).total_seconds() < MIN_SUBTITLE_DURATION: + today = datetime.date.today() + if df.loc[i, 'duration'] < MIN_SUB_DUR: + if i < len(df) - 1 and time_diff_seconds(df.loc[i, 'start_time'],df.loc[i+1, 'start_time'],today) < MIN_SUB_DUR: rprint(f"[bold yellow]Merging subtitles {i+1} and {i+2}[/bold yellow]") df.loc[i, 'text'] += ' ' + df.loc[i+1, 'text'] df.loc[i, 'origin'] += ' ' + df.loc[i+1, 'origin'] df.loc[i, 'end_time'] = df.loc[i+1, 'end_time'] - df.loc[i, 'duration'] = (datetime.datetime.combine(datetime.date.today(), df.loc[i, 'end_time']) - - datetime.datetime.combine(datetime.date.today(), df.loc[i, 'start_time'])).total_seconds() + df.loc[i, 'duration'] = time_diff_seconds(df.loc[i, 'start_time'],df.loc[i, 'end_time'],today) df = df.drop(i+1).reset_index(drop=True) else: if i < len(df) - 1: # Not the last audio - rprint(f"[bold blue]Extending subtitle {i+1} duration to {MIN_SUBTITLE_DURATION} seconds[/bold blue]") - df.loc[i, 'end_time'] = (datetime.datetime.combine(datetime.date.today(), df.loc[i, 'start_time']) + - datetime.timedelta(seconds=MIN_SUBTITLE_DURATION)).time() - df.loc[i, 'duration'] = MIN_SUBTITLE_DURATION + rprint(f"[bold blue]Extending subtitle {i+1} duration to {MIN_SUB_DUR} seconds[/bold blue]") + df.loc[i, 'end_time'] = (datetime.datetime.combine(today, df.loc[i, 'start_time']) + + datetime.timedelta(seconds=MIN_SUB_DUR)).time() + df.loc[i, 'duration'] = MIN_SUB_DUR else: - rprint(f"[bold red]The last subtitle {i+1} duration is less than {MIN_SUBTITLE_DURATION} seconds, but not extending[/bold red]") + rprint(f"[bold red]The last subtitle {i+1} duration is less than {MIN_SUB_DUR} seconds, but not extending[/bold red]") i += 1 else: i += 1 df['start_time'] = df['start_time'].apply(lambda x: x.strftime('%H:%M:%S.%f')[:-3]) df['end_time'] = df['end_time'].apply(lambda x: x.strftime('%H:%M:%S.%f')[:-3]) - - # check and trim subtitle length, for twice to ensure the subtitle length is within the limit - for _ in range(2): - df['text'] = df.apply(lambda x: check_len_then_trim(x['text'], x['duration']), axis=1) + + ##! No longer perform secondary trim + # check and trim subtitle length, for twice to ensure the subtitle length is within the limit, 允许tolerance + # df['text'] = df.apply(lambda x: check_len_then_trim(x['text'], x['duration']+x['tolerance']), axis=1) return df diff --git a/core/step8_2_gen_dub_chunks.py b/core/step8_2_gen_dub_chunks.py new file mode 100644 index 00000000..bc92a80a --- /dev/null +++ b/core/step8_2_gen_dub_chunks.py @@ -0,0 +1,188 @@ +import pandas as pd +import os, sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from core.config_utils import load_key +from core.all_whisper_methods.whisperX_utils import get_audio_duration +from core.step8_1_gen_audio_task import time_diff_seconds +import datetime +import re +from core.all_tts_functions.estimate_duration import init_estimator, estimate_duration +from rich import print as rprint + +INPUT_EXCEL = "output/audio/tts_tasks.xlsx" +OUTPUT_EXCEL = "output/audio/tts_tasks.xlsx" +TRANSCRIPT_FILE = "output/trans.srt" +MAX_MERGE_COUNT = 5 +AUDIO_FILE = 'output/audio/raw.mp3' +ESTIMATOR = None + +def calc_if_too_fast(est_dur, tol_dur, duration, tolerance): + accept = load_key("speed_factor.accept") # Maximum acceptable speed factor + if est_dur / accept > tol_dur: # Even max speed factor cannot adapt + return 2 + elif est_dur > tol_dur: # Speed adjustment needed within acceptable range + return 1 + elif est_dur < duration - tolerance: # Speaking speed too slow + return -1 + else: # Normal speaking speed + return 0 + +def merge_rows(df, start_idx, merge_count): + """Merge multiple rows and calculate cumulative values""" + merged = { + 'est_dur': df.iloc[start_idx]['est_dur'], + 'tol_dur': df.iloc[start_idx]['tol_dur'], + 'duration': df.iloc[start_idx]['duration'] + } + + while merge_count < MAX_MERGE_COUNT and (start_idx + merge_count) < len(df): + next_row = df.iloc[start_idx + merge_count] + merged['est_dur'] += next_row['est_dur'] + merged['tol_dur'] += next_row['tol_dur'] + merged['duration'] += next_row['duration'] + + speed_flag = calc_if_too_fast( + merged['est_dur'], + merged['tol_dur'], + merged['duration'], + df.iloc[start_idx + merge_count]['tolerance'] + ) + + if speed_flag <= 0 or merge_count == 2: + df.at[start_idx + merge_count, 'cut_off'] = 1 + return merge_count + 1 + + merge_count += 1 + + # If no suitable merge point is found + if merge_count >= MAX_MERGE_COUNT or (start_idx + merge_count) >= len(df): + df.at[start_idx + merge_count - 1, 'cut_off'] = 1 + return merge_count + +def analyze_subtitle_timing_and_speed(df): + rprint("[🔍 Analyzing] Calculating subtitle timing and speed...") + global ESTIMATOR + if ESTIMATOR is None: + ESTIMATOR = init_estimator() + TOLERANCE = load_key("tolerance") + whole_dur = get_audio_duration(AUDIO_FILE) + df['gap'] = 0.0 # Initialize gap column + for i in range(len(df) - 1): + current_end = datetime.datetime.strptime(df.loc[i, 'end_time'], '%H:%M:%S.%f').time() + next_start = datetime.datetime.strptime(df.loc[i + 1, 'start_time'], '%H:%M:%S.%f').time() + df.loc[i, 'gap'] = time_diff_seconds(current_end, next_start, datetime.date.today()) + + # Set the gap for the last line + last_end = datetime.datetime.strptime(df.iloc[-1]['end_time'], '%H:%M:%S.%f').time() + last_end_seconds = (last_end.hour * 3600 + last_end.minute * 60 + + last_end.second + last_end.microsecond / 1000000) + df.iloc[-1, df.columns.get_loc('gap')] = whole_dur - last_end_seconds + + df['tolerance'] = df['gap'].apply(lambda x: TOLERANCE if x > TOLERANCE else x) + df['tol_dur'] = df['duration'] + df['tolerance'] + df['est_dur'] = df.apply(lambda x: estimate_duration(x['text'], ESTIMATOR), axis=1) + + ## Calculate speed indicators + accept = load_key("speed_factor.accept") # Maximum acceptable speed factor + def calc_if_too_fast(row): + est_dur = row['est_dur'] + tol_dur = row['tol_dur'] + duration = row['duration'] + tolerance = row['tolerance'] + + if est_dur / accept > tol_dur: # Even max speed factor cannot adapt + return 2 + elif est_dur > tol_dur: # Speed adjustment needed within acceptable range + return 1 + elif est_dur < duration - tolerance: # Speaking speed too slow + return -1 + else: # Normal speaking speed + return 0 + + df['if_too_fast'] = df.apply(calc_if_too_fast, axis=1) + return df + +def process_cutoffs(df): + rprint("[✂️ Processing] Generating cutoff points...") + df['cut_off'] = 0 # Initialize cut_off column + df.loc[df['gap'] >= load_key("tolerance"), 'cut_off'] = 1 # Set to 1 when gap is greater than TOLERANCE + idx = 0 + while idx < len(df): + # Process marked split points + if df.iloc[idx]['cut_off'] == 1: + if df.iloc[idx]['if_too_fast'] == 2: + rprint(f"[⚠️ Warning] Line {idx} is too fast and cannot be fixed by speed adjustment") + idx += 1 + continue + + # Process the last line + if idx + 1 >= len(df): + df.at[idx, 'cut_off'] = 1 + break + + # Process normal or slow lines + if df.iloc[idx]['if_too_fast'] <= 0: + if df.iloc[idx + 1]['if_too_fast'] <= 0: + df.at[idx, 'cut_off'] = 1 + idx += 1 + else: + idx += merge_rows(df, idx, 1) + # Process fast lines + else: + idx += merge_rows(df, idx, 1) + + return df + +def gen_dub_chunks(): + rprint("[🎬 Starting] Generating dubbing chunks...") + df = pd.read_excel(INPUT_EXCEL) + + rprint("[📊 Processing] Analyzing timing and speed...") + df = analyze_subtitle_timing_and_speed(df) + + rprint("[✂️ Processing] Processing cutoffs...") + df = process_cutoffs(df) + + rprint("[📝 Reading] Loading transcript file...") + content = open(TRANSCRIPT_FILE, "r", encoding="utf-8").read() + + # Process subtitle content + content_lines = [] + for block in content.strip().split('\n\n'): + lines = [line.strip() for line in block.split('\n') if line.strip()] + if len(lines) >= 3: + text = ' '.join(lines[2:]) + # Clean text + text = re.sub(r'\([^)]*\)|([^)]*)', '', text).strip().replace('-', '') + content_lines.append(text) + + # Match processing + df['lines'] = None + last_idx = 0 + + for idx, row in df.iterrows(): + target = row['text'].replace(' ', '') + matches = [] + current = '' + + for i in range(last_idx, len(content_lines)): + line = content_lines[i].replace(' ', '') + current += line + matches.append(content_lines[i]) + + if current == target: + df.at[idx, 'lines'] = matches + last_idx = i + 1 + break + else: # If no match is found + rprint(f"[❌ Error] Matching failed at line {idx}:") + rprint(f"Target: '{target}'") + rprint(f"Current: '{current}'") + raise ValueError("Matching failed") + + # Save results + df.to_excel(OUTPUT_EXCEL, index=False) + rprint("[✅ Complete] Matching completed successfully!") + +if __name__ == "__main__": + gen_dub_chunks() \ No newline at end of file diff --git a/core/step9_extract_refer_audio.py b/core/step9_extract_refer_audio.py index 1cef1149..74777404 100644 --- a/core/step9_extract_refer_audio.py +++ b/core/step9_extract_refer_audio.py @@ -12,7 +12,7 @@ # Simplified path definitions REF_DIR = 'output/audio/refers' SEG_DIR = 'output/audio/segs' -TASKS_FILE = 'output/audio/sovits_tasks.xlsx' +TASKS_FILE = 'output/audio/tts_tasks.xlsx' def time_to_samples(time_str, sr): """Unified time conversion function""" diff --git a/docs/pages/docs/start.en-US.md b/docs/pages/docs/start.en-US.md index 5c68828a..de78adcc 100644 --- a/docs/pages/docs/start.en-US.md +++ b/docs/pages/docs/start.en-US.md @@ -1,49 +1,43 @@ # 🚀 Getting Started ## 📋 API Configuration -This project requires Large Language Models and TTS. Multiple options are provided for each component. **Please read the configuration guide carefully 😊** +This project requires Large Language Models and TTS. **Recommended to use [SiliconFlow](https://cloud.siliconflow.cn/i/ttKDEsxE)**, which offers free credits upon registration and only needs one key for all features. ### 1. **Get API_KEY for Large Language Models**: | Recommended Model | Recommended Provider | base_url | Price | Effect | |:-----|:---------|:---------|:-----|:---------| -| gemini-1.5-pro-002 | [Yunwu API](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | $0.70 / 1M tokens | 🤩 | -| claude-3-5-sonnet-20240620 | [Yunwu API](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | $1.40 / 1M tokens | 🤩 | -| gpt-4o | [Yunwu API](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | $0.70 / 1M tokens | 😃 | +| Qwen/Qwen2.5-72B-Instruct | [SiliconFlow](https://cloud.siliconflow.cn/i/ttKDEsxE) | https://api.siliconflow.cn | ¥4 / 1M tokens | 😃 | +| claude-3-5-sonnet-20240620 | / | / | $15 / 1M tokens | 🤩 | -⚠️ Warning: The prompts involve multi-step reasoning chains and complex JSON formats. Weak models are prone to errors. An 1-hour video costs about $1.40 using Claude, and about $0.70 with other models. If using the official Grok API, please note to adjust max_workers to 1 in the config. - -> Note: Yunwu API also supports OpenAI's tts-1 interface, which can be used in the dubbing step. - -
-How to get API key from Yunwu API? - -1. Go to [Yunwu API website](https://yunwu.zeabur.app/register?aff=TXMB) -2. Register an account and top up -3. Create a new key on the API key page -4. Make sure to check `Unlimited quota`, recommended channel is `Pure AZ 1.5x` -
- -
-Can I use other models? - -- ✅ Supports OAI-Like API interfaces, you can change in the Streamlit sidebar. -- ⚠️ However, other models (especially smaller ones) have weaker instruction following capabilities and are very likely to error during translation. Strongly not recommended. If errors occur, please switch models. -
+Note: Supports OpenAI interface, you can try different models. However, the process involves multi-step reasoning chains and complex JSON formats, **not recommended to use models smaller than 30B**. ### 2. **TTS API** VideoLingo provides multiple TTS integration methods. Here's a comparison (skip if only using translation without dubbing) | TTS Solution | Pros | Cons | Chinese Effect | Non-Chinese Effect | |:---------|:-----|:-----|:---------|:-----------| +| 🎙️ SiliconFlow FishTTS (Recommended) | Supports cloning | Not the best | 😃 | 😃 | | 🎙️ OpenAI TTS | Realistic emotions | Chinese sounds foreign | 😕 | 🤩 | -| 🔊 Azure TTS (Recommended) | Natural effect | Difficult to top up | 🤩 | 😃 | -| 🎤 Fish TTS | Authentic native speaker | Limited official models | 😂 | 😂 | -| 🗣️ GPT-SoVITS (Testing) | Best voice cloning | Currently only supports Chinese/English, requires NVIDIA GPU for inference, configuration requires relevant knowledge | 🏆 | 🚫 | +| 🔊 Azure TTS | Natural effect | Limited emotions | 🤩 | 😃 | +| 🎤 Fish TTS | Authentic native | Limited official models | 😂 | 😂 | +| 🗣️ GPT-SoVITS | Best voice cloning | Only supports Chinese/English, requires local inference, complex setup | 🏆 | 🚫 | -- For OpenAI TTS, recommended to use [Yunwu API](https://yunwu.zeabur.app/register?aff=TXMB), make sure to select `tts-1` for the model; -- For Azure TTS, register and top up on the [official website](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/get-started-text-to-speech?tabs=windows%2Cterminal&pivots=programming-language-python) (has free quota); -- For Fish TTS, register on the [official website](https://fish.audio/en/go-api/) (comes with $10 credit) +- For SiliconFlow FishTTS, get key from [SiliconFlow](https://cloud.siliconflow.cn/i/ttKDEsxE), note that cloning feature requires paid credits; +- For OpenAI TTS, recommended to use [Yunwu API](https://yunwu.zeabur.app/register?aff=TXMB); +- For Azure TTS, register on official website or purchase from third parties; +- For Fish TTS, register on [official website](https://fish.audio/en/go-api/) (comes with $10 credit) + +
+SiliconFlow FishTTS Tutorial + +Currently supports 3 modes: + +1. `preset`: Uses fixed voice, can preview on [Official Playground](https://cloud.siliconflow.cn/playground/text-to-speech/17885302608), default is `anna`. +2. `clone(stable)`: Corresponds to fishtts api's `custom`, uses voice from uploaded audio, automatically samples first 10 seconds of video for voice, more stable. +3. `clone(dynamic)`: Corresponds to fishtts api's `dynamic`, uses each sentence as reference audio during TTS, may have inconsistent voice but better effect. + +
How to choose OpenAI voices? @@ -118,12 +112,22 @@ After configuration, select `Reference Audio Mode` in the sidebar (see Yuque doc ## 🛠️ Quick Start -VideoLingo supports Windows, macOS and Linux systems, and can run on CPU or GPU. For GPU acceleration on Windows, install these dependencies: +VideoLingo supports Windows, macOS and Linux systems, and can run on CPU or GPU. + +For GPU acceleration on Windows, install these dependencies: - [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) - [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) -> Note: After installing CUDA and CUDNN, check if they're added to system path and restart computer 🔄 +> Note: After installing, add `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` to system path and restart computer 🔄 + +### Windows One-Click Install + +Make sure [Git](https://git-scm.com/downloads) is installed, + +1. Download source code locally + +2. Double click `OneKeyInstall&Start.bat` to complete installation and launch webpage ### Source Installation @@ -157,26 +161,24 @@ Basic Python knowledge required. For any issues, ask the AI assistant at [videol ``` Script will automatically install appropriate torch version -5. 🎉 Enter command or click `OneKeyStart.bat` to launch Streamlit app: +5. 🎉 Enter command to launch Streamlit app: ```bash streamlit run st.py ``` -6. Set key in sidebar of popup webpage, and note whisper method and transcription language selection +6. Set key in sidebar of popup webpage and start using~ - ![en_set](https://github.com/user-attachments/assets/2f32f49b-0b7a-4ff4-930f-4e5f9bac9002) + ![zh_set](https://github.com/user-attachments/assets/bb9381d0-8d99-4d8b-aaff-9846076fc7a3) -7. Whisper transcription will automatically download models, but for users who cannot access Huggingface through command line, you can manually download whisper models and place them in the root directory: [Baidu Drive](https://pan.baidu.com/s/1Igo_FvFV4Xcb8tSYT0ktpA?pwd=e1c7) +7. Transcription step will automatically download models from huggingface, or you can download manually and place `_model_cache` folder in VideoLingo directory: [Baidu Drive](https://pan.baidu.com/s/1Igo_FvFV4Xcb8tSYT0ktpA?pwd=e1c7) -8. More settings can be manually modified in `config.yaml`, watch command line output during operation +8. (Optional) More settings can be manually modified in `config.yaml`, watch command line output during operation ## 🚨 Common Errors -1. **'Empty Translation Line'**: This occurs when using a less capable LLM that omits short phrases during translation. Solution: Please retry with Claude 3.5 Sonnet. - -2. **'Key Error' during translation**: +1. **'Key Error' during translation**: - Reason 1: Same as above, weaker models have poor JSON format compliance. - Reason 2: LLM may refuse to translate sensitive content. Solution: Check `response` and `msg` fields in `output/gpt_log/error.json`. -3. **'Retry Failed', 'SSL', 'Connection', 'Timeout'**: Usually network issues. Solution: Users in mainland China please switch network nodes and retry. +2. **'Retry Failed', 'SSL', 'Connection', 'Timeout'**: Usually network issues. Solution: Users in mainland China please switch network nodes and retry. diff --git a/docs/pages/docs/start.zh-CN.md b/docs/pages/docs/start.zh-CN.md index a972219c..202c2daf 100644 --- a/docs/pages/docs/start.zh-CN.md +++ b/docs/pages/docs/start.zh-CN.md @@ -1,34 +1,16 @@ # 🚀 开始使用 -## 📋 API 配置准备 -本项目需使用大模型 和 TTS ,每个环节都提供了多种选择,**请仔细阅读配置指南😊** +## 📋 API 配置指南 +本项目需使用大模型 和 TTS ,**推荐使用 [硅基流动](https://cloud.siliconflow.cn/i/ttKDEsxE)**,注册送积分,只需要一个 Key 即可体验全部功能。 + ### 1. **获取大模型的 API_KEY**: | 推荐模型 | 推荐提供商 | base_url | 价格 | 效果 | |:-----|:---------|:---------|:-----|:---------| -| gemini-1.5-pro-002 | [云雾 api](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | ¥7 / 1M tokens | 🤩 | -| claude-3-5-sonnet-20240620 | [云雾 api](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | ¥10 / 1M tokens | 🤩 | -| gpt-4o | [云雾 api](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | ¥7 / 1M tokens | 😃 | - -⚠️ 警告:prompt 涉及多步思维链和复杂的json格式,弱模型容易出错。1h 视频使用 claude 花费约 10 元。 - -> 注:云雾api 还支持 openai 的 tts-1 接口,可在配音步骤选用。 - -
-云雾api 如何获取 api key? - -1. 前往 [云雾 api 官网](https://yunwu.zeabur.app/register?aff=TXMB) -2. 注册账户并充值 -3. 在 api key 页面新建一个 key -4. 注意勾选 `无限额度` ,渠道建议选 `纯AZ 1.5倍` -
- -
-能用别的模型吗? +| Qwen/Qwen2.5-72B-Instruct | [硅基流动](https://cloud.siliconflow.cn/i/ttKDEsxE) | https://api.siliconflow.cn | ¥4 / 1M tokens | 😃 | +| claude-3-5-sonnet-20240620 | / | / | $15 / 1M tokens | 🤩 | -- ✅ 支持 OAI-Like 的 API 接口,需要自行在 streamlit 侧边栏更换。 -- ⚠️ 但其他模型(尤其是小模型)遵循指令要求能力弱,非常容易在翻译过程报错,强烈不推荐,遇到报错请更换模型。 -
+注:支持 Openai 接口,可自行尝试不同模型。但处理过程涉及多步思维链和复杂的json格式,**不建议使用小于 30B 的模型**。 ### 2. **TTS 的 API** @@ -36,14 +18,27 @@ VideoLingo提供了多种 tts 接入方式,以下是对比(如不使用配 | TTS 方案 | 优点 | 缺点 | 中文效果 | 非中文效果 | |:---------|:-----|:-----|:---------|:-----------| +| 🎙️ SiliconFlow FishTTS (推荐) | 支持克隆 | 不是最好 | 😃 | 😃 | | 🎙️ OpenAI TTS | 情感真实 | 中文听起来像外国人 | 😕 | 🤩 | -| 🔊 Azure TTS (推荐) | 效果自然 | 充值不方便 | 🤩 | 😃 | +| 🔊 Azure TTS | 效果自然 | 情感不够丰富 | 🤩 | 😃 | | 🎤 Fish TTS | 真是本地人 | 官方模型有限 | 😂 | 😂 | -| 🗣️ GPT-SoVITS (测试) | 最强语音克隆 | 目前只支持中英文,需要N卡推理模型,配置需要相关知识 | 🏆 | 🚫 | +| 🗣️ GPT-SoVITS | 最强语音克隆 | 只支持中英文,需要本地推理,配置麻烦 | 🏆 | 🚫 | + +- SiliconFlow FishTTS 请在 [硅基流动](https://cloud.siliconflow.cn/i/ttKDEsxE) 获取key,注意克隆功能需要付费充值积分; +- OpenAI TTS,推荐使用 [云雾 api](https://yunwu.zeabur.app/register?aff=TXMB); +- Azure TTS 可以在官网注册获取key,也可以淘宝购买; +- Fish TTS 可以在 [官网](https://fish.audio/zh-CN/go-api/) 注册(送10刀额度) + +
+SiliconFlow FishTTS 使用教程 + +目前支持 3 种模式: -- OpenAI TTS,推荐使用 [云雾 api](https://yunwu.zeabur.app/register?aff=TXMB),注意在模型处勾选 `tts-1`; -- Azure TTS 在 [官网](https://learn.microsoft.com/zh-cn/azure/ai-services/speech-service/get-started-text-to-speech?tabs=windows%2Cterminal&pivots=programming-language-python) 注册充值(有免费额度); -- Fish TTS 请自行在 [官网](https://fish.audio/zh-CN/go-api/) 注册(送10刀额度) +1. `preset`: 使用固定音色,可以在 [官网Playground](https://cloud.siliconflow.cn/playground/text-to-speech/17885302608) 试听,默认 `anna`。 +2. `clone(stable)`: 对应 fishtts api 的 `custom`,使用一段上传音频的音色,会自动采集视频前十秒声音作为音色使用,比较稳定。 +3. `clone(dynamic)`: 对应 fishtts api 的 `dynamic`,在 tts 过程使用每一句作为参考音频,可能出现音色不一致,但效果更好。 + +
OpenAI 声音怎么选? @@ -61,7 +56,7 @@ VideoLingo提供了多种 tts 接入方式,以下是对比(如不使用配
Fish TTS 声音怎么选? -前往 [官网](https://fish.audio/zh-CN/) 中试听��择你想要的声音,��� URL 中可以找到该声音对应的代号,例如丁真是 `54a5170264694bfc8e9ad98df7bd89c3`,热门的几种声音已添加在 `config.yaml` 中。如需使用其他声音,请在 `config.yaml` 中修改 `fish_tts.character_id_dict` 字典。 +前往 [官网](https://fish.audio/zh-CN/) 中试听选择你想要的声音,在 URL 中可以找到该声音对应的代号,例如丁真是 `54a5170264694bfc8e9ad98df7bd89c3`,热门的几种声音已添加在 `config.yaml` 中。如需使用其他声音,请在 `config.yaml` 中修改 `fish_tts.character_id_dict` 字典。
@@ -118,12 +113,27 @@ VideoLingo提供了多种 tts 接入方式,以下是对比(如不使用配 ## 🛠️ 快速上手 -VideoLingo 支持 Windows、macOS 和 Linux 系统,可使用 CPU 或 GPU 运行。对于 Windows 系统使用 GPU 加速,需要安装以下依赖: +VideoLingo 支持 Windows、macOS 和 Linux 系统,可使用 CPU 或 GPU 运行。 + +对于 Windows 系统使用 GPU 加速,需要安装以下依赖: - [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) - [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) -> 注意:安装完 CUDA 和 CUDNN 后需要检查是否添加到了系统路径,并重启计算机 🔄 +> 注意:安装后需要将 `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` 添加至系统环境变量,并重启计算机 🔄 + +### Windows 一键安装 + +请确保已安装 [Git](https://git-scm.com/downloads), + +1. 下载源码到本地 + +2. (可选)应用汉化补丁: + - 打开项目根目录下的 `i18n/中文` 文件夹 + - 将该文件夹中的所有内容复制到项目根目录 + - 在弹出的提示中选择"替换目标中的文件" + +3. 双击 `OneKeyInstall&Start.bat` 即可完成安装并启动网页 ### 源码安装 @@ -134,7 +144,7 @@ VideoLingo 支持 Windows、macOS 和 Linux 系统,可使用 CPU 或 GPU 运 需要一定的 python 基础,遇到任何问题可以询问官方网站 [videolingo.io](https://videolingo.io) 右下角的AI助手~ -1. 打开 Anaconda Prompt 并切换到你想安装的目录,例如桌面: +1. 打开 `Anaconda Prompt` 并切换到你想安装的目录,例如桌面: ```bash cd desktop ``` @@ -151,13 +161,11 @@ VideoLingo 支持 Windows、macOS 和 Linux 系统,可使用 CPU 或 GPU 运 conda activate videolingo ``` -4. 应用汉化补丁: - - 打开项目根目录下的 `i18n/中文` 文件夹 - - 将该文件夹中的所有内容复制到项目根目录 - - 在弹出的提示中选择"替换目标中的文件" - (注意:Mac系统会删除整个目标文件夹后再复制,而Windows只会替换重复的文件。Mac用户建议手动将文件逐个移动到目标位置) +4. (可选)应用汉化补丁: + + 参照 **一键安装** 中的说明 - 完成以上步骤后,界面将切换为中文显示。 + (注意:Mac系统会删除整个目标文件夹后再复制,而Windows只会替换重复的文件。Mac用户建议手动将文件逐个移动到目标位置) 5. 运行安装脚本: ```bash @@ -174,17 +182,15 @@ VideoLingo 支持 Windows、macOS 和 Linux 系统,可使用 CPU 或 GPU 运 ![zh_set](https://github.com/user-attachments/assets/bb9381d0-8d99-4d8b-aaff-9846076fc7a3) -8. whisper 转录步骤会自动下载模型,但是对于命令行无法访问 huggingface 的用户,也可以手动下载 whisper 模型放置在根目录下:[百度网盘](https://pan.baidu.com/s/1Igo_FvFV4Xcb8tSYT0ktpA?pwd=e1c7) +8. 转录步骤会自动从 huggingface 下载模型,也可以手动下载,将 `_model_cache` 文件夹放置在 VideoLingo 目录下:[百度网盘](https://pan.baidu.com/s/1Igo_FvFV4Xcb8tSYT0ktpA?pwd=e1c7) -9. 更多设置可以在 `config.yaml` 中手动修改,运行过程请注意命令行输出 +9. (可选)更多设置可以在 `config.yaml` 中手动修改,运行过程请注意命令行输出 ## 🚨 常见报错 -1. **'Empty Translation Line'**: 这是由于选用了较笨的LLM,在翻译时把一些短语句直接省略了。解决方案:请换用Claude 3.5 Sonnet重试。 - -2. **翻译过程的 'Key Error'**: +1. **翻译过程的 'Key Error'**: - 原因1:同上,弱模型遵循JSON格式能力有误。 - 原因2:对于敏感内容,LLM可能拒绝翻译。 解决方案:请检查 `output/gpt_log/error.json` 的 `response` 和 `msg` 字段。 -3. **'Retry Failed', 'SSL', 'Connection', 'Timeout'**: 通常是网络问题。解决方案:中国大陆用户请切换网络节点重试。 \ No newline at end of file +2. **'Retry Failed', 'SSL', 'Connection', 'Timeout'**: 通常是网络问题。解决方案:中国大陆用户请切换网络节点重试。 \ No newline at end of file diff --git a/i18n/README.zh.md b/i18n/README.zh.md index 8c6ced37..5ba89748 100644 --- a/i18n/README.zh.md +++ b/i18n/README.zh.md @@ -14,12 +14,12 @@ ## 🌟 项目简介 -VideoLingo 是一站式视频翻译本地化配音工具,能够一键生成 Netflix 级别的高质量字幕,告别生硬机翻,告别多行字幕,还能加上高质量的配音,让全世界的知识能够跨越语言的障碍共享。 +VideoLingo 是一站式视频翻译本地化配音工具,能够一键生成 Netflix 级别的高质量字幕,告别生硬机翻,告别多行字幕,还能加上高质量的克隆配音,让全世界的知识能够跨越语言的障碍共享。 主要特点和功能: - 🎥 使用 yt-dlp 从 Youtube 链接下载视频 -- 🎙️ 使用 WhisperX 进行单词级时间轴字幕识别 +- **🎙️ 使用 WhisperX 进行单词级时间轴字幕识别** - **📝 使用 NLP 和 GPT 根据句意进行字幕分割** @@ -29,15 +29,13 @@ VideoLingo 是一站式视频翻译本地化配音工具,能够一键生成 Ne - **✅ 按照 Netflix 标准检查单行长度,绝无双行字幕** -- **🗣️ 使用 GPT-SoVITS 等方法对齐配音** +- **🗣️ 使用 FishTTS 等方法对齐克隆配音** - 🚀 整合包一键启动,在 streamlit 中一键出片 - 📝 详细记录每步操作日志,支持随时中断和恢复进度 -- 🌐 全面的多语言支持,轻松实现跨语言视频本地化 - -与同类项目的主要区别:**绝无多行字幕,最佳的翻译质量** +与同类项目相比的优势:**绝无多行字幕,最佳的翻译质量,无缝的配音体验** ## 🎥 效果演示 @@ -80,26 +78,26 @@ https://github.com/user-attachments/assets/85c64f8c-06cf-4af9-b153-ee9d2897b768 | 意大利语 | 🤩 | [意转中](https://github.com/user-attachments/assets/f1f893eb-dad3-4460-aaf6-10cac999195e) | | 西班牙语 | 🤩 | [西转中](https://github.com/user-attachments/assets/c1d28f1c-83d2-4f13-a1a1-859bd6cc3553) | | 日语 | 😐 | [日转中](https://github.com/user-attachments/assets/856c3398-2da3-4e25-9c36-27ca2d1f68c2) | -| 中文* | 🤩 | [中转英](https://github.com/user-attachments/assets/48f746fe-96ff-47fd-bd23-59e9202b495c) | -> *中文需单独配置whisperX模型,仅适用于本地源码安装,配置过程见安装文档,并注意在网页侧边栏指定转录语言为zh +| 中文* | 😊 | [中转英](https://github.com/user-attachments/assets/48f746fe-96ff-47fd-bd23-59e9202b495c) | +> *中文需单独配置标点增强后的 whisper 模型,详见安装文档。但效果一般,因为 faster-whisper 加速的 whisper 失去了原有的好的断句,且识别得到的中文没有标点符号,难以断句。同样问题出现在日语上。 -翻译语言支持大模型会的所有语言,配音语言取决于选取的TTS方法。 +翻译语言支持所有语言,配音语言取决于选取的TTS。 ## 🚀 快速开始 ### 在线体验 -商业版提供免费的 20min 额度,请访问 [videolingo.io](https://videolingo.io) +商业版(beta)提供免费的 20min 额度,请访问 [videolingo.io](https://videolingo.io) ### Colab 运行 -只需 5 分钟即可在 Colab 中快速体验 VideoLingo: +只需 5 分钟的安装即可在 Colab 中快速体验 VideoLingo: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Huanshere/VideoLingo/blob/main/VideoLingo_colab.ipynb) ### 本地安装 -VideoLingo 支持所有硬件平台和操作系统,但在 GPU 加速下性能最佳。详细安装说明请参考文档:[English](/docs/pages/docs/start.en-US.md) | [简体中文](/docs/pages/docs/start.zh-CN.md) +VideoLingo 支持所有硬件平台和操作系统,但在 GPU 加速下性能最佳。文档:[English](/docs/pages/docs/start.en-US.md) | [简体中文](/docs/pages/docs/start.zh-CN.md) ### 使用Docker @@ -118,30 +116,23 @@ docker run -d -p 8501:8501 --gpus all videolingo 使用说明: [English](/batch/README.md) | [简体中文](/batch/README.zh.md) ## ⚠️ 当前限制 -1. 不同设备运行 whisperX 效果不同,v1.7 会先进行 demucs 人声分离,但可能会导致分离后转录效果不如分离前,原因是 whisper 本身是在带 bgm 的环境下训练的,分离前不会转录bgm的歌词,但是分离后可能会转录歌词。 - -2. **配音功能的质量可能不完美**,仍处于测试开发阶段,正在尝试接入 MascGCT。目前为获得最佳效果,建议根据原视频的语速和内容特点,选择相近语速的 TTS,效果见 [demo](https://www.bilibili.com/video/BV1mt1QYyERR/?share_source=copy_web&vd_source=fa92558c28cd668d33dabaddb17e2f9e)。 - -3. **多语言视频转录识别仅仅只会保留主要语言**,这是由于 whisperX 在强制对齐单词级字幕时使用的是针对单个语言的特化模型,会因为不认识另一种语言而删去。 +1. WhisperX 转录效果可能受到视频背景声影响,因为使用了 wav2vac 模型进行对齐,但尽管如此,WhisperX 已经能在 99% 情况下解决 Whisper 本身的幻觉问题。 -3. **多角色分别配音正在开发**,whisperX 具有 VAD 的潜力,但是具体需要一些施工,暂时没有支持此功能。 +2. 配音功能由于不同语言的语速和语调差异,还受到前置处理字幕的影响,可能不能 100% 完美,但本项目做了非常多的语速上的工程处理,尽可能保证配音效果。 -## 🚗 路线图 +3. **多语言视频转录识别仅仅只会保留主要语言**,这是由于 whisperX 在强制对齐单词级字幕时使用的是针对单个语言的特化模型,会因为不认识另一种语言而删去。有些商用api可以进行机器自动转换,但实测效果非常一般,因此这个问题目前只能依靠人为切段处理。 -- [x] SaaS 版本 at [videolingo.io](https://videolingo.io) -- [ ] VAD 区分说话人,多角色配音 -- [ ] 用户术语表 -- [ ] 配音视频唇形同步 +4. **多角色分别配音仍在开发**,whisperX 具有 VAD 的潜力(尽管官方承认效果一般),但是具体需要一些施工,暂时没有支持此功能。 ## 📄 许可证 -本项目采用 Apache 2.0 许可证,我们衷心感谢以下开源项目的贡献: +本项目采用 Apache 2.0 许可证,衷心感谢以下开源项目的贡献: -[whisperX](https://github.com/m-bain/whisperX) | [yt-dlp](https://github.com/yt-dlp/yt-dlp) | [json_repair](https://github.com/mangiucugna/json_repair) | [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) | [BELLE](https://github.com/LianjiaTech/BELLE) +[whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE) ## 📬 联系我们 -- 加入我们的 QQ 群:875297969 +- 加入我们的 QQ 群寻求解答:875297969 - 在 GitHub 上提交 [Issues](https://github.com/Huanshere/VideoLingo/issues) 或 [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) - 关注我的 Twitter:[@Huanshere](https://twitter.com/Huanshere) - 联系邮箱:team@videolingo.io diff --git "a/i18n/\344\270\255\346\226\207/config.yaml" "b/i18n/\344\270\255\346\226\207/config.yaml" index 590fdbc2..d05cfea8 100644 --- "a/i18n/\344\270\255\346\226\207/config.yaml" +++ "b/i18n/\344\270\255\346\226\207/config.yaml" @@ -1,30 +1,31 @@ # * 标有 * 的设置是高级设置,不会出现在 Streamlit 页面中,只能在 config.py 中手动修改 +version: "2.0.0" ## ======================== 基本设置 ======================== ## # API 设置 api: - key: 'YOUR_KEY' - base_url: 'https://yunwu.zeabur.app' - model: 'gemini-1.5-pro-002' + key: 'YOUR_API_KEY' + base_url: 'https://api.siliconflow.cn' + model: 'Qwen/Qwen2.5-72B-Instruct' # 语言设置,写入提示词,可以用自然语言描述 -target_language: 'Chinese' +target_language: '简体中文' # 是否在转录前进行人声分离,警告这可能会减慢过程并导致行缺失! demucs: false whisper: - # Whisper 设置 [whisperx, whisperxapi] - method: 'whisperx' - # Whisper 指定识别语言 [en, zh, auto] auto 为自动检测,en 为强制翻译为英语 + # ["medium", "large-v3", "large-v3-turbo"]. 注意:对于中文模型将强制使用 Belle/large-v3 + model: 'large-v3' + # Whisper 指定识别语言 [en, zh, ...] language: 'en' detected_language: 'en' # 视频分辨率 [0x0, 640x360, 1920x1080] 0x0 会生成一个 0 秒的黑色视频占位符 -resolution: '640x360' +resolution: '1920x1080' ## ======================== 高级设置 ======================== ## # *下载 YouTube 视频的默认分辨率 [360, 1080, best] -ytb_resolution: '360' +ytb_resolution: '1080' subtitle: # *每行字幕的最大字符长度 @@ -41,8 +42,20 @@ max_split_length: 20 pause_before_translate: false ## ======================== 配音设置 ======================== ## -# TTS 选择 [openai_tts, gpt_sovits, azure_tts, fish_tts] -tts_method: 'azure_tts' +# TTS 选择 [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts] +tts_method: 'sf_fish_tts' + +# SiliconFlow FishTTS +sf_fish_tts: + # SiliconFlow API key + api_key: 'YOUR_API_KEY' + # 仅用于 "preset" 模式 + voice: 'anna' + # *仅用于 "custom" 模式,不要手动设置 + custom_name: '' + voice_id: '' + # preset, custom, dynamic + mode: "preset" # OpenAI TTS-1 API 配置 openai_tts: @@ -72,15 +85,21 @@ fish_tts: # *音频速度范围 speed_factor: min: 1 + accept: 1.2 # 可以接受的最大速度 max: 1.4 - normal: 1.2 # *被认为是正常语速 # *合并音频配置 -min_subtitle_duration: 3 -min_trim_duration: 2.50 +min_subtitle_duration: 2.5 # 最小字幕出现时间 会强制扩展 +min_trim_duration: 3.5 # 小于这个值的字幕不会切割 +tolerance: 1.5 # 允许向后延申的时间 # 音量设置 -dub_volume: 1.3 # *配音音频音量(1.3 = 130%,大多数原始配音音频相对较安静) +dub_volume: 1.5 # *配音音频音量(1.5 = 150%,大多数原始配音音频相对较安静) + + + + + ## ======================== 附加设置 请勿修改 ======================== ## # Whisper 模型目录 @@ -96,7 +115,6 @@ allowed_video_formats: - 'wmv' - 'webm' -# 支持的音频格式 allowed_audio_formats: - 'wav' - 'mp3' @@ -112,6 +130,11 @@ llm_support_json: - 'gemini-1.5-pro-latest' - 'gemini-1.5-pro-002' +# 存在问题 +# - 'Qwen/Qwen2.5-72B-Instruct' +# - 'Qwen/Qwen2.5-Coder-32B-Instruct' +# - 'Qwen/Qwen2.5-Chat-72B-Instruct-128K' + # Spacy 模型 spacy_model_map: en: 'en_core_web_md' diff --git "a/i18n/\344\270\255\346\226\207/core/pypi_autochoose.py" "b/i18n/\344\270\255\346\226\207/core/pypi_autochoose.py" deleted file mode 100644 index 79d873ca..00000000 --- "a/i18n/\344\270\255\346\226\207/core/pypi_autochoose.py" +++ /dev/null @@ -1,140 +0,0 @@ -import subprocess -import time -import requests -import os -import concurrent.futures -from rich.console import Console -from rich.table import Table -from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn - -MIRRORS = { - "Alibaba Cloud": "https://mirrors.aliyun.com/pypi/simple", - "Tsinghua University": "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple", - "Huawei Cloud": "https://repo.huaweicloud.com/repository/pypi/simple", - "Tencent Cloud": "https://mirrors.cloud.tencent.com/pypi/simple", - "163 Cloud": "https://mirrors.163.com/pypi/simple", - "PyPI Official": "https://pypi.org/simple" -} - -console = Console() - -FAST_THRESHOLD = 1000 # ms -SLOW_THRESHOLD = 1500 # ms - -def get_optimal_thread_count(): - try: - cpu_count = os.cpu_count() - return max(cpu_count - 1, 1) - except: - return 2 - -def test_mirror_speed(name, url): - try: - start_time = time.time() - response = requests.get(url, timeout=5) - end_time = time.time() - if response.status_code == 200: - speed = (end_time - start_time) * 1000 - return name, speed - else: - return name, float('inf') - except requests.RequestException: - return name, float('inf') - -def set_pip_mirror(url, host): - try: - subprocess.run(["pip", "config", "set", "global.index-url", url], check=True, capture_output=True) - subprocess.run(["pip", "config", "set", "install.trusted-host", host], check=True, capture_output=True) - return True - except subprocess.CalledProcessError: - return False - -def get_current_pip_mirror(): - try: - result = subprocess.run(["pip", "config", "get", "global.index-url"], capture_output=True, text=True, check=True) - return result.stdout.strip() - except subprocess.CalledProcessError: - return None - -def main(): - console.print("[yellow]开始新的镜像速度测试[/yellow]") - - # First test PyPI official mirror - pypi_name = next(name for name, url in MIRRORS.items() if "pypi.org" in url) - pypi_url = MIRRORS[pypi_name] - console.print("[cyan]测试PyPI官方镜像...[/cyan]") - - optimal_thread_count = get_optimal_thread_count() - console.print(f"使用 {optimal_thread_count} 个线程进行测试") - - _, pypi_speed = test_mirror_speed(pypi_name, pypi_url) - - if pypi_speed < FAST_THRESHOLD: - console.print(f"PyPI官方镜像速度很快 ({pypi_speed:.2f} ms)。使用官方镜像。") - set_pip_mirror(pypi_url, "pypi.org") - return - elif pypi_speed < SLOW_THRESHOLD: - console.print(f"PyPI官方镜像速度可以接受 ({pypi_speed:.2f} ms)。您可以继续使用它。") - return - - console.print(f"PyPI官方镜像速度较慢 ({pypi_speed:.2f} ms)。测试其他镜像...") - - # Test other mirrors - speeds = {} - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - ) as progress: - task = progress.add_task("[cyan]测试镜像...", total=len(MIRRORS) - 1) # -1 because we already tested PyPI - - with concurrent.futures.ThreadPoolExecutor(max_workers=optimal_thread_count) as executor: - future_to_mirror = {executor.submit(test_mirror_speed, name, url): name for name, url in MIRRORS.items() if name != pypi_name} - for future in concurrent.futures.as_completed(future_to_mirror): - name = future_to_mirror[future] - try: - name, speed = future.result() - if speed != float('inf'): - speeds[name] = speed - except Exception as exc: - print(f'{name} 生成了一个异常: {exc}') - finally: - progress.update(task, advance=1) - - table = Table(title="镜像速度测试结果") - table.add_column("镜像", style="cyan") - table.add_column("响应时间 (ms)", justify="right", style="magenta") - - for name, speed in sorted(speeds.items(), key=lambda x: x[1]): - table.add_row(name, f"{speed:.2f}") - - console.print(table) - - if speeds: - fastest_mirror = min(speeds, key=speeds.get) - fastest_url = MIRRORS[fastest_mirror] - console.print(f"\n[green]最快的镜像: {fastest_mirror} ({fastest_url})[/green]") - console.print(f"[green]响应时间: {speeds[fastest_mirror]:.2f} ms[/green]") - - host = fastest_url.split("//")[1].split("/")[0] - if set_pip_mirror(fastest_url, host): - current_mirror = get_current_pip_mirror() - console.print(f"\n[yellow]当前pip源: {current_mirror}[/yellow]") - - if current_mirror == fastest_url: - console.print(f"[bold green]成功切换到 {fastest_mirror} 镜像。[/bold green]") - else: - console.print("[bold red]切换失败。当前pip源与预期不符。[/bold red]") - console.print(f"[yellow]预期的pip源: {fastest_url}[/yellow]") - console.print("[yellow]请手动检查配置或尝试以管理员权限运行此脚本。[/yellow]") - else: - console.print("[bold red]切换镜像失败,将继续使用当前源。[/bold red]") - current_mirror = get_current_pip_mirror() - console.print(f"[yellow]当前pip源: {current_mirror}[/yellow]") - console.print("[yellow]请检查是否有足够的权限修改pip配置。[/yellow]") - else: - console.print("[bold red]所有镜像都无法访问。请检查您的网络连接。[/bold red]") - -if __name__ == "__main__": - main() diff --git "a/i18n/\344\270\255\346\226\207/install.py" "b/i18n/\344\270\255\346\226\207/install.py" index 1db2ec3b..91d90f06 100644 --- "a/i18n/\344\270\255\346\226\207/install.py" +++ "b/i18n/\344\270\255\346\226\207/install.py" @@ -4,47 +4,62 @@ import sys import zipfile import shutil - sys.path.append(os.path.dirname(os.path.abspath(__file__))) +ascii_logo = """ +__ ___ _ _ _ +\ \ / (_) __| | ___ ___ | | (_)_ __ __ _ ___ + \ \ / /| |/ _` |/ _ \/ _ \| | | | '_ \ / _` |/ _ \ + \ V / | | (_| | __/ (_) | |___| | | | | (_| | (_) | + \_/ |_|\__,_|\___|\___/|_____|_|_| |_|\__, |\___/ + |___/ +""" + def install_package(*packages): subprocess.check_call([sys.executable, "-m", "pip", "install", *packages]) -install_package("requests", "rich", "ruamel.yaml") -from pypi_autochoose import main as choose_mirror - def check_gpu(): - """检查是否有 NVIDIA GPU 可用""" try: - # 🔍 尝试运行 nvidia-smi 命令来检测 GPU subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) return True except (subprocess.CalledProcessError, FileNotFoundError): return False def main(): + install_package("requests", "rich", "ruamel.yaml") from rich.console import Console from rich.panel import Panel - + from rich.box import DOUBLE console = Console() + + width = max(len(line) for line in ascii_logo.splitlines()) + 4 + welcome_panel = Panel( + ascii_logo, + width=width, + box=DOUBLE, + title="[bold green]🌏[/bold green]", + border_style="bright_blue" + ) + console.print(welcome_panel) + console.print(Panel.fit("🚀 开始安装", style="bold magenta")) # 配置镜像源 - console.print(Panel("⚙️ 正在配置镜像源", style="bold yellow")) + from core.pypi_autochoose import main as choose_mirror choose_mirror() - # 检测系统和 GPU + # 检测系统和GPU if platform.system() == 'Darwin': - console.print(Panel("🍎 检测到 MacOS,正在安装 CPU 版本的 PyTorch... 但速度会慢很多", style="cyan")) - subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "torchaudio"]) + console.print(Panel("🍎 检测到 MacOS,正在安装 CPU 版本的 PyTorch... 但转写速度会慢很多", style="cyan")) + subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.1.2", "torchaudio==2.1.2"]) else: has_gpu = check_gpu() if has_gpu: console.print(Panel("🎮 检测到 NVIDIA GPU,正在安装 CUDA 版本的 PyTorch...", style="cyan")) subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.0.0", "torchaudio==2.0.0", "--index-url", "https://download.pytorch.org/whl/cu118"]) else: - console.print(Panel("💻 未检测到 NVIDIA GPU,正在安装 CPU 版本的 PyTorch... 但速度会慢很多", style="cyan")) - subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "torchaudio"]) + console.print(Panel("💻 未检测到 NVIDIA GPU,正在安装 CPU 版本的 PyTorch... 但转写速度会慢很多", style="cyan")) + subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.1.2", "torchaudio==2.1.2"]) # 安装 WhisperX console.print(Panel("📦 正在安装 WhisperX...", style="cyan")) @@ -65,6 +80,10 @@ def install_requirements(): subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]) def download_and_extract_ffmpeg(): + # 需要同时安装 conda-ffmpeg 和 ffmpeg.exe + console.print(Panel("📦 正在通过 conda 安装 ffmpeg...", style="cyan")) + subprocess.check_call(["conda", "install", "-y", "ffmpeg"]) + import requests system = platform.system() if system == "Windows": @@ -83,15 +102,15 @@ def download_and_extract_ffmpeg(): print(f"{ffmpeg_exe} 已存在") return - print("正在下载 FFmpeg") + console.print(Panel("📦 正在下载 FFmpeg...", style="cyan")) response = requests.get(url) if response.status_code == 200: filename = "ffmpeg.zip" if system in ["Windows", "Darwin"] else "ffmpeg.tar.xz" with open(filename, 'wb') as f: f.write(response.content) - print(f"FFmpeg 下载完成: {filename}") + console.print(Panel(f"FFmpeg 下载完成: {filename}", style="cyan")) - print("正在解压 FFmpeg") + console.print(Panel("📦 正在解压 FFmpeg...", style="cyan")) if system == "Linux": import tarfile with tarfile.open(filename) as tar_ref: @@ -106,15 +125,15 @@ def download_and_extract_ffmpeg(): zip_ref.extract(file) shutil.move(os.path.join(*file.split('/')[:-1], os.path.basename(file)), os.path.basename(file)) - print("正在清理") + console.print(Panel("📦 正在清理...", style="cyan")) os.remove(filename) if system == "Windows": for item in os.listdir(): if os.path.isdir(item) and "ffmpeg" in item.lower(): shutil.rmtree(item) - print("FFmpeg 解压完成") + console.print(Panel("FFmpeg 解压完成", style="cyan")) else: - print("FFmpeg 下载失败") + console.print(Panel("❌ FFmpeg 下载失败", style="red")) def install_noto_font(): if platform.system() == 'Linux': diff --git "a/i18n/\344\270\255\346\226\207/st.py" "b/i18n/\344\270\255\346\226\207/st.py" index 2b3bb513..0172e680 100644 --- "a/i18n/\344\270\255\346\226\207/st.py" +++ "b/i18n/\344\270\255\346\226\207/st.py" @@ -10,6 +10,9 @@ st.set_page_config(page_title="VideoLingo", page_icon="docs/logo.svg") +SUB_VIDEO = "output/output_sub.mp4" +DUB_VIDEO = "output/output_dub.mp4" + def text_processing_section(): st.header("翻译和生成字幕") with st.container(border=True): @@ -25,16 +28,16 @@ def text_processing_section(): 6. 将字幕合并到视频中 """, unsafe_allow_html=True) - if not os.path.exists("output/output_video_with_subs.mp4"): + if not os.path.exists(SUB_VIDEO): if st.button("开始处理字幕", key="text_processing_button"): process_text() st.rerun() else: if load_key("resolution") != "0x0": - st.video("output/output_video_with_subs.mp4") + st.video(SUB_VIDEO) download_subtitle_zip_button(text="下载所有字幕") - if st.button("归档到'history'", key="cleanup_in_text_processing"): + if st.button("归档到'历史记录'", key="cleanup_in_text_processing"): cleanup() st.rerun() return True @@ -60,24 +63,25 @@ def process_text(): st.balloons() def audio_processing_section(): - st.header("配音(测试版)") + st.header("配音") with st.container(border=True): st.markdown("""

此阶段包含以下步骤:

- 1. 生成音频任务
- 2. 生成音频
- 3. 将音频合并到视频中 + 1. 生成音频任务和分段
+ 2. 提取参考音频
+ 3. 生成和合并音频文件
+ 4. 将最终音频合并到视频中 """, unsafe_allow_html=True) - if not os.path.exists("output/output_video_with_audio.mp4"): + if not os.path.exists(DUB_VIDEO): if st.button("开始处理音频", key="audio_processing_button"): process_audio() st.rerun() else: st.success("音频处理完成!你可以在 `output` 文件夹中查看音频文件。") if load_key("resolution") != "0x0": - st.video("output/output_video_with_audio.mp4") + st.video(DUB_VIDEO) if st.button("删除配音文件", key="delete_dubbing_files"): delete_dubbing_files() st.rerun() @@ -87,13 +91,16 @@ def audio_processing_section(): def process_audio(): with st.spinner("生成音频任务中"): - step8_gen_audio_task.gen_audio_task_main() + step8_1_gen_audio_task.gen_audio_task_main() + step8_2_gen_dub_chunks.gen_dub_chunks() with st.spinner("提取参考音频中"): step9_extract_refer_audio.extract_refer_audio_main() - with st.spinner("生成音频中"): - step10_gen_audio.process_sovits_tasks() - with st.spinner("将音频合并到视频中"): - step11_merge_audio_to_vid.merge_main() + with st.spinner("生成所有音频中"): + step10_gen_audio.gen_audio() + with st.spinner("合并完整音频中"): + step11_merge_full_audio.merge_full_audio() + with st.spinner("将配音合并到视频中"): + step12_merge_dub_to_vid.merge_video_audio() st.success("音频处理完成!🎇") st.balloons() diff --git "a/i18n/\344\270\255\346\226\207/st_components/sidebar_setting.py" "b/i18n/\344\270\255\346\226\207/st_components/sidebar_setting.py" index 9b743bba..0f64c786 100644 --- "a/i18n/\344\270\255\346\226\207/st_components/sidebar_setting.py" +++ "b/i18n/\344\270\255\346\226\207/st_components/sidebar_setting.py" @@ -4,56 +4,54 @@ import streamlit as st from core.config_utils import update_key, load_key -def config_text_input(label, key, help=None): - """通用配置文本输入处理器""" - value = st.text_input(label, value=load_key(key), help=help) - if value != load_key(key): - update_key(key, value) - return value +def config_input(label, key, help=None): + """Generic config input handler""" + val = st.text_input(label, value=load_key(key), help=help) + if val != load_key(key): + update_key(key, val) + return val def page_setting(): with st.expander("LLM 配置", expanded=True): - config_text_input("API_KEY", "api.key") - config_text_input("BASE_URL", "api.base_url", help="API请求的基础URL") + config_input("API_KEY", "api.key") + config_input("BASE_URL", "api.base_url", help="API请求的基础URL") - col1, col2 = st.columns([4, 1]) - with col1: - config_text_input("模型", "api.model") - with col2: + c1, c2 = st.columns([4, 1]) + with c1: + config_input("模型", "api.model") + with c2: if st.button("📡", key="api"): - if valid_llm_api(): - st.toast("API 密钥有效", icon="✅") - else: - st.toast("API 密钥无效", icon="❌") + st.toast("API密钥有效" if check_api() else "API密钥无效", + icon="✅" if check_api() else "❌") with st.expander("转写和字幕设置", expanded=True): - col1, col2 = st.columns(2) - with col1: - whisper_language_options_dict = { - "🇺🇸 English": "en", - "🇨🇳 简体中文": "zh", - "🇪🇸 Español": "es", - "🇷🇺 Русский": "ru", - "🇫🇷 Français": "fr", - "🇩🇪 Deutsch": "de", - "🇮🇹 Italiano": "it", - "🇯🇵 日本語": "ja" + c1, c2 = st.columns(2) + with c1: + langs = { + "🇺🇸 English": "en", + "🇨🇳 简体中文": "zh", + "🇪🇸 Español": "es", + "🇷🇺 Русский": "ru", + "🇫🇷 Français": "fr", + "🇩🇪 Deutsch": "de", + "🇮🇹 Italiano": "it", + "🇯🇵 日本語": "ja" } - selected_whisper_language = st.selectbox( + lang = st.selectbox( "识别语言:", - options=list(whisper_language_options_dict.keys()), - index=list(whisper_language_options_dict.values()).index(load_key("whisper.language")) + options=list(langs.keys()), + index=list(langs.values()).index(load_key("whisper.language")) ) - if whisper_language_options_dict[selected_whisper_language] != load_key("whisper.language"): - update_key("whisper.language", whisper_language_options_dict[selected_whisper_language]) + if langs[lang] != load_key("whisper.language"): + update_key("whisper.language", langs[lang]) - with col2: + with c2: target_language = st.text_input("目标语言", value=load_key("target_language")) if target_language != load_key("target_language"): update_key("target_language", target_language) - col1, col2 = st.columns(2) - with col1: + c1, c2 = st.columns(2) + with c1: burn_subtitles = st.toggle("烧录字幕", value=load_key("resolution") != "0x0") resolution_options = { @@ -61,7 +59,7 @@ def page_setting(): "360p": "640x360" } - with col2: + with c2: if burn_subtitles: selected_resolution = st.selectbox( "视频分辨率", @@ -74,32 +72,53 @@ def page_setting(): if resolution != load_key("resolution"): update_key("resolution", resolution) - - with st.expander("配音设置", expanded=False): - tts_methods = ["openai_tts", "azure_tts", "gpt_sovits", "fish_tts"] - selected_tts_method = st.selectbox("TTS 方法", options=tts_methods, index=tts_methods.index(load_key("tts_method"))) + + with st.expander("配音设置", expanded=True): + tts_methods = ["sf_fish_tts", "openai_tts", "azure_tts", "gpt_sovits", "fish_tts"] + selected_tts_method = st.selectbox("TTS方法", options=tts_methods, index=tts_methods.index(load_key("tts_method"))) if selected_tts_method != load_key("tts_method"): update_key("tts_method", selected_tts_method) - if selected_tts_method == "openai_tts": - config_text_input("OpenAI 语音", "openai_tts.voice") - config_text_input("OpenAI TTS API 密钥", "openai_tts.api_key") - config_text_input("OpenAI TTS API 基础 URL", "openai_tts.base_url") + if selected_tts_method == "sf_fish_tts": + config_input("SiliconFlow API密钥", "sf_fish_tts.api_key") + + # Add mode selection dropdown + mode_options = { + "preset": "preset", + "custom": "clone(stable)", + "dynamic": "clone(dynamic)" + } + selected_mode = st.selectbox( + "模式选择", + options=list(mode_options.keys()), + format_func=lambda x: mode_options[x], + index=list(mode_options.keys()).index(load_key("sf_fish_tts.mode")) if load_key("sf_fish_tts.mode") in mode_options.keys() else 0 + ) + if selected_mode != load_key("sf_fish_tts.mode"): + update_key("sf_fish_tts.mode", selected_mode) + + if selected_mode == "preset": + config_input("语音", "sf_fish_tts.voice") + + elif selected_tts_method == "openai_tts": + config_input("OpenAI语音", "openai_tts.voice") + config_input("OpenAI TTS API密钥", "openai_tts.api_key") + config_input("OpenAI TTS API基础URL", "openai_tts.base_url") elif selected_tts_method == "fish_tts": - config_text_input("Fish TTS API 密钥", "fish_tts.api_key") - fish_tts_character = st.selectbox("Fish TTS 角色", options=list(load_key("fish_tts.character_id_dict").keys()), index=list(load_key("fish_tts.character_id_dict").keys()).index(load_key("fish_tts.character"))) + config_input("Fish TTS API密钥", "fish_tts.api_key") + fish_tts_character = st.selectbox("Fish TTS角色", options=list(load_key("fish_tts.character_id_dict").keys()), index=list(load_key("fish_tts.character_id_dict").keys()).index(load_key("fish_tts.character"))) if fish_tts_character != load_key("fish_tts.character"): update_key("fish_tts.character", fish_tts_character) elif selected_tts_method == "azure_tts": - config_text_input("Azure 密钥", "azure_tts.key") - config_text_input("Azure 区域", "azure_tts.region") - config_text_input("Azure 语音", "azure_tts.voice") + config_input("Azure密钥", "azure_tts.key") + config_input("Azure区域", "azure_tts.region") + config_input("Azure语音", "azure_tts.voice") elif selected_tts_method == "gpt_sovits": - st.info("配置 GPT_SoVITS,请参考 Github 主页") - config_text_input("SoVITS 角色", "gpt_sovits.character") + st.info("配置GPT_SoVITS,请参考Github主页") + config_input("SoVITS角色", "gpt_sovits.character") refer_mode_options = {1: "模式1:仅用提供的参考音频", 2: "模式2:仅用视频第1条语音做参考", 3: "模式3:使用视频每一条语音做参考"} selected_refer_mode = st.selectbox( @@ -112,9 +131,10 @@ def page_setting(): if selected_refer_mode != load_key("gpt_sovits.refer_mode"): update_key("gpt_sovits.refer_mode", selected_refer_mode) -def valid_llm_api(): +def check_api(): try: - response = ask_gpt("This is a test, response 'message':'success' in json format.", response_json=True, log_title='None') - return response.get('message') == 'success' + resp = ask_gpt("This is a test, response 'message':'success' in json format.", + response_json=True, log_title='None') + return resp.get('message') == 'success' except Exception: return False diff --git "a/i18n/\344\270\255\346\226\207/\344\270\200\351\224\256\345\220\257\345\212\250.bat" "b/i18n/\344\270\255\346\226\207/\344\270\200\351\224\256\345\220\257\345\212\250.bat" deleted file mode 100644 index 8fd86efa..00000000 --- "a/i18n/\344\270\255\346\226\207/\344\270\200\351\224\256\345\220\257\345\212\250.bat" +++ /dev/null @@ -1,13 +0,0 @@ -@echo off -cd /d %~dp0 -if exist runtime ( - echo 使用 runtime 文件夹... - runtime\python.exe -m streamlit run st.py -) else ( - echo 未找到 runtime 文件夹,使用 conda 环境,若启动失败说明 conda 不在系统环境中... - call activate videolingo - python -m streamlit run st.py - call deactivate -) - -pause diff --git a/install.py b/install.py index 90dc6a2b..5bb85a7d 100644 --- a/install.py +++ b/install.py @@ -4,33 +4,48 @@ import sys import zipfile import shutil - sys.path.append(os.path.dirname(os.path.abspath(__file__))) +ascii_logo = """ +__ ___ _ _ _ +\ \ / (_) __| | ___ ___ | | (_)_ __ __ _ ___ + \ \ / /| |/ _` |/ _ \/ _ \| | | | '_ \ / _` |/ _ \ + \ V / | | (_| | __/ (_) | |___| | | | | (_| | (_) | + \_/ |_|\__,_|\___|\___/|_____|_|_| |_|\__, |\___/ + |___/ +""" + def install_package(*packages): subprocess.check_call([sys.executable, "-m", "pip", "install", *packages]) -install_package("requests", "rich", "ruamel.yaml") -from pypi_autochoose import main as choose_mirror - def check_gpu(): - """Check if NVIDIA GPU is available""" try: - # 🔍 Try running nvidia-smi command to detect GPU subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) return True except (subprocess.CalledProcessError, FileNotFoundError): return False def main(): + install_package("requests", "rich", "ruamel.yaml") from rich.console import Console from rich.panel import Panel - + from rich.box import DOUBLE console = Console() + + width = max(len(line) for line in ascii_logo.splitlines()) + 4 + welcome_panel = Panel( + ascii_logo, + width=width, + box=DOUBLE, + title="[bold green]🌏[/bold green]", + border_style="bright_blue" + ) + console.print(welcome_panel) + console.print(Panel.fit("🚀 Starting Installation", style="bold magenta")) # Configure mirrors - console.print(Panel("⚙️ Configuring mirrors", style="bold yellow")) + from core.pypi_autochoose import main as choose_mirror choose_mirror() # Detect system and GPU @@ -65,6 +80,10 @@ def install_requirements(): subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]) def download_and_extract_ffmpeg(): + # requires both conda-ffmpeg and ffmpeg.exe + console.print(Panel("📦 Installing ffmpeg through conda...", style="cyan")) + subprocess.check_call(["conda", "install", "-y", "ffmpeg"]) + import requests system = platform.system() if system == "Windows": @@ -83,15 +102,15 @@ def download_and_extract_ffmpeg(): print(f"{ffmpeg_exe} already exists") return - print("Downloading FFmpeg") + console.print(Panel("📦 Downloading FFmpeg...", style="cyan")) response = requests.get(url) if response.status_code == 200: filename = "ffmpeg.zip" if system in ["Windows", "Darwin"] else "ffmpeg.tar.xz" with open(filename, 'wb') as f: f.write(response.content) - print(f"FFmpeg downloaded: {filename}") + console.print(Panel(f"FFmpeg downloaded: {filename}", style="cyan")) - print("Extracting FFmpeg") + console.print(Panel("📦 Extracting FFmpeg...", style="cyan")) if system == "Linux": import tarfile with tarfile.open(filename) as tar_ref: @@ -106,15 +125,15 @@ def download_and_extract_ffmpeg(): zip_ref.extract(file) shutil.move(os.path.join(*file.split('/')[:-1], os.path.basename(file)), os.path.basename(file)) - print("Cleaning up") + console.print(Panel("📦 Cleaning up...", style="cyan")) os.remove(filename) if system == "Windows": for item in os.listdir(): if os.path.isdir(item) and "ffmpeg" in item.lower(): shutil.rmtree(item) - print("FFmpeg extraction completed") + console.print(Panel("FFmpeg extraction completed", style="cyan")) else: - print("Failed to download FFmpeg") + console.print(Panel("❌ Failed to download FFmpeg", style="red")) def install_noto_font(): if platform.system() == 'Linux': diff --git a/pip_setup.py b/pip_setup.py new file mode 100644 index 00000000..42d5d06e --- /dev/null +++ b/pip_setup.py @@ -0,0 +1,70 @@ +import os +import subprocess +import sys + +script_dir = os.getcwd() + +def run_cmd(cmd, assert_success=False, environment=False, capture_output=False, env=None): + # Use the conda environment + if environment: + conda_env_path = os.path.join(script_dir, "installer_files", "env") + if sys.platform.startswith("win"): + conda_bat_path = os.path.join(script_dir, "installer_files", "conda", "condabin", "conda.bat") + cmd = "\"" + conda_bat_path + "\" activate \"" + conda_env_path + "\" >nul && " + cmd + else: + conda_sh_path = os.path.join(script_dir, "installer_files", "conda", "etc", "profile.d", "conda.sh") + cmd = ". \"" + conda_sh_path + "\" && conda activate \"" + conda_env_path + "\" && " + cmd + + # Run shell commands + result = subprocess.run(cmd, shell=True, capture_output=capture_output, env=env) + + # Assert the command ran successfully + if assert_success and result.returncode != 0: + print("Command '" + cmd + "' failed with exit status code '" + str(result.returncode) + "'. Exiting...") + sys.exit() + return result + +def check_env(): + # If we have access to conda, we are probably in an environment + conda_exist = run_cmd("conda", environment=True, capture_output=True).returncode == 0 + if not conda_exist: + print("Conda is not installed. Exiting...") + sys.exit() + + # Ensure this is a new environment and not the base environment + if os.environ["CONDA_DEFAULT_ENV"] == "base": + print("Create an environment for this project and activate it. Exiting...") + sys.exit() + +def check_gpu_win(): + if not sys.platform.startswith('win'): + return + + CUDNN_PATH = "C:\\Program Files\\NVIDIA\\CUDNN\\v9.3\\bin\\12.6" + + def check_gpu(): + try: + subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + return True + except (subprocess.CalledProcessError, FileNotFoundError): + return False + + if check_gpu(): + if CUDNN_PATH not in os.environ.get('PATH', ''): + print("🚨 Warning: CUDNN path not found in system environment!") + print(f"⚡ Please add the following path to system PATH:\n{CUDNN_PATH}") + sys.exit(1) + else: + print("✅ CUDNN found in system PATH - All good!") + +def install_dependencies(): + run_cmd("python install.py", assert_success=True, environment=True) + +def run_model(): + run_cmd(f"python -m streamlit run st.py", environment=True) + +if __name__ == "__main__": + check_env() + install_dependencies() + check_gpu_win() + run_model() diff --git a/pypi_autochoose.py b/pypi_autochoose.py deleted file mode 100644 index 0623508b..00000000 --- a/pypi_autochoose.py +++ /dev/null @@ -1,140 +0,0 @@ -import subprocess -import time -import requests -import os -import concurrent.futures -from rich.console import Console -from rich.table import Table -from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn -import sys - -MIRRORS = { - "Tsinghua University": "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple", - "PyPI Official": "https://pypi.org/simple" -} - -console = Console() - -FAST_THRESHOLD = 1000 # ms -SLOW_THRESHOLD = 1500 # ms - -def get_optimal_thread_count(): - try: - cpu_count = os.cpu_count() - return max(cpu_count - 1, 1) - except: - return 2 - -def test_mirror_speed(name, url): - try: - start_time = time.time() - response = requests.get(url, timeout=5) - end_time = time.time() - if response.status_code == 200: - speed = (end_time - start_time) * 1000 - return name, speed - else: - return name, float('inf') - except requests.RequestException: - return name, float('inf') - -def set_pip_mirror(url): - try: - subprocess.run([sys.executable, "-m", "pip", "config", "set", "global.index-url", url], - check=True, - capture_output=True) - return True - except subprocess.CalledProcessError as e: - print(f"Failed to set pip mirror: {e}") - return False - -def get_current_pip_mirror(): - try: - result = subprocess.run([sys.executable, "-m", "pip", "config", "get", "global.index-url"], - capture_output=True, text=True, check=True) - return result.stdout.strip() - except subprocess.CalledProcessError: - return None - -def main(): - console.print("[yellow]Starting new mirror speed test[/yellow]") - - # First test PyPI official mirror - pypi_name = next(name for name, url in MIRRORS.items() if "pypi.org" in url) - pypi_url = MIRRORS[pypi_name] - console.print("[cyan]Testing PyPI official mirror...[/cyan]") - - optimal_thread_count = get_optimal_thread_count() - console.print(f"Using {optimal_thread_count} threads for testing") - - _, pypi_speed = test_mirror_speed(pypi_name, pypi_url) - - if pypi_speed < FAST_THRESHOLD: - console.print(f"PyPI official mirror is fast ({pypi_speed:.2f} ms). Using the official mirror.") - set_pip_mirror(pypi_url) - return - elif pypi_speed < SLOW_THRESHOLD: - console.print(f"PyPI official mirror speed is acceptable ({pypi_speed:.2f} ms). You may continue using it.") - return - - console.print(f"PyPI official mirror is slow ({pypi_speed:.2f} ms). Testing other mirrors...") - - # Test other mirrors - speeds = {} - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - ) as progress: - task = progress.add_task("[cyan]Testing mirrors...", total=len(MIRRORS) - 1) # -1 because we already tested PyPI - - with concurrent.futures.ThreadPoolExecutor(max_workers=optimal_thread_count) as executor: - future_to_mirror = {executor.submit(test_mirror_speed, name, url): name for name, url in MIRRORS.items() if name != pypi_name} - for future in concurrent.futures.as_completed(future_to_mirror): - name = future_to_mirror[future] - try: - name, speed = future.result() - if speed != float('inf'): - speeds[name] = speed - except Exception as exc: - print(f'{name} generated an exception: {exc}') - finally: - progress.update(task, advance=1) - - table = Table(title="Mirror Speed Test Results") - table.add_column("Mirror", style="cyan") - table.add_column("Response Time (ms)", justify="right", style="magenta") - - for name, speed in sorted(speeds.items(), key=lambda x: x[1]): - table.add_row(name, f"{speed:.2f}") - - console.print(table) - - if speeds: - fastest_mirror = min(speeds, key=speeds.get) - fastest_url = MIRRORS[fastest_mirror] - console.print(f"\n[green]Fastest mirror: {fastest_mirror} ({fastest_url})[/green]") - console.print(f"[green]Response time: {speeds[fastest_mirror]:.2f} ms[/green]") - - host = fastest_url.split("//")[1].split("/")[0] - if set_pip_mirror(fastest_url): - current_mirror = get_current_pip_mirror() - console.print(f"\n[yellow]Current pip source: {current_mirror}[/yellow]") - - if current_mirror == fastest_url: - console.print(f"[bold green]Successfully switched to {fastest_mirror} mirror.[/bold green]") - else: - console.print("[bold red]Switch failed. Current pip source doesn't match the expected one.[/bold red]") - console.print(f"[yellow]Expected pip source: {fastest_url}[/yellow]") - console.print("[yellow]Please check the configuration manually or try running this script with administrator privileges.[/yellow]") - else: - console.print("[bold red]Failed to switch mirror, will continue using the current source.[/bold red]") - current_mirror = get_current_pip_mirror() - console.print(f"[yellow]Current pip source: {current_mirror}[/yellow]") - console.print("[yellow]Please check if you have sufficient permissions to modify pip configuration.[/yellow]") - else: - console.print("[bold red]All mirrors are unreachable. Please check your network connection.[/bold red]") - -if __name__ == "__main__": - main() diff --git a/requirements.txt b/requirements.txt index 600c1228..287d5daf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,8 @@ yt-dlp json-repair ruamel.yaml autocorrect-py -demucs[dev] @ git+https://github.com/adefossez/demucs \ No newline at end of file +demucs[dev] @ git+https://github.com/adefossez/demucs + +syllables +pypinyin +g2p-en diff --git a/st.py b/st.py index 5eaa2988..951ae1a5 100644 --- a/st.py +++ b/st.py @@ -10,6 +10,9 @@ st.set_page_config(page_title="VideoLingo", page_icon="docs/logo.svg") +SUB_VIDEO = "output/output_sub.mp4" +DUB_VIDEO = "output/output_dub.mp4" + def text_processing_section(): st.header("Translate and Generate Subtitles") with st.container(border=True): @@ -25,13 +28,13 @@ def text_processing_section(): 6. Merging subtitles into the video """, unsafe_allow_html=True) - if not os.path.exists("output/output_video_with_subs.mp4"): + if not os.path.exists(SUB_VIDEO): if st.button("Start Processing Subtitles", key="text_processing_button"): process_text() st.rerun() else: if load_key("resolution") != "0x0": - st.video("output/output_video_with_subs.mp4") + st.video(SUB_VIDEO) download_subtitle_zip_button(text="Download All Srt Files") if st.button("Archive to 'history'", key="cleanup_in_text_processing"): @@ -60,24 +63,25 @@ def process_text(): st.balloons() def audio_processing_section(): - st.header("Dubbing (beta)") + st.header("Dubbing") with st.container(border=True): st.markdown("""

This stage includes the following steps:

- 1. Generate audio tasks
- 2. Generate audio
- 3. Merge audio into the video + 1. Generate audio tasks and chunks
+ 2. Extract reference audio
+ 3. Generate and merge audio files
+ 4. Merge final audio into video """, unsafe_allow_html=True) - if not os.path.exists("output/output_video_with_audio.mp4"): + if not os.path.exists(DUB_VIDEO): if st.button("Start Audio Processing", key="audio_processing_button"): process_audio() st.rerun() else: st.success("Audio processing is complete! You can check the audio files in the `output` folder.") if load_key("resolution") != "0x0": - st.video("output/output_video_with_audio.mp4") + st.video(DUB_VIDEO) if st.button("Delete dubbing files", key="delete_dubbing_files"): delete_dubbing_files() st.rerun() @@ -87,13 +91,16 @@ def audio_processing_section(): def process_audio(): with st.spinner("Generate audio tasks"): - step8_gen_audio_task.gen_audio_task_main() + step8_1_gen_audio_task.gen_audio_task_main() + step8_2_gen_dub_chunks.gen_dub_chunks() with st.spinner("Extract refer audio"): step9_extract_refer_audio.extract_refer_audio_main() - with st.spinner("Generate audio"): - step10_gen_audio.process_sovits_tasks() - with st.spinner("Merge audio into the video"): - step11_merge_audio_to_vid.merge_main() + with st.spinner("Generate all audio"): + step10_gen_audio.gen_audio() + with st.spinner("Merge full audio"): + step11_merge_full_audio.merge_full_audio() + with st.spinner("Merge dubbing to the video"): + step12_merge_dub_to_vid.merge_video_audio() st.success("Audio processing complete! 🎇") st.balloons() diff --git a/st_components/icon.png b/st_components/icon.png deleted file mode 100644 index c26080fd..00000000 Binary files a/st_components/icon.png and /dev/null differ diff --git a/st_components/imports_and_utils.py b/st_components/imports_and_utils.py index 76985fc5..9a6fc4fa 100644 --- a/st_components/imports_and_utils.py +++ b/st_components/imports_and_utils.py @@ -1,8 +1,31 @@ import os, sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from core import step1_ytdlp, step2_whisperX, step3_1_spacy_split, step3_2_splitbymeaning, step9_extract_refer_audio -from core import step4_1_summarize, step4_2_translate_all, step5_splitforsub, step6_generate_final_timeline -from core import step7_merge_sub_to_vid, step8_gen_audio_task, step10_gen_audio, step11_merge_audio_to_vid +from core import ( + # Download & Transcribe 📥 + step11_merge_full_audio, + step1_ytdlp, + step2_whisperX, + + # Text Processing & Analysis 📝 + step3_1_spacy_split, + step3_2_splitbymeaning, + step4_1_summarize, + step4_2_translate_all, + step5_splitforsub, + + # Subtitle Timeline & Merging 🎬 + step6_generate_final_timeline, + step7_merge_sub_to_vid, + + # Audio Generation & Processing 🎵 + step8_1_gen_audio_task, + step8_2_gen_dub_chunks, + step9_extract_refer_audio, + step10_gen_audio, + + # Final Video Composition 🎥 + step12_merge_dub_to_vid +) from core.onekeycleanup import cleanup from core.delete_retry_dubbing import delete_dubbing_files from core.ask_gpt import ask_gpt diff --git a/st_components/sidebar_setting.py b/st_components/sidebar_setting.py index db371633..b0d88ec5 100644 --- a/st_components/sidebar_setting.py +++ b/st_components/sidebar_setting.py @@ -4,56 +4,54 @@ import streamlit as st from core.config_utils import update_key, load_key -def config_text_input(label, key, help=None): - """Generic config text input handler""" - value = st.text_input(label, value=load_key(key), help=help) - if value != load_key(key): - update_key(key, value) - return value +def config_input(label, key, help=None): + """Generic config input handler""" + val = st.text_input(label, value=load_key(key), help=help) + if val != load_key(key): + update_key(key, val) + return val def page_setting(): with st.expander("LLM Configuration", expanded=True): - config_text_input("API_KEY", "api.key") - config_text_input("BASE_URL", "api.base_url", help="Base URL for API requests") + config_input("API_KEY", "api.key") + config_input("BASE_URL", "api.base_url", help="Base URL for API requests") - col1, col2 = st.columns([4, 1]) - with col1: - config_text_input("MODEL", "api.model") - with col2: + c1, c2 = st.columns([4, 1]) + with c1: + config_input("MODEL", "api.model") + with c2: if st.button("📡", key="api"): - if valid_llm_api(): - st.toast("API Key is valid", icon="✅") - else: - st.toast("API Key is invalid", icon="❌") + st.toast("API Key is valid" if check_api() else "API Key is invalid", + icon="✅" if check_api() else "❌") with st.expander("Transcription and Subtitle Settings", expanded=True): - col1, col2 = st.columns(2) - with col1: - whisper_language_options_dict = { - "🇺🇸 English": "en", - "🇨🇳 简体中文": "zh", - "🇪🇸 Español": "es", - "🇷🇺 Русский": "ru", - "🇫🇷 Français": "fr", - "🇩🇪 Deutsch": "de", - "🇮🇹 Italiano": "it", - "🇯🇵 日本語": "ja" + c1, c2 = st.columns(2) + with c1: + langs = { + "🇺🇸 English": "en", + "🇨🇳 简体中文": "zh", + "🇪🇸 Español": "es", + "🇷🇺 Русский": "ru", + "🇫🇷 Français": "fr", + "🇩🇪 Deutsch": "de", + "🇮🇹 Italiano": "it", + "🇯🇵 日本語": "ja" } - selected_whisper_language = st.selectbox( + lang = st.selectbox( "Recognition Language:", - options=list(whisper_language_options_dict.keys()), - index=list(whisper_language_options_dict.values()).index(load_key("whisper.language")) + options=list(langs.keys()), + index=list(langs.values()).index(load_key("whisper.language")) ) - if whisper_language_options_dict[selected_whisper_language] != load_key("whisper.language"): - update_key("whisper.language", whisper_language_options_dict[selected_whisper_language]) + if langs[lang] != load_key("whisper.language"): + update_key("whisper.language", langs[lang]) - with col2: + with c2: target_language = st.text_input("Target Language", value=load_key("target_language")) if target_language != load_key("target_language"): update_key("target_language", target_language) - col1, col2 = st.columns(2) - with col1: + c1, c2 = st.columns(2) + with c1: burn_subtitles = st.toggle("Burn Subtitles", value=load_key("resolution") != "0x0") resolution_options = { @@ -61,7 +59,7 @@ def page_setting(): "360p": "640x360" } - with col2: + with c2: if burn_subtitles: selected_resolution = st.selectbox( "Video Resolution", @@ -75,31 +73,52 @@ def page_setting(): if resolution != load_key("resolution"): update_key("resolution", resolution) - with st.expander("Dubbing Settings", expanded=False): - tts_methods = ["openai_tts", "azure_tts", "gpt_sovits", "fish_tts"] + with st.expander("Dubbing Settings", expanded=True): + tts_methods = ["sf_fish_tts", "openai_tts", "azure_tts", "gpt_sovits", "fish_tts"] selected_tts_method = st.selectbox("TTS Method", options=tts_methods, index=tts_methods.index(load_key("tts_method"))) if selected_tts_method != load_key("tts_method"): update_key("tts_method", selected_tts_method) - if selected_tts_method == "openai_tts": - config_text_input("OpenAI Voice", "openai_tts.voice") - config_text_input("OpenAI TTS API Key", "openai_tts.api_key") - config_text_input("OpenAI TTS API Base URL", "openai_tts.base_url") + if selected_tts_method == "sf_fish_tts": + config_input("SiliconFlow API Key", "sf_fish_tts.api_key") + + # Add mode selection dropdown + mode_options = { + "preset": "Preset", + "custom": "Refer_stable", + "dynamic": "Refer_dynamic" + } + selected_mode = st.selectbox( + "Mode Selection", + options=list(mode_options.keys()), + format_func=lambda x: mode_options[x], + index=list(mode_options.keys()).index(load_key("sf_fish_tts.mode")) if load_key("sf_fish_tts.mode") in mode_options.keys() else 0 + ) + if selected_mode != load_key("sf_fish_tts.mode"): + update_key("sf_fish_tts.mode", selected_mode) + + if selected_mode == "preset": + config_input("Voice", "sf_fish_tts.voice") + + elif selected_tts_method == "openai_tts": + config_input("OpenAI Voice", "openai_tts.voice") + config_input("OpenAI TTS API Key", "openai_tts.api_key") + config_input("OpenAI TTS API Base URL", "openai_tts.base_url") elif selected_tts_method == "fish_tts": - config_text_input("Fish TTS API Key", "fish_tts.api_key") + config_input("Fish TTS API Key", "fish_tts.api_key") fish_tts_character = st.selectbox("Fish TTS Character", options=list(load_key("fish_tts.character_id_dict").keys()), index=list(load_key("fish_tts.character_id_dict").keys()).index(load_key("fish_tts.character"))) if fish_tts_character != load_key("fish_tts.character"): update_key("fish_tts.character", fish_tts_character) elif selected_tts_method == "azure_tts": - config_text_input("Azure Key", "azure_tts.key") - config_text_input("Azure Region", "azure_tts.region") - config_text_input("Azure Voice", "azure_tts.voice") + config_input("Azure Key", "azure_tts.key") + config_input("Azure Region", "azure_tts.region") + config_input("Azure Voice", "azure_tts.voice") elif selected_tts_method == "gpt_sovits": st.info("配置GPT_SoVITS,请参考Github主页") - config_text_input("SoVITS Character", "gpt_sovits.character") + config_input("SoVITS Character", "gpt_sovits.character") refer_mode_options = {1: "模式1:仅用提供的参考音频", 2: "模式2:仅用视频第1条语音做参考", 3: "模式3:使用视频每一条语音做参考"} selected_refer_mode = st.selectbox( @@ -112,9 +131,10 @@ def page_setting(): if selected_refer_mode != load_key("gpt_sovits.refer_mode"): update_key("gpt_sovits.refer_mode", selected_refer_mode) -def valid_llm_api(): +def check_api(): try: - response = ask_gpt("This is a test, response 'message':'success' in json format.", response_json=True, log_title='None') - return response.get('message') == 'success' + resp = ask_gpt("This is a test, response 'message':'success' in json format.", + response_json=True, log_title='None') + return resp.get('message') == 'success' except Exception: return False \ No newline at end of file