diff --git a/.github/ISSUE_TEMPLATE/issue-report.yml b/.github/ISSUE_TEMPLATE/issue-report.yml
deleted file mode 100644
index d9d64089..00000000
--- a/.github/ISSUE_TEMPLATE/issue-report.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: 使用问题报告 / Usage Problem Report
-description: 报告在使用过程中遇到的问题 / Report issues encountered while using
-labels: ["bug"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        感谢您花时间填写这份问题报告！在提交之前，请先搜索是否已存在类似的 issue。
-        Thank you for taking the time to fill out this problem report! Before submitting, please search to see if a similar issue already exists.
-
-  - type: checkboxes
-    id: search
-    attributes:
-      label: 搜索现有 issues / Search existing issues
-      description: 请确保没有重复的 issue。/ Please make sure there are no duplicate issues.
-      options:
-        - label: 我已经搜索了现有的 issues / I have searched the existing issues
-          required: true
-
-  - type: input
-    id: llm-model
-    attributes:
-      label: 使用的 LLM 模型 / LLM Model Used
-      description: 请指明您使用的是哪个 LLM 模型 / Please specify which LLM model you are using
-      placeholder: 例如：GPT-3.5-turbo, BERT, etc. / For example: GPT-3.5-turbo, BERT, etc.
-    validations:
-      required: true
-
-  - type: textarea
-    id: problem-step
-    attributes:
-      label: 问题发生的步骤 / Steps Where the Problem Occurred
-      description: 请详细描述在哪个步骤遇到了问题 / Please describe in detail at which step you encountered the problem
-      placeholder: |
-        1. 首先我... / First, I...
-        2. 然后我... / Then, I...
-        3. 接着出现了... / After that...
-    validations:
-      required: true
-
-  - type: textarea
-    id: command-screenshot
-    attributes:
-      label: 命令行截图 / Command Line Screenshot
-      description: 请提供包含完整代码的命令行截图 / Please provide a screenshot of the command line including the full code
-      placeholder: 请在此处粘贴您的截图 / Please paste your screenshot here
-    validations:
-      required: true
-
-  - type: textarea
-    id: additional-info
-    attributes:
-      label: 其他信息 / Additional Information
-      description: 还有什么其他相关信息可以提供吗？/ Is there any other relevant information you can provide?
-      placeholder: 任何您认为可能有帮助的额外信息 / Any additional information you think might be helpful
diff --git a/.gitignore b/.gitignore
index 7dce1fb3..ca46faeb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -170,4 +170,5 @@ config.backup.yaml
 
 # runtime
 runtime/
-dev/
\ No newline at end of file
+dev/
+installer_files/
\ No newline at end of file
diff --git a/OneKeyInstall&Start.bat b/OneKeyInstall&Start.bat
new file mode 100644
index 00000000..d1a4e77f
--- /dev/null
+++ b/OneKeyInstall&Start.bat
@@ -0,0 +1,67 @@
+@echo off
+
+cd /D "%~dp0"
+
+set PATH=%PATH%;%SystemRoot%\system32
+
+echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end
+
+@rem fix failed install when installing to a separate drive
+set TMP=%cd%\installer_files
+set TEMP=%cd%\installer_files
+
+@rem config
+set INSTALL_DIR=%cd%\installer_files
+set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
+set INSTALL_ENV_DIR=%cd%\installer_files\env
+set MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Windows-x86_64.exe
+set conda_exists=F
+
+@rem figure out whether git and conda needs to be installed
+call "%CONDA_ROOT_PREFIX%\_conda.exe" --version >nul 2>&1
+if "%ERRORLEVEL%" EQU "0" set conda_exists=T
+
+@rem (if necessary) install git and conda into a contained environment
+@rem download conda
+if "%conda_exists%" == "F" (
+	echo Downloading Miniconda from %MINICONDA_DOWNLOAD_URL% to %INSTALL_DIR%\miniconda_installer.exe
+
+	mkdir "%INSTALL_DIR%"
+	call curl -Lk "%MINICONDA_DOWNLOAD_URL%" > "%INSTALL_DIR%\miniconda_installer.exe" || ( echo. && echo Miniconda failed to download. && goto end )
+
+	echo Installing Miniconda to %CONDA_ROOT_PREFIX%
+	start /wait "" "%INSTALL_DIR%\miniconda_installer.exe" /InstallationType=JustMe /NoShortcuts=1 /AddToPath=0 /RegisterPython=0 /NoRegistry=1 /S /D=%CONDA_ROOT_PREFIX%
+
+	@rem test the conda binary
+	echo Miniconda version:
+	call "%CONDA_ROOT_PREFIX%\_conda.exe" --version || ( echo. && echo Miniconda not found. && goto end )
+)
+
+@rem create the installer env
+if not exist "%INSTALL_ENV_DIR%" (
+  echo Packages to install: python=3.10.0 requests rich ruamel.yaml
+  call "%CONDA_ROOT_PREFIX%\_conda.exe" create --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python=3.10.0 requests rich "ruamel.yaml" || ( echo. && echo Conda environment creation failed. && goto end )
+)
+
+@rem check if conda environment was actually created
+if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
+
+@rem environment isolation
+set PYTHONNOUSERSITE=1
+set PYTHONPATH=
+set PYTHONHOME=
+@rem ! may cause error if we use cudnn on windows
+set "CUDA_PATH=%INSTALL_ENV_DIR%"
+set "CUDA_HOME=%CUDA_PATH%"
+
+@rem activate installer env
+call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end )
+
+@rem Run pip setup
+call python pip_setup.py
+
+echo.
+echo Done!
+
+:end
+pause
diff --git a/OneKeyStart.bat b/OneKeyStart.bat
deleted file mode 100644
index 29ea8efc..00000000
--- a/OneKeyStart.bat
+++ /dev/null
@@ -1,13 +0,0 @@
-@echo off
-cd /d %~dp0
-if exist runtime (
-    echo Using runtime folder...
-    runtime\python.exe -m streamlit run st.py
-) else (
-    echo Runtime folder not found. Using conda environment...
-    call activate videolingo
-    python -m streamlit run st.py
-    call deactivate
-)
-
-pause
\ No newline at end of file
diff --git a/batch/OneKeyBatch.bat b/batch/OneKeyBatch.bat
index 38086bfd..24141944 100644
--- a/batch/OneKeyBatch.bat
+++ b/batch/OneKeyBatch.bat
@@ -1,14 +1,24 @@
 @echo off
-cd /d %~dp0..
-
-if exist runtime (
-    echo Using runtime folder...
-    runtime\python.exe batch\utils\batch_processor.py
-) else (
-    echo Runtime folder not found. Using conda environment...
-    call conda activate videolingo
-    python batch\utils\batch_processor.py
-    call conda deactivate
-)
+cd /D "%~dp0"
+cd ..
 
+@rem 设置环境变量
+set INSTALL_DIR=%cd%\installer_files
+set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
+set INSTALL_ENV_DIR=%cd%\installer_files\env
+
+@rem 环境隔离设置
+set PYTHONNOUSERSITE=1
+set PYTHONPATH=
+set PYTHONHOME=
+set "CUDA_PATH=%INSTALL_ENV_DIR%"
+set "CUDA_HOME=%CUDA_PATH%"
+
+@rem 激活conda环境
+call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Conda environment not found && goto end )
+
+@rem 运行批处理脚本
+call python batch\utils\batch_processor.py
+
+:end
 pause
diff --git a/batch/README.md b/batch/README.md
index bc79c3e8..d0977406 100644
--- a/batch/README.md
+++ b/batch/README.md
@@ -2,29 +2,27 @@
 
 [English](./README.md) | [简体中文](./README.zh.md)
 
-Before utilizing the batch mode, ensure you have familiarized yourself with the Streamlit mode and properly configured the parameters in `config.yaml`.
+Before utilizing the batch mode, ensure you have used the Streamlit mode and properly configured the parameters in `config.yaml`.
 
 ## Usage Guide
 
-> Note: All referenced files, with the exception of `config.yaml`, are located within the `batch` folder.
-
 ### 1. Video File Preparation
 
-- Upload your video files for processing to the `input` folder
-- YouTube links can be specified in the subsequent step
+- Place your video files in the `input` folder
+- YouTube links can be specified in the next step
 
 ### 2. Task Configuration
 
-Modify the `tasks_setting.xlsx` file as follows:
+Edit the `tasks_setting.xlsx` file:
 
 | Field | Description | Acceptable Values |
 |-------|-------------|-------------------|
-| Video File | Video filename (excluding `input/` prefix) or YouTube URL | - |
-| Source Language | Original language of the video | 'en', 'zh', 'auto', or leave empty for default |
-| Target Language | Desired translation language | Use natural language description, or leave empty for default |
-| Dubbing | Enable or disable dubbing | 0 or empty: no dubbing; 1: enable dubbing |
+| Video File | Video filename (without `input/` prefix) or YouTube URL | - |
+| Source Language | Source language | 'en', 'zh', ... or leave empty for default |
+| Target Language | Translation language | Use natural language description, or leave empty for default |
+| Dubbing | Enable dubbing | 0 or empty: no dubbing; 1: enable dubbing |
 
-Example configuration:
+Example:
 
 | Video File | Source Language | Target Language | Dubbing |
 |------------|-----------------|-----------------|---------|
@@ -33,24 +31,23 @@ Example configuration:
 
 ### 3. Executing Batch Processing
 
-1. Launch `OneKeyBatch.bat` with a double-click
-2. Processed files will be stored in the `output` folder
-3. Monitor task progress in the `Status` column of `tasks_setting.xlsx`
+1. Double-click to run `OneKeyBatch.bat`
+2. Output files will be saved in the `output` folder
+3. Task status can be monitored in the `Status` column of `tasks_setting.xlsx`
 
 > Note: Keep `tasks_setting.xlsx` closed during execution to prevent interruptions due to file access conflicts.
 
-
 ## Important Considerations
 
 ### Handling Interruptions
 
-In the event of an unexpected command line closure, language settings in `config.yaml` may be altered. Verify these settings before attempting to resume processing.
+If the command line is closed unexpectedly, language settings in `config.yaml` may be altered. Check settings before retrying.
 
 ### Error Management
 
-- Files that fail to process will be relocated to the `output/ERROR` folder
-- Detailed error messages are logged in the `Status` column of `tasks_setting.xlsx`
-- To reattempt processing:
-  1. Transfer the specific video folder from `ERROR` to the root directory
-  2. Rename this folder to `output`
-  3. Utilize the Streamlit mode to reinitiate processing
+- Failed files will be moved to the `output/ERROR` folder
+- Error messages are recorded in the `Status` column of `tasks_setting.xlsx`
+- To retry:
+  1. Move the single video folder from `ERROR` to the root directory
+  2. Rename it to `output`
+  3. Use Streamlit mode to process again
diff --git a/batch/README.zh.md b/batch/README.zh.md
index 8a0ed089..73103140 100644
--- a/batch/README.zh.md
+++ b/batch/README.zh.md
@@ -6,8 +6,6 @@
 
 ## 使用方法
 
-> 注：以下所说文件除了 `config.yaml` 以外都在 `batch` 文件夹下。
-
 ### 1. 准备视频文件
 
 - 将要处理的视频文件放入 `input` 文件夹
@@ -20,7 +18,7 @@
 | 字段 | 说明 | 可选值 |
 |------|------|--------|
 | Video File | 视频文件名（无需 `input/` 前缀）或 YouTube 链接 | - |
-| Source Language | 源语言 | 'en', 'zh', 'auto'，或留空使用默认设置 |
+| Source Language | 源语言 | 'en', 'zh', ... 或留空使用默认设置 |
 | Target Language | 翻译语言 | 使用自然语言描述，或留空使用默认设置 |
 | Dubbing | 是否配音 | 0 或留空：不配音；1：配音 |
 
diff --git a/batch/utils/settings_check.py b/batch/utils/settings_check.py
index a90fc53e..5ea05612 100644
--- a/batch/utils/settings_check.py
+++ b/batch/utils/settings_check.py
@@ -4,11 +4,17 @@
 from rich.console import Console
 from rich.panel import Panel
 
+# Constants
+SETTINGS_FILE = 'batch/tasks_setting.xlsx'
+INPUT_FOLDER = os.path.join('batch', 'input')
+VALID_DUBBING_VALUES = [0, 1]
+
 console = Console()
 
 def check_settings():
-    df = pd.read_excel('batch/tasks_setting.xlsx')
-    input_files = set(os.listdir(os.path.join('batch', 'input')))
+    os.makedirs(INPUT_FOLDER, exist_ok=True)
+    df = pd.read_excel(SETTINGS_FILE)
+    input_files = set(os.listdir(INPUT_FOLDER))
     excel_files = set(df['Video File'].tolist())
     files_not_in_excel = input_files - excel_files
 
@@ -31,19 +37,14 @@ def check_settings():
 
         if video_file.startswith('http'):
             url_tasks += 1
-        elif os.path.isfile(os.path.join('batch', 'input', video_file)):
+        elif os.path.isfile(os.path.join(INPUT_FOLDER, video_file)):
             local_video_tasks += 1
         else:
             console.print(Panel(f"Invalid video file or URL 「{video_file}」", title=f"[bold red]Error in row {index + 2}", expand=False))
             all_passed = False
 
-        if not pd.isna(source_language):
-            if source_language.lower() not in ['en', 'zh', 'auto']:
-                console.print(Panel(f"Invalid source language 「{source_language}」", title=f"[bold red]Error in row {index + 2}", expand=False))
-                all_passed = False
-
         if not pd.isna(dubbing):
-            if int(dubbing) not in [0, 1]:
+            if int(dubbing) not in VALID_DUBBING_VALUES:
                 console.print(Panel(f"Invalid dubbing value 「{dubbing}」", title=f"[bold red]Error in row {index + 2}", expand=False))
                 all_passed = False
 
diff --git a/batch/utils/video_processor.py b/batch/utils/video_processor.py
index 342e5228..3d5cb12f 100644
--- a/batch/utils/video_processor.py
+++ b/batch/utils/video_processor.py
@@ -1,53 +1,74 @@
 import os, sys
 sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
-from core import step1_ytdlp, step2_whisperX, step3_1_spacy_split, step3_2_splitbymeaning
-from core import step4_1_summarize, step4_2_translate_all, step5_splitforsub, step6_generate_final_timeline 
-from core import step7_merge_sub_to_vid, step8_gen_audio_task, step10_gen_audio, step11_merge_audio_to_vid
+from st_components.imports_and_utils import *
 from core.onekeycleanup import cleanup
 from core.config_utils import load_key
 import shutil
 from functools import partial
+from rich.panel import Panel
+from rich.console import Console
+
+console = Console()
+
+INPUT_DIR = 'batch/input'
+OUTPUT_DIR = 'output'
+SAVE_DIR = 'batch/output'
+ERROR_OUTPUT_DIR = 'batch/output/ERROR'
+YTB_RESOLUTION_KEY = "ytb_resolution"
 
 def process_video(file, dubbing=False, is_retry=False):
     if not is_retry:
-        prepare_output_folder('output')
+        prepare_output_folder(OUTPUT_DIR)
     
-    steps = [
-        ("Processing input file", partial(process_input_file, file)),
-        ("Transcribing with Whisper", partial(step2_whisperX.transcribe)),
-        ("Splitting sentences", split_sentences),
-        ("Summarizing and translating", summarize_and_translate),
-        ("Processing and aligning subtitles", process_and_align_subtitles),
-        ("Merging subtitles to video", step7_merge_sub_to_vid.merge_subtitles_to_video),
+    text_steps = [
+        ("🎥 Processing input file", partial(process_input_file, file)),
+        ("🎙️ Transcribing with Whisper", partial(step2_whisperX.transcribe)),
+        ("✂️ Splitting sentences", split_sentences),
+        ("📝 Summarizing and translating", summarize_and_translate),
+        ("⚡ Processing and aligning subtitles", process_and_align_subtitles),
+        ("🎬 Merging subtitles to video", step7_merge_sub_to_vid.merge_subtitles_to_video),
     ]
     
     if dubbing:
-        steps.extend([
-            ("Generating audio tasks", step8_gen_audio_task.gen_audio_task_main),
-            ("Generating audio using SoVITS", step10_gen_audio.process_sovits_tasks),
-            ("Merging generated audio with video", step11_merge_audio_to_vid.merge_main),
-        ])
+        dubbing_steps = [
+            ("🔊 Generating audio tasks", gen_audio_tasks),
+            ("🎵 Extracting reference audio", step9_extract_refer_audio.extract_refer_audio_main),
+            ("🗣️ Generating audio", step10_gen_audio.gen_audio),
+            ("🔄 Merging full audio", step11_merge_full_audio.merge_full_audio),
+            ("🎞️ Merging dubbing to video", step12_merge_dub_to_vid.merge_video_audio),
+        ]
+        text_steps.extend(dubbing_steps)
     
     current_step = ""
-    for step_name, step_func in steps:
+    for step_name, step_func in text_steps:
         current_step = step_name
         for attempt in range(3):
             try:
-                print(f"Executing: {step_name}...")
+                console.print(Panel(
+                    f"[bold green]{step_name}[/]",
+                    subtitle=f"Attempt {attempt + 1}/3" if attempt > 0 else None,
+                    border_style="blue"
+                ))
                 result = step_func()
                 if result is not None:
                     globals().update(result)
                 break
             except Exception as e:
                 if attempt == 2:
-                    error_message = f"Error in step '{current_step}': {str(e)}"
-                    print(error_message)
-                    cleanup("batch/output/ERROR")
-                    return False, current_step, error_message
-                print(f"Attempt {attempt + 1} failed. Retrying...")
+                    error_panel = Panel(
+                        f"[bold red]Error in step '{current_step}':[/]\n{str(e)}",
+                        border_style="red"
+                    )
+                    console.print(error_panel)
+                    cleanup(ERROR_OUTPUT_DIR)
+                    return False, current_step, str(e)
+                console.print(Panel(
+                    f"[yellow]Attempt {attempt + 1} failed. Retrying...[/]",
+                    border_style="yellow"
+                ))
     
-    print("All steps completed successfully!")
-    cleanup("batch/output")
+    console.print(Panel("[bold green]All steps completed successfully! 🎉[/]", border_style="green"))
+    cleanup(SAVE_DIR)
     return True, "", ""
 
 def prepare_output_folder(output_folder):
@@ -57,11 +78,11 @@ def prepare_output_folder(output_folder):
 
 def process_input_file(file):
     if file.startswith('http'):
-        step1_ytdlp.download_video_ytdlp(file, resolution=load_key("ytb_resolution"), cutoff_time=None)
+        step1_ytdlp.download_video_ytdlp(file, resolution=load_key(YTB_RESOLUTION_KEY), cutoff_time=None)
         video_file = step1_ytdlp.find_video_files()
     else:
         input_file = os.path.join('batch', 'input', file)
-        output_file = os.path.join('output', file)
+        output_file = os.path.join(OUTPUT_DIR, file)
         shutil.copy(input_file, output_file)
         video_file = output_file
     return {'video_file': video_file}
@@ -77,3 +98,7 @@ def summarize_and_translate():
 def process_and_align_subtitles():
     step5_splitforsub.split_for_sub_main()
     step6_generate_final_timeline.align_timestamp_main()
+
+def gen_audio_tasks():
+    step8_1_gen_audio_task.gen_audio_task_main()
+    step8_2_gen_dub_chunks.gen_dub_chunks()
diff --git a/config.yaml b/config.yaml
index cdc1ec8e..daabb838 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,10 +1,11 @@
 # * Settings marked with * are advanced settings that won't appear in the Streamlit page and can only be modified manually in config.py
+version: "2.0.0"
 ## ======================== Basic Settings ======================== ##
 # API settings
 api:
-  key: 'YOUR_KEY'
-  base_url: 'https://yunwu.zeabur.app'
-  model: 'gemini-1.5-pro-002'
+  key: 'YOUR_API_KEY'
+  base_url: 'https://api.siliconflow.cn'
+  model: 'Qwen/Qwen2.5-72B-Instruct'
 
 # Language settings, written into the prompt, can be described in natural language
 target_language: '简体中文'
@@ -13,7 +14,9 @@ target_language: '简体中文'
 demucs: false
 
 whisper:
-  # Whisper specified recognition language [en, zh, auto] auto for automatic detection, en for forced translation to English
+  # ["medium", "large-v3", "large-v3-turbo"]. Note: for zh model will force to use Belle/large-v3
+  model: 'large-v3'
+  # Whisper specified recognition language [en, zh, ...]
   language: 'en'
   detected_language: 'en'
 
@@ -22,7 +25,7 @@ resolution: '1920x1080'
 
 ## ======================== Advanced Settings ======================== ##
 # *Default resolution for downloading YouTube videos [360, 1080, best]
-ytb_resolution: '360'
+ytb_resolution: '1080'
 
 subtitle:
   # *Maximum length of each subtitle line in characters
@@ -39,8 +42,20 @@ max_split_length: 20
 pause_before_translate: false
 
 ## ======================== Dubbing Settings ======================== ##
-# TTS selection [openai_tts, gpt_sovits, azure_tts, fish_tts]
-tts_method: 'openai_tts'
+# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts]
+tts_method: 'sf_fish_tts'
+
+# SiliconFlow FishTTS
+sf_fish_tts:
+  # SiliconFlow API key
+  api_key: 'YOUR_API_KEY'
+  # only for mode "preset"
+  voice: 'anna'
+  # *only for mode "custom", dont set manually
+  custom_name: ''
+  voice_id: ''
+  # preset, custom, dynamic
+  mode: "preset"
 
 # OpenAI TTS-1 API configuration
 openai_tts:
@@ -70,15 +85,16 @@ fish_tts:
 # *Audio speed range
 speed_factor:
   min: 1
+  accept: 1.2 # 可以接受的最大速度
   max: 1.4
-  normal: 1.2  # *Considered normal speech rate
 
 # *Merge audio configuration
-min_subtitle_duration: 3
-min_trim_duration: 2.50
+min_subtitle_duration: 2.5 # 最小字幕出现时间 会强制扩展
+min_trim_duration: 3.5 # 小于这个值的字幕不会切割
+tolerance: 1.5 # 允许向后延申的时间
 
 # Volume settings
-dub_volume: 1.3  # *Dubbed audio volume (1.3 = 130%, most original dubbing audio is relatively quiet)
+dub_volume: 1.5  # *Dubbed audio volume (1.5 = 150%, most original dubbing audio is relatively quiet)
 
 
 
@@ -114,6 +130,11 @@ llm_support_json:
 - 'gemini-1.5-pro-latest'
 - 'gemini-1.5-pro-002'
 
+# have problems
+# - 'Qwen/Qwen2.5-72B-Instruct'
+# - 'Qwen/Qwen2.5-Coder-32B-Instruct'
+# - 'Qwen/Qwen2.5-Chat-72B-Instruct-128K'
+
 # Spacy models
 spacy_model_map:
   en: 'en_core_web_md'
@@ -137,4 +158,4 @@ language_split_with_space:
 # Languages that do not use space as separator
 language_split_without_space:
 - 'zh'
-- 'ja'
\ No newline at end of file
+- 'ja'
diff --git a/core/all_tts_functions/estimate_duration.py b/core/all_tts_functions/estimate_duration.py
new file mode 100644
index 00000000..c4f7e807
--- /dev/null
+++ b/core/all_tts_functions/estimate_duration.py
@@ -0,0 +1,128 @@
+import syllables
+from pypinyin import pinyin, Style
+from g2p_en import G2p
+from typing import Optional
+import re
+
+class AdvancedSyllableEstimator:
+    def __init__(self):
+        self.g2p_en = G2p()
+        self.duration_params = {'en': 0.225, 'zh': 0.21, 'ja': 0.21, 'fr': 0.22, 'es': 0.22, 'ko': 0.21, 'default': 0.22}
+        self.lang_patterns = {
+            'zh': r'[\u4e00-\u9fff]', 'ja': r'[\u3040-\u309f\u30a0-\u30ff]',
+            'fr': r'[àâçéèêëîïôùûüÿœæ]', 'es': r'[áéíóúñ¿¡]', 'en': r'[a-zA-Z]+', 'ko': r'[\uac00-\ud7af\u1100-\u11ff]'}
+        self.lang_joiners = {'zh': '', 'ja': '', 'en': ' ', 'fr': ' ', 'es': ' ', 'ko': ' '}
+        self.punctuation = {
+            'mid': r'[，；：,;、]+', 'end': r'[。！？.!?]+', 'space': r'\s+',
+            'pause': {'space': 0.15, 'default': 0.1}
+        }
+
+    def estimate_duration(self, text: str, lang: Optional[str] = None) -> float:
+        syllable_count = self.count_syllables(text, lang)
+        return syllable_count * self.duration_params.get(lang or 'default')
+
+    def count_syllables(self, text: str, lang: Optional[str] = None) -> int:
+        if not text.strip(): return 0
+        lang = lang or self._detect_language(text)
+        
+        vowels_map = {
+            'fr': 'aeiouyàâéèêëîïôùûüÿœæ',
+            'es': 'aeiouáéíóúü'
+        }
+        
+        if lang == 'en':
+            return self._count_english_syllables(text)
+        elif lang == 'zh':
+            text = re.sub(r'[^\u4e00-\u9fff]', '', text)
+            return len(pinyin(text, style=Style.NORMAL))
+        elif lang == 'ja':
+            text = re.sub(r'[きぎしじちぢにひびぴみり][ょゅゃ]', 'X', text)
+            text = re.sub(r'[っー]', '', text)
+            return len(re.findall(r'[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9fff]', text))
+        elif lang in ('fr', 'es'):
+            text = re.sub(r'e\b', '', text.lower()) if lang == 'fr' else text.lower()
+            return max(1, len(re.findall(f'[{vowels_map[lang]}]+', text)))
+        elif lang == 'ko':
+            return len(re.findall(r'[\uac00-\ud7af]', text))
+        return len(text.split())
+
+    def _count_english_syllables(self, text: str) -> int:
+        total = 0
+        for word in text.strip().split():
+            try:
+                total += syllables.estimate(word)
+            except:
+                phones = self.g2p_en(word)
+                total += max(1, len([p for p in phones if any(c in p for c in 'aeiou')]))
+        return max(1, total)
+
+    def _detect_language(self, text: str) -> str:
+        for lang, pattern in self.lang_patterns.items():
+            if re.search(pattern, text): return lang
+        return 'en'
+
+    def process_mixed_text(self, text: str) -> dict:
+        result = {'language_breakdown': {}, 'total_syllables': 0, 'punctuation': [], 'spaces': []}
+        segments = re.split(f"({self.punctuation['space']}|{self.punctuation['mid']}|{self.punctuation['end']})", text)
+        total_duration = 0
+        
+        for i, segment in enumerate(segments):
+            if not segment: continue
+            
+            if re.match(self.punctuation['space'], segment):
+                prev_lang = self._detect_language(segments[i-1]) if i > 0 else None
+                next_lang = self._detect_language(segments[i+1]) if i < len(segments)-1 else None
+                if prev_lang and next_lang and (self.lang_joiners[prev_lang] == '' or self.lang_joiners[next_lang] == ''):
+                    result['spaces'].append(segment)
+                    total_duration += self.punctuation['pause']['space']
+            elif re.match(f"{self.punctuation['mid']}|{self.punctuation['end']}", segment):
+                result['punctuation'].append(segment)
+                total_duration += self.punctuation['pause']['default']
+            else:
+                lang = self._detect_language(segment)
+                if lang:
+                    syllables = self.count_syllables(segment, lang)
+                    if lang not in result['language_breakdown']:
+                        result['language_breakdown'][lang] = {'syllables': 0, 'text': ''}
+                    result['language_breakdown'][lang]['syllables'] += syllables
+                    result['language_breakdown'][lang]['text'] += (self.lang_joiners[lang] + segment 
+                        if result['language_breakdown'][lang]['text'] else segment)
+                    result['total_syllables'] += syllables
+                    total_duration += syllables * self.duration_params.get(lang, self.duration_params['default'])
+        
+        result['estimated_duration'] = total_duration
+        
+        return result
+    
+def init_estimator():
+    return AdvancedSyllableEstimator()
+
+def estimate_duration(text: str, estimator: AdvancedSyllableEstimator):
+    return estimator.process_mixed_text(text)['estimated_duration']
+
+# 使用示例
+if __name__ == "__main__":
+    estimator = init_estimator()
+    print(estimate_duration('你好', estimator))
+
+    # 测试用例
+    test_cases = [
+        # "Hello world this is a test",  # 纯英文
+        # "你好世界 这是一个测试",      # 中文带空格
+        # "Hello 你好 world 世界",      # 中英混合
+        # "The weather is nice 所以我们去公园",  # 中英混合带空格
+        # "我们需要在输出中体现空格的停顿时间",
+        # "I couldn't help but notice the vibrant colors of the autumn leaves cascading gently from the trees"
+        "가을 나뭇잎이 부드럽게 떨어지는 생생한 색깔을 주목하지 않을 수 없었다"
+    ]
+    
+    for text in test_cases:
+        result = estimator.process_mixed_text(text)
+        print(f"\nText: {text}")
+        print(f"Total syllables: {result['total_syllables']}")
+        print(f"Estimated duration: {result['estimated_duration']:.2f}s")
+        print("Language breakdown:")
+        for lang, info in result['language_breakdown'].items():
+            print(f"- {lang}: {info['syllables']} syllables ({info['text']})")
+        print(f"Punctuation: {result['punctuation']}")
+        print(f"Spaces: {result['spaces']}")
\ No newline at end of file
diff --git a/core/all_tts_functions/siliconflow_fish_tts.py b/core/all_tts_functions/siliconflow_fish_tts.py
new file mode 100644
index 00000000..ce54e68e
--- /dev/null
+++ b/core/all_tts_functions/siliconflow_fish_tts.py
@@ -0,0 +1,250 @@
+import requests
+from pathlib import Path
+import os, sys
+import base64
+import uuid
+from typing import List, Tuple
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+from core.config_utils import load_key, update_key
+from core.step1_ytdlp import find_video_files
+from core.all_whisper_methods.whisperX_utils import get_audio_duration
+import hashlib
+from rich import print as rprint
+from pydub import AudioSegment
+import time
+from rich.panel import Panel
+from rich.text import Text
+
+API_URL_SPEECH = "https://api.siliconflow.cn/v1/audio/speech"
+API_URL_VOICE = "https://api.siliconflow.cn/v1/uploads/audio/voice"
+
+AUDIO_REFERS_DIR = "output/audio/refers"
+MODEL_NAME = "fishaudio/fish-speech-1.4"
+
+def _get_headers():
+    return {"Authorization": f'Bearer {load_key("sf_fish_tts.api_key")}', "Content-Type": "application/json"}
+
+def siliconflow_fish_tts(text, save_path, mode="preset", voice_id=None, ref_audio=None, ref_text=None, check_duration=False):
+    sf_fish_set, headers = load_key("sf_fish_tts"), _get_headers()
+    payload = {"model": MODEL_NAME, "response_format": "wav", "stream": False, "input": text}
+    
+    if mode == "preset": 
+        payload["voice"] = f"fishaudio/fish-speech-1.4:{sf_fish_set['voice']}"
+    elif mode == "custom": 
+        if not voice_id: 
+            raise ValueError("custom mode requires voice_id")
+        payload["voice"] = voice_id
+    elif mode == "dynamic":
+        if not ref_audio or not ref_text: 
+            raise ValueError("dynamic mode requires ref_audio and ref_text")
+        with open(ref_audio, 'rb') as f: 
+            audio_base64 = base64.b64encode(f.read()).decode('utf-8')
+        payload = {
+            "model": MODEL_NAME,
+            "response_format": "wav",
+            "stream": False,
+            "input": text,
+            "voice": None,
+            "references": [{
+                "audio": f"data:audio/wav;base64,{audio_base64}",
+                "text": ref_text
+            }]
+        }
+    else: raise ValueError("Invalid mode")
+
+    max_retries = 2
+    retry_delay = 1
+    
+    for attempt in range(max_retries):
+        response = requests.post(API_URL_SPEECH, json=payload, headers=headers)
+        if response.status_code == 200:
+            wav_file_path = Path(save_path).with_suffix('.wav')
+            wav_file_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(wav_file_path, 'wb') as f: f.write(response.content)
+            
+            if check_duration:
+                duration = get_audio_duration(wav_file_path)
+                rprint(f"[blue]Audio Duration: {duration:.2f} seconds")
+                
+            rprint(f"[green]Successfully generated audio file: {wav_file_path}")
+            return True
+            
+        error_msg = response.json()
+        rprint(f"[red]Failed to generate audio | HTTP {response.status_code} (Attempt {attempt + 1}/{max_retries})")
+        rprint(f"[red]Text: {text}")
+        rprint(f"[red]Error details: {error_msg}")
+        
+        if attempt < max_retries - 1:
+            time.sleep(retry_delay)
+            rprint(f"[yellow]Retrying in {retry_delay} second...")
+            
+    return False
+
+def create_custom_voice(audio_path, text, custom_name=None):
+    if not Path(audio_path).exists():
+        raise FileNotFoundError(f"Audio file not found at {audio_path}")
+    
+    try:
+        audio_base64 = f"data:audio/wav;base64,{base64.b64encode(open(audio_path, 'rb').read()).decode('utf-8')}"
+        rprint(f"[yellow]✅ Successfully encoded audio file")
+    except Exception as e:
+        rprint(f"[red]❌ Error reading file: {str(e)}")
+        raise
+    
+    payload = {
+        "audio": audio_base64,
+        "model": MODEL_NAME,
+        "customName": custom_name or str(uuid.uuid4())[:8],
+        "text": text
+    }
+    
+    rprint(f"[yellow]🚀 Sending request to create voice...")
+    response = requests.post(API_URL_VOICE, json=payload, headers=_get_headers())
+    response_json = response.json()
+    
+    if response.status_code == 200:
+        voice_id = response_json.get('uri')
+        status_text = Text()
+        status_text.append("✨ Successfully created custom voice!\n", style="green")
+        status_text.append(f"🎙️ Voice ID: {voice_id}\n", style="green")
+        status_text.append(f"⌛ Creation Time: {time.strftime('%Y-%m-%d %H:%M:%S')}", style="green")
+        rprint(Panel(status_text, title="Voice Creation Status"))
+        return voice_id
+        
+    error_text = Text()
+    error_text.append("❌ Failed to create custom voice\n", style="red")
+    error_text.append(f"⚠️ HTTP Status: {response.status_code}\n", style="red")
+    error_text.append(f"💬 Error Details: {response_json}", style="red")
+    rprint(Panel(error_text, title="Error", border_style="red"))
+    raise ValueError(f"Failed to create custom voice 🚫 HTTP {response.status_code}, Error details: {response_json}")
+
+def merge_audio(files: List[str], output: str) -> bool:
+    """Merge audio files, add a brief silence"""
+    try:
+        # Create an empty audio segment
+        combined = AudioSegment.empty()
+        silence = AudioSegment.silent(duration=100)  # 100ms silence
+        
+        # Add audio files one by one
+        for file in files:
+            audio = AudioSegment.from_wav(file)
+            combined += audio + silence
+        
+        # Export the combined file
+        combined.export(output, format="wav", parameters=[
+            "-acodec", "pcm_s16le",
+            "-ar", "44100",
+            "-ac", "1"
+        ])
+        
+        if os.path.getsize(output) == 0:
+            rprint(f"[red]Output file size is 0")
+            return False
+            
+        rprint(f"[green]Successfully merged audio files")
+        return True
+        
+    except Exception as e:
+        rprint(f"[red]Failed to merge audio: {str(e)}")
+        return False
+
+def get_ref_audio(task_df) -> Tuple[str, str]:
+    """Get reference audio and text, ensuring the combined text length does not exceed 100 characters"""
+    rprint(f"[blue]🎯 Starting reference audio selection process...")
+    
+    duration = 0
+    selected = []
+    combined_text = ""
+    found_first = False
+    
+    for _, row in task_df.iterrows():
+        current_text = row['origin']
+        
+        # If no valid record has been found yet
+        if not found_first:
+            if len(current_text) <= 100:
+                selected.append(row)
+                combined_text = current_text
+                duration += row['duration']
+                found_first = True
+                rprint(f"[yellow]📝 Found first valid row: {current_text[:50]}...")
+            else:
+                rprint(f"[yellow]⏭️ Skipping long row: {current_text[:50]}... ({len(current_text)} chars)")
+            continue
+            
+        # Check subsequent rows
+        new_text = combined_text + " " + current_text
+        if len(new_text) > 100:
+            break
+            
+        selected.append(row)
+        combined_text = new_text
+        duration += row['duration']
+        rprint(f"[yellow]📝 Added row: {current_text[:50]}...")
+        
+        if duration > 10:
+            break
+    
+    if not selected:
+        rprint(f"[red]❌ No valid segments found (all texts exceed 100 characters)")
+        return None, None
+        
+    rprint(f"[blue]📊 Selected {len(selected)} segments, total duration: {duration:.2f}s")
+    
+    audio_files = [f"{AUDIO_REFERS_DIR}/{row['number']}.wav" for row in selected]
+    rprint(f"[yellow]🎵 Audio files to merge: {audio_files}")
+    
+    combined_audio = f"{AUDIO_REFERS_DIR}/combined_reference.wav"
+    success = merge_audio(audio_files, combined_audio)
+    
+    if not success:
+        rprint(f"[red]❌ Error: Failed to merge audio files")
+        return None, None
+        
+    rprint(f"[green]✅ Successfully created combined audio: {combined_audio}")
+    rprint(f"[green]📝 Final combined text: {combined_text} | Length: {len(combined_text)}")
+    
+    return combined_audio, combined_text
+
+def siliconflow_fish_tts_for_videolingo(text, save_as, number, task_df):
+    sf_fish_set = load_key("sf_fish_tts")
+    MODE = sf_fish_set["mode"]
+
+    if MODE == "preset":
+        return siliconflow_fish_tts(text, save_as, mode="preset")
+    elif MODE == "custom":
+        video_file = find_video_files()
+        custom_name = hashlib.md5(video_file.encode()).hexdigest()[:8]
+        rprint(f"[yellow]Using custom name: {custom_name}")
+        log_name = load_key("sf_fish_tts.custom_name")
+        
+        if log_name != custom_name:
+            # Get the merged reference audio and text
+            ref_audio, ref_text = get_ref_audio(task_df)
+            if ref_audio is None or ref_text is None:
+                rprint(f"[red]Failed to get reference audio and text, falling back to preset mode")
+                return siliconflow_fish_tts(text, save_as, mode="preset")
+                
+            voice_id = create_custom_voice(ref_audio, ref_text, custom_name)
+            update_key("sf_fish_tts.voice_id", voice_id)
+            update_key("sf_fish_tts.custom_name", custom_name)
+        else:
+            voice_id = load_key("sf_fish_tts.voice_id")
+        return siliconflow_fish_tts(text=text, save_path=save_as, mode="custom", voice_id=voice_id)
+    elif MODE == "dynamic":
+        ref_audio_path = f"{AUDIO_REFERS_DIR}/{number}.wav"
+        if not Path(ref_audio_path).exists():
+            rprint(f"[red]Reference audio not found: {ref_audio_path}, falling back to preset mode")
+            return siliconflow_fish_tts(text, save_as, mode="preset")
+            
+        ref_text = task_df[task_df['number'] == number]['origin'].iloc[0]
+        return siliconflow_fish_tts(text=text, save_path=save_as, mode="dynamic", ref_audio=str(ref_audio_path), ref_text=ref_text)
+    else:
+        raise ValueError("Invalid mode. Choose 'preset', 'custom', or 'dynamic'")
+
+if __name__ == '__main__':
+    pass
+    # create_custom_voice("output/audio/refers/1.wav", "Okay folks, welcome back. This is price action model number four, position trading.")
+    siliconflow_fish_tts("가을 나뭇잎이 부드럽게 떨어지는 생생한 색깔을 주목하지 않을 수 없었다", "preset_test.wav", mode="preset", check_duration=True)
+    # siliconflow_fish_tts("使用客制化音色测试", "custom_test.wav", mode="custom", voice_id="speech:your-voice-name:cm04pf7az00061413w7kz5qxs:mjtkgbyuunvtybnsvbxd")
+    # siliconflow_fish_tts("使用动态音色测试", "dynamic_test.wav", mode="dynamic", ref_audio="output/audio/refers/1.wav", ref_text="Okay folks, welcome back. This is price action model number four, position trading.")
\ No newline at end of file
diff --git a/core/all_tts_functions/tts_main.py b/core/all_tts_functions/tts_main.py
new file mode 100644
index 00000000..91b4f2fe
--- /dev/null
+++ b/core/all_tts_functions/tts_main.py
@@ -0,0 +1,57 @@
+import os, sys
+import re
+from rich import print as rprint
+from pydub import AudioSegment
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+from core.config_utils import load_key
+from core.all_whisper_methods.whisperX_utils import get_audio_duration
+from core.all_tts_functions.gpt_sovits_tts import gpt_sovits_tts_for_videolingo
+from core.all_tts_functions.siliconflow_fish_tts import siliconflow_fish_tts_for_videolingo
+from core.all_tts_functions.openai_tts import openai_tts
+from core.all_tts_functions.fish_tts import fish_tts
+from core.all_tts_functions.azure_tts import azure_tts
+
+def tts_main(text, save_as, number, task_df):
+        # 检查文本是否为空或单字符，单字符配音容易触发bug
+    cleaned_text = re.sub(r'[^\w\s]', '', text).strip()
+    if not cleaned_text or len(cleaned_text) <= 1:
+        silence = AudioSegment.silent(duration=100)  # 100ms = 0.1s
+        silence.export(save_as, format="wav")
+        rprint(f"Created silent audio for empty/single-char text: {save_as}")
+        return
+    
+    # 如果文件存在，跳过
+    if os.path.exists(save_as):
+        return
+    
+    print(f"Generating <{text}...>")
+    TTS_METHOD = load_key("tts_method")
+    
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            if TTS_METHOD == 'openai_tts':
+                openai_tts(text, save_as)
+            elif TTS_METHOD == 'gpt_sovits':
+                gpt_sovits_tts_for_videolingo(text, save_as, number, task_df)
+            elif TTS_METHOD == 'fish_tts':
+                fish_tts(text, save_as)
+            elif TTS_METHOD == 'azure_tts':
+                azure_tts(text, save_as)
+            elif TTS_METHOD == 'sf_fish_tts':
+                siliconflow_fish_tts_for_videolingo(text, save_as, number, task_df)
+            
+            # 检查生成的音频时长
+            duration = get_audio_duration(save_as)
+            if duration > 0:
+                break
+            else:
+                if os.path.exists(save_as):
+                    os.remove(save_as)
+                raise Exception("Generated audio duration is 0")
+                
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise Exception(f"Failed to generate audio after {max_retries} attempts: {str(e)}")
+            print(f"Attempt {attempt + 1} failed, retrying...")
\ No newline at end of file
diff --git a/core/all_whisper_methods/whisperX_utils.py b/core/all_whisper_methods/whisperX_utils.py
index 2e8ed8a5..e599b930 100644
--- a/core/all_whisper_methods/whisperX_utils.py
+++ b/core/all_whisper_methods/whisperX_utils.py
@@ -4,20 +4,36 @@
 from rich import print
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 from core.config_utils import update_key
-from core.all_whisper_methods.demucs_vl import RAW_AUDIO_FILE, AUDIO_DIR
 
-def convert_video_to_audio(input_file: str) -> str:
-    os.makedirs(AUDIO_DIR, exist_ok=True)
-    if not os.path.exists(RAW_AUDIO_FILE):
-        print(f"🎬➡️🎵 Converting to audio with FFmpeg ......")
+AUDIO_DIR = "output/audio"
+RAW_AUDIO_FILE = "output/audio/raw.mp3"
+CLEANED_CHUNKS_EXCEL_PATH = "output/log/cleaned_chunks.xlsx"
+
+def compress_audio(input_file: str, output_file: str):
+    """将输入音频文件压缩为低质量音频文件，用于转录"""
+    if not os.path.exists(output_file):
+        print(f"🗜️ Converting to low quality audio with FFmpeg ......")
+        # 16000 Hz, 1 channel, (Whisper default) , 96kbps to keep more details as well as smaller file size
         subprocess.run([
-            'ffmpeg', '-y', '-i', input_file, '-vn', '-b:a', '64k',
+            'ffmpeg', '-y', '-i', input_file, '-vn', '-b:a', '96k',
             '-ar', '16000', '-ac', '1', '-metadata', 'encoding=UTF-8',
-            '-f', 'mp3', RAW_AUDIO_FILE
+            '-f', 'mp3', output_file
         ], check=True, stderr=subprocess.PIPE)
-        print(f"🎬➡️🎵 Converted <{input_file}> to <{RAW_AUDIO_FILE}> with FFmpeg\n")
+        print(f"🗜️ Converted <{input_file}> to <{output_file}> with FFmpeg")
+    return output_file
 
-    return RAW_AUDIO_FILE
+def convert_video_to_audio(video_file: str):
+    os.makedirs(AUDIO_DIR, exist_ok=True)
+    if not os.path.exists(RAW_AUDIO_FILE):
+        print(f"🎬➡️🎵 Converting to high quality audio with FFmpeg ......")
+        subprocess.run([
+            'ffmpeg', '-y', '-i', video_file, '-vn',
+            '-c:a', 'libmp3lame', '-b:a', '128k',
+            '-ar', '32000',
+            '-ac', '1', 
+            '-metadata', 'encoding=UTF-8', RAW_AUDIO_FILE
+        ], check=True, stderr=subprocess.PIPE)
+        print(f"🎬➡️🎵 Converted <{video_file}> to <{RAW_AUDIO_FILE}> with FFmpeg\n")
 
 def _detect_silence(audio_file: str, start: float, end: float) -> List[float]:
     """Detect silence points in the given audio segment"""
@@ -40,13 +56,17 @@ def get_audio_duration(audio_file: str) -> float:
     _, stderr = process.communicate()
     output = stderr.decode('utf-8', errors='ignore')
     
-    duration_str = [line for line in output.split('\n') if 'Duration' in line][0]
-    duration_parts = duration_str.split('Duration: ')[1].split(',')[0].split(':')
-    duration = float(duration_parts[0])*3600 + float(duration_parts[1])*60 + float(duration_parts[2])
-    print(f"🔪 Audio duration: {duration:.2f}s")
+    try:
+        duration_str = [line for line in output.split('\n') if 'Duration' in line][0]
+        duration_parts = duration_str.split('Duration: ')[1].split(',')[0].split(':')
+        duration = float(duration_parts[0])*3600 + float(duration_parts[1])*60 + float(duration_parts[2])
+    except Exception as e:
+        print(f"[red]❌ Error: Failed to get audio duration: {e}[/red]")
+        duration = 0
     return duration
 
-def split_audio(audio_file: str, target_len: int = 50*60, win: int = 60) -> List[Tuple[float, float]]:
+def split_audio(audio_file: str, target_len: int = 30*60, win: int = 60) -> List[Tuple[float, float]]:
+    # 30 min 16000 Hz 96kbps ~ 22MB < 25MB required by whisper
     print("[bold blue]🔪 Starting audio segmentation...[/]")
     
     duration = get_audio_duration(audio_file)
@@ -121,8 +141,7 @@ def process_transcription(result: Dict) -> pd.DataFrame:
 
 def save_results(df: pd.DataFrame):
     os.makedirs('output/log', exist_ok=True)
-    excel_path = os.path.join('output/log', "cleaned_chunks.xlsx")
-    
+
     # Remove rows where 'text' is empty
     initial_rows = len(df)
     df = df[df['text'].str.len() > 0]
@@ -137,8 +156,8 @@ def save_results(df: pd.DataFrame):
         df = df[df['text'].str.len() <= 20]
     
     df['text'] = df['text'].apply(lambda x: f'"{x}"')
-    df.to_excel(excel_path, index=False)
-    print(f"📊 Excel file saved to {excel_path}")
+    df.to_excel(CLEANED_CHUNKS_EXCEL_PATH, index=False)
+    print(f"📊 Excel file saved to {CLEANED_CHUNKS_EXCEL_PATH}")
 
 def save_language(language: str):
     update_key("whisper.detected_language", language)
\ No newline at end of file
diff --git a/core/delete_retry_dubbing.py b/core/delete_retry_dubbing.py
index 29f5013b..cfb81749 100644
--- a/core/delete_retry_dubbing.py
+++ b/core/delete_retry_dubbing.py
@@ -4,8 +4,8 @@
 
 def delete_dubbing_files():
     files_to_delete = [
-        os.path.join("output", "trans_vocal_total.wav"),
-        os.path.join("output", "output_video_with_audio.mp4")
+        os.path.join("output", "dub.wav"),
+        os.path.join("output", "output_dub.mp4")
     ]
     
     for file_path in files_to_delete:
diff --git a/core/pypi_autochoose.py b/core/pypi_autochoose.py
new file mode 100644
index 00000000..0d3c4d25
--- /dev/null
+++ b/core/pypi_autochoose.py
@@ -0,0 +1,110 @@
+import subprocess
+import time
+import requests
+import os
+import concurrent.futures
+from rich.console import Console
+from rich.table import Table
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.panel import Panel
+import sys
+
+MIRRORS = {
+    "Tsinghua Mirror": "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple",
+    "PyPI Official": "https://pypi.org/simple"
+}
+
+console = Console()
+
+FAST_THRESHOLD = 1000  # ms
+SLOW_THRESHOLD = 1500  # ms
+
+def get_optimal_thread_count():
+    try:
+        cpu_count = os.cpu_count()
+        return max(cpu_count - 1, 1)
+    except:
+        return 2
+
+def test_mirror_speed(name, url):
+    try:
+        start_time = time.time()
+        response = requests.get(url, timeout=5)
+        end_time = time.time()
+        if response.status_code == 200:
+            speed = (end_time - start_time) * 1000 
+            return name, speed
+        else:
+            return name, float('inf')
+    except requests.RequestException:
+        return name, float('inf')
+
+def set_pip_mirror(url):
+    try:
+        subprocess.run([sys.executable, "-m", "pip", "config", "set", "global.index-url", url], 
+                      check=True, 
+                      capture_output=True)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Failed to set pip mirror: {e}")
+        return False
+
+def get_current_pip_mirror():
+    try:
+        result = subprocess.run([sys.executable, "-m", "pip", "config", "get", "global.index-url"], 
+                              capture_output=True, text=True, check=True)
+        return result.stdout.strip()
+    except subprocess.CalledProcessError:
+        return None
+
+def main():
+    console.print(Panel.fit("🚀 PyPI Mirror Speed Test", style="bold cyan"))
+    
+    # Test all mirrors simultaneously
+    speeds = {}
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[cyan]Testing mirrors...[/cyan]"),
+    ) as progress:
+        progress.add_task("", total=None)  # Indeterminate spinner
+        
+        with concurrent.futures.ThreadPoolExecutor(max_workers=get_optimal_thread_count()) as executor:
+            future_to_mirror = {executor.submit(test_mirror_speed, name, url): name 
+                              for name, url in MIRRORS.items()}
+            
+            for future in concurrent.futures.as_completed(future_to_mirror):
+                name = future_to_mirror[future]
+                try:
+                    name, speed = future.result()
+                    if speed != float('inf'):
+                        speeds[name] = speed
+                except Exception as exc:
+                    print(f'{name} generated an exception: {exc}')
+
+    # Results display
+    table = Table(show_header=False)
+    table.add_column(style="cyan")
+    table.add_column(justify="right", style="magenta")
+
+    for name, speed in sorted(speeds.items(), key=lambda x: x[1]):
+        table.add_row(name, f"{speed:.0f}ms")
+
+    console.print(table)
+
+    if speeds:
+        fastest_mirror = min(speeds, key=speeds.get)
+        fastest_url = MIRRORS[fastest_mirror]
+        
+        if set_pip_mirror(fastest_url):
+            current_mirror = get_current_pip_mirror()
+            if current_mirror == fastest_url:
+                console.print(f"✅ Switched to {fastest_mirror}\n🔗 {fastest_url}", style="green")
+            else:
+                console.print(f"❌ Switch failed\nExpected: {fastest_url}\nCurrent: {current_mirror}\n💡 Try running with admin privileges", style="red")
+        else:
+            console.print(f"❌ Failed to switch mirror\n💡 Check permissions and try again", style="red")
+    else:
+        console.print("❌ All mirrors unreachable\n💡 Check network connection", style="red")
+
+if __name__ == "__main__":
+    main()
diff --git a/core/step10_gen_audio.py b/core/step10_gen_audio.py
index a56a096b..ac85f45c 100644
--- a/core/step10_gen_audio.py
+++ b/core/step10_gen_audio.py
@@ -1,173 +1,215 @@
-import os, sys
-import pandas as pd
-from tqdm import tqdm
-import soundfile as sf
+import os
+import sys
+import time
+import shutil
 import subprocess
+from typing import Tuple
+
+import pandas as pd
+from pydub import AudioSegment
 from rich import print as rprint
-from rich.panel import Panel
 from rich.console import Console
-import time
+from rich.progress import Progress
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from core.all_tts_functions.gpt_sovits_tts import gpt_sovits_tts_for_videolingo
-from core.all_tts_functions.openai_tts import openai_tts
-from core.all_tts_functions.fish_tts import fish_tts
-from core.all_tts_functions.azure_tts import azure_tts
-from core.prompts_storage import get_subtitle_trim_prompt
-from core.ask_gpt import ask_gpt
 from core.config_utils import load_key
+from core.all_whisper_methods.whisperX_utils import get_audio_duration
+from core.all_tts_functions.tts_main import tts_main
 
 console = Console()
 
 TEMP_DIR = 'output/audio/tmp'
 SEGS_DIR = 'output/audio/segs'
-TASKS_FILE = "output/audio/sovits_tasks.xlsx"
+TASKS_FILE = "output/audio/tts_tasks.xlsx"
+OUTPUT_FILE = "output/audio/tts_tasks.xlsx"
 TEMP_FILE_TEMPLATE = f"{TEMP_DIR}/{{}}_temp.wav"
 OUTPUT_FILE_TEMPLATE = f"{SEGS_DIR}/{{}}.wav"
+WARMUP_SIZE = 5
 
-def check_wav_duration(file_path):
-    try:
-        audio_info = sf.info(file_path)
-        return audio_info.duration
-    except Exception as e:
-        raise Exception(f"Error checking duration: {str(e)}")
-
-def parse_srt_time(time_str):
+def parse_df_srt_time(time_str: str) -> float:
+    """Convert SRT time format to seconds"""
     hours, minutes, seconds = time_str.strip().split(':')
-    seconds, milliseconds = seconds.split(',')
+    seconds, milliseconds = seconds.split('.')
     return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000
 
-def tts_main(text, save_as, number, task_df):
-    TTS_METHOD = load_key("tts_method")
-    if TTS_METHOD == 'openai_tts':
-        openai_tts(text, save_as)
-    elif TTS_METHOD == 'gpt_sovits':
-        #! 注意 gpt_sovits_tts 只支持输出中文，输入中文或英文
-        gpt_sovits_tts_for_videolingo(text, save_as, number, task_df)
-    elif TTS_METHOD == 'fish_tts':
-        fish_tts(text, save_as)
-    elif TTS_METHOD == 'azure_tts':
-        azure_tts(text, save_as)
-
-def generate_audio(text, target_duration, save_as, number, task_df):
-    MIN_SPEED = load_key("speed_factor.min")
-    MAX_SPEED = load_key("speed_factor.max")
-    os.makedirs(TEMP_DIR, exist_ok=True)
-    temp_file = TEMP_FILE_TEMPLATE.format(number)
-
-    # handle empty text or nan
-    if pd.isna(text) or not str(text).strip():
-        # generate silent audio
-        cmd = ['ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=mono', '-t', '0.1', '-q:a', '0', '-y', save_as]
-        subprocess.run(cmd, check=True, stderr=subprocess.PIPE)
-        rprint(f"ℹ️  {number} Generated silent audio for empty text: {save_as}")
+def adjust_audio_speed(input_file: str, output_file: str, speed_factor: float) -> None:
+    """Adjust audio speed and handle edge cases"""
+    # If the speed factor is close to 1, directly copy the file
+    if abs(speed_factor - 1.0) < 0.001:
+        shutil.copy2(input_file, output_file)
         return
-
-    tts_main(text, temp_file, number, task_df)
-
-    original_duration = check_wav_duration(temp_file)
-    # -0.03 to avoid the duration is too close to the target_duration
-    speed_factor = original_duration / (target_duration-0.03)
-
-    # Check speed factor and adjust audio speed
-    if MIN_SPEED <= speed_factor <= MAX_SPEED:
-        change_audio_speed(temp_file, save_as, speed_factor)
-        final_duration = check_wav_duration(save_as)
-        rprint(f"✅ {number} Adjusted audio: {save_as} | Duration: {final_duration:.2f}s | Required: {target_duration:.2f}s | Speed factor: {speed_factor:.2f}")
-    elif speed_factor < MIN_SPEED:
-        change_audio_speed(temp_file, save_as, MIN_SPEED)
-        final_duration = check_wav_duration(save_as)
-        rprint(f"⚠️ {number} Adjusted audio: {save_as} | Duration: {final_duration:.2f}s | Required: {target_duration:.2f}s | Speed factor: {MIN_SPEED}")
-    else:  # speed_factor > MAX_SPEED
-        rprint(f"🚨 {number} Speed factor out of range: {speed_factor:.2f}, attempting to simplify subtitle...")
         
-        original_text = text
-        prompt = get_subtitle_trim_prompt(text, target_duration)
-        response = ask_gpt(prompt, response_json=True, log_title='subtitle_trim')
-        shortened_text = response['result']
-
-        rprint(f"Original subtitle: {original_text} | Simplified subtitle: {shortened_text}")
-        
-        tts_main(shortened_text, temp_file, number, task_df)
-        new_original_duration = check_wav_duration(temp_file)
-        new_speed_factor = new_original_duration / (target_duration-0.03)
-
-        if MIN_SPEED <= new_speed_factor <= MAX_SPEED:
-            change_audio_speed(temp_file, save_as, new_speed_factor)
-            final_duration = check_wav_duration(save_as)
-            rprint(f"✅ {number} Adjusted audio: {save_as} | Duration: {final_duration:.2f}s | Required: {target_duration:.2f}s | Speed factor: {new_speed_factor:.2f}")
-        elif new_speed_factor > MAX_SPEED:
-            rprint(f"🚔 {number} Speed factor still out of range after simplification: {new_speed_factor:.2f}")
-            change_audio_speed(temp_file, save_as, new_speed_factor) #! force adjust
-            final_duration = check_wav_duration(save_as)
-            rprint(f"🚔 {number} Forced adjustment: {save_as} | Duration: {final_duration:.2f}s | Required: {target_duration:.2f}s | Speed factor: {new_speed_factor}")
-        elif new_speed_factor < MIN_SPEED:
-            rprint(f"⚠️ {number} Speed factor too low after simplification: {new_speed_factor:.2f}")
-            change_audio_speed(temp_file, save_as, MIN_SPEED)
-            final_duration = check_wav_duration(save_as)
-            rprint(f"⚠️ {number} Forced adjustment: {save_as} | Duration: {final_duration:.2f}s | Required: {target_duration:.2f}s | Speed factor: {MIN_SPEED}")
-    
-    #! check duration for safety
-    if final_duration > target_duration:
-        rprint(f"❎ {number} Final duration is longer than target duration: {final_duration:.2f}s | Required: {target_duration:.2f}s. This is a bug, please report it.")
-        raise Exception()
-    
-    if os.path.exists(temp_file):
-        os.remove(temp_file)
-
-def change_audio_speed(input_file, output_file, speed_factor):
     atempo = speed_factor
     cmd = ['ffmpeg', '-i', input_file, '-filter:a', f'atempo={atempo}', '-y', output_file]
-    
-    max_retries = 3
+    input_duration = get_audio_duration(input_file)
+    max_retries = 2
     for attempt in range(max_retries):
         try:
             subprocess.run(cmd, check=True, stderr=subprocess.PIPE)
-            return  # Success, exit the function
+            output_duration = get_audio_duration(output_file)
+            expected_duration = input_duration / speed_factor
+            diff = output_duration - expected_duration
+            # If the output duration exceeds the expected duration, but the input audio is less than 3 seconds, and the error is within 0.1 seconds, truncate to the expected length
+            if output_duration >= expected_duration * 1.01 and input_duration < 3 and diff <= 0.1:
+                audio = AudioSegment.from_wav(output_file)
+                trimmed_audio = audio[:(expected_duration * 1000)]  # pydub uses milliseconds
+                trimmed_audio.export(output_file, format="wav")
+                print(f"✂️ Trimmed to expected duration: {expected_duration:.2f} seconds")
+                return
+            elif output_duration >= expected_duration * 1.01:
+                raise Exception(f"Audio duration abnormal: input file={input_file}, output file={output_file}, speed factor={speed_factor}, input duration={input_duration:.2f}s, output duration={output_duration:.2f}s")
+            return
         except subprocess.CalledProcessError as e:
-            if attempt < max_retries - 1:  # If it's not the last attempt
-                rprint(f"[yellow]Warning: Failed to change audio speed, retrying in 1 second (Attempt {attempt + 1}/{max_retries})[/yellow]")
+            if attempt < max_retries - 1:
+                rprint(f"[yellow]⚠️ Audio speed adjustment failed, retrying in 1s ({attempt + 1}/{max_retries})[/yellow]")
                 time.sleep(1)
             else:
-                rprint(f"[red]Error: Failed to change audio speed, maximum retry attempts reached ({max_retries})[/red]")
-                raise e  # Re-raise the exception if all retries failed
-
-def process_sovits_tasks():
-    tasks_df = pd.read_excel(TASKS_FILE)
-    errors = []
-    os.makedirs(SEGS_DIR, exist_ok=True)
-
-    with console.status("[bold green]Processing tasks...") as status:
-        for _, row in tqdm(tasks_df.iterrows(), total=len(tasks_df)):
-            out_file = OUTPUT_FILE_TEMPLATE.format(row["number"])
-            if os.path.exists(out_file):
-                rprint(f"[yellow]File {out_file} already exists, skipping[/yellow]")
-                continue
-            try:
-                generate_audio(row['text'], float(row['duration']), out_file, row['number'], tasks_df)
-            except Exception as e:
-                errors.append(row['number'])
-                rprint(Panel(f"Error processing task {row['number']}: {str(e)}", title="Error", border_style="red"))
-
-    if errors:
-        # Retry once, sometimes there might be network issues or file I/O errors
-        rprint(Panel(f"The following tasks encountered errors, retrying: {', '.join(map(str, errors))}", title="Retry", border_style="yellow"))
-        retry_tasks = errors.copy()
-        errors.clear()
-        for task_number in retry_tasks:
-            row = tasks_df[tasks_df['number'] == task_number].iloc[0]
-            out_file = OUTPUT_FILE_TEMPLATE.format(row["number"])
+                rprint(f"[red]❌ Audio speed adjustment failed, max retries reached ({max_retries})[/red]")
+                raise e
+
+def process_row(row: pd.Series, tasks_df: pd.DataFrame) -> Tuple[int, float]:
+    """Helper function for processing single row data"""
+    number = row['number']
+    lines = eval(row['lines']) if isinstance(row['lines'], str) else row['lines']
+    real_dur = 0
+    for line_index, line in enumerate(lines):
+        temp_file = TEMP_FILE_TEMPLATE.format(f"{number}_{line_index}")
+        tts_main(line, temp_file, number, tasks_df)
+        real_dur += get_audio_duration(temp_file)
+    return number, real_dur
+
+def generate_tts_audio(tasks_df: pd.DataFrame) -> pd.DataFrame:
+    """Generate TTS audio sequentially and calculate actual duration"""
+    tasks_df['real_dur'] = 0
+    rprint("[bold green]🎯 Starting TTS audio generation...[/bold green]")
+    
+    with Progress() as progress:
+        task = progress.add_task("[cyan]🔄 Generating TTS audio...", total=len(tasks_df))
+        
+        # warm up for first 5 rows
+        warmup_size = min(WARMUP_SIZE, len(tasks_df))
+        for _, row in tasks_df.head(warmup_size).iterrows():
             try:
-                generate_audio(row['text'], float(row['duration']), out_file, row['number'], tasks_df)
+                number, real_dur = process_row(row, tasks_df)
+                tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur
+                progress.advance(task)
             except Exception as e:
-                errors.append(row['number'])
-                rprint(Panel(f"Error retrying task {row['number']}: {str(e)}", title="Error", border_style="red"))
+                rprint(f"[red]❌ Error in warmup: {str(e)}[/red]")
+                raise e
+        
+        # parallel processing for remaining tasks
+        if len(tasks_df) > warmup_size:
+            remaining_tasks = tasks_df.iloc[warmup_size:].copy()
+            with ThreadPoolExecutor() as executor:
+                futures = [
+                    executor.submit(process_row, row, tasks_df.copy())
+                    for _, row in remaining_tasks.iterrows()
+                ]
+                
+                for future in as_completed(futures):
+                    try:
+                        number, real_dur = future.result()
+                        tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur
+                        progress.advance(task)
+                    except Exception as e:
+                        rprint(f"[red]❌ Error: {str(e)}[/red]")
+                        raise e
+
+    rprint("[bold green]✨ TTS audio generation completed![/bold green]")
+    return tasks_df
+
+def process_chunk(chunk_df: pd.DataFrame, accept: float, min_speed: float) -> tuple[float, bool]:
+    """Process audio chunk and calculate speed factor"""
+    chunk_durs = chunk_df['real_dur'].sum()
+    tol_durs = chunk_df['tol_dur'].sum()
+    durations = tol_durs - chunk_df.iloc[-1]['tolerance']
+    all_gaps = chunk_df['gap'].sum() - chunk_df.iloc[-1]['gap']
+    
+    keep_gaps = True
+    speed_var_error = 0.1
+
+    if (chunk_durs + all_gaps) / accept < durations:
+        speed_factor = max(min_speed, (chunk_durs + all_gaps) / (durations-speed_var_error))
+    elif chunk_durs / accept < durations:
+        speed_factor = max(min_speed, chunk_durs / (durations-speed_var_error))
+        keep_gaps = False
+    elif (chunk_durs + all_gaps) / accept < tol_durs:
+        speed_factor = max(min_speed, (chunk_durs + all_gaps) / (tol_durs-speed_var_error))
+    else:
+        speed_factor = chunk_durs / (tol_durs-speed_var_error)
+        keep_gaps = False
+        
+    return round(speed_factor, 3), keep_gaps
+
+def merge_chunks(tasks_df: pd.DataFrame) -> pd.DataFrame:
+    """Merge audio chunks and adjust timeline"""
+    rprint("[bold blue]🔄 Starting audio chunks processing...[/bold blue]")
+    accept = load_key("speed_factor.accept")
+    min_speed = load_key("speed_factor.min")
+    chunk_start = 0
+    
+    tasks_df['new_sub_times'] = None
+    
+    for index, row in tasks_df.iterrows():
+        if row['cut_off'] == 1:
+            chunk_df = tasks_df.iloc[chunk_start:index+1].reset_index(drop=True)
+            speed_factor, keep_gaps = process_chunk(chunk_df, accept, min_speed)
+            
+            # 🎯 Step1: Start processing new timeline
+            chunk_start_time = parse_df_srt_time(chunk_df.iloc[0]['start_time'])
+            chunk_end_time = parse_df_srt_time(chunk_df.iloc[-1]['end_time']) + chunk_df.iloc[-1]['tolerance'] # 加上tolerance才是这一块的结束
+            cur_time = chunk_start_time
+            for i, row in chunk_df.iterrows():
+                # If i is not 0, which is not the first row of the chunk, cur_time needs to be added with the gap of the previous row, remember to divide by speed_factor
+                if i != 0 and keep_gaps:
+                    cur_time += chunk_df.iloc[i-1]['gap']/speed_factor
+                new_sub_times = []
+                number = row['number']
+                lines = eval(row['lines']) if isinstance(row['lines'], str) else row['lines']
+                for line_index, line in enumerate(lines):
+                    # 🔄 Step2: Start speed change and save as OUTPUT_FILE_TEMPLATE
+                    temp_file = TEMP_FILE_TEMPLATE.format(f"{number}_{line_index}")
+                    output_file = OUTPUT_FILE_TEMPLATE.format(f"{number}_{line_index}")
+                    adjust_audio_speed(temp_file, output_file, speed_factor)
+                    ad_dur = get_audio_duration(output_file)
+                    new_sub_times.append([cur_time, cur_time+ad_dur])
+                    cur_time += ad_dur
+                # 🔄 Step3: Find corresponding main DataFrame index and update new_sub_times
+                main_df_idx = tasks_df[tasks_df['number'] == row['number']].index[0]
+                tasks_df.at[main_df_idx, 'new_sub_times'] = new_sub_times
+                # 🎯 Step4: Choose emoji based on speed_factor and accept comparison
+                emoji = "⚡" if speed_factor <= accept else "⚠️"
+                rprint(f"[cyan]{emoji} Processed chunk {chunk_start} to {index} with speed factor {speed_factor}[/cyan]")
+            # 🔄 Step5: Check if the last row exceeds the range
+            if cur_time > chunk_end_time:
+                raise Exception(f"Chunk {chunk_start} to {index} exceeds the chunk end time {chunk_end_time:.2f} seconds with current time {cur_time:.2f} seconds")
+            chunk_start = index+1
+    
+    rprint("[bold green]✅ Audio chunks processing completed![/bold green]")
+    return tasks_df
 
-    if errors:
-        error_msg = f"The following tasks failed to process: {', '.join(map(str, errors))}"
-        rprint(Panel(error_msg, title="Failed Tasks", border_style="red"))
-        raise Exception("tasks failed to process, please check cli output for details")
+def gen_audio() -> None:
+    """Main function: Generate audio and process timeline"""
+    rprint("[bold magenta]🚀 Starting audio generation process...[/bold magenta]")
+    
+    # 🎯 Step1: Create necessary directories
+    os.makedirs(TEMP_DIR, exist_ok=True)
+    os.makedirs(SEGS_DIR, exist_ok=True)
+    
+    # 📝 Step2: Load task file
+    tasks_df = pd.read_excel(TASKS_FILE)
+    rprint("[green]📊 Loaded task file successfully[/green]")
+    
+    # 🔊 Step3: Generate TTS audio
+    tasks_df = generate_tts_audio(tasks_df)
+    
+    # 🔄 Step4: Merge audio chunks
+    tasks_df = merge_chunks(tasks_df)
     
-    rprint(Panel("Task processing completed", title="Success", border_style="green"))
+    # 💾 Step5: Save results
+    tasks_df.to_excel(OUTPUT_FILE, index=False)
+    rprint("[bold green]🎉 Audio generation completed successfully![/bold green]")
 
 if __name__ == "__main__":
-    process_sovits_tasks()
\ No newline at end of file
+    gen_audio()
diff --git a/core/step11_merge_audio_to_vid.py b/core/step11_merge_audio_to_vid.py
deleted file mode 100644
index 04310dde..00000000
--- a/core/step11_merge_audio_to_vid.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import sys, os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from core.config_utils import load_key
-from datetime import datetime
-import pandas as pd
-import subprocess
-from pydub import AudioSegment
-from rich import print as rprint
-import numpy as np
-import soundfile as sf
-import cv2
-from core.all_whisper_methods.demucs_vl import BACKGROUND_AUDIO_FILE
-from core.step7_merge_sub_to_vid import check_gpu_available
-
-INPUT_EXCEL = 'output/audio/sovits_tasks.xlsx'
-OUTPUT_AUDIO = 'output/trans_vocal_total.wav'
-VIDEO_FILE = "output/output_video_with_subs.mp4"
-OUTPUT_VIDEO = "output/output_video_with_audio.mp4"
-
-def time_to_datetime(time_str):
-    return datetime.strptime(time_str, '%H:%M:%S.%f')
-
-def create_silence(duration, output_file):
-    sample_rate = 32000
-    num_samples = int(duration * sample_rate)
-    silence = np.zeros(num_samples, dtype=np.float32)
-    sf.write(output_file, silence, sample_rate)
-
-def merge_all_audio():
-    # Define input and output paths
-    input_excel = INPUT_EXCEL
-    output_audio = OUTPUT_AUDIO
-        
-    df = pd.read_excel(input_excel)
-    
-    # Get the sample rate of the first audio file
-    first_audio = f'output/audio/segs/{df.iloc[0]["number"]}.wav'
-    sample_rate = AudioSegment.from_wav(first_audio).frame_rate
-
-    # Create an empty AudioSegment object
-    merged_audio = AudioSegment.silent(duration=0, frame_rate=sample_rate)
-
-    prev_target_start_time = None
-    prev_actual_duration = 0
-    
-    for index, row in df.iterrows():
-        number = row['number']
-        start_time = row['start_time']
-        input_audio = f'output/audio/segs/{number}.wav'
-        
-        if not os.path.exists(input_audio):
-            rprint(f"[bold yellow]Warning: File {input_audio} does not exist, skipping this file.[/bold yellow]")
-            continue
-        
-        audio_segment = AudioSegment.from_wav(input_audio)
-        actual_duration = len(audio_segment) / 1000  # Convert to seconds
-        target_start_time = time_to_datetime(start_time)
-        
-        silence_duration = (target_start_time - datetime(1900, 1, 1)).total_seconds() if prev_target_start_time is None else (target_start_time - prev_target_start_time).total_seconds() - prev_actual_duration
-        
-        if silence_duration > 0:
-            silence = AudioSegment.silent(duration=int(silence_duration * 1000), frame_rate=sample_rate)
-            merged_audio += silence
-        
-        merged_audio += audio_segment
-        
-        prev_target_start_time = target_start_time
-        prev_actual_duration = actual_duration
-
-    # Export the merged audio
-    merged_audio.export(output_audio, format="wav")
-    rprint(f"[bold green]Audio file successfully merged, output file: {output_audio}[/bold green]")
-
-def merge_video_audio():
-    """Merge video and audio, and reduce video volume"""
-    background_file = BACKGROUND_AUDIO_FILE
-    
-    if load_key("resolution") == '0x0':
-        rprint("[bold yellow]Warning: A 0-second black video will be generated as a placeholder as Resolution is set to 0x0.[/bold yellow]")
-
-        # Create a black frame
-        frame = np.zeros((1080, 1920, 3), dtype=np.uint8)
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, 1, (1920, 1080))
-        out.write(frame)
-        out.release()
-
-        rprint("[bold green]Placeholder video has been generated.[/bold green]")
-        return
-
-    # Merge video and audio
-    dub_volume = load_key("dub_volume")
-    cmd = ['ffmpeg', '-y', '-i', VIDEO_FILE, '-i', background_file, '-i', OUTPUT_AUDIO, 
-           '-filter_complex', f'[1:a]volume=1[a1];[2:a]volume={dub_volume}[a2];[a1][a2]amix=inputs=2:duration=first:dropout_transition=3[a]']
-
-    if check_gpu_available():
-        rprint("[bold green]Using GPU acceleration...[/bold green]")
-        cmd.extend(['-c:v', 'h264_nvenc'])
-    
-    cmd.extend(['-map', '0:v', '-map', '[a]', '-c:a', 'aac', '-b:a', '192k', OUTPUT_VIDEO])
-    
-    subprocess.run(cmd)
-    rprint(f"[bold green]Video and audio successfully merged into {OUTPUT_VIDEO}[/bold green]")
-
-def merge_main():
-    merge_all_audio()
-    merge_video_audio()
-    
-if __name__ == "__main__":
-    merge_main()
\ No newline at end of file
diff --git a/core/step11_merge_full_audio.py b/core/step11_merge_full_audio.py
new file mode 100644
index 00000000..384937d5
--- /dev/null
+++ b/core/step11_merge_full_audio.py
@@ -0,0 +1,144 @@
+import sys, os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import pandas as pd
+import subprocess
+from pydub import AudioSegment
+from rich import print as rprint
+from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
+from rich.console import Console
+console = Console()
+
+INPUT_EXCEL = 'output/audio/tts_tasks.xlsx'
+DUB_VOCAL_FILE = 'output/dub.mp3'
+
+DUB_SUB_FILE = 'output/dub.srt'
+SEGS_DIR = 'output/audio/segs'
+OUTPUT_FILE_TEMPLATE = f"{SEGS_DIR}/{{}}.wav"
+
+def load_and_flatten_data(excel_file):
+    """Load and flatten Excel data"""
+    df = pd.read_excel(excel_file)
+    lines = [eval(line) if isinstance(line, str) else line for line in df['lines'].tolist()]
+    lines = [item for sublist in lines for item in sublist]
+    
+    new_sub_times = [eval(time) if isinstance(time, str) else time for time in df['new_sub_times'].tolist()]
+    new_sub_times = [item for sublist in new_sub_times for item in sublist]
+    
+    return df, lines, new_sub_times
+
+def get_audio_files(df):
+    """Generate a list of audio file paths"""
+    audios = []
+    for index, row in df.iterrows():
+        number = row['number']
+        line_count = len(eval(row['lines']) if isinstance(row['lines'], str) else row['lines'])
+        for line_index in range(line_count):
+            temp_file = OUTPUT_FILE_TEMPLATE.format(f"{number}_{line_index}")
+            audios.append(temp_file)
+    return audios
+
+def process_audio_segment(audio_file):
+    """Process a single audio segment with MP3 compression"""
+    temp_file = f"{audio_file}_temp.mp3"
+    ffmpeg_cmd = [
+        'ffmpeg', '-y',
+        '-i', audio_file,
+        '-ar', '16000',  # 固定采样率为16kHz
+        '-ac', '1',      # 单声道
+        '-b:a', '64k',   # 比特率64kbps
+        temp_file
+    ]
+    subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    audio_segment = AudioSegment.from_mp3(temp_file)
+    os.remove(temp_file)
+    return audio_segment
+
+def merge_audio_segments(audios, new_sub_times, sample_rate):
+    merged_audio = AudioSegment.silent(duration=0, frame_rate=sample_rate)
+    
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TaskProgressColumn(),
+    ) as progress:
+        merge_task = progress.add_task("🎵 Merging audio segments...", total=len(audios))
+        
+        for i, (audio_file, time_range) in enumerate(zip(audios, new_sub_times)):
+            if not os.path.exists(audio_file):
+                console.print(f"[bold yellow]⚠️  Warning: File {audio_file} does not exist, skipping...[/bold yellow]")
+                progress.advance(merge_task)
+                continue
+                
+            audio_segment = process_audio_segment(audio_file)
+            start_time, end_time = time_range
+            
+            # Add silence segment
+            if i > 0:
+                prev_end = new_sub_times[i-1][1]
+                silence_duration = start_time - prev_end
+                if silence_duration > 0:
+                    silence = AudioSegment.silent(duration=int(silence_duration * 1000), frame_rate=sample_rate)
+                    merged_audio += silence
+            elif start_time > 0:
+                silence = AudioSegment.silent(duration=int(start_time * 1000), frame_rate=sample_rate)
+                merged_audio += silence
+                
+            merged_audio += audio_segment
+            progress.advance(merge_task)
+    
+    return merged_audio
+
+def create_srt_subtitle():
+    df, lines, new_sub_times = load_and_flatten_data(INPUT_EXCEL)
+    
+    with open(DUB_SUB_FILE, 'w', encoding='utf-8') as f:
+        for i, ((start_time, end_time), line) in enumerate(zip(new_sub_times, lines), 1):
+            start_str = f"{int(start_time//3600):02d}:{int((start_time%3600)//60):02d}:{int(start_time%60):02d},{int((start_time*1000)%1000):03d}"
+            end_str = f"{int(end_time//3600):02d}:{int((end_time%3600)//60):02d}:{int(end_time%60):02d},{int((end_time*1000)%1000):03d}"
+            
+            f.write(f"{i}\n")
+            f.write(f"{start_str} --> {end_str}\n")
+            f.write(f"{line}\n\n")
+    
+    rprint(f"[bold green]✅ Subtitle file created: {DUB_SUB_FILE}[/bold green]")
+
+def merge_full_audio():
+    """Main function: Process the complete audio merging process"""
+    console.print("\n[bold cyan]🎬 Starting audio merging process...[/bold cyan]")
+    
+    with console.status("[bold cyan]📊 Loading data from Excel...[/bold cyan]"):
+        df, lines, new_sub_times = load_and_flatten_data(INPUT_EXCEL)
+    console.print("[bold green]✅ Data loaded successfully[/bold green]")
+    
+    with console.status("[bold cyan]🔍 Getting audio file list...[/bold cyan]"):
+        audios = get_audio_files(df)
+    console.print(f"[bold green]✅ Found {len(audios)} audio segments[/bold green]")
+    
+    with console.status("[bold cyan]📝 Generating subtitle file...[/bold cyan]"):
+        create_srt_subtitle()
+    
+    if not os.path.exists(audios[0]):
+        console.print(f"[bold red]❌ Error: First audio file {audios[0]} does not exist![/bold red]")
+        return
+    
+    with console.status("[bold cyan]🎚️ Getting sample rate...[/bold cyan]"):
+        detected_rate = AudioSegment.from_wav(audios[0]).frame_rate
+        sample_rate = min(16000, detected_rate)
+    console.print(f"[bold green]✅ Sample rate: {sample_rate}Hz (detected: {detected_rate}Hz)[/bold green]")
+
+    console.print("[bold cyan]🔄 Starting audio merge process...[/bold cyan]")
+    merged_audio = merge_audio_segments(audios, new_sub_times, sample_rate)
+    
+    with console.status("[bold cyan]💾 Exporting final audio file...[/bold cyan]"):
+        merged_audio = merged_audio.set_frame_rate(16000).set_channels(1)
+        merged_audio.export(
+            DUB_VOCAL_FILE, 
+            format="mp3",
+            parameters=["-b:a", "64k"]
+        )
+    console.print(f"[bold green]✅ Audio file successfully merged![/bold green]")
+    console.print(f"[bold green]📁 Output file: {DUB_VOCAL_FILE}[/bold green]")
+
+if __name__ == "__main__":
+    merge_full_audio()
\ No newline at end of file
diff --git a/core/step12_merge_dub_to_vid.py b/core/step12_merge_dub_to_vid.py
new file mode 100644
index 00000000..f261b97f
--- /dev/null
+++ b/core/step12_merge_dub_to_vid.py
@@ -0,0 +1,82 @@
+import os
+import sys
+import platform
+import subprocess
+
+import numpy as np
+import cv2
+from rich import print as rprint
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from core.all_whisper_methods.demucs_vl import BACKGROUND_AUDIO_FILE
+from core.step7_merge_sub_to_vid import check_gpu_available
+from core.config_utils import load_key
+from core.step1_ytdlp import find_video_files
+
+DUB_VIDEO = "output/output_dub.mp4"
+DUB_SUB_FILE = 'output/dub.srt'
+DUB_AUDIO = 'output/dub.mp3'
+
+TRANS_FONT_SIZE = 20
+TRANS_FONT_NAME = 'Arial'
+if platform.system() == 'Linux':
+    TRANS_FONT_NAME = 'NotoSansCJK-Regular'
+
+TRANS_FONT_COLOR = '&H00FFFF'
+TRANS_OUTLINE_COLOR = '&H000000'
+TRANS_OUTLINE_WIDTH = 1 
+TRANS_BACK_COLOR = '&H33000000'
+
+def merge_video_audio():
+    """Merge video and audio, and reduce video volume"""
+    VIDEO_FILE = find_video_files()
+    background_file = BACKGROUND_AUDIO_FILE
+    
+    if load_key("resolution") == '0x0':
+        rprint("[bold yellow]Warning: A 0-second black video will be generated as a placeholder as Resolution is set to 0x0.[/bold yellow]")
+
+        # Create a black frame
+        frame = np.zeros((1080, 1920, 3), dtype=np.uint8)
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(DUB_VIDEO, fourcc, 1, (1920, 1080))
+        out.write(frame)
+        out.release()
+
+        rprint("[bold green]Placeholder video has been generated.[/bold green]")
+        return
+
+    # Merge video and audio with translated subtitles
+    dub_volume = load_key("dub_volume")
+    resolution = load_key("resolution")
+    target_width, target_height = resolution.split('x')
+    
+    subtitle_filter = (
+        f"subtitles={DUB_SUB_FILE}:force_style='FontSize={TRANS_FONT_SIZE},"
+        f"FontName={TRANS_FONT_NAME},PrimaryColour={TRANS_FONT_COLOR},"
+        f"OutlineColour={TRANS_OUTLINE_COLOR},OutlineWidth={TRANS_OUTLINE_WIDTH},"
+        f"BackColour={TRANS_BACK_COLOR},Alignment=2,MarginV=27,BorderStyle=4'"
+    )
+    
+    cmd = [
+        'ffmpeg', '-y', '-i', VIDEO_FILE, '-i', background_file, '-i', DUB_AUDIO,
+        '-filter_complex',
+        f'[0:v]scale={target_width}:{target_height}:force_original_aspect_ratio=decrease,'
+        f'pad={target_width}:{target_height}:(ow-iw)/2:(oh-ih)/2,'
+        f'{subtitle_filter}[v];'
+        f'[1:a]volume=1[a1];[2:a]volume={dub_volume}[a2];'
+        f'[a1][a2]amix=inputs=2:duration=first:dropout_transition=3[a]'
+    ]
+
+    if check_gpu_available():
+        rprint("[bold green]Using GPU acceleration...[/bold green]")
+        cmd.extend(['-map', '[v]', '-map', '[a]', '-c:v', 'h264_nvenc'])
+    else:
+        cmd.extend(['-map', '[v]', '-map', '[a]'])
+    
+    cmd.extend(['-c:a', 'aac', '-b:a', '192k', DUB_VIDEO])
+    
+    subprocess.run(cmd)
+    rprint(f"[bold green]Video and audio successfully merged into {DUB_VIDEO}[/bold green]")
+
+if __name__ == '__main__':
+    merge_video_audio()
diff --git a/core/step2_whisperX.py b/core/step2_whisperX.py
index c0a4d6f3..064b7750 100644
--- a/core/step2_whisperX.py
+++ b/core/step2_whisperX.py
@@ -11,15 +11,48 @@
 from rich import print as rprint
 import subprocess
 import tempfile
+import time
 
 from core.config_utils import load_key
 from core.all_whisper_methods.demucs_vl import demucs_main, RAW_AUDIO_FILE, VOCAL_AUDIO_FILE
-from core.all_whisper_methods.whisperX_utils import process_transcription, convert_video_to_audio, split_audio, save_results, save_language
+from core.all_whisper_methods.whisperX_utils import process_transcription, convert_video_to_audio, split_audio, save_results, save_language, compress_audio, CLEANED_CHUNKS_EXCEL_PATH
 from core.step1_ytdlp import find_video_files
 
 MODEL_DIR = load_key("model_dir")
+WHISPER_FILE = "output/audio/for_whisper.mp3"
+
+def check_hf_mirror() -> str:
+    """Check and return the fastest HF mirror"""
+    mirrors = {
+        'Official': 'huggingface.co',
+        'Mirror': 'hf-mirror.com'
+    }
+    fastest_url = f"https://{mirrors['Official']}"
+    best_time = float('inf')
+    rprint("[cyan]🔍 Checking HuggingFace mirrors...[/cyan]")
+    for name, domain in mirrors.items():
+        try:
+            if os.name == 'nt':
+                cmd = ['ping', '-n', '1', '-w', '3000', domain]
+            else:
+                cmd = ['ping', '-c', '1', '-W', '3', domain]
+            start = time.time()
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            response_time = time.time() - start
+            if result.returncode == 0:
+                if response_time < best_time:
+                    best_time = response_time
+                    fastest_url = f"https://{domain}"
+                rprint(f"[green]✓ {name}:[/green] {response_time:.2f}s")
+        except:
+            rprint(f"[red]✗ {name}:[/red] Failed to connect")
+    if best_time == float('inf'):
+        rprint("[yellow]⚠️ All mirrors failed, using default[/yellow]")
+    rprint(f"[cyan]🚀 Selected mirror:[/cyan] {fastest_url} ({best_time:.2f}s)")
+    return fastest_url
 
 def transcribe_audio(audio_file: str, start: float, end: float) -> Dict:
+    os.environ['HF_ENDPOINT'] = check_hf_mirror() #? don't know if it's working...
     WHISPER_LANGUAGE = load_key("whisper.language")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     rprint(f"🚀 Starting WhisperX using device: {device} ...")
@@ -40,8 +73,8 @@ def transcribe_audio(audio_file: str, start: float, end: float) -> Dict:
             model_name = "Huan69/Belle-whisper-large-v3-zh-punct-fasterwhisper"
             local_model = os.path.join(MODEL_DIR, "Belle-whisper-large-v3-zh-punct-fasterwhisper")
         else:
-            model_name = "large-v3"
-            local_model = os.path.join(MODEL_DIR, "large-v3")
+            model_name = load_key("whisper.model")
+            local_model = os.path.join(MODEL_DIR, model_name)
             
         if os.path.exists(local_model):
             rprint(f"[green]📥 Loading local WHISPER model:[/green] {local_model} ...")
@@ -49,14 +82,8 @@ def transcribe_audio(audio_file: str, start: float, end: float) -> Dict:
         else:
             rprint(f"[green]📥 Using WHISPER model from HuggingFace:[/green] {model_name} ...")
 
-        vad_options = {
-                "vad_onset": 0.500,
-                "vad_offset": 0.363
-            }
-        asr_options = {
-                "temperatures": [0],
-                "initial_prompt": "",
-            }
+        vad_options = {"vad_onset": 0.500,"vad_offset": 0.363}
+        asr_options = {"temperatures": [0],"initial_prompt": "",}
         whisper_language = None if 'auto' in WHISPER_LANGUAGE else WHISPER_LANGUAGE
         rprint("[bold yellow]**You can ignore warning of `Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118...`**[/bold yellow]")
         model = whisperx.load_model(model_name, device, compute_type=compute_type, language=whisper_language, vad_options=vad_options, asr_options=asr_options, download_root=MODEL_DIR)
@@ -108,7 +135,7 @@ def transcribe_audio(audio_file: str, start: float, end: float) -> Dict:
         raise
 
 def transcribe():
-    if os.path.exists("output/log/cleaned_chunks.xlsx"):
+    if os.path.exists(CLEANED_CHUNKS_EXCEL_PATH):
         rprint("[yellow]⚠️ Transcription results already exist, skipping transcription step.[/yellow]")
         return
     
@@ -120,22 +147,25 @@ def transcribe():
     if load_key("demucs"):
         demucs_main()
     
-    whisper_file = VOCAL_AUDIO_FILE if load_key("demucs") else RAW_AUDIO_FILE
+    # step2 Compress audio
+    choose_audio = VOCAL_AUDIO_FILE if load_key("demucs") else RAW_AUDIO_FILE
+    whisper_audio = compress_audio(choose_audio, WHISPER_FILE)
 
-    # step2 Extract audio
-    segments = split_audio(whisper_file)
+    # step3 Extract audio
+    segments = split_audio(whisper_audio)
     
-    # step3 Transcribe audio
+    # step4 Transcribe audio
     all_results = []
     for start, end in segments:
-        result = transcribe_audio(whisper_file, start, end)
+        result = transcribe_audio(whisper_audio, start, end)
         all_results.append(result)
     
-    # step4 Combine results
+    # step5 Combine results
     combined_result = {'segments': []}
     for result in all_results:
         combined_result['segments'].extend(result['segments'])
     
+    # step6 Process df
     df = process_transcription(combined_result)
     save_results(df)
         
diff --git a/core/step4_1_summarize.py b/core/step4_1_summarize.py
index d9604ddb..6a9de493 100644
--- a/core/step4_1_summarize.py
+++ b/core/step4_1_summarize.py
@@ -12,7 +12,7 @@ def combine_chunks():
         sentences = file.readlines()
     cleaned_sentences = [line.strip() for line in sentences]
     combined_text = ' '.join(cleaned_sentences)
-    return combined_text[:16000]  #! Return only the first 16000 characters
+    return combined_text[:32000]  #! Return only the first 32000 characters
 
 def search_things_to_note_in_prompt(sentence):
     """Search for terms to note in the given sentence"""
diff --git a/core/step4_2_translate_all.py b/core/step4_2_translate_all.py
index aa0fcdd0..8c9ac707 100644
--- a/core/step4_2_translate_all.py
+++ b/core/step4_2_translate_all.py
@@ -5,7 +5,7 @@
 import concurrent.futures
 from core.translate_once import translate_lines
 from core.step4_1_summarize import search_things_to_note_in_prompt
-from core.step8_gen_audio_task import check_len_then_trim
+from core.step8_1_gen_audio_task import check_len_then_trim
 from core.step6_generate_final_timeline import align_timestamp
 from core.config_utils import load_key
 from rich.console import Console
diff --git a/core/step5_splitforsub.py b/core/step5_splitforsub.py
index b567c8f1..212b31b3 100644
--- a/core/step5_splitforsub.py
+++ b/core/step5_splitforsub.py
@@ -67,43 +67,37 @@ def valid_align(response_data):
     
     return src_parts, tr_parts, tr_remerged
 
-def split_align_subs(src_lines: List[str], tr_lines: List[str], max_retry=5) -> Tuple[List[str], List[str], List[str]]:
+def split_align_subs(src_lines: List[str], tr_lines: List[str]) -> Tuple[List[str], List[str], List[str]]:
     subtitle_set = load_key("subtitle")
     MAX_SUB_LENGTH = subtitle_set["max_length"]
     TARGET_SUB_MULTIPLIER = subtitle_set["target_multiplier"]
     remerged_tr_lines = tr_lines.copy()
     
-    for attempt in range(max_retry):
-        console.print(Panel(f"🔄 Split attempt {attempt + 1}", expand=False))
-        to_split = []
-        
-        for i, (src, tr) in enumerate(zip(src_lines, tr_lines)):
-            src, tr = str(src), str(tr)
-            if len(src) > MAX_SUB_LENGTH or calc_len(tr) * TARGET_SUB_MULTIPLIER > MAX_SUB_LENGTH:
-                to_split.append(i)
-                table = Table(title=f"📏 Line {i} needs to be split")
-                table.add_column("Type", style="cyan")
-                table.add_column("Content", style="magenta")
-                table.add_row("Source Line", src)
-                table.add_row("Target Line", tr)
-                console.print(table)
-        
-        def process(i):
-            split_src = split_sentence(src_lines[i], num_parts=2).strip()
-            src_parts, tr_parts, tr_remerged = align_subs(src_lines[i], tr_lines[i], split_src)
-            src_lines[i] = src_parts
-            tr_lines[i] = tr_parts
-            remerged_tr_lines[i] = tr_remerged
-        
-        with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor:
-            executor.map(process, to_split)
-        
-        # Flatten `src_lines` and `tr_lines`
-        src_lines = [item for sublist in src_lines for item in (sublist if isinstance(sublist, list) else [sublist])]
-        tr_lines = [item for sublist in tr_lines for item in (sublist if isinstance(sublist, list) else [sublist])]
-        
-        if all(len(src) <= MAX_SUB_LENGTH for src in src_lines) and all(calc_len(tr) * TARGET_SUB_MULTIPLIER <= MAX_SUB_LENGTH for tr in tr_lines):
-            break
+    to_split = []
+    for i, (src, tr) in enumerate(zip(src_lines, tr_lines)):
+        src, tr = str(src), str(tr)
+        if len(src) > MAX_SUB_LENGTH or calc_len(tr) * TARGET_SUB_MULTIPLIER > MAX_SUB_LENGTH:
+            to_split.append(i)
+            table = Table(title=f"📏 Line {i} needs to be split")
+            table.add_column("Type", style="cyan")
+            table.add_column("Content", style="magenta")
+            table.add_row("Source Line", src)
+            table.add_row("Target Line", tr)
+            console.print(table)
+    
+    def process(i):
+        split_src = split_sentence(src_lines[i], num_parts=2).strip()
+        src_parts, tr_parts, tr_remerged = align_subs(src_lines[i], tr_lines[i], split_src)
+        src_lines[i] = src_parts
+        tr_lines[i] = tr_parts
+        remerged_tr_lines[i] = tr_remerged
+    
+    with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor:
+        executor.map(process, to_split)
+    
+    # Flatten `src_lines` and `tr_lines`
+    src_lines = [item for sublist in src_lines for item in (sublist if isinstance(sublist, list) else [sublist])]
+    tr_lines = [item for sublist in tr_lines for item in (sublist if isinstance(sublist, list) else [sublist])]
     
     return src_lines, tr_lines, remerged_tr_lines
 
@@ -114,12 +108,25 @@ def split_for_sub_main():
     src = df['Source'].tolist()
     trans = df['Translation'].tolist()
     
-    split_src, split_trans, remerged = split_align_subs(src.copy(), trans, max_retry=3)
+    subtitle_set = load_key("subtitle")
+    MAX_SUB_LENGTH = subtitle_set["max_length"]
+    TARGET_SUB_MULTIPLIER = subtitle_set["target_multiplier"]
     
+    for attempt in range(3):  # 使用固定的3次重试
+        console.print(Panel(f"🔄 Split attempt {attempt + 1}", expand=False))
+        split_src, split_trans, remerged = split_align_subs(src.copy(), trans)
+        
+        # 检查是否所有字幕都符合长度要求
+        if all(len(src) <= MAX_SUB_LENGTH for src in split_src) and \
+           all(calc_len(tr) * TARGET_SUB_MULTIPLIER <= MAX_SUB_LENGTH for tr in split_trans):
+            break
+        
+        # 更新源数据继续下一轮分割
+        src = split_src
+        trans = split_trans
+
     pd.DataFrame({'Source': split_src, 'Translation': split_trans}).to_excel(OUTPUT_SPLIT_FILE, index=False)
     pd.DataFrame({'Source': src, 'Translation': remerged}).to_excel(OUTPUT_REMERGED_FILE, index=False)
-    
-    console.print("[bold green]✅ Subtitles splitting and remerging completed![/bold green]")
 
 if __name__ == '__main__':
     split_for_sub_main()
diff --git a/core/step7_merge_sub_to_vid.py b/core/step7_merge_sub_to_vid.py
index 07641bb1..dd9bd7ac 100644
--- a/core/step7_merge_sub_to_vid.py
+++ b/core/step7_merge_sub_to_vid.py
@@ -27,7 +27,7 @@
 TRANS_BACK_COLOR = '&H33000000'
 
 OUTPUT_DIR = "output"
-OUTPUT_VIDEO = f"{OUTPUT_DIR}/output_video_with_subs.mp4"
+OUTPUT_VIDEO = f"{OUTPUT_DIR}/output_sub.mp4"
 SRC_SRT = f"{OUTPUT_DIR}/src.srt"
 TRANS_SRT = f"{OUTPUT_DIR}/trans.srt"
 
diff --git a/core/step8_gen_audio_task.py b/core/step8_1_gen_audio_task.py
similarity index 63%
rename from core/step8_gen_audio_task.py
rename to core/step8_1_gen_audio_task.py
index 494de96f..3b63499c 100644
--- a/core/step8_gen_audio_task.py
+++ b/core/step8_1_gen_audio_task.py
@@ -9,38 +9,23 @@
 from rich.panel import Panel
 from rich.console import Console
 from core.config_utils import load_key  
+from core.all_tts_functions.estimate_duration import init_estimator, estimate_duration
 
 console = Console()
 speed_factor = load_key("speed_factor")
 
 TRANS_SUBS_FOR_AUDIO_FILE = 'output/audio/trans_subs_for_audio.srt'
 SRC_SUBS_FOR_AUDIO_FILE = 'output/audio/src_subs_for_audio.srt'
-SOVITS_TASKS_FILE = 'output/audio/sovits_tasks.xlsx'
+SOVITS_TASKS_FILE = 'output/audio/tts_tasks.xlsx'
+ESTIMATOR = None
 
 def check_len_then_trim(text, duration):
-    multiplier = speed_factor['normal'] * speed_factor['max']
-    # Define speech speed: characters/second or words/second, punctuation/second
-    speed_zh_ja = 4 * multiplier  # Chinese and Japanese characters per second
-    speed_en_and_others = 5 * multiplier   # Words per second for English and other languages
-    speed_punctuation = 4 * multiplier   # Punctuation marks per second
-    
-    # Count characters, words, and punctuation for each language
-    chinese_japanese_chars = len(re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\u3400-\u4dbf\uf900-\ufaff\uff66-\uff9f]', text))
-    en_and_others_words = len(re.findall(r'\b[a-zA-ZàâçéèêëîïôûùüÿñæœáéíóúüñÁÉÍÓÚÜÑàèéìíîòóùúÀÈÉÌÍÎÒÓÙÚäöüßÄÖÜа-яА-Я]+\b', text))
-    punctuation_count = len(re.findall(r'[,.!?;:，。！？；：](?=.)', text))
-    
-    # Estimate duration for each language part and punctuation
-    chinese_japanese_duration = chinese_japanese_chars / speed_zh_ja
-    en_and_others_duration = en_and_others_words / speed_en_and_others
-    punctuation_duration = punctuation_count / speed_punctuation
-    
-    # Total estimated duration
-    estimated_duration = chinese_japanese_duration + en_and_others_duration + punctuation_duration
+    global ESTIMATOR
+    if ESTIMATOR is None:
+        ESTIMATOR = init_estimator()
+    estimated_duration = estimate_duration(text, ESTIMATOR) / speed_factor['max']
     
     console.print(f"Subtitle text: {text}, "
-                  f"Subtitle info: Chinese/Japanese chars: {chinese_japanese_chars}, "
-                  f"English and other language words: {en_and_others_words}, "
-                  f"Punctuation marks: {punctuation_count}, "
                   f"[bold green]Estimated reading duration: {estimated_duration:.2f} seconds[/bold green]")
 
     if estimated_duration > duration:
@@ -62,8 +47,15 @@ def valid_trim(response):
     else:
         return text
 
+def time_diff_seconds(t1, t2, base_date):
+    """Calculate the difference in seconds between two time objects"""
+    dt1 = datetime.datetime.combine(base_date, t1)
+    dt2 = datetime.datetime.combine(base_date, t2)
+    return (dt2 - dt1).total_seconds()
+
 def process_srt():
     """Process srt file, generate audio tasks"""
+    
     with open(TRANS_SUBS_FOR_AUDIO_FILE, 'r', encoding='utf-8') as file:
         content = file.read()
     
@@ -92,8 +84,7 @@ def process_srt():
             start_time, end_time = lines[1].split(' --> ')
             start_time = datetime.datetime.strptime(start_time, '%H:%M:%S,%f').time()
             end_time = datetime.datetime.strptime(end_time, '%H:%M:%S,%f').time()
-            duration = (datetime.datetime.combine(datetime.date.today(), end_time) - 
-                        datetime.datetime.combine(datetime.date.today(), start_time)).total_seconds()
+            duration = time_diff_seconds(start_time, end_time, datetime.date.today())
             text = ' '.join(lines[2:])
             # Remove content within parentheses (including English and Chinese parentheses)
             text = re.sub(r'\([^)]*\)', '', text).strip()
@@ -108,48 +99,40 @@ def process_srt():
             rprint(Panel(f"Unable to parse subtitle block '{block}', error: {str(e)}, skipping this subtitle block.", title="Error", border_style="red"))
             continue
         
-        subtitles.append({
-            'number': number,
-            'start_time': start_time,
-            'end_time': end_time,
-            'duration': duration,
-            'text': text,
-            'origin': origin
-        })
+        subtitles.append({'number': number, 'start_time': start_time, 'end_time': end_time, 'duration': duration, 'text': text, 'origin': origin})
     
     df = pd.DataFrame(subtitles)
     
     i = 0
-    MIN_SUBTITLE_DURATION = load_key("min_subtitle_duration")
+    MIN_SUB_DUR = load_key("min_subtitle_duration")
     while i < len(df):
-        if df.loc[i, 'duration'] < MIN_SUBTITLE_DURATION:
-            if i < len(df) - 1 and (datetime.datetime.combine(datetime.date.today(), df.loc[i+1, 'start_time']) - 
-                                    datetime.datetime.combine(datetime.date.today(), df.loc[i, 'start_time'])).total_seconds() < MIN_SUBTITLE_DURATION:
+        today = datetime.date.today()
+        if df.loc[i, 'duration'] < MIN_SUB_DUR:
+            if i < len(df) - 1 and time_diff_seconds(df.loc[i, 'start_time'],df.loc[i+1, 'start_time'],today) < MIN_SUB_DUR:
                 rprint(f"[bold yellow]Merging subtitles {i+1} and {i+2}[/bold yellow]")
                 df.loc[i, 'text'] += ' ' + df.loc[i+1, 'text']
                 df.loc[i, 'origin'] += ' ' + df.loc[i+1, 'origin']
                 df.loc[i, 'end_time'] = df.loc[i+1, 'end_time']
-                df.loc[i, 'duration'] = (datetime.datetime.combine(datetime.date.today(), df.loc[i, 'end_time']) - 
-                                        datetime.datetime.combine(datetime.date.today(), df.loc[i, 'start_time'])).total_seconds()
+                df.loc[i, 'duration'] = time_diff_seconds(df.loc[i, 'start_time'],df.loc[i, 'end_time'],today)
                 df = df.drop(i+1).reset_index(drop=True)
             else:
                 if i < len(df) - 1:  # Not the last audio
-                    rprint(f"[bold blue]Extending subtitle {i+1} duration to {MIN_SUBTITLE_DURATION} seconds[/bold blue]")
-                    df.loc[i, 'end_time'] = (datetime.datetime.combine(datetime.date.today(), df.loc[i, 'start_time']) + 
-                                            datetime.timedelta(seconds=MIN_SUBTITLE_DURATION)).time()
-                    df.loc[i, 'duration'] = MIN_SUBTITLE_DURATION
+                    rprint(f"[bold blue]Extending subtitle {i+1} duration to {MIN_SUB_DUR} seconds[/bold blue]")
+                    df.loc[i, 'end_time'] = (datetime.datetime.combine(today, df.loc[i, 'start_time']) + 
+                                            datetime.timedelta(seconds=MIN_SUB_DUR)).time()
+                    df.loc[i, 'duration'] = MIN_SUB_DUR
                 else:
-                    rprint(f"[bold red]The last subtitle {i+1} duration is less than {MIN_SUBTITLE_DURATION} seconds, but not extending[/bold red]")
+                    rprint(f"[bold red]The last subtitle {i+1} duration is less than {MIN_SUB_DUR} seconds, but not extending[/bold red]")
                 i += 1
         else:
             i += 1
     
     df['start_time'] = df['start_time'].apply(lambda x: x.strftime('%H:%M:%S.%f')[:-3])
     df['end_time'] = df['end_time'].apply(lambda x: x.strftime('%H:%M:%S.%f')[:-3])
-    
-    # check and trim subtitle length, for twice to ensure the subtitle length is within the limit
-    for _ in range(2):
-        df['text'] = df.apply(lambda x: check_len_then_trim(x['text'], x['duration']), axis=1)
+
+    ##! No longer perform secondary trim
+    # check and trim subtitle length, for twice to ensure the subtitle length is within the limit, 允许tolerance
+    # df['text'] = df.apply(lambda x: check_len_then_trim(x['text'], x['duration']+x['tolerance']), axis=1)
 
     return df
 
diff --git a/core/step8_2_gen_dub_chunks.py b/core/step8_2_gen_dub_chunks.py
new file mode 100644
index 00000000..bc92a80a
--- /dev/null
+++ b/core/step8_2_gen_dub_chunks.py
@@ -0,0 +1,188 @@
+import pandas as pd
+import os, sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from core.config_utils import load_key
+from core.all_whisper_methods.whisperX_utils import get_audio_duration
+from core.step8_1_gen_audio_task import time_diff_seconds
+import datetime
+import re
+from core.all_tts_functions.estimate_duration import init_estimator, estimate_duration
+from rich import print as rprint
+
+INPUT_EXCEL = "output/audio/tts_tasks.xlsx"
+OUTPUT_EXCEL = "output/audio/tts_tasks.xlsx"
+TRANSCRIPT_FILE = "output/trans.srt"
+MAX_MERGE_COUNT = 5
+AUDIO_FILE = 'output/audio/raw.mp3'
+ESTIMATOR = None
+
+def calc_if_too_fast(est_dur, tol_dur, duration, tolerance):
+    accept = load_key("speed_factor.accept") # Maximum acceptable speed factor
+    if est_dur / accept > tol_dur:  # Even max speed factor cannot adapt
+        return 2
+    elif est_dur > tol_dur:  # Speed adjustment needed within acceptable range
+        return 1
+    elif est_dur < duration - tolerance:  # Speaking speed too slow
+        return -1
+    else:  # Normal speaking speed
+        return 0
+
+def merge_rows(df, start_idx, merge_count):
+    """Merge multiple rows and calculate cumulative values"""
+    merged = {
+        'est_dur': df.iloc[start_idx]['est_dur'],
+        'tol_dur': df.iloc[start_idx]['tol_dur'],
+        'duration': df.iloc[start_idx]['duration']
+    }
+    
+    while merge_count < MAX_MERGE_COUNT and (start_idx + merge_count) < len(df):
+        next_row = df.iloc[start_idx + merge_count]
+        merged['est_dur'] += next_row['est_dur']
+        merged['tol_dur'] += next_row['tol_dur']
+        merged['duration'] += next_row['duration']
+        
+        speed_flag = calc_if_too_fast(
+            merged['est_dur'],
+            merged['tol_dur'],
+            merged['duration'],
+            df.iloc[start_idx + merge_count]['tolerance']
+        )
+        
+        if speed_flag <= 0 or merge_count == 2:
+            df.at[start_idx + merge_count, 'cut_off'] = 1
+            return merge_count + 1
+        
+        merge_count += 1
+    
+    # If no suitable merge point is found
+    if merge_count >= MAX_MERGE_COUNT or (start_idx + merge_count) >= len(df):
+        df.at[start_idx + merge_count - 1, 'cut_off'] = 1
+    return merge_count
+
+def analyze_subtitle_timing_and_speed(df):
+    rprint("[🔍 Analyzing] Calculating subtitle timing and speed...")
+    global ESTIMATOR
+    if ESTIMATOR is None:
+        ESTIMATOR = init_estimator()
+    TOLERANCE = load_key("tolerance")
+    whole_dur = get_audio_duration(AUDIO_FILE)
+    df['gap'] = 0.0  # Initialize gap column
+    for i in range(len(df) - 1):
+        current_end = datetime.datetime.strptime(df.loc[i, 'end_time'], '%H:%M:%S.%f').time()
+        next_start = datetime.datetime.strptime(df.loc[i + 1, 'start_time'], '%H:%M:%S.%f').time()
+        df.loc[i, 'gap'] = time_diff_seconds(current_end, next_start, datetime.date.today())
+    
+    # Set the gap for the last line
+    last_end = datetime.datetime.strptime(df.iloc[-1]['end_time'], '%H:%M:%S.%f').time()
+    last_end_seconds = (last_end.hour * 3600 + last_end.minute * 60 + 
+                       last_end.second + last_end.microsecond / 1000000)
+    df.iloc[-1, df.columns.get_loc('gap')] = whole_dur - last_end_seconds
+    
+    df['tolerance'] = df['gap'].apply(lambda x: TOLERANCE if x > TOLERANCE else x)
+    df['tol_dur'] = df['duration'] + df['tolerance']
+    df['est_dur'] = df.apply(lambda x: estimate_duration(x['text'], ESTIMATOR), axis=1)
+
+    ## Calculate speed indicators
+    accept = load_key("speed_factor.accept") # Maximum acceptable speed factor
+    def calc_if_too_fast(row):
+        est_dur = row['est_dur']
+        tol_dur = row['tol_dur']
+        duration = row['duration']
+        tolerance = row['tolerance']
+        
+        if est_dur / accept > tol_dur:  # Even max speed factor cannot adapt
+            return 2
+        elif est_dur > tol_dur:  # Speed adjustment needed within acceptable range
+            return 1
+        elif est_dur < duration - tolerance:  # Speaking speed too slow
+            return -1
+        else:  # Normal speaking speed
+            return 0
+    
+    df['if_too_fast'] = df.apply(calc_if_too_fast, axis=1)
+    return df
+
+def process_cutoffs(df):
+    rprint("[✂️ Processing] Generating cutoff points...")
+    df['cut_off'] = 0  # Initialize cut_off column
+    df.loc[df['gap'] >= load_key("tolerance"), 'cut_off'] = 1  # Set to 1 when gap is greater than TOLERANCE
+    idx = 0
+    while idx < len(df):
+        # Process marked split points
+        if df.iloc[idx]['cut_off'] == 1:
+            if df.iloc[idx]['if_too_fast'] == 2:
+                rprint(f"[⚠️ Warning] Line {idx} is too fast and cannot be fixed by speed adjustment")
+            idx += 1
+            continue
+
+        # Process the last line
+        if idx + 1 >= len(df):
+            df.at[idx, 'cut_off'] = 1
+            break
+
+        # Process normal or slow lines
+        if df.iloc[idx]['if_too_fast'] <= 0:
+            if df.iloc[idx + 1]['if_too_fast'] <= 0:
+                df.at[idx, 'cut_off'] = 1
+                idx += 1
+            else:
+                idx += merge_rows(df, idx, 1)
+        # Process fast lines
+        else:
+            idx += merge_rows(df, idx, 1)
+    
+    return df
+
+def gen_dub_chunks():
+    rprint("[🎬 Starting] Generating dubbing chunks...")
+    df = pd.read_excel(INPUT_EXCEL)
+    
+    rprint("[📊 Processing] Analyzing timing and speed...")
+    df = analyze_subtitle_timing_and_speed(df)
+    
+    rprint("[✂️ Processing] Processing cutoffs...")
+    df = process_cutoffs(df)
+
+    rprint("[📝 Reading] Loading transcript file...")
+    content = open(TRANSCRIPT_FILE, "r", encoding="utf-8").read()
+
+    # Process subtitle content
+    content_lines = []
+    for block in content.strip().split('\n\n'):
+        lines = [line.strip() for line in block.split('\n') if line.strip()]
+        if len(lines) >= 3:
+            text = ' '.join(lines[2:])
+            # Clean text
+            text = re.sub(r'\([^)]*\)|（[^）]*）', '', text).strip().replace('-', '')
+            content_lines.append(text)
+
+    # Match processing
+    df['lines'] = None
+    last_idx = 0
+
+    for idx, row in df.iterrows():
+        target = row['text'].replace(' ', '')
+        matches = []
+        current = ''
+        
+        for i in range(last_idx, len(content_lines)):
+            line = content_lines[i].replace(' ', '')
+            current += line
+            matches.append(content_lines[i])
+            
+            if current == target:
+                df.at[idx, 'lines'] = matches
+                last_idx = i + 1
+                break
+        else:  # If no match is found
+            rprint(f"[❌ Error] Matching failed at line {idx}:")
+            rprint(f"Target: '{target}'")
+            rprint(f"Current: '{current}'")
+            raise ValueError("Matching failed")
+
+    # Save results
+    df.to_excel(OUTPUT_EXCEL, index=False)
+    rprint("[✅ Complete] Matching completed successfully!")
+
+if __name__ == "__main__":
+    gen_dub_chunks()
\ No newline at end of file
diff --git a/core/step9_extract_refer_audio.py b/core/step9_extract_refer_audio.py
index 1cef1149..74777404 100644
--- a/core/step9_extract_refer_audio.py
+++ b/core/step9_extract_refer_audio.py
@@ -12,7 +12,7 @@
 # Simplified path definitions
 REF_DIR = 'output/audio/refers'
 SEG_DIR = 'output/audio/segs'
-TASKS_FILE = 'output/audio/sovits_tasks.xlsx'
+TASKS_FILE = 'output/audio/tts_tasks.xlsx'
 
 def time_to_samples(time_str, sr):
     """Unified time conversion function"""
diff --git a/docs/pages/docs/start.en-US.md b/docs/pages/docs/start.en-US.md
index 5c68828a..de78adcc 100644
--- a/docs/pages/docs/start.en-US.md
+++ b/docs/pages/docs/start.en-US.md
@@ -1,49 +1,43 @@
 # 🚀 Getting Started
 
 ## 📋 API Configuration
-This project requires Large Language Models and TTS. Multiple options are provided for each component. **Please read the configuration guide carefully 😊**
+This project requires Large Language Models and TTS. **Recommended to use [SiliconFlow](https://cloud.siliconflow.cn/i/ttKDEsxE)**, which offers free credits upon registration and only needs one key for all features.
 
 ### 1. **Get API_KEY for Large Language Models**:
 
 | Recommended Model | Recommended Provider | base_url | Price | Effect |
 |:-----|:---------|:---------|:-----|:---------|
-| gemini-1.5-pro-002 | [Yunwu API](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | $0.70 / 1M tokens | 🤩 |
-| claude-3-5-sonnet-20240620 | [Yunwu API](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | $1.40 / 1M tokens | 🤩 |
-| gpt-4o | [Yunwu API](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | $0.70 / 1M tokens | 😃 |
+| Qwen/Qwen2.5-72B-Instruct | [SiliconFlow](https://cloud.siliconflow.cn/i/ttKDEsxE) | https://api.siliconflow.cn | ¥4 / 1M tokens | 😃 |
+| claude-3-5-sonnet-20240620 | / | / | $15 / 1M tokens | 🤩 |
 
-⚠️ Warning: The prompts involve multi-step reasoning chains and complex JSON formats. Weak models are prone to errors. An 1-hour video costs about $1.40 using Claude, and about $0.70 with other models. If using the official Grok API, please note to adjust max_workers to 1 in the config.
-
-> Note: Yunwu API also supports OpenAI's tts-1 interface, which can be used in the dubbing step.
-
-<details>
-<summary>How to get API key from Yunwu API?</summary>
-
-1. Go to [Yunwu API website](https://yunwu.zeabur.app/register?aff=TXMB)
-2. Register an account and top up
-3. Create a new key on the API key page
-4. Make sure to check `Unlimited quota`, recommended channel is `Pure AZ 1.5x`
-</details>
-
-<details>
-<summary>Can I use other models?</summary>
-
-- ✅ Supports OAI-Like API interfaces, you can change in the Streamlit sidebar.
-- ⚠️ However, other models (especially smaller ones) have weaker instruction following capabilities and are very likely to error during translation. Strongly not recommended. If errors occur, please switch models.
-</details>
+Note: Supports OpenAI interface, you can try different models. However, the process involves multi-step reasoning chains and complex JSON formats, **not recommended to use models smaller than 30B**.
 
 ### 2. **TTS API**
 VideoLingo provides multiple TTS integration methods. Here's a comparison (skip if only using translation without dubbing)
 
 | TTS Solution | Pros | Cons | Chinese Effect | Non-Chinese Effect |
 |:---------|:-----|:-----|:---------|:-----------|
+| 🎙️ SiliconFlow FishTTS (Recommended) | Supports cloning | Not the best | 😃 | 😃 |
 | 🎙️ OpenAI TTS | Realistic emotions | Chinese sounds foreign | 😕 | 🤩 |
-| 🔊 Azure TTS (Recommended) | Natural effect | Difficult to top up | 🤩 | 😃 |
-| 🎤 Fish TTS | Authentic native speaker | Limited official models | 😂 | 😂 |
-| 🗣️ GPT-SoVITS (Testing) | Best voice cloning | Currently only supports Chinese/English, requires NVIDIA GPU for inference, configuration requires relevant knowledge | 🏆 | 🚫 |
+| 🔊 Azure TTS | Natural effect | Limited emotions | 🤩 | 😃 |
+| 🎤 Fish TTS | Authentic native | Limited official models | 😂 | 😂 |
+| 🗣️ GPT-SoVITS | Best voice cloning | Only supports Chinese/English, requires local inference, complex setup | 🏆 | 🚫 |
 
-- For OpenAI TTS, recommended to use [Yunwu API](https://yunwu.zeabur.app/register?aff=TXMB), make sure to select `tts-1` for the model;
-- For Azure TTS, register and top up on the [official website](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/get-started-text-to-speech?tabs=windows%2Cterminal&pivots=programming-language-python) (has free quota);
-- For Fish TTS, register on the [official website](https://fish.audio/en/go-api/) (comes with $10 credit)
+- For SiliconFlow FishTTS, get key from [SiliconFlow](https://cloud.siliconflow.cn/i/ttKDEsxE), note that cloning feature requires paid credits;
+- For OpenAI TTS, recommended to use [Yunwu API](https://yunwu.zeabur.app/register?aff=TXMB);
+- For Azure TTS, register on official website or purchase from third parties;
+- For Fish TTS, register on [official website](https://fish.audio/en/go-api/) (comes with $10 credit)
+
+<details>
+<summary>SiliconFlow FishTTS Tutorial</summary>
+
+Currently supports 3 modes:
+
+1. `preset`: Uses fixed voice, can preview on [Official Playground](https://cloud.siliconflow.cn/playground/text-to-speech/17885302608), default is `anna`.
+2. `clone(stable)`: Corresponds to fishtts api's `custom`, uses voice from uploaded audio, automatically samples first 10 seconds of video for voice, more stable.
+3. `clone(dynamic)`: Corresponds to fishtts api's `dynamic`, uses each sentence as reference audio during TTS, may have inconsistent voice but better effect.
+
+</details>
 
 <details>
 <summary>How to choose OpenAI voices?</summary>
@@ -118,12 +112,22 @@ After configuration, select `Reference Audio Mode` in the sidebar (see Yuque doc
 
 ## 🛠️ Quick Start
 
-VideoLingo supports Windows, macOS and Linux systems, and can run on CPU or GPU. For GPU acceleration on Windows, install these dependencies:
+VideoLingo supports Windows, macOS and Linux systems, and can run on CPU or GPU.
+
+For GPU acceleration on Windows, install these dependencies:
 
 - [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)
 - [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)
 
-> Note: After installing CUDA and CUDNN, check if they're added to system path and restart computer 🔄
+> Note: After installing, add `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` to system path and restart computer 🔄
+
+### Windows One-Click Install
+
+Make sure [Git](https://git-scm.com/downloads) is installed,
+
+1. Download source code locally
+
+2. Double click `OneKeyInstall&Start.bat` to complete installation and launch webpage
 
 ### Source Installation
 
@@ -157,26 +161,24 @@ Basic Python knowledge required. For any issues, ask the AI assistant at [videol
    ```
    Script will automatically install appropriate torch version
 
-5. 🎉 Enter command or click `OneKeyStart.bat` to launch Streamlit app:
+5. 🎉 Enter command to launch Streamlit app:
    ```bash
    streamlit run st.py
    ```
 
-6. Set key in sidebar of popup webpage, and note whisper method and transcription language selection
+6. Set key in sidebar of popup webpage and start using~
 
-   ![en_set](https://github.com/user-attachments/assets/2f32f49b-0b7a-4ff4-930f-4e5f9bac9002)
+   ![zh_set](https://github.com/user-attachments/assets/bb9381d0-8d99-4d8b-aaff-9846076fc7a3)
 
-7. Whisper transcription will automatically download models, but for users who cannot access Huggingface through command line, you can manually download whisper models and place them in the root directory: [Baidu Drive](https://pan.baidu.com/s/1Igo_FvFV4Xcb8tSYT0ktpA?pwd=e1c7)
+7. Transcription step will automatically download models from huggingface, or you can download manually and place `_model_cache` folder in VideoLingo directory: [Baidu Drive](https://pan.baidu.com/s/1Igo_FvFV4Xcb8tSYT0ktpA?pwd=e1c7)
 
-8. More settings can be manually modified in `config.yaml`, watch command line output during operation
+8. (Optional) More settings can be manually modified in `config.yaml`, watch command line output during operation
 
 ## 🚨 Common Errors
 
-1. **'Empty Translation Line'**: This occurs when using a less capable LLM that omits short phrases during translation. Solution: Please retry with Claude 3.5 Sonnet.
-
-2. **'Key Error' during translation**: 
+1. **'Key Error' during translation**: 
    - Reason 1: Same as above, weaker models have poor JSON format compliance.
    - Reason 2: LLM may refuse to translate sensitive content.
    Solution: Check `response` and `msg` fields in `output/gpt_log/error.json`.
 
-3. **'Retry Failed', 'SSL', 'Connection', 'Timeout'**: Usually network issues. Solution: Users in mainland China please switch network nodes and retry.
+2. **'Retry Failed', 'SSL', 'Connection', 'Timeout'**: Usually network issues. Solution: Users in mainland China please switch network nodes and retry.
diff --git a/docs/pages/docs/start.zh-CN.md b/docs/pages/docs/start.zh-CN.md
index a972219c..202c2daf 100644
--- a/docs/pages/docs/start.zh-CN.md
+++ b/docs/pages/docs/start.zh-CN.md
@@ -1,34 +1,16 @@
 # 🚀 开始使用
 
-## 📋 API 配置准备
-本项目需使用大模型 和 TTS ，每个环节都提供了多种选择，**请仔细阅读配置指南😊**
+## 📋 API 配置指南
+本项目需使用大模型 和 TTS ，**推荐使用 [硅基流动](https://cloud.siliconflow.cn/i/ttKDEsxE)**，注册送积分，只需要一个 Key 即可体验全部功能。
+
 ### 1. **获取大模型的 API_KEY**：
 
 | 推荐模型 | 推荐提供商 | base_url | 价格 | 效果 |
 |:-----|:---------|:---------|:-----|:---------|
-| gemini-1.5-pro-002 | [云雾 api](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | ￥7 / 1M tokens | 🤩 |
-| claude-3-5-sonnet-20240620 | [云雾 api](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | ￥10 / 1M tokens | 🤩 |
-| gpt-4o | [云雾 api](https://yunwu.zeabur.app/register?aff=TXMB) | https://yunwu.zeabur.app | ￥7 / 1M tokens | 😃 |
-
-⚠️ 警告：prompt 涉及多步思维链和复杂的json格式，弱模型容易出错。1h 视频使用 claude 花费约 10 元。
-
-> 注：云雾api 还支持 openai 的 tts-1 接口，可在配音步骤选用。
-
-<details>
-<summary>云雾api 如何获取 api key？</summary>
-
-1. 前往 [云雾 api 官网](https://yunwu.zeabur.app/register?aff=TXMB)
-2. 注册账户并充值
-3. 在 api key 页面新建一个 key
-4. 注意勾选 `无限额度` ，渠道建议选 `纯AZ 1.5倍`
-</details>
-
-<details>
-<summary>能用别的模型吗？</summary>
+| Qwen/Qwen2.5-72B-Instruct | [硅基流动](https://cloud.siliconflow.cn/i/ttKDEsxE) | https://api.siliconflow.cn | ￥4 / 1M tokens | 😃 |
+| claude-3-5-sonnet-20240620 | / | / | $15 / 1M tokens | 🤩 |
 
-- ✅ 支持 OAI-Like 的 API 接口，需要自行在 streamlit 侧边栏更换。
-- ⚠️ 但其他模型（尤其是小模型）遵循指令要求能力弱，非常容易在翻译过程报错，强烈不推荐，遇到报错请更换模型。
-</details>
+注：支持 Openai 接口，可自行尝试不同模型。但处理过程涉及多步思维链和复杂的json格式，**不建议使用小于 30B 的模型**。
 
 
 ### 2. **TTS 的 API**
@@ -36,14 +18,27 @@ VideoLingo提供了多种 tts 接入方式，以下是对比（如不使用配
 
 | TTS 方案 | 优点 | 缺点 | 中文效果 | 非中文效果 |
 |:---------|:-----|:-----|:---------|:-----------|
+| 🎙️ SiliconFlow FishTTS (推荐) | 支持克隆 | 不是最好 | 😃 | 😃 |
 | 🎙️ OpenAI TTS | 情感真实 | 中文听起来像外国人 | 😕 | 🤩 |
-| 🔊 Azure TTS (推荐)  | 效果自然 | 充值不方便 | 🤩 | 😃 |
+| 🔊 Azure TTS | 效果自然 | 情感不够丰富 | 🤩 | 😃 |
 | 🎤 Fish TTS  | 真是本地人 | 官方模型有限 | 😂 | 😂 |
-| 🗣️ GPT-SoVITS (测试) | 最强语音克隆 | 目前只支持中英文，需要N卡推理模型，配置需要相关知识 | 🏆 | 🚫 |
+| 🗣️ GPT-SoVITS | 最强语音克隆 | 只支持中英文，需要本地推理，配置麻烦 | 🏆 | 🚫 |
+
+- SiliconFlow FishTTS 请在 [硅基流动](https://cloud.siliconflow.cn/i/ttKDEsxE) 获取key，注意克隆功能需要付费充值积分；
+- OpenAI TTS，推荐使用 [云雾 api](https://yunwu.zeabur.app/register?aff=TXMB)；
+- Azure TTS 可以在官网注册获取key，也可以淘宝购买；
+- Fish TTS 可以在 [官网](https://fish.audio/zh-CN/go-api/) 注册（送10刀额度）
+
+<details>
+<summary>SiliconFlow FishTTS 使用教程</summary>
+
+目前支持 3 种模式：
 
-- OpenAI TTS，推荐使用 [云雾 api](https://yunwu.zeabur.app/register?aff=TXMB)，注意在模型处勾选 `tts-1`；
-- Azure TTS 在 [官网](https://learn.microsoft.com/zh-cn/azure/ai-services/speech-service/get-started-text-to-speech?tabs=windows%2Cterminal&pivots=programming-language-python) 注册充值（有免费额度）；
-- Fish TTS 请自行在 [官网](https://fish.audio/zh-CN/go-api/) 注册（送10刀额度）
+1. `preset`: 使用固定音色，可以在 [官网Playground](https://cloud.siliconflow.cn/playground/text-to-speech/17885302608) 试听，默认 `anna`。
+2. `clone(stable)`: 对应 fishtts api 的 `custom`，使用一段上传音频的音色，会自动采集视频前十秒声音作为音色使用，比较稳定。
+3. `clone(dynamic)`: 对应 fishtts api 的 `dynamic`，在 tts 过程使用每一句作为参考音频，可能出现音色不一致，但效果更好。
+
+</details>
 
 <details>
 <summary>OpenAI 声音怎么选？</summary>
@@ -61,7 +56,7 @@ VideoLingo提供了多种 tts 接入方式，以下是对比（如不使用配
 <details>
 <summary>Fish TTS 声音怎么选？</summary>
 
-前往 [官网](https://fish.audio/zh-CN/) 中试听��择你想要的声音，��� URL 中可以找到该声音对应的代号，例如丁真是 `54a5170264694bfc8e9ad98df7bd89c3`，热门的几种声音已添加在 `config.yaml` 中。如需使用其他声音，请在 `config.yaml` 中修改 `fish_tts.character_id_dict` 字典。
+前往 [官网](https://fish.audio/zh-CN/) 中试听选择你想要的声音，在 URL 中可以找到该声音对应的代号，例如丁真是 `54a5170264694bfc8e9ad98df7bd89c3`，热门的几种声音已添加在 `config.yaml` 中。如需使用其他声音，请在 `config.yaml` 中修改 `fish_tts.character_id_dict` 字典。
 
 </details>
 
@@ -118,12 +113,27 @@ VideoLingo提供了多种 tts 接入方式，以下是对比（如不使用配
 
 ## 🛠️ 快速上手
 
-VideoLingo 支持 Windows、macOS 和 Linux 系统，可使用 CPU 或 GPU 运行。对于 Windows 系统使用 GPU 加速，需要安装以下依赖：
+VideoLingo 支持 Windows、macOS 和 Linux 系统，可使用 CPU 或 GPU 运行。
+
+对于 Windows 系统使用 GPU 加速，需要安装以下依赖：
 
 - [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)
 - [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)
 
-> 注意：安装完 CUDA 和 CUDNN 后需要检查是否添加到了系统路径，并重启计算机 🔄
+> 注意：安装后需要将 `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` 添加至系统环境变量，并重启计算机 🔄
+
+### Windows 一键安装
+
+请确保已安装 [Git](https://git-scm.com/downloads)，
+
+1. 下载源码到本地
+
+2. （可选）应用汉化补丁：
+   - 打开项目根目录下的 `i18n/中文` 文件夹
+   - 将该文件夹中的所有内容复制到项目根目录
+   - 在弹出的提示中选择"替换目标中的文件"
+
+3. 双击 `OneKeyInstall&Start.bat` 即可完成安装并启动网页
 
 ### 源码安装
 
@@ -134,7 +144,7 @@ VideoLingo 支持 Windows、macOS 和 Linux 系统，可使用 CPU 或 GPU 运
 
 需要一定的 python 基础，遇到任何问题可以询问官方网站 [videolingo.io](https://videolingo.io) 右下角的AI助手~
 
-1. 打开 Anaconda Prompt 并切换到你想安装的目录，例如桌面：
+1. 打开 `Anaconda Prompt` 并切换到你想安装的目录，例如桌面：
    ```bash
    cd desktop
    ```
@@ -151,13 +161,11 @@ VideoLingo 支持 Windows、macOS 和 Linux 系统，可使用 CPU 或 GPU 运
    conda activate videolingo
    ```
 
-4. 应用汉化补丁：
-   - 打开项目根目录下的 `i18n/中文` 文件夹
-   - 将该文件夹中的所有内容复制到项目根目录
-   - 在弹出的提示中选择"替换目标中的文件"
-   （注意：Mac系统会删除整个目标文件夹后再复制，而Windows只会替换重复的文件。Mac用户建议手动将文件逐个移动到目标位置）
+4. （可选）应用汉化补丁：
+
+    参照 **一键安装** 中的说明
 
-   完成以上步骤后，界面将切换为中文显示。
+   （注意：Mac系统会删除整个目标文件夹后再复制，而Windows只会替换重复的文件。Mac用户建议手动将文件逐个移动到目标位置）
 
 5. 运行安装脚本：
    ```bash
@@ -174,17 +182,15 @@ VideoLingo 支持 Windows、macOS 和 Linux 系统，可使用 CPU 或 GPU 运
 
    ![zh_set](https://github.com/user-attachments/assets/bb9381d0-8d99-4d8b-aaff-9846076fc7a3)
 
-8. whisper 转录步骤会自动下载模型，但是对于命令行无法访问 huggingface 的用户，也可以手动下载 whisper 模型放置在根目录下：[百度网盘](https://pan.baidu.com/s/1Igo_FvFV4Xcb8tSYT0ktpA?pwd=e1c7)
+8. 转录步骤会自动从 huggingface 下载模型，也可以手动下载，将 `_model_cache` 文件夹放置在 VideoLingo 目录下：[百度网盘](https://pan.baidu.com/s/1Igo_FvFV4Xcb8tSYT0ktpA?pwd=e1c7)
 
-9. 更多设置可以在 `config.yaml` 中手动修改，运行过程请注意命令行输出
+9. （可选）更多设置可以在 `config.yaml` 中手动修改，运行过程请注意命令行输出
 
 ## 🚨 常见报错
 
-1. **'Empty Translation Line'**: 这是由于选用了较笨的LLM，在翻译时把一些短语句直接省略了。解决方案：请换用Claude 3.5 Sonnet重试。
-
-2. **翻译过程的 'Key Error'**: 
+1. **翻译过程的 'Key Error'**: 
    - 原因1：同上，弱模型遵循JSON格式能力有误。
    - 原因2：对于敏感内容，LLM可能拒绝翻译。
    解决方案：请检查 `output/gpt_log/error.json` 的 `response` 和 `msg` 字段。
 
-3. **'Retry Failed', 'SSL', 'Connection', 'Timeout'**: 通常是网络问题。解决方案：中国大陆用户请切换网络节点重试。
\ No newline at end of file
+2. **'Retry Failed', 'SSL', 'Connection', 'Timeout'**: 通常是网络问题。解决方案：中国大陆用户请切换网络节点重试。
\ No newline at end of file
diff --git a/i18n/README.zh.md b/i18n/README.zh.md
index 8c6ced37..5ba89748 100644
--- a/i18n/README.zh.md
+++ b/i18n/README.zh.md
@@ -14,12 +14,12 @@
 
 ## 🌟 项目简介
 
-VideoLingo 是一站式视频翻译本地化配音工具，能够一键生成 Netflix 级别的高质量字幕，告别生硬机翻，告别多行字幕，还能加上高质量的配音，让全世界的知识能够跨越语言的障碍共享。
+VideoLingo 是一站式视频翻译本地化配音工具，能够一键生成 Netflix 级别的高质量字幕，告别生硬机翻，告别多行字幕，还能加上高质量的克隆配音，让全世界的知识能够跨越语言的障碍共享。
 
 主要特点和功能：
 - 🎥 使用 yt-dlp 从 Youtube 链接下载视频
 
-- 🎙️ 使用 WhisperX 进行单词级时间轴字幕识别
+- **🎙️ 使用 WhisperX 进行单词级时间轴字幕识别**
 
 - **📝 使用 NLP 和 GPT 根据句意进行字幕分割**
 
@@ -29,15 +29,13 @@ VideoLingo 是一站式视频翻译本地化配音工具，能够一键生成 Ne
 
 - **✅ 按照 Netflix 标准检查单行长度，绝无双行字幕**
 
-- **🗣️ 使用 GPT-SoVITS 等方法对齐配音**
+- **🗣️ 使用 FishTTS 等方法对齐克隆配音**
 
 - 🚀 整合包一键启动，在 streamlit 中一键出片
 
 - 📝 详细记录每步操作日志，支持随时中断和恢复进度
 
-- 🌐 全面的多语言支持，轻松实现跨语言视频本地化
-
-与同类项目的主要区别：**绝无多行字幕，最佳的翻译质量**
+与同类项目相比的优势：**绝无多行字幕，最佳的翻译质量，无缝的配音体验**
 
 ## 🎥 效果演示
 
@@ -80,26 +78,26 @@ https://github.com/user-attachments/assets/85c64f8c-06cf-4af9-b153-ee9d2897b768
 | 意大利语 | 🤩 | [意转中](https://github.com/user-attachments/assets/f1f893eb-dad3-4460-aaf6-10cac999195e) |
 | 西班牙语 | 🤩 | [西转中](https://github.com/user-attachments/assets/c1d28f1c-83d2-4f13-a1a1-859bd6cc3553) |
 | 日语 | 😐 | [日转中](https://github.com/user-attachments/assets/856c3398-2da3-4e25-9c36-27ca2d1f68c2) |
-| 中文* | 🤩 | [中转英](https://github.com/user-attachments/assets/48f746fe-96ff-47fd-bd23-59e9202b495c) |
-> *中文需单独配置whisperX模型，仅适用于本地源码安装，配置过程见安装文档，并注意在网页侧边栏指定转录语言为zh
+| 中文* | 😊 | [中转英](https://github.com/user-attachments/assets/48f746fe-96ff-47fd-bd23-59e9202b495c) |
+> *中文需单独配置标点增强后的 whisper 模型，详见安装文档。但效果一般，因为 faster-whisper 加速的 whisper 失去了原有的好的断句，且识别得到的中文没有标点符号，难以断句。同样问题出现在日语上。
 
-翻译语言支持大模型会的所有语言，配音语言取决于选取的TTS方法。
+翻译语言支持所有语言，配音语言取决于选取的TTS。
 
 ## 🚀 快速开始
 
 ### 在线体验
 
-商业版提供免费的 20min 额度，请访问 [videolingo.io](https://videolingo.io)
+商业版（beta）提供免费的 20min 额度，请访问 [videolingo.io](https://videolingo.io)
 
 ### Colab 运行
 
-只需 5 分钟即可在 Colab 中快速体验 VideoLingo：
+只需 5 分钟的安装即可在 Colab 中快速体验 VideoLingo：
 
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Huanshere/VideoLingo/blob/main/VideoLingo_colab.ipynb)
 
 ### 本地安装
 
-VideoLingo 支持所有硬件平台和操作系统，但在 GPU 加速下性能最佳。详细安装说明请参考文档：[English](/docs/pages/docs/start.en-US.md) | [简体中文](/docs/pages/docs/start.zh-CN.md)
+VideoLingo 支持所有硬件平台和操作系统，但在 GPU 加速下性能最佳。文档：[English](/docs/pages/docs/start.en-US.md) | [简体中文](/docs/pages/docs/start.zh-CN.md)
 
 
 ### 使用Docker
@@ -118,30 +116,23 @@ docker run -d -p 8501:8501 --gpus all videolingo
 使用说明: [English](/batch/README.md) | [简体中文](/batch/README.zh.md)
 
 ## ⚠️ 当前限制
-1. 不同设备运行 whisperX 效果不同，v1.7 会先进行 demucs 人声分离，但可能会导致分离后转录效果不如分离前，原因是 whisper 本身是在带 bgm 的环境下训练的，分离前不会转录bgm的歌词，但是分离后可能会转录歌词。
-
-2. **配音功能的质量可能不完美**，仍处于测试开发阶段，正在尝试接入 MascGCT。目前为获得最佳效果，建议根据原视频的语速和内容特点，选择相近语速的 TTS，效果见 [demo](https://www.bilibili.com/video/BV1mt1QYyERR/?share_source=copy_web&vd_source=fa92558c28cd668d33dabaddb17e2f9e)。
-
-3. **多语言视频转录识别仅仅只会保留主要语言**，这是由于 whisperX 在强制对齐单词级字幕时使用的是针对单个语言的特化模型，会因为不认识另一种语言而删去。
+1. WhisperX 转录效果可能受到视频背景声影响，因为使用了 wav2vac 模型进行对齐，但尽管如此，WhisperX 已经能在 99% 情况下解决 Whisper 本身的幻觉问题。
 
-3. **多角色分别配音正在开发**，whisperX 具有 VAD 的潜力，但是具体需要一些施工，暂时没有支持此功能。
+2. 配音功能由于不同语言的语速和语调差异，还受到前置处理字幕的影响，可能不能 100% 完美，但本项目做了非常多的语速上的工程处理，尽可能保证配音效果。
 
-## 🚗 路线图
+3. **多语言视频转录识别仅仅只会保留主要语言**，这是由于 whisperX 在强制对齐单词级字幕时使用的是针对单个语言的特化模型，会因为不认识另一种语言而删去。有些商用api可以进行机器自动转换，但实测效果非常一般，因此这个问题目前只能依靠人为切段处理。
 
-- [x] SaaS 版本 at [videolingo.io](https://videolingo.io)
-- [ ] VAD 区分说话人，多角色配音
-- [ ] 用户术语表
-- [ ] 配音视频唇形同步
+4. **多角色分别配音仍在开发**，whisperX 具有 VAD 的潜力（尽管官方承认效果一般），但是具体需要一些施工，暂时没有支持此功能。
 
 ## 📄 许可证
 
-本项目采用 Apache 2.0 许可证，我们衷心感谢以下开源项目的贡献：
+本项目采用 Apache 2.0 许可证，衷心感谢以下开源项目的贡献：
 
-[whisperX](https://github.com/m-bain/whisperX) ｜ [yt-dlp](https://github.com/yt-dlp/yt-dlp) ｜ [json_repair](https://github.com/mangiucugna/json_repair) ｜ [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) ｜ [BELLE](https://github.com/LianjiaTech/BELLE)
+[whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE)
 
 ## 📬 联系我们
 
-- 加入我们的 QQ 群：875297969
+- 加入我们的 QQ 群寻求解答：875297969
 - 在 GitHub 上提交 [Issues](https://github.com/Huanshere/VideoLingo/issues) 或 [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls)
 - 关注我的 Twitter：[@Huanshere](https://twitter.com/Huanshere)
 - 联系邮箱：team@videolingo.io
diff --git "a/i18n/\344\270\255\346\226\207/config.yaml" "b/i18n/\344\270\255\346\226\207/config.yaml"
index 590fdbc2..d05cfea8 100644
--- "a/i18n/\344\270\255\346\226\207/config.yaml"
+++ "b/i18n/\344\270\255\346\226\207/config.yaml"
@@ -1,30 +1,31 @@
 # * 标有 * 的设置是高级设置，不会出现在 Streamlit 页面中，只能在 config.py 中手动修改
+version: "2.0.0"
 ## ======================== 基本设置 ======================== ##
 # API 设置
 api:
-  key: 'YOUR_KEY'
-  base_url: 'https://yunwu.zeabur.app'
-  model: 'gemini-1.5-pro-002'
+  key: 'YOUR_API_KEY'
+  base_url: 'https://api.siliconflow.cn'
+  model: 'Qwen/Qwen2.5-72B-Instruct'
 
 # 语言设置，写入提示词，可以用自然语言描述
-target_language: 'Chinese'
+target_language: '简体中文'
 
 # 是否在转录前进行人声分离，警告这可能会减慢过程并导致行缺失！
 demucs: false
 
 whisper:
-  # Whisper 设置 [whisperx, whisperxapi]
-  method: 'whisperx'
-  # Whisper 指定识别语言 [en, zh, auto] auto 为自动检测，en 为强制翻译为英语
+  # ["medium", "large-v3", "large-v3-turbo"]. 注意：对于中文模型将强制使用 Belle/large-v3
+  model: 'large-v3'
+  # Whisper 指定识别语言 [en, zh, ...]
   language: 'en'
   detected_language: 'en'
 
 # 视频分辨率 [0x0, 640x360, 1920x1080] 0x0 会生成一个 0 秒的黑色视频占位符
-resolution: '640x360'
+resolution: '1920x1080'
 
 ## ======================== 高级设置 ======================== ##
 # *下载 YouTube 视频的默认分辨率 [360, 1080, best]
-ytb_resolution: '360'
+ytb_resolution: '1080'
 
 subtitle:
   # *每行字幕的最大字符长度
@@ -41,8 +42,20 @@ max_split_length: 20
 pause_before_translate: false
 
 ## ======================== 配音设置 ======================== ##
-# TTS 选择 [openai_tts, gpt_sovits, azure_tts, fish_tts]
-tts_method: 'azure_tts'
+# TTS 选择 [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts]
+tts_method: 'sf_fish_tts'
+
+# SiliconFlow FishTTS
+sf_fish_tts:
+  # SiliconFlow API key
+  api_key: 'YOUR_API_KEY'
+  # 仅用于 "preset" 模式
+  voice: 'anna'
+  # *仅用于 "custom" 模式，不要手动设置
+  custom_name: ''
+  voice_id: ''
+  # preset, custom, dynamic
+  mode: "preset"
 
 # OpenAI TTS-1 API 配置
 openai_tts:
@@ -72,15 +85,21 @@ fish_tts:
 # *音频速度范围
 speed_factor:
   min: 1
+  accept: 1.2 # 可以接受的最大速度
   max: 1.4
-  normal: 1.2  # *被认为是正常语速
 
 # *合并音频配置
-min_subtitle_duration: 3
-min_trim_duration: 2.50
+min_subtitle_duration: 2.5 # 最小字幕出现时间 会强制扩展
+min_trim_duration: 3.5 # 小于这个值的字幕不会切割
+tolerance: 1.5 # 允许向后延申的时间
 
 # 音量设置
-dub_volume: 1.3  # *配音音频音量（1.3 = 130%，大多数原始配音音频相对较安静）
+dub_volume: 1.5  # *配音音频音量（1.5 = 150%，大多数原始配音音频相对较安静）
+
+
+
+
+
 
 ## ======================== 附加设置 请勿修改 ======================== ##
 # Whisper 模型目录
@@ -96,7 +115,6 @@ allowed_video_formats:
 - 'wmv'
 - 'webm'
 
-# 支持的音频格式
 allowed_audio_formats:
 - 'wav'
 - 'mp3'
@@ -112,6 +130,11 @@ llm_support_json:
 - 'gemini-1.5-pro-latest'
 - 'gemini-1.5-pro-002'
 
+# 存在问题
+# - 'Qwen/Qwen2.5-72B-Instruct'
+# - 'Qwen/Qwen2.5-Coder-32B-Instruct'
+# - 'Qwen/Qwen2.5-Chat-72B-Instruct-128K'
+
 # Spacy 模型
 spacy_model_map:
   en: 'en_core_web_md'
diff --git "a/i18n/\344\270\255\346\226\207/core/pypi_autochoose.py" "b/i18n/\344\270\255\346\226\207/core/pypi_autochoose.py"
deleted file mode 100644
index 79d873ca..00000000
--- "a/i18n/\344\270\255\346\226\207/core/pypi_autochoose.py"
+++ /dev/null
@@ -1,140 +0,0 @@
-import subprocess
-import time
-import requests
-import os
-import concurrent.futures
-from rich.console import Console
-from rich.table import Table
-from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn
-
-MIRRORS = {
-    "Alibaba Cloud": "https://mirrors.aliyun.com/pypi/simple",
-    "Tsinghua University": "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple",
-    "Huawei Cloud": "https://repo.huaweicloud.com/repository/pypi/simple",
-    "Tencent Cloud": "https://mirrors.cloud.tencent.com/pypi/simple",
-    "163 Cloud": "https://mirrors.163.com/pypi/simple",
-    "PyPI Official": "https://pypi.org/simple"
-}
-
-console = Console()
-
-FAST_THRESHOLD = 1000  # ms
-SLOW_THRESHOLD = 1500  # ms
-
-def get_optimal_thread_count():
-    try:
-        cpu_count = os.cpu_count()
-        return max(cpu_count - 1, 1)
-    except:
-        return 2
-
-def test_mirror_speed(name, url):
-    try:
-        start_time = time.time()
-        response = requests.get(url, timeout=5)
-        end_time = time.time()
-        if response.status_code == 200:
-            speed = (end_time - start_time) * 1000 
-            return name, speed
-        else:
-            return name, float('inf')
-    except requests.RequestException:
-        return name, float('inf')
-
-def set_pip_mirror(url, host):
-    try:
-        subprocess.run(["pip", "config", "set", "global.index-url", url], check=True, capture_output=True)
-        subprocess.run(["pip", "config", "set", "install.trusted-host", host], check=True, capture_output=True)
-        return True
-    except subprocess.CalledProcessError:
-        return False
-
-def get_current_pip_mirror():
-    try:
-        result = subprocess.run(["pip", "config", "get", "global.index-url"], capture_output=True, text=True, check=True)
-        return result.stdout.strip()
-    except subprocess.CalledProcessError:
-        return None
-
-def main():
-    console.print("[yellow]开始新的镜像速度测试[/yellow]")
-    
-    # First test PyPI official mirror
-    pypi_name = next(name for name, url in MIRRORS.items() if "pypi.org" in url)
-    pypi_url = MIRRORS[pypi_name]
-    console.print("[cyan]测试PyPI官方镜像...[/cyan]")
-    
-    optimal_thread_count = get_optimal_thread_count()
-    console.print(f"使用 {optimal_thread_count} 个线程进行测试")
-    
-    _, pypi_speed = test_mirror_speed(pypi_name, pypi_url)
-    
-    if pypi_speed < FAST_THRESHOLD:
-        console.print(f"PyPI官方镜像速度很快 ({pypi_speed:.2f} ms)。使用官方镜像。")
-        set_pip_mirror(pypi_url, "pypi.org")
-        return
-    elif pypi_speed < SLOW_THRESHOLD:
-        console.print(f"PyPI官方镜像速度可以接受 ({pypi_speed:.2f} ms)。您可以继续使用它。")
-        return
-
-    console.print(f"PyPI官方镜像速度较慢 ({pypi_speed:.2f} ms)。测试其他镜像...")
-
-    # Test other mirrors
-    speeds = {}
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        BarColumn(),
-        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
-    ) as progress:
-        task = progress.add_task("[cyan]测试镜像...", total=len(MIRRORS) - 1)  # -1 because we already tested PyPI
-        
-        with concurrent.futures.ThreadPoolExecutor(max_workers=optimal_thread_count) as executor:
-            future_to_mirror = {executor.submit(test_mirror_speed, name, url): name for name, url in MIRRORS.items() if name != pypi_name}
-            for future in concurrent.futures.as_completed(future_to_mirror):
-                name = future_to_mirror[future]
-                try:
-                    name, speed = future.result()
-                    if speed != float('inf'):
-                        speeds[name] = speed
-                except Exception as exc:
-                    print(f'{name} 生成了一个异常: {exc}')
-                finally:
-                    progress.update(task, advance=1)
-
-    table = Table(title="镜像速度测试结果")
-    table.add_column("镜像", style="cyan")
-    table.add_column("响应时间 (ms)", justify="right", style="magenta")
-
-    for name, speed in sorted(speeds.items(), key=lambda x: x[1]):
-        table.add_row(name, f"{speed:.2f}")
-
-    console.print(table)
-
-    if speeds:
-        fastest_mirror = min(speeds, key=speeds.get)
-        fastest_url = MIRRORS[fastest_mirror]
-        console.print(f"\n[green]最快的镜像: {fastest_mirror} ({fastest_url})[/green]")
-        console.print(f"[green]响应时间: {speeds[fastest_mirror]:.2f} ms[/green]")
-        
-        host = fastest_url.split("//")[1].split("/")[0]
-        if set_pip_mirror(fastest_url, host):
-            current_mirror = get_current_pip_mirror()
-            console.print(f"\n[yellow]当前pip源: {current_mirror}[/yellow]")
-            
-            if current_mirror == fastest_url:
-                console.print(f"[bold green]成功切换到 {fastest_mirror} 镜像。[/bold green]")
-            else:
-                console.print("[bold red]切换失败。当前pip源与预期不符。[/bold red]")
-                console.print(f"[yellow]预期的pip源: {fastest_url}[/yellow]")
-                console.print("[yellow]请手动检查配置或尝试以管理员权限运行此脚本。[/yellow]")
-        else:
-            console.print("[bold red]切换镜像失败，将继续使用当前源。[/bold red]")
-            current_mirror = get_current_pip_mirror()
-            console.print(f"[yellow]当前pip源: {current_mirror}[/yellow]")
-            console.print("[yellow]请检查是否有足够的权限修改pip配置。[/yellow]")
-    else:
-        console.print("[bold red]所有镜像都无法访问。请检查您的网络连接。[/bold red]")
-
-if __name__ == "__main__":
-    main()
diff --git "a/i18n/\344\270\255\346\226\207/install.py" "b/i18n/\344\270\255\346\226\207/install.py"
index 1db2ec3b..91d90f06 100644
--- "a/i18n/\344\270\255\346\226\207/install.py"
+++ "b/i18n/\344\270\255\346\226\207/install.py"
@@ -4,47 +4,62 @@
 import sys
 import zipfile
 import shutil
-
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
+ascii_logo = """
+__     ___     _            _     _                    
+\ \   / (_) __| | ___  ___ | |   (_)_ __   __ _  ___  
+ \ \ / /| |/ _` |/ _ \/ _ \| |   | | '_ \ / _` |/ _ \ 
+  \ V / | | (_| |  __/ (_) | |___| | | | | (_| | (_) |
+   \_/  |_|\__,_|\___|\___/|_____|_|_| |_|\__, |\___/ 
+                                          |___/        
+"""
+
 def install_package(*packages):
     subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])
 
-install_package("requests", "rich", "ruamel.yaml")
-from pypi_autochoose import main as choose_mirror
-
 def check_gpu():
-    """检查是否有 NVIDIA GPU 可用"""
     try:
-        # 🔍 尝试运行 nvidia-smi 命令来检测 GPU
         subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
         return True
     except (subprocess.CalledProcessError, FileNotFoundError):
         return False
 
 def main():
+    install_package("requests", "rich", "ruamel.yaml")
     from rich.console import Console
     from rich.panel import Panel
-    
+    from rich.box import DOUBLE
     console = Console()
+    
+    width = max(len(line) for line in ascii_logo.splitlines()) + 4
+    welcome_panel = Panel(
+        ascii_logo,
+        width=width,
+        box=DOUBLE,
+        title="[bold green]🌏[/bold green]",
+        border_style="bright_blue"
+    )
+    console.print(welcome_panel)
+    
     console.print(Panel.fit("🚀 开始安装", style="bold magenta"))
 
     # 配置镜像源
-    console.print(Panel("⚙️ 正在配置镜像源", style="bold yellow"))
+    from core.pypi_autochoose import main as choose_mirror
     choose_mirror()
 
-    # 检测系统和 GPU
+    # 检测系统和GPU
     if platform.system() == 'Darwin':
-        console.print(Panel("🍎 检测到 MacOS，正在安装 CPU 版本的 PyTorch... 但速度会慢很多", style="cyan"))
-        subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "torchaudio"])
+        console.print(Panel("🍎 检测到 MacOS，正在安装 CPU 版本的 PyTorch... 但转写速度会慢很多", style="cyan"))
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.1.2", "torchaudio==2.1.2"])
     else:
         has_gpu = check_gpu()
         if has_gpu:
             console.print(Panel("🎮 检测到 NVIDIA GPU，正在安装 CUDA 版本的 PyTorch...", style="cyan"))
             subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.0.0", "torchaudio==2.0.0", "--index-url", "https://download.pytorch.org/whl/cu118"])
         else:
-            console.print(Panel("💻 未检测到 NVIDIA GPU，正在安装 CPU 版本的 PyTorch... 但速度会慢很多", style="cyan"))
-            subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "torchaudio"])
+            console.print(Panel("💻 未检测到 NVIDIA GPU，正在安装 CPU 版本的 PyTorch... 但转写速度会慢很多", style="cyan"))
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.1.2", "torchaudio==2.1.2"])
     
     # 安装 WhisperX
     console.print(Panel("📦 正在安装 WhisperX...", style="cyan"))
@@ -65,6 +80,10 @@ def install_requirements():
         subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
 
     def download_and_extract_ffmpeg():
+        # 需要同时安装 conda-ffmpeg 和 ffmpeg.exe
+        console.print(Panel("📦 正在通过 conda 安装 ffmpeg...", style="cyan"))
+        subprocess.check_call(["conda", "install", "-y", "ffmpeg"])
+
         import requests
         system = platform.system()
         if system == "Windows":
@@ -83,15 +102,15 @@ def download_and_extract_ffmpeg():
             print(f"{ffmpeg_exe} 已存在")
             return
 
-        print("正在下载 FFmpeg")
+        console.print(Panel("📦 正在下载 FFmpeg...", style="cyan"))
         response = requests.get(url)
         if response.status_code == 200:
             filename = "ffmpeg.zip" if system in ["Windows", "Darwin"] else "ffmpeg.tar.xz"
             with open(filename, 'wb') as f:
                 f.write(response.content)
-            print(f"FFmpeg 下载完成: {filename}")
+            console.print(Panel(f"FFmpeg 下载完成: {filename}", style="cyan"))
         
-            print("正在解压 FFmpeg")
+            console.print(Panel("📦 正在解压 FFmpeg...", style="cyan"))
             if system == "Linux":
                 import tarfile
                 with tarfile.open(filename) as tar_ref:
@@ -106,15 +125,15 @@ def download_and_extract_ffmpeg():
                             zip_ref.extract(file)
                             shutil.move(os.path.join(*file.split('/')[:-1], os.path.basename(file)), os.path.basename(file))
             
-            print("正在清理")
+            console.print(Panel("📦 正在清理...", style="cyan"))
             os.remove(filename)
             if system == "Windows":
                 for item in os.listdir():
                     if os.path.isdir(item) and "ffmpeg" in item.lower():
                         shutil.rmtree(item)
-            print("FFmpeg 解压完成")
+            console.print(Panel("FFmpeg 解压完成", style="cyan"))
         else:
-            print("FFmpeg 下载失败")
+            console.print(Panel("❌ FFmpeg 下载失败", style="red"))
 
     def install_noto_font():
         if platform.system() == 'Linux':
diff --git "a/i18n/\344\270\255\346\226\207/st.py" "b/i18n/\344\270\255\346\226\207/st.py"
index 2b3bb513..0172e680 100644
--- "a/i18n/\344\270\255\346\226\207/st.py"
+++ "b/i18n/\344\270\255\346\226\207/st.py"
@@ -10,6 +10,9 @@
 
 st.set_page_config(page_title="VideoLingo", page_icon="docs/logo.svg")
 
+SUB_VIDEO = "output/output_sub.mp4"
+DUB_VIDEO = "output/output_dub.mp4"
+
 def text_processing_section():
     st.header("翻译和生成字幕")
     with st.container(border=True):
@@ -25,16 +28,16 @@ def text_processing_section():
             6. 将字幕合并到视频中
         """, unsafe_allow_html=True)
 
-        if not os.path.exists("output/output_video_with_subs.mp4"):
+        if not os.path.exists(SUB_VIDEO):
             if st.button("开始处理字幕", key="text_processing_button"):
                 process_text()
                 st.rerun()
         else:
             if load_key("resolution") != "0x0":
-                st.video("output/output_video_with_subs.mp4")
+                st.video(SUB_VIDEO)
             download_subtitle_zip_button(text="下载所有字幕")
             
-            if st.button("归档到'history'", key="cleanup_in_text_processing"):
+            if st.button("归档到'历史记录'", key="cleanup_in_text_processing"):
                 cleanup()
                 st.rerun()
             return True
@@ -60,24 +63,25 @@ def process_text():
     st.balloons()
 
 def audio_processing_section():
-    st.header("配音（测试版）")
+    st.header("配音")
     with st.container(border=True):
         st.markdown("""
         <p style='font-size: 20px;'>
         此阶段包含以下步骤：
         <p style='font-size: 20px;'>
-            1. 生成音频任务<br>
-            2. 生成音频<br>
-            3. 将音频合并到视频中
+            1. 生成音频任务和分段<br>
+            2. 提取参考音频<br>
+            3. 生成和合并音频文件<br>
+            4. 将最终音频合并到视频中
         """, unsafe_allow_html=True)
-        if not os.path.exists("output/output_video_with_audio.mp4"):
+        if not os.path.exists(DUB_VIDEO):
             if st.button("开始处理音频", key="audio_processing_button"):
                 process_audio()
                 st.rerun()
         else:
             st.success("音频处理完成！你可以在 `output` 文件夹中查看音频文件。")
             if load_key("resolution") != "0x0": 
-                st.video("output/output_video_with_audio.mp4") 
+                st.video(DUB_VIDEO) 
             if st.button("删除配音文件", key="delete_dubbing_files"):
                 delete_dubbing_files()
                 st.rerun()
@@ -87,13 +91,16 @@ def audio_processing_section():
 
 def process_audio():
     with st.spinner("生成音频任务中"): 
-        step8_gen_audio_task.gen_audio_task_main()
+        step8_1_gen_audio_task.gen_audio_task_main()
+        step8_2_gen_dub_chunks.gen_dub_chunks()
     with st.spinner("提取参考音频中"):
         step9_extract_refer_audio.extract_refer_audio_main()
-    with st.spinner("生成音频中"):
-        step10_gen_audio.process_sovits_tasks()
-    with st.spinner("将音频合并到视频中"):
-        step11_merge_audio_to_vid.merge_main()
+    with st.spinner("生成所有音频中"):
+        step10_gen_audio.gen_audio()
+    with st.spinner("合并完整音频中"):
+        step11_merge_full_audio.merge_full_audio()
+    with st.spinner("将配音合并到视频中"):
+        step12_merge_dub_to_vid.merge_video_audio()
     
     st.success("音频处理完成！🎇")
     st.balloons()
diff --git "a/i18n/\344\270\255\346\226\207/st_components/sidebar_setting.py" "b/i18n/\344\270\255\346\226\207/st_components/sidebar_setting.py"
index 9b743bba..0f64c786 100644
--- "a/i18n/\344\270\255\346\226\207/st_components/sidebar_setting.py"
+++ "b/i18n/\344\270\255\346\226\207/st_components/sidebar_setting.py"
@@ -4,56 +4,54 @@
 import streamlit as st
 from core.config_utils import update_key, load_key
 
-def config_text_input(label, key, help=None):
-    """通用配置文本输入处理器"""
-    value = st.text_input(label, value=load_key(key), help=help)
-    if value != load_key(key):
-        update_key(key, value)
-    return value
+def config_input(label, key, help=None):
+    """Generic config input handler"""
+    val = st.text_input(label, value=load_key(key), help=help)
+    if val != load_key(key):
+        update_key(key, val)
+    return val
 
 def page_setting():
     with st.expander("LLM 配置", expanded=True):
-        config_text_input("API_KEY", "api.key")
-        config_text_input("BASE_URL", "api.base_url", help="API请求的基础URL")
+        config_input("API_KEY", "api.key")
+        config_input("BASE_URL", "api.base_url", help="API请求的基础URL")
         
-        col1, col2 = st.columns([4, 1])
-        with col1:
-            config_text_input("模型", "api.model")
-        with col2:
+        c1, c2 = st.columns([4, 1])
+        with c1:
+            config_input("模型", "api.model")
+        with c2:
             if st.button("📡", key="api"):
-                if valid_llm_api():
-                    st.toast("API 密钥有效", icon="✅")
-                else:
-                    st.toast("API 密钥无效", icon="❌")
+                st.toast("API密钥有效" if check_api() else "API密钥无效", 
+                        icon="✅" if check_api() else "❌")
     
     with st.expander("转写和字幕设置", expanded=True):
-        col1, col2 = st.columns(2)
-        with col1:
-            whisper_language_options_dict = {
-            "🇺🇸 English": "en",
-            "🇨🇳 简体中文": "zh",
-            "🇪🇸 Español": "es",
-            "🇷🇺 Русский": "ru",
-            "🇫🇷 Français": "fr",
-            "🇩🇪 Deutsch": "de",
-            "🇮🇹 Italiano": "it",
-            "🇯🇵 日本語": "ja"
+        c1, c2 = st.columns(2)
+        with c1:
+            langs = {
+                "🇺🇸 English": "en",
+                "🇨🇳 简体中文": "zh",
+                "🇪🇸 Español": "es",
+                "🇷🇺 Русский": "ru",
+                "🇫🇷 Français": "fr",
+                "🇩🇪 Deutsch": "de",
+                "🇮🇹 Italiano": "it",
+                "🇯🇵 日本語": "ja"
             }
-            selected_whisper_language = st.selectbox(
+            lang = st.selectbox(
                 "识别语言:", 
-                options=list(whisper_language_options_dict.keys()),
-                index=list(whisper_language_options_dict.values()).index(load_key("whisper.language"))
+                options=list(langs.keys()),
+                index=list(langs.values()).index(load_key("whisper.language"))
             )
-            if whisper_language_options_dict[selected_whisper_language] != load_key("whisper.language"):
-                update_key("whisper.language", whisper_language_options_dict[selected_whisper_language])
+            if langs[lang] != load_key("whisper.language"):
+                update_key("whisper.language", langs[lang])
 
-        with col2:
+        with c2:
             target_language = st.text_input("目标语言", value=load_key("target_language"))
             if target_language != load_key("target_language"):
                 update_key("target_language", target_language)
 
-        col1, col2 = st.columns(2)
-        with col1:
+        c1, c2 = st.columns(2)
+        with c1:
             burn_subtitles = st.toggle("烧录字幕", value=load_key("resolution") != "0x0")
         
         resolution_options = {
@@ -61,7 +59,7 @@ def page_setting():
             "360p": "640x360"
         }
         
-        with col2:
+        with c2:
             if burn_subtitles:
                 selected_resolution = st.selectbox(
                     "视频分辨率",
@@ -74,32 +72,53 @@ def page_setting():
 
         if resolution != load_key("resolution"):
             update_key("resolution", resolution)
-
-    with st.expander("配音设置", expanded=False):
-        tts_methods = ["openai_tts", "azure_tts", "gpt_sovits", "fish_tts"]
-        selected_tts_method = st.selectbox("TTS 方法", options=tts_methods, index=tts_methods.index(load_key("tts_method")))
+        
+    with st.expander("配音设置", expanded=True):
+        tts_methods = ["sf_fish_tts", "openai_tts", "azure_tts", "gpt_sovits", "fish_tts"]
+        selected_tts_method = st.selectbox("TTS方法", options=tts_methods, index=tts_methods.index(load_key("tts_method")))
         if selected_tts_method != load_key("tts_method"):
             update_key("tts_method", selected_tts_method)
 
-        if selected_tts_method == "openai_tts":
-            config_text_input("OpenAI 语音", "openai_tts.voice")
-            config_text_input("OpenAI TTS API 密钥", "openai_tts.api_key")
-            config_text_input("OpenAI TTS API 基础 URL", "openai_tts.base_url")
+        if selected_tts_method == "sf_fish_tts":
+            config_input("SiliconFlow API密钥", "sf_fish_tts.api_key")
+            
+            # Add mode selection dropdown
+            mode_options = {
+                "preset": "preset",
+                "custom": "clone(stable)",
+                "dynamic": "clone(dynamic)"
+            }
+            selected_mode = st.selectbox(
+                "模式选择",
+                options=list(mode_options.keys()),
+                format_func=lambda x: mode_options[x],
+                index=list(mode_options.keys()).index(load_key("sf_fish_tts.mode")) if load_key("sf_fish_tts.mode") in mode_options.keys() else 0
+            )
+            if selected_mode != load_key("sf_fish_tts.mode"):
+                update_key("sf_fish_tts.mode", selected_mode)
+                
+            if selected_mode == "preset":
+                config_input("语音", "sf_fish_tts.voice")
+
+        elif selected_tts_method == "openai_tts":
+            config_input("OpenAI语音", "openai_tts.voice")
+            config_input("OpenAI TTS API密钥", "openai_tts.api_key")
+            config_input("OpenAI TTS API基础URL", "openai_tts.base_url")
 
         elif selected_tts_method == "fish_tts":
-            config_text_input("Fish TTS API 密钥", "fish_tts.api_key")
-            fish_tts_character = st.selectbox("Fish TTS 角色", options=list(load_key("fish_tts.character_id_dict").keys()), index=list(load_key("fish_tts.character_id_dict").keys()).index(load_key("fish_tts.character")))
+            config_input("Fish TTS API密钥", "fish_tts.api_key")
+            fish_tts_character = st.selectbox("Fish TTS角色", options=list(load_key("fish_tts.character_id_dict").keys()), index=list(load_key("fish_tts.character_id_dict").keys()).index(load_key("fish_tts.character")))
             if fish_tts_character != load_key("fish_tts.character"):
                 update_key("fish_tts.character", fish_tts_character)
 
         elif selected_tts_method == "azure_tts":
-            config_text_input("Azure 密钥", "azure_tts.key")
-            config_text_input("Azure 区域", "azure_tts.region")
-            config_text_input("Azure 语音", "azure_tts.voice")
+            config_input("Azure密钥", "azure_tts.key")
+            config_input("Azure区域", "azure_tts.region")
+            config_input("Azure语音", "azure_tts.voice")
 
         elif selected_tts_method == "gpt_sovits":
-            st.info("配置 GPT_SoVITS，请参考 Github 主页")
-            config_text_input("SoVITS 角色", "gpt_sovits.character")
+            st.info("配置GPT_SoVITS，请参考Github主页")
+            config_input("SoVITS角色", "gpt_sovits.character")
             
             refer_mode_options = {1: "模式1：仅用提供的参考音频", 2: "模式2：仅用视频第1条语音做参考", 3: "模式3：使用视频每一条语音做参考"}
             selected_refer_mode = st.selectbox(
@@ -112,9 +131,10 @@ def page_setting():
             if selected_refer_mode != load_key("gpt_sovits.refer_mode"):
                 update_key("gpt_sovits.refer_mode", selected_refer_mode)
 
-def valid_llm_api():
+def check_api():
     try:
-        response = ask_gpt("This is a test, response 'message':'success' in json format.", response_json=True, log_title='None')
-        return response.get('message') == 'success'
+        resp = ask_gpt("This is a test, response 'message':'success' in json format.", 
+                      response_json=True, log_title='None')
+        return resp.get('message') == 'success'
     except Exception:
         return False
diff --git "a/i18n/\344\270\255\346\226\207/\344\270\200\351\224\256\345\220\257\345\212\250.bat" "b/i18n/\344\270\255\346\226\207/\344\270\200\351\224\256\345\220\257\345\212\250.bat"
deleted file mode 100644
index 8fd86efa..00000000
--- "a/i18n/\344\270\255\346\226\207/\344\270\200\351\224\256\345\220\257\345\212\250.bat"
+++ /dev/null
@@ -1,13 +0,0 @@
-@echo off
-cd /d %~dp0
-if exist runtime (
-    echo 使用 runtime 文件夹...
-    runtime\python.exe -m streamlit run st.py
-) else (
-    echo 未找到 runtime 文件夹，使用 conda 环境，若启动失败说明 conda 不在系统环境中...
-    call activate videolingo
-    python -m streamlit run st.py
-    call deactivate
-)
-
-pause
diff --git a/install.py b/install.py
index 90dc6a2b..5bb85a7d 100644
--- a/install.py
+++ b/install.py
@@ -4,33 +4,48 @@
 import sys
 import zipfile
 import shutil
-
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
+ascii_logo = """
+__     ___     _            _     _                    
+\ \   / (_) __| | ___  ___ | |   (_)_ __   __ _  ___  
+ \ \ / /| |/ _` |/ _ \/ _ \| |   | | '_ \ / _` |/ _ \ 
+  \ V / | | (_| |  __/ (_) | |___| | | | | (_| | (_) |
+   \_/  |_|\__,_|\___|\___/|_____|_|_| |_|\__, |\___/ 
+                                          |___/        
+"""
+
 def install_package(*packages):
     subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])
 
-install_package("requests", "rich", "ruamel.yaml")
-from pypi_autochoose import main as choose_mirror
-
 def check_gpu():
-    """Check if NVIDIA GPU is available"""
     try:
-        # 🔍 Try running nvidia-smi command to detect GPU
         subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
         return True
     except (subprocess.CalledProcessError, FileNotFoundError):
         return False
 
 def main():
+    install_package("requests", "rich", "ruamel.yaml")
     from rich.console import Console
     from rich.panel import Panel
-    
+    from rich.box import DOUBLE
     console = Console()
+    
+    width = max(len(line) for line in ascii_logo.splitlines()) + 4
+    welcome_panel = Panel(
+        ascii_logo,
+        width=width,
+        box=DOUBLE,
+        title="[bold green]🌏[/bold green]",
+        border_style="bright_blue"
+    )
+    console.print(welcome_panel)
+    
     console.print(Panel.fit("🚀 Starting Installation", style="bold magenta"))
 
     # Configure mirrors
-    console.print(Panel("⚙️ Configuring mirrors", style="bold yellow"))
+    from core.pypi_autochoose import main as choose_mirror
     choose_mirror()
 
     # Detect system and GPU
@@ -65,6 +80,10 @@ def install_requirements():
         subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
 
     def download_and_extract_ffmpeg():
+        # requires both conda-ffmpeg and ffmpeg.exe
+        console.print(Panel("📦 Installing ffmpeg through conda...", style="cyan"))
+        subprocess.check_call(["conda", "install", "-y", "ffmpeg"])
+
         import requests
         system = platform.system()
         if system == "Windows":
@@ -83,15 +102,15 @@ def download_and_extract_ffmpeg():
             print(f"{ffmpeg_exe} already exists")
             return
 
-        print("Downloading FFmpeg")
+        console.print(Panel("📦 Downloading FFmpeg...", style="cyan"))
         response = requests.get(url)
         if response.status_code == 200:
             filename = "ffmpeg.zip" if system in ["Windows", "Darwin"] else "ffmpeg.tar.xz"
             with open(filename, 'wb') as f:
                 f.write(response.content)
-            print(f"FFmpeg downloaded: {filename}")
+            console.print(Panel(f"FFmpeg downloaded: {filename}", style="cyan"))
         
-            print("Extracting FFmpeg")
+            console.print(Panel("📦 Extracting FFmpeg...", style="cyan"))
             if system == "Linux":
                 import tarfile
                 with tarfile.open(filename) as tar_ref:
@@ -106,15 +125,15 @@ def download_and_extract_ffmpeg():
                             zip_ref.extract(file)
                             shutil.move(os.path.join(*file.split('/')[:-1], os.path.basename(file)), os.path.basename(file))
             
-            print("Cleaning up")
+            console.print(Panel("📦 Cleaning up...", style="cyan"))
             os.remove(filename)
             if system == "Windows":
                 for item in os.listdir():
                     if os.path.isdir(item) and "ffmpeg" in item.lower():
                         shutil.rmtree(item)
-            print("FFmpeg extraction completed")
+            console.print(Panel("FFmpeg extraction completed", style="cyan"))
         else:
-            print("Failed to download FFmpeg")
+            console.print(Panel("❌ Failed to download FFmpeg", style="red"))
 
     def install_noto_font():
         if platform.system() == 'Linux':
diff --git a/pip_setup.py b/pip_setup.py
new file mode 100644
index 00000000..42d5d06e
--- /dev/null
+++ b/pip_setup.py
@@ -0,0 +1,70 @@
+import os
+import subprocess
+import sys
+
+script_dir = os.getcwd()
+
+def run_cmd(cmd, assert_success=False, environment=False, capture_output=False, env=None):
+    # Use the conda environment
+    if environment:
+        conda_env_path = os.path.join(script_dir, "installer_files", "env")
+        if sys.platform.startswith("win"):
+            conda_bat_path = os.path.join(script_dir, "installer_files", "conda", "condabin", "conda.bat")
+            cmd = "\"" + conda_bat_path + "\" activate \"" + conda_env_path + "\" >nul && " + cmd
+        else:
+            conda_sh_path = os.path.join(script_dir, "installer_files", "conda", "etc", "profile.d", "conda.sh")
+            cmd = ". \"" + conda_sh_path + "\" && conda activate \"" + conda_env_path + "\" && " + cmd
+
+    # Run shell commands
+    result = subprocess.run(cmd, shell=True, capture_output=capture_output, env=env)
+
+    # Assert the command ran successfully
+    if assert_success and result.returncode != 0:
+        print("Command '" + cmd + "' failed with exit status code '" + str(result.returncode) + "'. Exiting...")
+        sys.exit()
+    return result
+
+def check_env():
+    # If we have access to conda, we are probably in an environment
+    conda_exist = run_cmd("conda", environment=True, capture_output=True).returncode == 0
+    if not conda_exist:
+        print("Conda is not installed. Exiting...")
+        sys.exit()
+
+    # Ensure this is a new environment and not the base environment
+    if os.environ["CONDA_DEFAULT_ENV"] == "base":
+        print("Create an environment for this project and activate it. Exiting...")
+        sys.exit()
+
+def check_gpu_win():
+    if not sys.platform.startswith('win'):
+        return
+    
+    CUDNN_PATH = "C:\\Program Files\\NVIDIA\\CUDNN\\v9.3\\bin\\12.6"
+
+    def check_gpu():
+        try:
+            subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+            return True
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            return False
+    
+    if check_gpu():
+        if CUDNN_PATH not in os.environ.get('PATH', ''):
+            print("🚨 Warning: CUDNN path not found in system environment!")
+            print(f"⚡ Please add the following path to system PATH:\n{CUDNN_PATH}")
+            sys.exit(1)
+        else:
+            print("✅ CUDNN found in system PATH - All good!")
+
+def install_dependencies():
+    run_cmd("python install.py", assert_success=True, environment=True)
+
+def run_model():
+    run_cmd(f"python -m streamlit run st.py", environment=True)
+
+if __name__ == "__main__":
+    check_env()
+    install_dependencies()
+    check_gpu_win()
+    run_model()
diff --git a/pypi_autochoose.py b/pypi_autochoose.py
deleted file mode 100644
index 0623508b..00000000
--- a/pypi_autochoose.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import subprocess
-import time
-import requests
-import os
-import concurrent.futures
-from rich.console import Console
-from rich.table import Table
-from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn
-import sys
-
-MIRRORS = {
-    "Tsinghua University": "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple",
-    "PyPI Official": "https://pypi.org/simple"
-}
-
-console = Console()
-
-FAST_THRESHOLD = 1000  # ms
-SLOW_THRESHOLD = 1500  # ms
-
-def get_optimal_thread_count():
-    try:
-        cpu_count = os.cpu_count()
-        return max(cpu_count - 1, 1)
-    except:
-        return 2
-
-def test_mirror_speed(name, url):
-    try:
-        start_time = time.time()
-        response = requests.get(url, timeout=5)
-        end_time = time.time()
-        if response.status_code == 200:
-            speed = (end_time - start_time) * 1000 
-            return name, speed
-        else:
-            return name, float('inf')
-    except requests.RequestException:
-        return name, float('inf')
-
-def set_pip_mirror(url):
-    try:
-        subprocess.run([sys.executable, "-m", "pip", "config", "set", "global.index-url", url], 
-                      check=True, 
-                      capture_output=True)
-        return True
-    except subprocess.CalledProcessError as e:
-        print(f"Failed to set pip mirror: {e}")
-        return False
-
-def get_current_pip_mirror():
-    try:
-        result = subprocess.run([sys.executable, "-m", "pip", "config", "get", "global.index-url"], 
-                              capture_output=True, text=True, check=True)
-        return result.stdout.strip()
-    except subprocess.CalledProcessError:
-        return None
-
-def main():
-    console.print("[yellow]Starting new mirror speed test[/yellow]")
-    
-    # First test PyPI official mirror
-    pypi_name = next(name for name, url in MIRRORS.items() if "pypi.org" in url)
-    pypi_url = MIRRORS[pypi_name]
-    console.print("[cyan]Testing PyPI official mirror...[/cyan]")
-    
-    optimal_thread_count = get_optimal_thread_count()
-    console.print(f"Using {optimal_thread_count} threads for testing")
-    
-    _, pypi_speed = test_mirror_speed(pypi_name, pypi_url)
-    
-    if pypi_speed < FAST_THRESHOLD:
-        console.print(f"PyPI official mirror is fast ({pypi_speed:.2f} ms). Using the official mirror.")
-        set_pip_mirror(pypi_url)
-        return
-    elif pypi_speed < SLOW_THRESHOLD:
-        console.print(f"PyPI official mirror speed is acceptable ({pypi_speed:.2f} ms). You may continue using it.")
-        return
-
-    console.print(f"PyPI official mirror is slow ({pypi_speed:.2f} ms). Testing other mirrors...")
-
-    # Test other mirrors
-    speeds = {}
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        BarColumn(),
-        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
-    ) as progress:
-        task = progress.add_task("[cyan]Testing mirrors...", total=len(MIRRORS) - 1)  # -1 because we already tested PyPI
-        
-        with concurrent.futures.ThreadPoolExecutor(max_workers=optimal_thread_count) as executor:
-            future_to_mirror = {executor.submit(test_mirror_speed, name, url): name for name, url in MIRRORS.items() if name != pypi_name}
-            for future in concurrent.futures.as_completed(future_to_mirror):
-                name = future_to_mirror[future]
-                try:
-                    name, speed = future.result()
-                    if speed != float('inf'):
-                        speeds[name] = speed
-                except Exception as exc:
-                    print(f'{name} generated an exception: {exc}')
-                finally:
-                    progress.update(task, advance=1)
-
-    table = Table(title="Mirror Speed Test Results")
-    table.add_column("Mirror", style="cyan")
-    table.add_column("Response Time (ms)", justify="right", style="magenta")
-
-    for name, speed in sorted(speeds.items(), key=lambda x: x[1]):
-        table.add_row(name, f"{speed:.2f}")
-
-    console.print(table)
-
-    if speeds:
-        fastest_mirror = min(speeds, key=speeds.get)
-        fastest_url = MIRRORS[fastest_mirror]
-        console.print(f"\n[green]Fastest mirror: {fastest_mirror} ({fastest_url})[/green]")
-        console.print(f"[green]Response time: {speeds[fastest_mirror]:.2f} ms[/green]")
-        
-        host = fastest_url.split("//")[1].split("/")[0]
-        if set_pip_mirror(fastest_url):
-            current_mirror = get_current_pip_mirror()
-            console.print(f"\n[yellow]Current pip source: {current_mirror}[/yellow]")
-            
-            if current_mirror == fastest_url:
-                console.print(f"[bold green]Successfully switched to {fastest_mirror} mirror.[/bold green]")
-            else:
-                console.print("[bold red]Switch failed. Current pip source doesn't match the expected one.[/bold red]")
-                console.print(f"[yellow]Expected pip source: {fastest_url}[/yellow]")
-                console.print("[yellow]Please check the configuration manually or try running this script with administrator privileges.[/yellow]")
-        else:
-            console.print("[bold red]Failed to switch mirror, will continue using the current source.[/bold red]")
-            current_mirror = get_current_pip_mirror()
-            console.print(f"[yellow]Current pip source: {current_mirror}[/yellow]")
-            console.print("[yellow]Please check if you have sufficient permissions to modify pip configuration.[/yellow]")
-    else:
-        console.print("[bold red]All mirrors are unreachable. Please check your network connection.[/bold red]")
-
-if __name__ == "__main__":
-    main()
diff --git a/requirements.txt b/requirements.txt
index 600c1228..287d5daf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,4 +18,8 @@ yt-dlp
 json-repair
 ruamel.yaml
 autocorrect-py
-demucs[dev] @ git+https://github.com/adefossez/demucs
\ No newline at end of file
+demucs[dev] @ git+https://github.com/adefossez/demucs
+
+syllables
+pypinyin
+g2p-en
diff --git a/st.py b/st.py
index 5eaa2988..951ae1a5 100644
--- a/st.py
+++ b/st.py
@@ -10,6 +10,9 @@
 
 st.set_page_config(page_title="VideoLingo", page_icon="docs/logo.svg")
 
+SUB_VIDEO = "output/output_sub.mp4"
+DUB_VIDEO = "output/output_dub.mp4"
+
 def text_processing_section():
     st.header("Translate and Generate Subtitles")
     with st.container(border=True):
@@ -25,13 +28,13 @@ def text_processing_section():
             6. Merging subtitles into the video
         """, unsafe_allow_html=True)
 
-        if not os.path.exists("output/output_video_with_subs.mp4"):
+        if not os.path.exists(SUB_VIDEO):
             if st.button("Start Processing Subtitles", key="text_processing_button"):
                 process_text()
                 st.rerun()
         else:
             if load_key("resolution") != "0x0":
-                st.video("output/output_video_with_subs.mp4")
+                st.video(SUB_VIDEO)
             download_subtitle_zip_button(text="Download All Srt Files")
             
             if st.button("Archive to 'history'", key="cleanup_in_text_processing"):
@@ -60,24 +63,25 @@ def process_text():
     st.balloons()
 
 def audio_processing_section():
-    st.header("Dubbing (beta)")
+    st.header("Dubbing")
     with st.container(border=True):
         st.markdown("""
         <p style='font-size: 20px;'>
         This stage includes the following steps:
         <p style='font-size: 20px;'>
-            1. Generate audio tasks<br>
-            2. Generate audio<br>
-            3. Merge audio into the video
+            1. Generate audio tasks and chunks<br>
+            2. Extract reference audio<br>
+            3. Generate and merge audio files<br>
+            4. Merge final audio into video
         """, unsafe_allow_html=True)
-        if not os.path.exists("output/output_video_with_audio.mp4"):
+        if not os.path.exists(DUB_VIDEO):
             if st.button("Start Audio Processing", key="audio_processing_button"):
                 process_audio()
                 st.rerun()
         else:
             st.success("Audio processing is complete! You can check the audio files in the `output` folder.")
             if load_key("resolution") != "0x0": 
-                st.video("output/output_video_with_audio.mp4") 
+                st.video(DUB_VIDEO) 
             if st.button("Delete dubbing files", key="delete_dubbing_files"):
                 delete_dubbing_files()
                 st.rerun()
@@ -87,13 +91,16 @@ def audio_processing_section():
 
 def process_audio():
     with st.spinner("Generate audio tasks"): 
-        step8_gen_audio_task.gen_audio_task_main()
+        step8_1_gen_audio_task.gen_audio_task_main()
+        step8_2_gen_dub_chunks.gen_dub_chunks()
     with st.spinner("Extract refer audio"):
         step9_extract_refer_audio.extract_refer_audio_main()
-    with st.spinner("Generate audio"):
-        step10_gen_audio.process_sovits_tasks()
-    with st.spinner("Merge audio into the video"):
-        step11_merge_audio_to_vid.merge_main()
+    with st.spinner("Generate all audio"):
+        step10_gen_audio.gen_audio()
+    with st.spinner("Merge full audio"):
+        step11_merge_full_audio.merge_full_audio()
+    with st.spinner("Merge dubbing to the video"):
+        step12_merge_dub_to_vid.merge_video_audio()
     
     st.success("Audio processing complete! 🎇")
     st.balloons()
diff --git a/st_components/icon.png b/st_components/icon.png
deleted file mode 100644
index c26080fd..00000000
Binary files a/st_components/icon.png and /dev/null differ
diff --git a/st_components/imports_and_utils.py b/st_components/imports_and_utils.py
index 76985fc5..9a6fc4fa 100644
--- a/st_components/imports_and_utils.py
+++ b/st_components/imports_and_utils.py
@@ -1,8 +1,31 @@
 import os, sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from core import step1_ytdlp, step2_whisperX, step3_1_spacy_split, step3_2_splitbymeaning, step9_extract_refer_audio
-from core import step4_1_summarize, step4_2_translate_all, step5_splitforsub, step6_generate_final_timeline 
-from core import step7_merge_sub_to_vid, step8_gen_audio_task, step10_gen_audio, step11_merge_audio_to_vid
+from core import (
+    # Download & Transcribe 📥
+    step11_merge_full_audio,
+    step1_ytdlp,
+    step2_whisperX,
+    
+    # Text Processing & Analysis 📝
+    step3_1_spacy_split,
+    step3_2_splitbymeaning,
+    step4_1_summarize,
+    step4_2_translate_all,
+    step5_splitforsub,
+    
+    # Subtitle Timeline & Merging 🎬
+    step6_generate_final_timeline,
+    step7_merge_sub_to_vid,
+    
+    # Audio Generation & Processing 🎵
+    step8_1_gen_audio_task,
+    step8_2_gen_dub_chunks,
+    step9_extract_refer_audio,
+    step10_gen_audio,
+    
+    # Final Video Composition 🎥
+    step12_merge_dub_to_vid
+)
 from core.onekeycleanup import cleanup  
 from core.delete_retry_dubbing import delete_dubbing_files
 from core.ask_gpt import ask_gpt
diff --git a/st_components/sidebar_setting.py b/st_components/sidebar_setting.py
index db371633..b0d88ec5 100644
--- a/st_components/sidebar_setting.py
+++ b/st_components/sidebar_setting.py
@@ -4,56 +4,54 @@
 import streamlit as st
 from core.config_utils import update_key, load_key
 
-def config_text_input(label, key, help=None):
-    """Generic config text input handler"""
-    value = st.text_input(label, value=load_key(key), help=help)
-    if value != load_key(key):
-        update_key(key, value)
-    return value
+def config_input(label, key, help=None):
+    """Generic config input handler"""
+    val = st.text_input(label, value=load_key(key), help=help)
+    if val != load_key(key):
+        update_key(key, val)
+    return val
 
 def page_setting():
     with st.expander("LLM Configuration", expanded=True):
-        config_text_input("API_KEY", "api.key")
-        config_text_input("BASE_URL", "api.base_url", help="Base URL for API requests")
+        config_input("API_KEY", "api.key")
+        config_input("BASE_URL", "api.base_url", help="Base URL for API requests")
         
-        col1, col2 = st.columns([4, 1])
-        with col1:
-            config_text_input("MODEL", "api.model")
-        with col2:
+        c1, c2 = st.columns([4, 1])
+        with c1:
+            config_input("MODEL", "api.model")
+        with c2:
             if st.button("📡", key="api"):
-                if valid_llm_api():
-                    st.toast("API Key is valid", icon="✅")
-                else:
-                    st.toast("API Key is invalid", icon="❌")
+                st.toast("API Key is valid" if check_api() else "API Key is invalid", 
+                        icon="✅" if check_api() else "❌")
     
     with st.expander("Transcription and Subtitle Settings", expanded=True):
-        col1, col2 = st.columns(2)
-        with col1:
-            whisper_language_options_dict = {
-            "🇺🇸 English": "en",
-            "🇨🇳 简体中文": "zh",
-            "🇪🇸 Español": "es",
-            "🇷🇺 Русский": "ru",
-            "🇫🇷 Français": "fr",
-            "🇩🇪 Deutsch": "de",
-            "🇮🇹 Italiano": "it",
-            "🇯🇵 日本語": "ja"
+        c1, c2 = st.columns(2)
+        with c1:
+            langs = {
+                "🇺🇸 English": "en",
+                "🇨🇳 简体中文": "zh",
+                "🇪🇸 Español": "es",
+                "🇷🇺 Русский": "ru",
+                "🇫🇷 Français": "fr",
+                "🇩🇪 Deutsch": "de",
+                "🇮🇹 Italiano": "it",
+                "🇯🇵 日本語": "ja"
             }
-            selected_whisper_language = st.selectbox(
+            lang = st.selectbox(
                 "Recognition Language:", 
-                options=list(whisper_language_options_dict.keys()),
-                index=list(whisper_language_options_dict.values()).index(load_key("whisper.language"))
+                options=list(langs.keys()),
+                index=list(langs.values()).index(load_key("whisper.language"))
             )
-            if whisper_language_options_dict[selected_whisper_language] != load_key("whisper.language"):
-                update_key("whisper.language", whisper_language_options_dict[selected_whisper_language])
+            if langs[lang] != load_key("whisper.language"):
+                update_key("whisper.language", langs[lang])
 
-        with col2:
+        with c2:
             target_language = st.text_input("Target Language", value=load_key("target_language"))
             if target_language != load_key("target_language"):
                 update_key("target_language", target_language)
 
-        col1, col2 = st.columns(2)
-        with col1:
+        c1, c2 = st.columns(2)
+        with c1:
             burn_subtitles = st.toggle("Burn Subtitles", value=load_key("resolution") != "0x0")
         
         resolution_options = {
@@ -61,7 +59,7 @@ def page_setting():
             "360p": "640x360"
         }
         
-        with col2:
+        with c2:
             if burn_subtitles:
                 selected_resolution = st.selectbox(
                     "Video Resolution",
@@ -75,31 +73,52 @@ def page_setting():
         if resolution != load_key("resolution"):
             update_key("resolution", resolution)
         
-    with st.expander("Dubbing Settings", expanded=False):
-        tts_methods = ["openai_tts", "azure_tts", "gpt_sovits", "fish_tts"]
+    with st.expander("Dubbing Settings", expanded=True):
+        tts_methods = ["sf_fish_tts", "openai_tts", "azure_tts", "gpt_sovits", "fish_tts"]
         selected_tts_method = st.selectbox("TTS Method", options=tts_methods, index=tts_methods.index(load_key("tts_method")))
         if selected_tts_method != load_key("tts_method"):
             update_key("tts_method", selected_tts_method)
 
-        if selected_tts_method == "openai_tts":
-            config_text_input("OpenAI Voice", "openai_tts.voice")
-            config_text_input("OpenAI TTS API Key", "openai_tts.api_key")
-            config_text_input("OpenAI TTS API Base URL", "openai_tts.base_url")
+        if selected_tts_method == "sf_fish_tts":
+            config_input("SiliconFlow API Key", "sf_fish_tts.api_key")
+            
+            # Add mode selection dropdown
+            mode_options = {
+                "preset": "Preset",
+                "custom": "Refer_stable",
+                "dynamic": "Refer_dynamic"
+            }
+            selected_mode = st.selectbox(
+                "Mode Selection",
+                options=list(mode_options.keys()),
+                format_func=lambda x: mode_options[x],
+                index=list(mode_options.keys()).index(load_key("sf_fish_tts.mode")) if load_key("sf_fish_tts.mode") in mode_options.keys() else 0
+            )
+            if selected_mode != load_key("sf_fish_tts.mode"):
+                update_key("sf_fish_tts.mode", selected_mode)
+                
+            if selected_mode == "preset":
+                config_input("Voice", "sf_fish_tts.voice")
+
+        elif selected_tts_method == "openai_tts":
+            config_input("OpenAI Voice", "openai_tts.voice")
+            config_input("OpenAI TTS API Key", "openai_tts.api_key")
+            config_input("OpenAI TTS API Base URL", "openai_tts.base_url")
 
         elif selected_tts_method == "fish_tts":
-            config_text_input("Fish TTS API Key", "fish_tts.api_key")
+            config_input("Fish TTS API Key", "fish_tts.api_key")
             fish_tts_character = st.selectbox("Fish TTS Character", options=list(load_key("fish_tts.character_id_dict").keys()), index=list(load_key("fish_tts.character_id_dict").keys()).index(load_key("fish_tts.character")))
             if fish_tts_character != load_key("fish_tts.character"):
                 update_key("fish_tts.character", fish_tts_character)
 
         elif selected_tts_method == "azure_tts":
-            config_text_input("Azure Key", "azure_tts.key")
-            config_text_input("Azure Region", "azure_tts.region")
-            config_text_input("Azure Voice", "azure_tts.voice")
+            config_input("Azure Key", "azure_tts.key")
+            config_input("Azure Region", "azure_tts.region")
+            config_input("Azure Voice", "azure_tts.voice")
 
         elif selected_tts_method == "gpt_sovits":
             st.info("配置GPT_SoVITS，请参考Github主页")
-            config_text_input("SoVITS Character", "gpt_sovits.character")
+            config_input("SoVITS Character", "gpt_sovits.character")
             
             refer_mode_options = {1: "模式1：仅用提供的参考音频", 2: "模式2：仅用视频第1条语音做参考", 3: "模式3：使用视频每一条语音做参考"}
             selected_refer_mode = st.selectbox(
@@ -112,9 +131,10 @@ def page_setting():
             if selected_refer_mode != load_key("gpt_sovits.refer_mode"):
                 update_key("gpt_sovits.refer_mode", selected_refer_mode)
 
-def valid_llm_api():
+def check_api():
     try:
-        response = ask_gpt("This is a test, response 'message':'success' in json format.", response_json=True, log_title='None')
-        return response.get('message') == 'success'
+        resp = ask_gpt("This is a test, response 'message':'success' in json format.", 
+                      response_json=True, log_title='None')
+        return resp.get('message') == 'success'
     except Exception:
         return False
\ No newline at end of file