From f75a3c30f5b0efddd54d0bacd933bd66e4128981 Mon Sep 17 00:00:00 2001 From: HUANYU XU <147359634+Huanshere@users.noreply.github.com> Date: Thu, 5 Sep 2024 21:01:08 +0800 Subject: [PATCH 1/5] update not ready --- ...ep2_whisperapi.py => _step2_whisperapi.py} | 0 core/prompts_storage.py | 2 +- core/spacy_utils/load_nlp_model.py | 54 +++++----- core/spacy_utils/split_by_mark.py | 20 +++- core/step2_whisper_stamped.py | 101 ++++++++++++++++++ core/step6_generate_final_timeline.py | 2 +- requirements.txt | 3 + st.py | 3 +- st_components/imports_and_utils.py | 2 +- 9 files changed, 153 insertions(+), 34 deletions(-) rename core/{step2_whisperapi.py => _step2_whisperapi.py} (100%) create mode 100644 core/step2_whisper_stamped.py diff --git a/core/step2_whisperapi.py b/core/_step2_whisperapi.py similarity index 100% rename from core/step2_whisperapi.py rename to core/_step2_whisperapi.py diff --git a/core/prompts_storage.py b/core/prompts_storage.py index 7f71ed7a..f19d6dbc 100644 --- a/core/prompts_storage.py +++ b/core/prompts_storage.py @@ -1,6 +1,6 @@ import os,sys,json sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from core.step2_whisperapi import get_whisper_language +from core.step2_whisper_stamped import get_whisper_language ## ================================================================ # @ step4_splitbymeaning.py def get_split_prompt(sentence, num_parts = 2, word_limit = 20): diff --git a/core/spacy_utils/load_nlp_model.py b/core/spacy_utils/load_nlp_model.py index 02db317a..7a1458b9 100644 --- a/core/spacy_utils/load_nlp_model.py +++ b/core/spacy_utils/load_nlp_model.py @@ -2,35 +2,35 @@ import spacy from spacy.cli import download sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) -from core.step2_whisperapi import get_whisper_language +from core.step2_whisper_stamped import get_whisper_language def get_spacy_model(language: str): language_map = { - "english": "en_core_web_sm", - "chinese": "zh_core_web_sm", - "spanish": "es_core_news_sm", - "french": "fr_core_news_sm", - "german": "de_core_news_sm", - "italian": "it_core_news_sm", - "japanese": "ja_core_news_sm", - "portuguese": "pt_core_news_sm", - "dutch": "nl_core_news_sm", - "greek": "el_core_news_sm", - "russian": "ru_core_news_sm", - "arabic": "ar_core_news_sm", - "hindi": "hi_core_news_sm", - "korean": "ko_core_news_sm", - "polish": "pl_core_news_sm", - "ukrainian": "uk_core_news_sm", - "vietnamese": "vi_core_news_sm", - "turkish": "tr_core_news_sm", - "thai": "th_core_news_sm", - "romanian": "ro_core_news_sm", - "danish": "da_core_news_sm", - "finnish": "fi_core_news_sm", - "hungarian": "hu_core_news_sm", - "norwegian": "nb_core_news_sm", - "swedish": "sv_core_news_sm" + "en": "en_core_web_sm", + "zh": "zh_core_web_sm", + "es": "es_core_news_sm", + "fr": "fr_core_news_sm", + "de": "de_core_news_sm", + "it": "it_core_news_sm", + "ja": "ja_core_news_sm", + "pt": "pt_core_news_sm", + "nl": "nl_core_news_sm", + "el": "el_core_news_sm", + "ru": "ru_core_news_sm", + "ar": "ar_core_news_sm", + "hi": "hi_core_news_sm", + "ko": "ko_core_news_sm", + "pl": "pl_core_news_sm", + "uk": "uk_core_news_sm", + "vi": "vi_core_news_sm", + "tr": "tr_core_news_sm", + "th": "th_core_news_sm", + "ro": "ro_core_news_sm", + "da": "da_core_news_sm", + "fi": "fi_core_news_sm", + "hu": "hu_core_news_sm", + "nb": "nb_core_news_sm", + "sv": "sv_core_news_sm" } model = language_map.get(language.lower(), "en_core_web_sm") @@ -42,7 +42,7 @@ def init_nlp(): try: from config import WHISPER_LANGUAGE if WHISPER_LANGUAGE == "en": - language = "english" + language = "en" else: language = get_whisper_language() model = get_spacy_model(language) diff --git a/core/spacy_utils/split_by_mark.py b/core/spacy_utils/split_by_mark.py index 430314a9..a31e529d 100644 --- a/core/spacy_utils/split_by_mark.py +++ b/core/spacy_utils/split_by_mark.py @@ -1,12 +1,28 @@ import warnings warnings.filterwarnings("ignore", category=FutureWarning) import os,sys -sys.path.append(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from load_nlp_model import init_nlp +import pandas as pd +from step2_whisper_stamped import get_whisper_language def split_by_mark(): + language = get_whisper_language() + # 支持的语言代码列表 + supported_languages = ['en', 'zh', 'es', 'fr', 'de', 'it', 'ja', 'pt', 'nl', 'el', 'ru', 'ar', 'hi', 'ko', 'pl', 'uk', 'vi', 'tr', 'th', 'ro', 'da', 'fi', 'hu', 'nb', 'sv'] + + # 检查输入的语言是否支持 + if language not in supported_languages: + raise ValueError(f"不支持的语言代码: {language}。支持的语言代码为: {', '.join(supported_languages)}") + nlp = init_nlp() - input_text = open("output/log/raw_transcript.txt", "r", encoding="utf-8").read() + chunks = pd.read_excel("output/log/cleaned_chunks.xlsx") + chunks.text = chunks.text.apply(lambda x: x.strip('"')) + + # 定义需要空格拼接的语言列表 + space_join_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'el', 'ru', 'pl', 'uk', 'ro', 'da', 'fi', 'hu', 'nb', 'sv'] + input_text = " ".join(chunks.text.to_list()) if language in space_join_languages else "".join(chunks.text.to_list()) + doc = nlp(input_text) assert doc.has_annotation("SENT_START") diff --git a/core/step2_whisper_stamped.py b/core/step2_whisper_stamped.py new file mode 100644 index 00000000..4c63b9c6 --- /dev/null +++ b/core/step2_whisper_stamped.py @@ -0,0 +1,101 @@ +import os,sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import subprocess +import whisper_timestamped as whisper +import torch +import pandas as pd +from typing import List, Dict +import warnings +warnings.filterwarnings("ignore") +import json + +def convert_video_to_audio_and_transcribe(input_file: str): + from config import WHISPER_MODEL, MODEL_DIR, WHISPER_LANGUAGE + # 🎬➡️🎵➡️📊 Convert video to audio and transcribe + # audio_file = os.path.splitext(input_file)[0] + '_temp.mp3' + os.makedirs('output/audio', exist_ok=True) + audio_file = 'output/audio/raw_full_audio.wav' + + if not os.path.exists(audio_file): + # Convert video to audio + ffmpeg_cmd = [ + 'ffmpeg', + '-i', input_file, + '-vn', + '-acodec', 'libmp3lame', + '-ar', '16000', + '-b:a', '64k', + audio_file + ] + print(f"🎬➡️🎵 正在转换为音频......") + subprocess.run(ffmpeg_cmd, check=True, stderr=subprocess.PIPE) + print(f"🎬➡️🎵 已将 <{input_file}> 转换为 <{audio_file}>\n") + + # Check file size + if os.path.getsize(audio_file) > 25 * 1024 * 1024: + print("⚠️ 文件大小超过25MB。请使用更小的文件。") + return None + + # Transcribe audio + device = 'cuda:0' if torch.cuda.is_available() else 'cpu' + print(f"🚀 正在启动Whisper...\n🖥️ ASR设备: {device}") + print("此步骤会花费很长时间,尤其会在100%后仍然处理很长时间...") + + audio = whisper.load_audio(audio_file) + os.makedirs(MODEL_DIR, exist_ok=True) + model = whisper.load_model(WHISPER_MODEL, device=device, download_root=MODEL_DIR) + if WHISPER_LANGUAGE == 'auto': + # result = whisper.transcribe(model, audio, beam_size=5, best_of=5, detect_disfluencies=True, vad=True, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)) + result = whisper.transcribe(model, audio, beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)) + else: + result = whisper.transcribe(model, audio, beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0), language=WHISPER_LANGUAGE) + + # 将 result['language'] 保存到 output\log\transcript_language.json,格式如 {"language": "japanese"} + os.makedirs('output/log', exist_ok=True) + with open('output/log/transcript_language.json', 'w', encoding='utf-8') as f: + json.dump({"language": result['language']}, f, ensure_ascii=False, indent=4) + print(f"📝 已将识别到的语言保存到 output/log/transcript_language.json") + + # Process transcription results + all_words: List[Dict[str, float]] = [ + {'text': f"{word['text']}", 'start': word['start'], 'end': word['end']} + for segment in result['segments'] + for word in segment['words'] + ] + + df = pd.DataFrame(all_words) + return df + +def save_results(df: pd.DataFrame): + # 💾 Save transcription results as Excel and text files + os.makedirs('output', exist_ok=True) + os.makedirs('output/log', exist_ok=True) + excel_path = os.path.join('output/log', "cleaned_chunks.xlsx") + # 给df[text]列都加上"",防止数字被excel自动转换为数字 + df['text'] = df['text'].apply(lambda x: f'"{x}"') + df.to_excel(excel_path, index=False) + print(f"📊 Excel文件已保存到 {excel_path}") + +def get_whisper_language(): + try: + with open("output/log/transcript_language.json", "r", encoding='utf-8') as f: + language = json.load(f)["language"] + return language + except: + print("无法读取语言信息") + return None + +def transcribe(video_file: StopIteration): + if not os.path.exists("output/log/cleaned_chunks.xlsx"): + # 🎥➡️📝 Transcribe video to text + df = convert_video_to_audio_and_transcribe(video_file) + if df is not None: + save_results(df) + else: + print("📊 转录结果已存在,跳过转录步骤。") + +if __name__ == "__main__": + from core.step1_ytdlp import find_video_files + video_file = find_video_files() + print(f"🎬 找到的视频文件: {video_file}, 开始转录...") + transcribe(video_file) \ No newline at end of file diff --git a/core/step6_generate_final_timeline.py b/core/step6_generate_final_timeline.py index 77298187..84378f5e 100644 --- a/core/step6_generate_final_timeline.py +++ b/core/step6_generate_final_timeline.py @@ -54,7 +54,7 @@ def get_sentence_timestamps(df_words, df_sentences): break word_index += 1 - if best_match['score'] > 0: + if best_match['score'] <0.9: print("原句:", sentence) print("匹配:", best_match['phrase']) print("相似度:{:.2f}".format(best_match['score'])) diff --git a/requirements.txt b/requirements.txt index a3acb3e7..2cce0e89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,9 @@ json-repair resampy streamlit +whisper-timestamped +onnxruntime + ##! gpt-sovits requirements # pydantic # soundfile diff --git a/st.py b/st.py index d40e283e..21870bc9 100644 --- a/st.py +++ b/st.py @@ -45,8 +45,7 @@ def process_text(): video_file = step1_ytdlp.find_video_files() with st.spinner("使用Whisper进行转录..."): - # step2_whisper_stamped.transcript(video_file) - step2_whisperapi.transcribe(video_file) + step2_whisper_stamped.transcribe(video_file) with st.spinner("分割长句..."): step3_1_spacy_split.split_by_spacy() step3_2_splitbymeaning.split_sentences_by_meaning() diff --git a/st_components/imports_and_utils.py b/st_components/imports_and_utils.py index 57667940..dde90089 100644 --- a/st_components/imports_and_utils.py +++ b/st_components/imports_and_utils.py @@ -1,6 +1,6 @@ import os, sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from core import step1_ytdlp, step2_whisperapi, step3_1_spacy_split, step3_2_splitbymeaning +from core import step2_whisper_stamped, step1_ytdlp, step3_1_spacy_split, step3_2_splitbymeaning from core import step4_1_summarize, step4_2_translate_all, step5_splitforsub, step6_generate_final_timeline from core import step7_merge_sub_to_vid, step8_extract_refer_audio, step9_generate_audio_task from core import step10_generate_audio, step11_merge_audio_to_vid From 0cd00c89c58634244a995e00a9742afcd536c14e Mon Sep 17 00:00:00 2001 From: HUANYU XU Date: Fri, 6 Sep 2024 00:11:13 +0800 Subject: [PATCH 2/5] update add joiner func --- config.example.py | 36 +++++++++++++++++++++++- core/spacy_utils/load_nlp_model.py | 38 +++----------------------- core/spacy_utils/split_by_connector.py | 2 +- core/spacy_utils/split_by_mark.py | 23 ++++++---------- core/step2_whisper_stamped.py | 12 ++++---- core/step3_2_splitbymeaning.py | 31 +++++++-------------- core/step6_generate_final_timeline.py | 16 +++++++---- 7 files changed, 74 insertions(+), 84 deletions(-) diff --git a/config.example.py b/config.example.py index 49c15c78..d6d56863 100644 --- a/config.example.py +++ b/config.example.py @@ -26,6 +26,7 @@ ## ======================== 进阶设置设置 ======================== ## # Whisper 指定识别语言 WHISPER_LANGUAGE = 'auto' +WHISPER_MODEL = 'large-v2' # 支持视频格式 ALLOWED_VIDEO_FORMATS = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm'] @@ -60,4 +61,37 @@ ORIGINAL_VOLUME = 0.1 # 第一次粗切单词数,18以下会切太碎影响翻译,22 以上太长会导致后续为字幕切分难以对齐 -MAX_SPLIT_LENGTH = 20 \ No newline at end of file +MAX_SPLIT_LENGTH = 20 + +## ======================== 语言模型 ======================== ## +# Spacy model +SPACY_MODEL_MAP = { + "en": "en_core_web_sm", + "zh": "zh_core_web_sm", + "es": "es_core_news_sm", + "fr": "fr_core_news_sm", + "de": "de_core_news_sm", + "it": "it_core_news_sm", + "ja": "ja_core_news_sm", + "pt": "pt_core_news_sm", + "nl": "nl_core_news_sm", + "el": "el_core_news_sm", + "ru": "ru_core_news_sm", + "ar": "ar_core_news_sm", + "hi": "hi_core_news_sm", + "ko": "ko_core_news_sm", + "pl": "pl_core_news_sm", + "uk": "uk_core_news_sm", + "vi": "vi_core_news_sm", + "tr": "tr_core_news_sm", + "th": "th_core_news_sm", + "ro": "ro_core_news_sm", + "da": "da_core_news_sm", + "fi": "fi_core_news_sm", + "hu": "hu_core_news_sm", + "nb": "nb_core_news_sm", + "sv": "sv_core_news_sm" +} + +LANGUAGE_SPLIT_WITH_SPACE = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'el', 'ru', 'ar', 'hi', 'pl', 'uk', 'vi', 'tr', 'ro', 'da', 'fi', 'hu', 'nb', 'sv'] +LANGUAGE_SPLIT_WITHOUT_SPACE = ['zh', 'ja', 'th', 'ko'] \ No newline at end of file diff --git a/core/spacy_utils/load_nlp_model.py b/core/spacy_utils/load_nlp_model.py index 7a1458b9..005b0636 100644 --- a/core/spacy_utils/load_nlp_model.py +++ b/core/spacy_utils/load_nlp_model.py @@ -3,48 +3,18 @@ from spacy.cli import download sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from core.step2_whisper_stamped import get_whisper_language +from config import SPACY_MODEL_MAP def get_spacy_model(language: str): - language_map = { - "en": "en_core_web_sm", - "zh": "zh_core_web_sm", - "es": "es_core_news_sm", - "fr": "fr_core_news_sm", - "de": "de_core_news_sm", - "it": "it_core_news_sm", - "ja": "ja_core_news_sm", - "pt": "pt_core_news_sm", - "nl": "nl_core_news_sm", - "el": "el_core_news_sm", - "ru": "ru_core_news_sm", - "ar": "ar_core_news_sm", - "hi": "hi_core_news_sm", - "ko": "ko_core_news_sm", - "pl": "pl_core_news_sm", - "uk": "uk_core_news_sm", - "vi": "vi_core_news_sm", - "tr": "tr_core_news_sm", - "th": "th_core_news_sm", - "ro": "ro_core_news_sm", - "da": "da_core_news_sm", - "fi": "fi_core_news_sm", - "hu": "hu_core_news_sm", - "nb": "nb_core_news_sm", - "sv": "sv_core_news_sm" - } - - model = language_map.get(language.lower(), "en_core_web_sm") - if language not in language_map: + model = SPACY_MODEL_MAP.get(language.lower(), "en_core_web_sm") + if language not in SPACY_MODEL_MAP: print(f"Spacy 模型不支持'{language}',使用 en_core_web_sm 模型作为后备选项...") return model def init_nlp(): try: from config import WHISPER_LANGUAGE - if WHISPER_LANGUAGE == "en": - language = "en" - else: - language = get_whisper_language() + language = "en" if WHISPER_LANGUAGE == "en" else get_whisper_language() model = get_spacy_model(language) print(f"⏳ 正在加载 NLP Spacy 模型: <{model}> ...") try: diff --git a/core/spacy_utils/split_by_connector.py b/core/spacy_utils/split_by_connector.py index db1ca60c..63f46556 100644 --- a/core/spacy_utils/split_by_connector.py +++ b/core/spacy_utils/split_by_connector.py @@ -13,7 +13,7 @@ def analyze_connectors(doc, token): 2. For 'that', check if it's part of a contraction (e.g., that's, that'll). 3. For all connectors, check if they function as a 'mark' dependent of a verb. 4. For 'which', 'where', 'when', check if they function as determiners or pronouns - for nouns or proper nouns. + for nouns or proper nouns. 5. Default to splitting for 'which', 'where', 'when' if no other conditions are met. 6. For 'and', 'or', 'but', check if they connect two independent clauses. """ diff --git a/core/spacy_utils/split_by_mark.py b/core/spacy_utils/split_by_mark.py index a31e529d..0fc9b841 100644 --- a/core/spacy_utils/split_by_mark.py +++ b/core/spacy_utils/split_by_mark.py @@ -1,27 +1,22 @@ import warnings warnings.filterwarnings("ignore", category=FutureWarning) import os,sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from load_nlp_model import init_nlp import pandas as pd -from step2_whisper_stamped import get_whisper_language +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +from core.spacy_utils.load_nlp_model import init_nlp +from core.step2_whisper_stamped import get_whisper_language +from config import get_joiner, WHISPER_LANGUAGE def split_by_mark(): - language = get_whisper_language() - # 支持的语言代码列表 - supported_languages = ['en', 'zh', 'es', 'fr', 'de', 'it', 'ja', 'pt', 'nl', 'el', 'ru', 'ar', 'hi', 'ko', 'pl', 'uk', 'vi', 'tr', 'th', 'ro', 'da', 'fi', 'hu', 'nb', 'sv'] - - # 检查输入的语言是否支持 - if language not in supported_languages: - raise ValueError(f"不支持的语言代码: {language}。支持的语言代码为: {', '.join(supported_languages)}") - + language = get_whisper_language() if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE # 考虑强制英文的情况 + joiner = get_joiner(language) + print(f"🔍 正在使用 {language} 语言的拼接方式: '{joiner}'") s nlp = init_nlp() chunks = pd.read_excel("output/log/cleaned_chunks.xlsx") chunks.text = chunks.text.apply(lambda x: x.strip('"')) - # 定义需要空格拼接的语言列表 - space_join_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'el', 'ru', 'pl', 'uk', 'ro', 'da', 'fi', 'hu', 'nb', 'sv'] - input_text = " ".join(chunks.text.to_list()) if language in space_join_languages else "".join(chunks.text.to_list()) + # 用 joiner 拼接 + input_text = joiner.join(chunks.text.to_list()) doc = nlp(input_text) assert doc.has_annotation("SENT_START") diff --git a/core/step2_whisper_stamped.py b/core/step2_whisper_stamped.py index 4c63b9c6..ded9aaf9 100644 --- a/core/step2_whisper_stamped.py +++ b/core/step2_whisper_stamped.py @@ -12,7 +12,6 @@ def convert_video_to_audio_and_transcribe(input_file: str): from config import WHISPER_MODEL, MODEL_DIR, WHISPER_LANGUAGE # 🎬➡️🎵➡️📊 Convert video to audio and transcribe - # audio_file = os.path.splitext(input_file)[0] + '_temp.mp3' os.makedirs('output/audio', exist_ok=True) audio_file = 'output/audio/raw_full_audio.wav' @@ -44,13 +43,12 @@ def convert_video_to_audio_and_transcribe(input_file: str): audio = whisper.load_audio(audio_file) os.makedirs(MODEL_DIR, exist_ok=True) model = whisper.load_model(WHISPER_MODEL, device=device, download_root=MODEL_DIR) - if WHISPER_LANGUAGE == 'auto': - # result = whisper.transcribe(model, audio, beam_size=5, best_of=5, detect_disfluencies=True, vad=True, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)) - result = whisper.transcribe(model, audio, beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)) - else: - result = whisper.transcribe(model, audio, beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0), language=WHISPER_LANGUAGE) - # 将 result['language'] 保存到 output\log\transcript_language.json,格式如 {"language": "japanese"} + transcribe_params = {'model': model, 'audio': audio, 'beam_size': 5, 'best_of': 5, 'temperature': (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)} + if WHISPER_LANGUAGE != 'auto': + transcribe_params['language'] = WHISPER_LANGUAGE + result = whisper.transcribe(**transcribe_params) + os.makedirs('output/log', exist_ok=True) with open('output/log/transcript_language.json', 'w', encoding='utf-8') as f: json.dump({"language": result['language']}, f, ensure_ascii=False, indent=4) diff --git a/core/step3_2_splitbymeaning.py b/core/step3_2_splitbymeaning.py index 33dffec9..52bed971 100644 --- a/core/step3_2_splitbymeaning.py +++ b/core/step3_2_splitbymeaning.py @@ -6,6 +6,8 @@ from difflib import SequenceMatcher import math from core.spacy_utils.load_nlp_model import init_nlp +from config import get_joiner, WHISPER_LANGUAGE +from core.step2_whisper_stamped import get_whisper_language def tokenize_sentence(sentence, nlp): # 分词器 统计句子单词数量 @@ -16,17 +18,16 @@ def find_split_positions(original, modified): split_positions = [] parts = modified.split('[br]') start = 0 - - # 移除原始句子中的所有空格 - original_no_space = ''.join(original.split()) + language = get_whisper_language() if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE + joiner = get_joiner(language) for i in range(len(parts) - 1): max_similarity = 0 best_split = None - for j in range(start, len(original_no_space)): - original_left = original_no_space[start:j] - modified_left = ''.join(parts[i].split()) # 移除修改后部分的空格 + for j in range(start, len(original)): + original_left = original[start:j] + modified_left = joiner.join(parts[i].split()) left_similarity = SequenceMatcher(None, original_left, modified_left).ratio() @@ -37,25 +38,13 @@ def find_split_positions(original, modified): if max_similarity < 0.9: print(f"警告:找到的最佳分割点相似度较低 {max_similarity}") if best_split is not None: - # 将无空格版本的分割点映射回原始句子 - original_split = map_no_space_to_original(original, best_split) - split_positions.append(original_split) - start = best_split + 1 + split_positions.append(best_split) + start = best_split else: print(f"警告:无法为第 {i+1} 部分找到合适的分割点。") return split_positions - -def map_no_space_to_original(original, no_space_index): - # 将无空格版本的索引映射回原始句子 - space_count = 0 - for i, char in enumerate(original): - if char.isspace(): - space_count += 1 - elif i - space_count == no_space_index: - return i - return len(original) # 如果没有找到匹配,返回原始句子的长度 - +o def split_sentence(sentence, num_parts, word_limit=18, index=-1, retry_attempt=0): """Split a long sentence using GPT and return the result as a string.""" split_prompt = get_split_prompt(sentence, num_parts, word_limit) diff --git a/core/step6_generate_final_timeline.py b/core/step6_generate_final_timeline.py index 84378f5e..de3f3a33 100644 --- a/core/step6_generate_final_timeline.py +++ b/core/step6_generate_final_timeline.py @@ -1,7 +1,10 @@ import pandas as pd -import os +import os, sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from difflib import SequenceMatcher import re +from config import get_joiner, WHISPER_LANGUAGE +from core.step2_whisper_stamped import get_whisper_language def convert_to_srt_format(start_time, end_time): """Convert time (in seconds) to the format: hours:minutes:seconds,milliseconds""" @@ -22,7 +25,9 @@ def remove_punctuation(text): def get_sentence_timestamps(df_words, df_sentences): time_stamp_list = [] word_index = 0 - + language = get_whisper_language() if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE + joiner = get_joiner(language) + for idx,sentence in df_sentences['Source'].items(): sentence = remove_punctuation(sentence.lower()) best_match = {'score': 0, 'start': 0, 'end': 0, 'word_count': 0} @@ -33,15 +38,14 @@ def get_sentence_timestamps(df_words, df_sentences): while word_index < len(df_words): word = remove_punctuation(df_words['text'][word_index].lower()) - #! 去掉空格, 这样是为了支持中文和日文这样的不用空格隔开的语言 - sentence = sentence.replace(" ", "") - current_phrase += word + '' + #! user joiner to join the sentence + current_phrase += word + joiner similarity = SequenceMatcher(None, sentence, current_phrase.strip()).ratio() if similarity > best_match['score']: best_match = { 'score': similarity, - 'start': df_words['start'][start_index], # 使用start_index + 'start': df_words['start'][start_index], 'end': df_words['end'][word_index], 'word_count': word_index - start_index + 1, 'phrase': current_phrase From be54d2fa3afb34c82d31efdc774be990f7663692 Mon Sep 17 00:00:00 2001 From: HUANYU XU Date: Fri, 6 Sep 2024 00:21:04 +0800 Subject: [PATCH 3/5] fix fix bug --- core/spacy_utils/split_by_mark.py | 2 +- core/step3_2_splitbymeaning.py | 2 +- core/step6_generate_final_timeline.py | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/core/spacy_utils/split_by_mark.py b/core/spacy_utils/split_by_mark.py index 0fc9b841..c56965e1 100644 --- a/core/spacy_utils/split_by_mark.py +++ b/core/spacy_utils/split_by_mark.py @@ -10,7 +10,7 @@ def split_by_mark(): language = get_whisper_language() if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE # 考虑强制英文的情况 joiner = get_joiner(language) - print(f"🔍 正在使用 {language} 语言的拼接方式: '{joiner}'") s + print(f"🔍 正在使用 {language} 语言的拼接方式: '{joiner}'") nlp = init_nlp() chunks = pd.read_excel("output/log/cleaned_chunks.xlsx") chunks.text = chunks.text.apply(lambda x: x.strip('"')) diff --git a/core/step3_2_splitbymeaning.py b/core/step3_2_splitbymeaning.py index 52bed971..cf6b7b9e 100644 --- a/core/step3_2_splitbymeaning.py +++ b/core/step3_2_splitbymeaning.py @@ -44,7 +44,7 @@ def find_split_positions(original, modified): print(f"警告:无法为第 {i+1} 部分找到合适的分割点。") return split_positions -o + def split_sentence(sentence, num_parts, word_limit=18, index=-1, retry_attempt=0): """Split a long sentence using GPT and return the result as a string.""" split_prompt = get_split_prompt(sentence, num_parts, word_limit) diff --git a/core/step6_generate_final_timeline.py b/core/step6_generate_final_timeline.py index de3f3a33..ea5b81fa 100644 --- a/core/step6_generate_final_timeline.py +++ b/core/step6_generate_final_timeline.py @@ -53,20 +53,20 @@ def get_sentence_timestamps(df_words, df_sentences): decreasing_count = 0 else: decreasing_count += 1 - # 如果连续 3 个词都没有匹配,则跳出循环 - if decreasing_count >= 3: + # 如果连续 5 个词都没有匹配,则跳出循环 + if decreasing_count >= 5: break word_index += 1 - if best_match['score'] <0.9: - print("原句:", sentence) - print("匹配:", best_match['phrase']) - print("相似度:{:.2f}".format(best_match['score'])) - print("-" * 50) + if best_match['score'] > 0.9: time_stamp_list.append((float(best_match['start']), float(best_match['end']))) word_index = start_index + best_match['word_count'] # 更新word_index到下一个句子的开始 else: print(f"警告:无法为句子找到匹配: {sentence}") + print("原句:", sentence) + print("匹配:", best_match['phrase']) + print("相似度:{:.2f}".format(best_match['score'])) + print("-" * 50) start_index = word_index # 为下一个句子更新start_index From 769defa6084c8fb0f3983329944548afc0050c44 Mon Sep 17 00:00:00 2001 From: HUANYU XU Date: Fri, 6 Sep 2024 08:17:14 +0800 Subject: [PATCH 4/5] optimize --- core/_step2_whisperapi.py | 1 - core/step4_2_translate_all.py | 2 +- core/{step4_2_translate_once.py => translate_once.py} | 0 3 files changed, 1 insertion(+), 2 deletions(-) rename core/{step4_2_translate_once.py => translate_once.py} (100%) diff --git a/core/_step2_whisperapi.py b/core/_step2_whisperapi.py index 5568bb23..82183b23 100644 --- a/core/_step2_whisperapi.py +++ b/core/_step2_whisperapi.py @@ -11,7 +11,6 @@ def convert_video_to_audio(input_file: str): # 🎬➡️🎵 Convert video to audio - # audio_file = os.path.splitext(input_file)[0] + '_temp.mp3' os.makedirs('output/audio', exist_ok=True) audio_file = 'output/audio/raw_full_audio.wav' diff --git a/core/step4_2_translate_all.py b/core/step4_2_translate_all.py index fb87420f..9a0eb9a1 100644 --- a/core/step4_2_translate_all.py +++ b/core/step4_2_translate_all.py @@ -1,9 +1,9 @@ import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from core.step4_2_translate_once import translate_lines import pandas as pd import json import concurrent.futures +from core.translate_once import translate_lines from core.step4_1_summarize import search_things_to_note_in_prompt # Function to split text into chunks diff --git a/core/step4_2_translate_once.py b/core/translate_once.py similarity index 100% rename from core/step4_2_translate_once.py rename to core/translate_once.py From 6d2b198fcac9b84a444dea582832bf5b47abd687 Mon Sep 17 00:00:00 2001 From: HUANYU XU Date: Fri, 6 Sep 2024 09:33:50 +0800 Subject: [PATCH 5/5] update --- config.example.py | 15 ++++++++++++--- core/step2_whisper_stamped.py | 2 +- st_components/sidebar_setting.py | 19 +++++++++++++------ 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/config.example.py b/config.example.py index d6d56863..114cb3ee 100644 --- a/config.example.py +++ b/config.example.py @@ -23,16 +23,17 @@ # 视频分辨率 RESOLUTIOM = '854x480' +# 指定Whisper模型,对于英文视频 medium 足够,对于亚洲语言必须使用 large-v2,v0.4 进行精细识别,所需时间非常长,遇到问题请反馈谢谢~ +WHISPER_MODEL = 'large-v2' ## ======================== 进阶设置设置 ======================== ## # Whisper 指定识别语言 WHISPER_LANGUAGE = 'auto' -WHISPER_MODEL = 'large-v2' # 支持视频格式 ALLOWED_VIDEO_FORMATS = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm'] # gpt多线程数量 -MAX_WORKERS = 5 +MAX_WORKERS = 6 # 每一步的 LLM 模型选择,其中 3_2 和 5 只建议 sonnet,换模型会不稳定报错 step3_2_split_model = MODEL[0] @@ -94,4 +95,12 @@ } LANGUAGE_SPLIT_WITH_SPACE = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'el', 'ru', 'ar', 'hi', 'pl', 'uk', 'vi', 'tr', 'ro', 'da', 'fi', 'hu', 'nb', 'sv'] -LANGUAGE_SPLIT_WITHOUT_SPACE = ['zh', 'ja', 'th', 'ko'] \ No newline at end of file +LANGUAGE_SPLIT_WITHOUT_SPACE = ['zh', 'ja', 'th', 'ko'] + +def get_joiner(language): + if language in LANGUAGE_SPLIT_WITH_SPACE: + return " " + elif language in LANGUAGE_SPLIT_WITHOUT_SPACE: + return "" + else: + raise ValueError(f"不支持的语言代码: {language}") \ No newline at end of file diff --git a/core/step2_whisper_stamped.py b/core/step2_whisper_stamped.py index ded9aaf9..fc88e3a5 100644 --- a/core/step2_whisper_stamped.py +++ b/core/step2_whisper_stamped.py @@ -44,7 +44,7 @@ def convert_video_to_audio_and_transcribe(input_file: str): os.makedirs(MODEL_DIR, exist_ok=True) model = whisper.load_model(WHISPER_MODEL, device=device, download_root=MODEL_DIR) - transcribe_params = {'model': model, 'audio': audio, 'beam_size': 5, 'best_of': 5, 'temperature': (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)} + transcribe_params = {'model': model, 'audio': audio, 'beam_size': 3, 'best_of': 3, 'temperature': (0.0, 0.4, 0.8)} if WHISPER_LANGUAGE != 'auto': transcribe_params['language'] = WHISPER_LANGUAGE result = whisper.transcribe(**transcribe_params) diff --git a/st_components/sidebar_setting.py b/st_components/sidebar_setting.py index b8377c7a..cbf47315 100644 --- a/st_components/sidebar_setting.py +++ b/st_components/sidebar_setting.py @@ -36,15 +36,22 @@ def page_setting(): st.header("字幕设置") lang_cols = st.columns(2) - with lang_cols[1]: - target_language = st.text_input("翻译目标语言:", value=config.TARGET_LANGUAGE) - if target_language != config.TARGET_LANGUAGE: - changes["TARGET_LANGUAGE"] = target_language with lang_cols[0]: + whisper_model_options = ["medium", "large-v2"] + selected_whisper_model = st.selectbox("Whisper模型:", options=whisper_model_options, index=whisper_model_options.index(config.WHISPER_MODEL) if config.WHISPER_MODEL in whisper_model_options else 0, help="对于英文视频 medium 足够,对于亚洲语言必须使用 large-v2,v0.4 进行精细识别,所需时间非常长,遇到问题请反馈谢谢~") + if selected_whisper_model != config.WHISPER_MODEL: + changes["WHISPER_MODEL"] = selected_whisper_model + with lang_cols[1]: whisper_language_options = ["auto", "en"] selected_whisper_language = st.selectbox("Whisper识别语言:", options=whisper_language_options, index=whisper_language_options.index(config.WHISPER_LANGUAGE) if config.WHISPER_LANGUAGE in whisper_language_options else 0) if selected_whisper_language != config.WHISPER_LANGUAGE: changes["WHISPER_LANGUAGE"] = selected_whisper_language + + + target_language = st.text_input("翻译目标语言:", value=config.TARGET_LANGUAGE) + if target_language != config.TARGET_LANGUAGE: + changes["TARGET_LANGUAGE"] = target_language + st.write("每行字幕最大字符数:") max_length_cols = st.columns(2) with max_length_cols[0]: @@ -65,7 +72,6 @@ def page_setting(): if resolution != config.RESOLUTIOM: changes["RESOLUTIOM"] = resolution - #! 配音功能仍在开发中,暂已停用,感谢理解! # st.header("SoVITS 角色配置") # dubbing_character = st.text_input("配音角色:", value=config.DUBBNING_CHARACTER) @@ -93,4 +99,5 @@ def page_setting(): else: st.toast("验证失败, 请检查 API_KEY 和 BASE_URL 是否正确", icon="❌") except Exception as e: - st.toast(f"访问失败 {e}", icon="❌") \ No newline at end of file + st.toast(f"访问失败 {e}", icon="❌") + st.warning("当前版本为 v0.4.0, whisper 本地识别非常耗时,请耐心等待...遇到问题可在Q群反馈") \ No newline at end of file