Merge pull request #41 from Huanshere/dev_v4

DEV v0.4.0
Huanshere · Sep 6, 2024 · 3832018 · 3832018
2 parents adc8e00 + 6d2b198
commit 3832018
Show file tree

Hide file tree

Showing 15 changed files with 210 additions and 86 deletions.
diff --git a/config.example.py b/config.example.py
@@ -23,6 +23,8 @@
 # 视频分辨率
 RESOLUTIOM = '854x480'
 
+# 指定Whisper模型，对于英文视频 medium 足够，对于亚洲语言必须使用 large-v2，v0.4 进行精细识别，所需时间非常长，遇到问题请反馈谢谢～
+WHISPER_MODEL = 'large-v2'
 ## ======================== 进阶设置设置 ======================== ##
 # Whisper 指定识别语言
 WHISPER_LANGUAGE = 'auto'
@@ -31,7 +33,7 @@
 ALLOWED_VIDEO_FORMATS = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm']
 
 # gpt多线程数量
-MAX_WORKERS = 5
+MAX_WORKERS = 6
 
 # 每一步的 LLM 模型选择，其中 3_2 和 5 只建议 sonnet，换模型会不稳定报错
 step3_2_split_model = MODEL[0]
@@ -60,4 +62,45 @@
 ORIGINAL_VOLUME = 0.1
 
 # 第一次粗切单词数，18以下会切太碎影响翻译，22 以上太长会导致后续为字幕切分难以对齐
-MAX_SPLIT_LENGTH = 20
+MAX_SPLIT_LENGTH = 20
+
+## ======================== 语言模型 ======================== ##
+# Spacy model
+SPACY_MODEL_MAP = {
+    "en": "en_core_web_sm",
+    "zh": "zh_core_web_sm",
+    "es": "es_core_news_sm",
+    "fr": "fr_core_news_sm",
+    "de": "de_core_news_sm",
+    "it": "it_core_news_sm",
+    "ja": "ja_core_news_sm",
+    "pt": "pt_core_news_sm",
+    "nl": "nl_core_news_sm",
+    "el": "el_core_news_sm",
+    "ru": "ru_core_news_sm",
+    "ar": "ar_core_news_sm",
+    "hi": "hi_core_news_sm",
+    "ko": "ko_core_news_sm",
+    "pl": "pl_core_news_sm",
+    "uk": "uk_core_news_sm",
+    "vi": "vi_core_news_sm",
+    "tr": "tr_core_news_sm",
+    "th": "th_core_news_sm",
+    "ro": "ro_core_news_sm",
+    "da": "da_core_news_sm",
+    "fi": "fi_core_news_sm",
+    "hu": "hu_core_news_sm",
+    "nb": "nb_core_news_sm",
+    "sv": "sv_core_news_sm"
+}
+
+LANGUAGE_SPLIT_WITH_SPACE = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'el', 'ru', 'ar', 'hi', 'pl', 'uk', 'vi', 'tr', 'ro', 'da', 'fi', 'hu', 'nb', 'sv']
+LANGUAGE_SPLIT_WITHOUT_SPACE = ['zh', 'ja', 'th', 'ko']
+
+def get_joiner(language):
+    if language in LANGUAGE_SPLIT_WITH_SPACE:
+        return " "
+    elif language in LANGUAGE_SPLIT_WITHOUT_SPACE:
+        return ""
+    else:
+        raise ValueError(f"不支持的语言代码: {language}")
diff --git a/core/step2_whisperapi.py → core/_step2_whisperapi.py b/core/step2_whisperapi.py → core/_step2_whisperapi.py
@@ -11,7 +11,6 @@
 
 def convert_video_to_audio(input_file: str):
     # 🎬➡️🎵 Convert video to audio
-    # audio_file = os.path.splitext(input_file)[0] + '_temp.mp3'
     os.makedirs('output/audio', exist_ok=True)
     audio_file = 'output/audio/raw_full_audio.wav'
 

diff --git a/core/prompts_storage.py b/core/prompts_storage.py
@@ -1,6 +1,6 @@
 import os,sys,json
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from core.step2_whisperapi import get_whisper_language
+from core.step2_whisper_stamped import get_whisper_language
 ## ================================================================
 # @ step4_splitbymeaning.py
 def get_split_prompt(sentence, num_parts = 2, word_limit = 20):

diff --git a/core/spacy_utils/load_nlp_model.py b/core/spacy_utils/load_nlp_model.py
@@ -2,49 +2,19 @@
 import spacy
 from spacy.cli import download
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-from core.step2_whisperapi import get_whisper_language
+from core.step2_whisper_stamped import get_whisper_language
+from config import SPACY_MODEL_MAP
 
 def get_spacy_model(language: str):
-    language_map = {
-        "english": "en_core_web_sm",
-        "chinese": "zh_core_web_sm",
-        "spanish": "es_core_news_sm",
-        "french": "fr_core_news_sm",
-        "german": "de_core_news_sm",
-        "italian": "it_core_news_sm",
-        "japanese": "ja_core_news_sm",
-        "portuguese": "pt_core_news_sm",
-        "dutch": "nl_core_news_sm",
-        "greek": "el_core_news_sm",
-        "russian": "ru_core_news_sm",
-        "arabic": "ar_core_news_sm",
-        "hindi": "hi_core_news_sm",
-        "korean": "ko_core_news_sm",
-        "polish": "pl_core_news_sm",
-        "ukrainian": "uk_core_news_sm",
-        "vietnamese": "vi_core_news_sm",
-        "turkish": "tr_core_news_sm",
-        "thai": "th_core_news_sm",
-        "romanian": "ro_core_news_sm",
-        "danish": "da_core_news_sm",
-        "finnish": "fi_core_news_sm",
-        "hungarian": "hu_core_news_sm",
-        "norwegian": "nb_core_news_sm",
-        "swedish": "sv_core_news_sm"
-    }
-
-    model = language_map.get(language.lower(), "en_core_web_sm")
-    if language not in language_map:
+    model = SPACY_MODEL_MAP.get(language.lower(), "en_core_web_sm")
+    if language not in SPACY_MODEL_MAP:
         print(f"Spacy 模型不支持'{language}'，使用 en_core_web_sm 模型作为后备选项...")
     return model
 
 def init_nlp():
     try:
         from config import WHISPER_LANGUAGE
-        if WHISPER_LANGUAGE == "en":
-            language = "english"
-        else:
-            language = get_whisper_language()
+        language = "en" if WHISPER_LANGUAGE == "en" else get_whisper_language()
         model = get_spacy_model(language)
         print(f"⏳ 正在加载 NLP Spacy 模型: <{model}> ...")
         try:

diff --git a/core/spacy_utils/split_by_connector.py b/core/spacy_utils/split_by_connector.py
@@ -13,7 +13,7 @@ def analyze_connectors(doc, token):
     2. For 'that', check if it's part of a contraction (e.g., that's, that'll).
     3. For all connectors, check if they function as a 'mark' dependent of a verb.
     4. For 'which', 'where', 'when', check if they function as determiners or pronouns 
-       for nouns or proper nouns.
+    for nouns or proper nouns.
     5. Default to splitting for 'which', 'where', 'when' if no other conditions are met.
     6. For 'and', 'or', 'but', check if they connect two independent clauses.
     """

diff --git a/core/spacy_utils/split_by_mark.py b/core/spacy_utils/split_by_mark.py
@@ -1,12 +1,23 @@
 import warnings
 warnings.filterwarnings("ignore", category=FutureWarning)
 import os,sys
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-from load_nlp_model import init_nlp
+import pandas as pd
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+from core.spacy_utils.load_nlp_model import init_nlp
+from core.step2_whisper_stamped import get_whisper_language
+from config import get_joiner, WHISPER_LANGUAGE
 
 def split_by_mark():
+    language = get_whisper_language() if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE # 考虑强制英文的情况
+    joiner = get_joiner(language)
+    print(f"🔍 正在使用 {language} 语言的拼接方式: '{joiner}'")
     nlp = init_nlp()
-    input_text = open("output/log/raw_transcript.txt", "r", encoding="utf-8").read()
+    chunks = pd.read_excel("output/log/cleaned_chunks.xlsx")
+    chunks.text = chunks.text.apply(lambda x: x.strip('"'))
+
+    # 用 joiner 拼接
+    input_text = joiner.join(chunks.text.to_list())
+
     doc = nlp(input_text)
     assert doc.has_annotation("SENT_START")
 

diff --git a/core/step2_whisper_stamped.py b/core/step2_whisper_stamped.py
@@ -0,0 +1,99 @@
+import os,sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import subprocess
+import whisper_timestamped as whisper
+import torch
+import pandas as pd
+from typing import List, Dict
+import warnings
+warnings.filterwarnings("ignore")
+import json
+
+def convert_video_to_audio_and_transcribe(input_file: str):
+    from config import WHISPER_MODEL, MODEL_DIR, WHISPER_LANGUAGE
+    # 🎬➡️🎵➡️📊 Convert video to audio and transcribe
+    os.makedirs('output/audio', exist_ok=True)
+    audio_file = 'output/audio/raw_full_audio.wav'
+
+    if not os.path.exists(audio_file):
+        # Convert video to audio
+        ffmpeg_cmd = [
+            'ffmpeg',
+            '-i', input_file,
+            '-vn',
+            '-acodec', 'libmp3lame',
+            '-ar', '16000',
+            '-b:a', '64k',
+            audio_file
+        ]
+        print(f"🎬➡️🎵 正在转换为音频......")
+        subprocess.run(ffmpeg_cmd, check=True, stderr=subprocess.PIPE)
+        print(f"🎬➡️🎵 已将 <{input_file}> 转换为 <{audio_file}>\n")
+
+    # Check file size
+    if os.path.getsize(audio_file) > 25 * 1024 * 1024:
+        print("⚠️ 文件大小超过25MB。请使用更小的文件。")
+        return None
+
+    # Transcribe audio
+    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+    print(f"🚀 正在启动Whisper...\n🖥️  ASR设备: {device}")
+    print("此步骤会花费很长时间，尤其会在100%后仍然处理很长时间...")
+
+    audio = whisper.load_audio(audio_file)
+    os.makedirs(MODEL_DIR, exist_ok=True)
+    model = whisper.load_model(WHISPER_MODEL, device=device, download_root=MODEL_DIR)
+
+    transcribe_params = {'model': model, 'audio': audio, 'beam_size': 3, 'best_of': 3, 'temperature': (0.0, 0.4, 0.8)}
+    if WHISPER_LANGUAGE != 'auto':
+        transcribe_params['language'] = WHISPER_LANGUAGE
+    result = whisper.transcribe(**transcribe_params)
+
+    os.makedirs('output/log', exist_ok=True)
+    with open('output/log/transcript_language.json', 'w', encoding='utf-8') as f:
+        json.dump({"language": result['language']}, f, ensure_ascii=False, indent=4)
+    print(f"📝 已将识别到的语言保存到 output/log/transcript_language.json")
+
+    # Process transcription results
+    all_words: List[Dict[str, float]] = [
+        {'text': f"{word['text']}", 'start': word['start'], 'end': word['end']}
+        for segment in result['segments']
+        for word in segment['words']
+    ]
+
+    df = pd.DataFrame(all_words)
+    return df
+
+def save_results(df: pd.DataFrame):
+    # 💾 Save transcription results as Excel and text files
+    os.makedirs('output', exist_ok=True)
+    os.makedirs('output/log', exist_ok=True)
+    excel_path = os.path.join('output/log', "cleaned_chunks.xlsx")
+    # 给df[text]列都加上""，防止数字被excel自动转换为数字
+    df['text'] = df['text'].apply(lambda x: f'"{x}"')
+    df.to_excel(excel_path, index=False)
+    print(f"📊 Excel文件已保存到 {excel_path}")
+
+def get_whisper_language():
+    try:
+        with open("output/log/transcript_language.json", "r", encoding='utf-8') as f:
+            language = json.load(f)["language"]
+        return language
+    except:
+        print("无法读取语言信息")
+        return None
+
+def transcribe(video_file: StopIteration):
+    if not os.path.exists("output/log/cleaned_chunks.xlsx"):
+        # 🎥➡️📝 Transcribe video to text
+        df = convert_video_to_audio_and_transcribe(video_file)
+        if df is not None:
+            save_results(df)
+    else:
+        print("📊 转录结果已存在，跳过转录步骤。")
+
+if __name__ == "__main__":
+    from core.step1_ytdlp import find_video_files
+    video_file = find_video_files()
+    print(f"🎬 找到的视频文件: {video_file}, 开始转录...")
+    transcribe(video_file)
diff --git a/core/step3_2_splitbymeaning.py b/core/step3_2_splitbymeaning.py
@@ -6,6 +6,8 @@
 from difflib import SequenceMatcher
 import math
 from core.spacy_utils.load_nlp_model import init_nlp
+from config import get_joiner, WHISPER_LANGUAGE
+from core.step2_whisper_stamped import get_whisper_language
 
 def tokenize_sentence(sentence, nlp):
     # 分词器 统计句子单词数量
@@ -16,17 +18,16 @@ def find_split_positions(original, modified):
     split_positions = []
     parts = modified.split('[br]')
     start = 0
-
-    # 移除原始句子中的所有空格
-    original_no_space = ''.join(original.split())
+    language = get_whisper_language() if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE
+    joiner = get_joiner(language)
 
     for i in range(len(parts) - 1):
         max_similarity = 0
         best_split = None
 
-        for j in range(start, len(original_no_space)):
-            original_left = original_no_space[start:j]
-            modified_left = ''.join(parts[i].split())  # 移除修改后部分的空格
+        for j in range(start, len(original)):
+            original_left = original[start:j]
+            modified_left = joiner.join(parts[i].split())
 
             left_similarity = SequenceMatcher(None, original_left, modified_left).ratio()
 
@@ -37,25 +38,13 @@ def find_split_positions(original, modified):
         if max_similarity < 0.9:
             print(f"警告：找到的最佳分割点相似度较低 {max_similarity}")
         if best_split is not None:
-            # 将无空格版本的分割点映射回原始句子
-            original_split = map_no_space_to_original(original, best_split)
-            split_positions.append(original_split)
-            start = best_split + 1
+            split_positions.append(best_split)
+            start = best_split
         else:
             print(f"警告：无法为第 {i+1} 部分找到合适的分割点。")
 
     return split_positions
 
-def map_no_space_to_original(original, no_space_index):
-    # 将无空格版本的索引映射回原始句子
-    space_count = 0
-    for i, char in enumerate(original):
-        if char.isspace():
-            space_count += 1
-        elif i - space_count == no_space_index:
-            return i
-    return len(original)  # 如果没有找到匹配，返回原始句子的长度
-
 def split_sentence(sentence, num_parts, word_limit=18, index=-1, retry_attempt=0):
     """Split a long sentence using GPT and return the result as a string."""
     split_prompt = get_split_prompt(sentence, num_parts, word_limit)

diff --git a/core/step4_2_translate_all.py b/core/step4_2_translate_all.py
@@ -1,9 +1,9 @@
 import sys, os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from core.step4_2_translate_once import translate_lines
 import pandas as pd
 import json
 import concurrent.futures
+from core.translate_once import translate_lines
 from core.step4_1_summarize import search_things_to_note_in_prompt
 
 # Function to split text into chunks