Skip to content

Commit

Permalink
Merge pull request #41 from Huanshere/dev_v4
Browse files Browse the repository at this point in the history
DEV v0.4.0
  • Loading branch information
Huanshere authored Sep 6, 2024
2 parents adc8e00 + 6d2b198 commit 3832018
Show file tree
Hide file tree
Showing 15 changed files with 210 additions and 86 deletions.
47 changes: 45 additions & 2 deletions config.example.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
# 视频分辨率
RESOLUTIOM = '854x480'

# 指定Whisper模型,对于英文视频 medium 足够,对于亚洲语言必须使用 large-v2,v0.4 进行精细识别,所需时间非常长,遇到问题请反馈谢谢~
WHISPER_MODEL = 'large-v2'
## ======================== 进阶设置设置 ======================== ##
# Whisper 指定识别语言
WHISPER_LANGUAGE = 'auto'
Expand All @@ -31,7 +33,7 @@
ALLOWED_VIDEO_FORMATS = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm']

# gpt多线程数量
MAX_WORKERS = 5
MAX_WORKERS = 6

# 每一步的 LLM 模型选择,其中 3_2 和 5 只建议 sonnet,换模型会不稳定报错
step3_2_split_model = MODEL[0]
Expand Down Expand Up @@ -60,4 +62,45 @@
ORIGINAL_VOLUME = 0.1

# 第一次粗切单词数,18以下会切太碎影响翻译,22 以上太长会导致后续为字幕切分难以对齐
MAX_SPLIT_LENGTH = 20
MAX_SPLIT_LENGTH = 20

## ======================== 语言模型 ======================== ##
# Spacy model
SPACY_MODEL_MAP = {
"en": "en_core_web_sm",
"zh": "zh_core_web_sm",
"es": "es_core_news_sm",
"fr": "fr_core_news_sm",
"de": "de_core_news_sm",
"it": "it_core_news_sm",
"ja": "ja_core_news_sm",
"pt": "pt_core_news_sm",
"nl": "nl_core_news_sm",
"el": "el_core_news_sm",
"ru": "ru_core_news_sm",
"ar": "ar_core_news_sm",
"hi": "hi_core_news_sm",
"ko": "ko_core_news_sm",
"pl": "pl_core_news_sm",
"uk": "uk_core_news_sm",
"vi": "vi_core_news_sm",
"tr": "tr_core_news_sm",
"th": "th_core_news_sm",
"ro": "ro_core_news_sm",
"da": "da_core_news_sm",
"fi": "fi_core_news_sm",
"hu": "hu_core_news_sm",
"nb": "nb_core_news_sm",
"sv": "sv_core_news_sm"
}

LANGUAGE_SPLIT_WITH_SPACE = ['en', 'es', 'fr', 'de', 'it', 'pt', 'nl', 'el', 'ru', 'ar', 'hi', 'pl', 'uk', 'vi', 'tr', 'ro', 'da', 'fi', 'hu', 'nb', 'sv']
LANGUAGE_SPLIT_WITHOUT_SPACE = ['zh', 'ja', 'th', 'ko']

def get_joiner(language):
if language in LANGUAGE_SPLIT_WITH_SPACE:
return " "
elif language in LANGUAGE_SPLIT_WITHOUT_SPACE:
return ""
else:
raise ValueError(f"不支持的语言代码: {language}")
1 change: 0 additions & 1 deletion core/step2_whisperapi.py → core/_step2_whisperapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

def convert_video_to_audio(input_file: str):
# 🎬➡️🎵 Convert video to audio
# audio_file = os.path.splitext(input_file)[0] + '_temp.mp3'
os.makedirs('output/audio', exist_ok=True)
audio_file = 'output/audio/raw_full_audio.wav'

Expand Down
2 changes: 1 addition & 1 deletion core/prompts_storage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os,sys,json
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from core.step2_whisperapi import get_whisper_language
from core.step2_whisper_stamped import get_whisper_language
## ================================================================
# @ step4_splitbymeaning.py
def get_split_prompt(sentence, num_parts = 2, word_limit = 20):
Expand Down
40 changes: 5 additions & 35 deletions core/spacy_utils/load_nlp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,49 +2,19 @@
import spacy
from spacy.cli import download
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from core.step2_whisperapi import get_whisper_language
from core.step2_whisper_stamped import get_whisper_language
from config import SPACY_MODEL_MAP

def get_spacy_model(language: str):
language_map = {
"english": "en_core_web_sm",
"chinese": "zh_core_web_sm",
"spanish": "es_core_news_sm",
"french": "fr_core_news_sm",
"german": "de_core_news_sm",
"italian": "it_core_news_sm",
"japanese": "ja_core_news_sm",
"portuguese": "pt_core_news_sm",
"dutch": "nl_core_news_sm",
"greek": "el_core_news_sm",
"russian": "ru_core_news_sm",
"arabic": "ar_core_news_sm",
"hindi": "hi_core_news_sm",
"korean": "ko_core_news_sm",
"polish": "pl_core_news_sm",
"ukrainian": "uk_core_news_sm",
"vietnamese": "vi_core_news_sm",
"turkish": "tr_core_news_sm",
"thai": "th_core_news_sm",
"romanian": "ro_core_news_sm",
"danish": "da_core_news_sm",
"finnish": "fi_core_news_sm",
"hungarian": "hu_core_news_sm",
"norwegian": "nb_core_news_sm",
"swedish": "sv_core_news_sm"
}

model = language_map.get(language.lower(), "en_core_web_sm")
if language not in language_map:
model = SPACY_MODEL_MAP.get(language.lower(), "en_core_web_sm")
if language not in SPACY_MODEL_MAP:
print(f"Spacy 模型不支持'{language}',使用 en_core_web_sm 模型作为后备选项...")
return model

def init_nlp():
try:
from config import WHISPER_LANGUAGE
if WHISPER_LANGUAGE == "en":
language = "english"
else:
language = get_whisper_language()
language = "en" if WHISPER_LANGUAGE == "en" else get_whisper_language()
model = get_spacy_model(language)
print(f"⏳ 正在加载 NLP Spacy 模型: <{model}> ...")
try:
Expand Down
2 changes: 1 addition & 1 deletion core/spacy_utils/split_by_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def analyze_connectors(doc, token):
2. For 'that', check if it's part of a contraction (e.g., that's, that'll).
3. For all connectors, check if they function as a 'mark' dependent of a verb.
4. For 'which', 'where', 'when', check if they function as determiners or pronouns
for nouns or proper nouns.
for nouns or proper nouns.
5. Default to splitting for 'which', 'where', 'when' if no other conditions are met.
6. For 'and', 'or', 'but', check if they connect two independent clauses.
"""
Expand Down
17 changes: 14 additions & 3 deletions core/spacy_utils/split_by_mark.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import os,sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from load_nlp_model import init_nlp
import pandas as pd
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from core.spacy_utils.load_nlp_model import init_nlp
from core.step2_whisper_stamped import get_whisper_language
from config import get_joiner, WHISPER_LANGUAGE

def split_by_mark():
language = get_whisper_language() if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE # 考虑强制英文的情况
joiner = get_joiner(language)
print(f"🔍 正在使用 {language} 语言的拼接方式: '{joiner}'")
nlp = init_nlp()
input_text = open("output/log/raw_transcript.txt", "r", encoding="utf-8").read()
chunks = pd.read_excel("output/log/cleaned_chunks.xlsx")
chunks.text = chunks.text.apply(lambda x: x.strip('"'))

# 用 joiner 拼接
input_text = joiner.join(chunks.text.to_list())

doc = nlp(input_text)
assert doc.has_annotation("SENT_START")

Expand Down
99 changes: 99 additions & 0 deletions core/step2_whisper_stamped.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import os,sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import subprocess
import whisper_timestamped as whisper
import torch
import pandas as pd
from typing import List, Dict
import warnings
warnings.filterwarnings("ignore")
import json

def convert_video_to_audio_and_transcribe(input_file: str):
from config import WHISPER_MODEL, MODEL_DIR, WHISPER_LANGUAGE
# 🎬➡️🎵➡️📊 Convert video to audio and transcribe
os.makedirs('output/audio', exist_ok=True)
audio_file = 'output/audio/raw_full_audio.wav'

if not os.path.exists(audio_file):
# Convert video to audio
ffmpeg_cmd = [
'ffmpeg',
'-i', input_file,
'-vn',
'-acodec', 'libmp3lame',
'-ar', '16000',
'-b:a', '64k',
audio_file
]
print(f"🎬➡️🎵 正在转换为音频......")
subprocess.run(ffmpeg_cmd, check=True, stderr=subprocess.PIPE)
print(f"🎬➡️🎵 已将 <{input_file}> 转换为 <{audio_file}>\n")

# Check file size
if os.path.getsize(audio_file) > 25 * 1024 * 1024:
print("⚠️ 文件大小超过25MB。请使用更小的文件。")
return None

# Transcribe audio
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f"🚀 正在启动Whisper...\n🖥️ ASR设备: {device}")
print("此步骤会花费很长时间,尤其会在100%后仍然处理很长时间...")

audio = whisper.load_audio(audio_file)
os.makedirs(MODEL_DIR, exist_ok=True)
model = whisper.load_model(WHISPER_MODEL, device=device, download_root=MODEL_DIR)

transcribe_params = {'model': model, 'audio': audio, 'beam_size': 3, 'best_of': 3, 'temperature': (0.0, 0.4, 0.8)}
if WHISPER_LANGUAGE != 'auto':
transcribe_params['language'] = WHISPER_LANGUAGE
result = whisper.transcribe(**transcribe_params)

os.makedirs('output/log', exist_ok=True)
with open('output/log/transcript_language.json', 'w', encoding='utf-8') as f:
json.dump({"language": result['language']}, f, ensure_ascii=False, indent=4)
print(f"📝 已将识别到的语言保存到 output/log/transcript_language.json")

# Process transcription results
all_words: List[Dict[str, float]] = [
{'text': f"{word['text']}", 'start': word['start'], 'end': word['end']}
for segment in result['segments']
for word in segment['words']
]

df = pd.DataFrame(all_words)
return df

def save_results(df: pd.DataFrame):
# 💾 Save transcription results as Excel and text files
os.makedirs('output', exist_ok=True)
os.makedirs('output/log', exist_ok=True)
excel_path = os.path.join('output/log', "cleaned_chunks.xlsx")
# 给df[text]列都加上"",防止数字被excel自动转换为数字
df['text'] = df['text'].apply(lambda x: f'"{x}"')
df.to_excel(excel_path, index=False)
print(f"📊 Excel文件已保存到 {excel_path}")

def get_whisper_language():
try:
with open("output/log/transcript_language.json", "r", encoding='utf-8') as f:
language = json.load(f)["language"]
return language
except:
print("无法读取语言信息")
return None

def transcribe(video_file: StopIteration):
if not os.path.exists("output/log/cleaned_chunks.xlsx"):
# 🎥➡️📝 Transcribe video to text
df = convert_video_to_audio_and_transcribe(video_file)
if df is not None:
save_results(df)
else:
print("📊 转录结果已存在,跳过转录步骤。")

if __name__ == "__main__":
from core.step1_ytdlp import find_video_files
video_file = find_video_files()
print(f"🎬 找到的视频文件: {video_file}, 开始转录...")
transcribe(video_file)
29 changes: 9 additions & 20 deletions core/step3_2_splitbymeaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from difflib import SequenceMatcher
import math
from core.spacy_utils.load_nlp_model import init_nlp
from config import get_joiner, WHISPER_LANGUAGE
from core.step2_whisper_stamped import get_whisper_language

def tokenize_sentence(sentence, nlp):
# 分词器 统计句子单词数量
Expand All @@ -16,17 +18,16 @@ def find_split_positions(original, modified):
split_positions = []
parts = modified.split('[br]')
start = 0

# 移除原始句子中的所有空格
original_no_space = ''.join(original.split())
language = get_whisper_language() if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE
joiner = get_joiner(language)

for i in range(len(parts) - 1):
max_similarity = 0
best_split = None

for j in range(start, len(original_no_space)):
original_left = original_no_space[start:j]
modified_left = ''.join(parts[i].split()) # 移除修改后部分的空格
for j in range(start, len(original)):
original_left = original[start:j]
modified_left = joiner.join(parts[i].split())

left_similarity = SequenceMatcher(None, original_left, modified_left).ratio()

Expand All @@ -37,25 +38,13 @@ def find_split_positions(original, modified):
if max_similarity < 0.9:
print(f"警告:找到的最佳分割点相似度较低 {max_similarity}")
if best_split is not None:
# 将无空格版本的分割点映射回原始句子
original_split = map_no_space_to_original(original, best_split)
split_positions.append(original_split)
start = best_split + 1
split_positions.append(best_split)
start = best_split
else:
print(f"警告:无法为第 {i+1} 部分找到合适的分割点。")

return split_positions

def map_no_space_to_original(original, no_space_index):
# 将无空格版本的索引映射回原始句子
space_count = 0
for i, char in enumerate(original):
if char.isspace():
space_count += 1
elif i - space_count == no_space_index:
return i
return len(original) # 如果没有找到匹配,返回原始句子的长度

def split_sentence(sentence, num_parts, word_limit=18, index=-1, retry_attempt=0):
"""Split a long sentence using GPT and return the result as a string."""
split_prompt = get_split_prompt(sentence, num_parts, word_limit)
Expand Down
2 changes: 1 addition & 1 deletion core/step4_2_translate_all.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import sys, os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from core.step4_2_translate_once import translate_lines
import pandas as pd
import json
import concurrent.futures
from core.translate_once import translate_lines
from core.step4_1_summarize import search_things_to_note_in_prompt

# Function to split text into chunks
Expand Down
Loading

0 comments on commit 3832018

Please sign in to comment.