修改翻译效果

2025-08-21 18:09:57 +08:00 · 2025-08-21 18:09:57 +08:00 · 45174f2082
commit 45174f2082
parent e0ae9d826f
13 changed files with 651 additions and 104 deletions
--- a/config.yaml
+++ b/config.yaml
@ -53,7 +53,7 @@ ytb_resolution: '1080'
 subtitle:
  # *Maximum length of each subtitle line in characters
-  max_length: 75
+  max_length: 40
  # *Translated subtitles are slightly larger than source subtitles, affecting the reference length for subtitle splitting
  target_multiplier: 1.2
@ -122,11 +122,11 @@ f5tts:
 # *Audio speed range
 speed_factor:
  min: 1
-  accept: 1.2 # Maximum acceptable speed
+  accept: 4 # Maximum acceptable speed
-  max: 1.4
+  max: 5
 # *Merge audio configuration
-min_subtitle_duration: 2.5 # Minimum subtitle duration, will be forcibly extended
+min_subtitle_duration: 0.2 # Minimum subtitle duration, will be forcibly extended
 min_trim_duration: 3.5 # Subtitles shorter than this value won't be split
 tolerance: 1.5 # Allowed extension time to the next subtitle
--- a/core/_10_gen_audio.py
+++ b/core/_10_gen_audio.py
@ -116,91 +116,138 @@ def generate_tts_audio(tasks_df: pd.DataFrame) -> pd.DataFrame:
    return tasks_df
 def process_chunk(chunk_df: pd.DataFrame, accept: float, min_speed: float) -> tuple[float, bool]:
-    """Process audio chunk and calculate speed factor"""
+    """处理音频块并计算速度调整因子"""
    # 计算当前音频块中所有音频的实际总时长
    chunk_durs = chunk_df['real_dur'].sum()
    # 计算当前音频块的容忍总时长（包含tolerance的总时长）
    tol_durs = chunk_df['tol_dur'].sum()
    # 计算可用的时长（总容忍时长减去最后一行的tolerance，因为最后一行的tolerance不应计入可用时长）
    durations = tol_durs - chunk_df.iloc[-1]['tolerance']
    # 计算所有间隙的总时长（减去最后一行的gap，因为最后一行后面没有间隙）
    all_gaps = chunk_df['gap'].sum() - chunk_df.iloc[-1]['gap']
    # 默认保留音频间隙
    keep_gaps = True
    # 速度调整的容错范围，预留0.1秒的缓冲时间
    speed_var_error = 0.1
    # 情况1：音频时长+间隙在可接受速度下仍小于可用时长
    # 此时可以保留间隙，速度调整幅度相对较小
    if (chunk_durs + all_gaps) / accept < durations:
        # 计算速度因子：总时长除以（可用时长减去容错时间），但不能低于最小速度
        speed_factor = max(min_speed, (chunk_durs + all_gaps) / (durations-speed_var_error))
    # 情况2：仅音频时长在可接受速度下小于可用时长，但加上间隙就超了
    # 此时需要去掉间隙，只考虑音频本身的时长
    elif chunk_durs / accept < durations:
        # 计算速度因子：仅考虑音频时长，不包含间隙
        speed_factor = max(min_speed, chunk_durs / (durations-speed_var_error))
        # 标记不保留间隙
        keep_gaps = False
    # 情况3：音频时长+间隙在可接受速度下小于总容忍时长
    # 此时可以保留间隙，但需要使用总容忍时长作为目标
    elif (chunk_durs + all_gaps) / accept < tol_durs:
        # 计算速度因子：使用总容忍时长作为目标时长
        speed_factor = max(min_speed, (chunk_durs + all_gaps) / (tol_durs-speed_var_error))
    # 情况4：其他情况（最严格的情况）
    # 音频时长即使在可接受速度下也超过了可用时间
    else:
        # 使用总容忍时长作为目标，去掉间隙以节省时间
        speed_factor = chunk_durs / (tol_durs-speed_var_error)
        # 标记不保留间隙
        keep_gaps = False
    # 返回四舍五入到3位小数的速度因子和是否保留间隙的标志
    return round(speed_factor, 3), keep_gaps
 def merge_chunks(tasks_df: pd.DataFrame) -> pd.DataFrame:
-    """Merge audio chunks and adjust timeline"""
+    """合并音频块并调整时间轴"""
    rprint("[bold blue]🔄 Starting audio chunks processing...[/bold blue]")
    # 从配置中加载可接受的速度因子和最小速度限制
    accept = load_key("speed_factor.accept")  
    min_speed = load_key("speed_factor.min")  
-    chunk_start = 0
+    chunk_start = 0  # 初始化当前音频块的起始索引
    # 为每行数据添加新的字幕时间列，用于存储调整后的时间戳
    tasks_df['new_sub_times'] = None
    # 遍历所有任务行，查找音频块分割点（cut_off==1表示该行是某个块的结束）
    for index, row in tasks_df.iterrows():
-        if row['cut_off'] == 1:
+        if row['cut_off'] == 1:  # 如果当前行标记为分割点
            # 提取当前音频块的数据（从chunk_start到当前index+1）
            chunk_df = tasks_df.iloc[chunk_start:index+1].reset_index(drop=True)
            # 计算该音频块的速度调整因子和是否保留间隙
            speed_factor, keep_gaps = process_chunk(chunk_df, accept, min_speed)
-            # 🎯 Step1: Start processing new timeline
+            # 🎯 步骤1: 开始处理新的时间轴
-            chunk_start_time = parse_df_srt_time(chunk_df.iloc[0]['start_time'])
+            # 解析块的开始时间和结束时间
-            chunk_end_time = parse_df_srt_time(chunk_df.iloc[-1]['end_time']) + chunk_df.iloc[-1]['tolerance'] # 加上tolerance才是这一块的结束
+            chunk_start_time = parse_df_srt_time(chunk_df.iloc[0]['start_time'])  # 块的开始时间
-            cur_time = chunk_start_time
+            chunk_end_time = parse_df_srt_time(chunk_df.iloc[-1]['end_time']) + chunk_df.iloc[-1]['tolerance']  # 块的结束时间（加上容忍度）
            cur_time = chunk_start_time  # 当前时间指针，从块开始时间开始
            # 遍历当前块中的每一行数据
            for i, row in chunk_df.iterrows():
-                # If i is not 0, which is not the first row of the chunk, cur_time needs to be added with the gap of the previous row, remember to divide by speed_factor
+                # 如果不是块的第一行，且需要保留间隙，则添加前一行的间隙时间（除以速度因子）
                if i != 0 and keep_gaps:
                    cur_time += chunk_df.iloc[i-1]['gap']/speed_factor
-                new_sub_times = []
+                
-                number = row['number']
+                new_sub_times = []  # 存储当前行的新字幕时间
                number = row['number']  # 获取当前行的编号
                # 解析行数据，如果是字符串则eval，否则直接使用
                lines = eval(row['lines']) if isinstance(row['lines'], str) else row['lines']
                # 处理该行中的每一句话
                for line_index, line in enumerate(lines):
-                    # 🔄 Step2: Start speed change and save as OUTPUT_FILE_TEMPLATE
+                    # 🔄 步骤2: 开始速度调整并保存为输出文件
                    # 构建临时文件和输出文件路径
                    temp_file = TEMP_FILE_TEMPLATE.format(f"{number}_{line_index}")
                    output_file = OUTPUT_FILE_TEMPLATE.format(f"{number}_{line_index}")
                    # 调整音频速度
                    adjust_audio_speed(temp_file, output_file, speed_factor)
                    # 获取调整后音频的时长
                    ad_dur = get_audio_duration(output_file)
                    # 记录新的字幕时间（开始时间和结束时间）
                    new_sub_times.append([cur_time, cur_time+ad_dur])
                    # 更新当前时间指针
                    cur_time += ad_dur
-                # 🔄 Step3: Find corresponding main DataFrame index and update new_sub_times
+                
                # 🔄 步骤3: 找到对应的主DataFrame索引并更新new_sub_times
                # 在主DataFrame中找到对应行的索引
                main_df_idx = tasks_df[tasks_df['number'] == row['number']].index[0]
                # 更新主DataFrame中该行的新字幕时间
                tasks_df.at[main_df_idx, 'new_sub_times'] = new_sub_times
-                # 🎯 Step4: Choose emoji based on speed_factor and accept comparison
+                
-                emoji = "⚡" if speed_factor <= accept else "⚠️"
+                # 🎯 步骤4: 根据速度因子与可接受值的比较选择表情符号
                emoji = "⚡" if speed_factor <= accept else "⚠️"  # 速度正常用闪电，超速用警告
                rprint(f"[cyan]{emoji} Processed chunk {chunk_start} to {index} with speed factor {speed_factor}[/cyan]")
-            # 🔄 Step5: Check if the last row exceeds the range
+            
-            if cur_time > chunk_end_time:
+            # 🔄 步骤5: 检查最后一行是否超出时间范围
-                time_diff = cur_time - chunk_end_time
+            if cur_time > chunk_end_time:  # 如果当前时间超过了块的结束时间
-                if time_diff <= 0.6:  # If exceeding time is within 0.6 seconds, truncate the last audio
+                time_diff = cur_time - chunk_end_time  # 计算超出的时间
                if time_diff <= 0.6:  # 如果超出时间在0.6秒以内，裁剪最后的音频
                    rprint(f"[yellow]⚠️ Chunk {chunk_start} to {index} exceeds by {time_diff:.3f}s, truncating last audio[/yellow]")
-                    # Get the last audio file
+                    
-                    last_number = tasks_df.iloc[index]['number']
+                    # 获取最后一个音频文件的信息
                    last_number = tasks_df.iloc[index]['number']  # 最后一行的编号
                    last_lines = eval(tasks_df.iloc[index]['lines']) if isinstance(tasks_df.iloc[index]['lines'], str) else tasks_df.iloc[index]['lines']
-                    last_line_index = len(last_lines) - 1
+                    last_line_index = len(last_lines) - 1  # 最后一句话的索引
-                    last_file = OUTPUT_FILE_TEMPLATE.format(f"{last_number}_{last_line_index}")
+                    last_file = OUTPUT_FILE_TEMPLATE.format(f"{last_number}_{last_line_index}")  # 最后一个音频文件路径
-                    # Calculate the duration to keep
+                    # 计算需要保留的音频时长
-                    audio = AudioSegment.from_wav(last_file)
+                    audio = AudioSegment.from_wav(last_file)  # 加载音频文件
-                    original_duration = len(audio) / 1000  # Convert to seconds
+                    original_duration = len(audio) / 1000  # 原始时长（转换为秒）
-                    new_duration = original_duration - time_diff
+                    new_duration = original_duration - time_diff  # 新的时长（减去超出部分）
-                    trimmed_audio = audio[:(new_duration * 1000)]  # pydub uses milliseconds
+                    trimmed_audio = audio[:(new_duration * 1000)]  # 裁剪音频（pydub使用毫秒）
-                    trimmed_audio.export(last_file, format="wav")
+                    trimmed_audio.export(last_file, format="wav")  # 导出裁剪后的音频
-                    # Update the last timestamp
+                    # 更新最后的时间戳
-                    last_times = tasks_df.at[index, 'new_sub_times']
+                    last_times = tasks_df.at[index, 'new_sub_times']  # 获取最后的时间数据
-                    last_times[-1][1] = chunk_end_time
+                    last_times[-1][1] = chunk_end_time  # 将最后一句的结束时间设为块的结束时间
-                    tasks_df.at[index, 'new_sub_times'] = last_times
+                    tasks_df.at[index, 'new_sub_times'] = last_times  # 更新到DataFrame中
                else:
                    # 如果超出时间过多，抛出异常
                    raise Exception(f"Chunk {chunk_start} to {index} exceeds the chunk end time {chunk_end_time:.2f} seconds with current time {cur_time:.2f} seconds")
            # 更新下一个块的起始索引
            chunk_start = index+1
    rprint("[bold green]✅ Audio chunks processing completed![/bold green]")
--- a/core/_12_dub_to_vid.py
+++ b/core/_12_dub_to_vid.py
@ -16,7 +16,7 @@ DUB_VIDEO = "output/output_dub.mp4"
 DUB_SUB_FILE = 'output/dub.srt'
 DUB_AUDIO = 'output/dub.mp3'
-TRANS_FONT_SIZE = 17
+TRANS_FONT_SIZE = 12
 TRANS_FONT_NAME = 'Arial'
 if platform.system() == 'Linux':
    TRANS_FONT_NAME = 'NotoSansCJK-Regular'
--- a/core/_3_1_split_nlp.py
+++ b/core/_3_1_split_nlp.py
@ -6,7 +6,8 @@ from core.utils import check_file_exists
 def split_by_spacy():
    nlp = init_nlp()
    split_by_mark(nlp)
-    split_by_comma_main(nlp)
+    # 使用新的强制逗号切分函数，替换原有的 split_by_comma_main(nlp)
    force_split_by_comma_main()
    split_sentences_main(nlp)
    split_long_by_root_main(nlp)
    return
--- a/core/_6_gen_sub.py
+++ b/core/_6_gen_sub.py
@ -4,6 +4,7 @@ import re
 from rich.panel import Panel
 from rich.console import Console
 import autocorrect_py as autocorrect
 from thefuzz import fuzz
 from core.utils import *
 from core.utils.models import *
 console = Console()
@ -56,7 +57,116 @@ def show_difference(str1, str2):
    print("Position markers: " + "".join("^" if i in diff_positions else " " for i in range(max(len(str1), len(str2)))))
    print(f"Difference indices: {diff_positions}")
 def get_sentence_timestamps_fuzzy(df_words, df_sentences):
    """使用模糊匹配算法获取句子时间戳，基于thefuzz词袋模型"""
    time_stamp_list = []
    # 准备原始单词列表，保留原始单词和其时间戳的对应关系
    words_list = []
    for idx, row in df_words.iterrows():
        word_text = str(row['text']).strip()
        if word_text:  # 过滤空词
            words_list.append({
                'text': word_text,
                'start': float(row['start']),
                'end': float(row['end']),
                'index': idx
            })
    console.print(f"📊 总共 {len(words_list)} 个词需要匹配")
    # 用于跟踪已使用的词的索引，避免重复匹配
    used_word_indices = set()
    for sentence_idx, sentence in df_sentences['Source'].items():
        clean_sentence = remove_punctuation(sentence.lower()).replace(" ", "")
        console.print(f"🔍 匹配句子 {sentence_idx}: {sentence}")
        console.print(f"   清理后: {clean_sentence}")
        # 动态窗口设置
        sentence_length = len(clean_sentence)
        min_window_size = max(1, int(sentence_length * 0.6))  # 最小60%
        max_window_size = int(sentence_length * 1.5)  # 最大150%
        best_match_score = 0
        best_match_start = 0
        best_match_end = 0
        # 滑动窗口搜索
        for start_idx in range(len(words_list)):
            if start_idx in used_word_indices:
                continue
            for window_size in range(min_window_size, min(max_window_size + 1, len(words_list) - start_idx + 1)):
                end_idx = start_idx + window_size - 1
                # 检查窗口内是否有已使用的词
                if any(i in used_word_indices for i in range(start_idx, end_idx + 1)):
                    continue
                # 构建窗口内的文本
                window_text = ''.join([remove_punctuation(words_list[i]['text'].lower()) 
                                     for i in range(start_idx, end_idx + 1)])
                # 使用thefuzz计算相似度（词袋模型，对词序不敏感）
                similarity = fuzz.token_set_ratio(clean_sentence, window_text)
                # 如果找到更好的匹配
                if similarity > best_match_score:
                    best_match_score = similarity
                    best_match_start = start_idx
                    best_match_end = end_idx
        # 评估匹配结果
        if best_match_score >= 70:  # 相似度阈值设为70%
            # 标记使用的词索引
            for i in range(best_match_start, best_match_end + 1):
                used_word_indices.add(i)
            # 获取时间戳
            start_time = words_list[best_match_start]['start']
            end_time = words_list[best_match_end]['end']
            matched_text = ''.join([words_list[i]['text'] for i in range(best_match_start, best_match_end + 1)])
            console.print(f"   ✅ 匹配成功! 相似度: {best_match_score}%")
            console.print(f"   匹配文本: {matched_text}")
            console.print(f"   时间范围: {start_time:.2f}s - {end_time:.2f}s")
            time_stamp_list.append((start_time, end_time))
        else:
            console.print(f"   ❌ 匹配失败! 最高相似度: {best_match_score}%")
            console.print(f"   句子: {sentence}")
            # 降级处理：如果相似度太低，尝试按比例分配时间
            if len(time_stamp_list) > 0:
                # 基于之前的句子推算时间
                prev_end = time_stamp_list[-1][1]
                remaining_duration = words_list[-1]['end'] - prev_end
                estimated_duration = max(2.0, remaining_duration / (len(df_sentences) - sentence_idx))
                start_time = prev_end
                end_time = start_time + estimated_duration
                console.print(f"   📍 使用估算时间: {start_time:.2f}s - {end_time:.2f}s")
                time_stamp_list.append((start_time, end_time))
            else:
                # 如果是第一句，使用视频开头
                start_time = 0.0
                end_time = max(3.0, words_list[0]['end'] if words_list else 3.0)
                console.print(f"   📍 使用默认开头时间: {start_time:.2f}s - {end_time:.2f}s")
                time_stamp_list.append((start_time, end_time))
    console.print(f"🎯 完成句子时间戳匹配，共处理 {len(time_stamp_list)} 句")
    return time_stamp_list
 def get_sentence_timestamps(df_words, df_sentences):
    """时间戳获取函数的主入口，优先使用模糊匹配"""
    try:
        return get_sentence_timestamps_fuzzy(df_words, df_sentences)
    except Exception as e:
        console.print(f"❌ 模糊匹配失败: {e}")
        console.print("🔄 尝试回退到精确匹配...")
        # 原有的精确匹配代码作为备选
        time_stamp_list = []
        # Build complete string and position mapping
@ -157,7 +267,7 @@ def align_timestamp_main():
    console.print(Panel("[bold green]🎉📝 Subtitles generation completed! Please check in the `output` folder 👀[/bold green]"))
    # for audio
-    df_translate_for_audio = pd.read_excel(_5_REMERGED) # use remerged file to avoid unmatched lines when dubbing
+    df_translate_for_audio = pd.read_excel(_5_SPLIT_SUB) # use remerged file to avoid unmatched lines when dubbing
    df_translate_for_audio['Translation'] = df_translate_for_audio['Translation'].apply(clean_translation)
    align_timestamp(df_text, df_translate_for_audio, AUDIO_SUBTITLE_OUTPUT_CONFIGS, _AUDIO_DIR)
--- a/core/_7_sub_into_vid.py
+++ b/core/_7_sub_into_vid.py
@ -6,7 +6,7 @@ import platform
 from core.utils import *
 SRC_FONT_SIZE = 15
-TRANS_FONT_SIZE = 17
+TRANS_FONT_SIZE = 12
 FONT_NAME = 'Arial'
 TRANS_FONT_NAME = 'Arial'
--- a/core/asr_backend/audio_preprocess.py
+++ b/core/asr_backend/audio_preprocess.py
@ -9,7 +9,7 @@ from pydub.silence import detect_silence
 from pydub.utils import mediainfo
 from rich import print as rprint
-def normalize_audio_volume(audio_path, output_path, target_db = -20.0, format = "wav"):
+def normalize_audio_volume(audio_path, output_path, target_db = -15.0, format = "wav"):
    audio = AudioSegment.from_file(audio_path)
    change_in_dBFS = target_db - audio.dBFS
    normalized_audio = audio.apply_gain(change_in_dBFS)
--- a/core/prompts.py
+++ b/core/prompts.py
@ -204,17 +204,21 @@ def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt):
    prompt_expressiveness = f'''
 ## Role
 You are a professional Netflix subtitle translator and language consultant.
-Your expertise lies not only in accurately understanding the original {src_language} but also in optimizing the {TARGET_LANGUAGE} translation to better suit the target language's expression habits and cultural background.
+Your expertise lies not only in accurately understanding the original {src_language} but also in optimizing the {TARGET_LANGUAGE} translation to better suit the target language's expression habits and cultural background, with a strong focus on creating subtitles suitable for video.
 ## Task
 We already have a direct translation version of the original {src_language} subtitles.
-Your task is to reflect on and improve these direct translations to create more natural and fluent {TARGET_LANGUAGE} subtitles.
+Your task is to reflect on and improve these direct translations to create more natural, fluent, and concise {TARGET_LANGUAGE} subtitles for video voice-overs and on-screen text.
-1. Analyze the direct translation results line by line, pointing out existing issues
+## Key Objective: Brevity and Natural Pacing
-2. Provide detailed modification suggestions
+A primary challenge is that the source language, Chinese, has a higher information density than English. Direct translations often become too long and wordy, forcing a rapid speaking pace for voice-overs and creating subtitles that are too long for comfortable reading.
-3. Perform free translation based on your analysis
+Your main goal is to significantly shorten the translated sentences to match the rhythm and information delivery speed of the original language, while preserving the core meaning and intent. This involves active simplification, rephrasing, and using more idiomatic expressions.
-4. Do not add comments or explanations in the translation, as the subtitles are for the audience to read
+
-5. Do not leave empty lines in the free translation, as the subtitles are for the audience to read
+1. Analyze the direct translation results line by line, pointing out existing issues, especially regarding length and awkward phrasing.
 2. Provide detailed modification suggestions focused on making the text shorter and more natural.
 3. Perform a free translation based on your analysis that is optimized for speaking and reading on screen.
 4. Do not add comments or explanations in the translation, as the subtitles are for the audience to read.
 5. Do not leave empty lines in the free translation, as the subtitles are for the audience to read.
 {shared_prompt}
@ -222,14 +226,15 @@ Your task is to reflect on and improve these direct translations to create more
 Please use a two-step thinking process to handle the text line by line:
 1. Direct Translation Reflection:
-   - Evaluate language fluency
+   - Prioritize Brevity and Pacing: Is the translation too wordy compared to the likely original? Identify redundant words, overly complex sentence structures, and literal translations that make the sentence unnecessarily long. This is the most critical check.
-   - Check if the language style is consistent with the original text
+   - Evaluate Language Fluency: Does the sentence sound natural or awkward?
-   - Check the conciseness of the subtitles, point out where the translation is too wordy
+   - Check Language Style Consistency: Is the style consistent with the original text (e.g., casual, technical, formal)?
 2. {TARGET_LANGUAGE} Free Translation:
-   - Aim for contextual smoothness and naturalness, conforming to {TARGET_LANGUAGE} expression habits
+   - Achieve Conciseness and Impact: Actively shorten the sentence. Use stronger verbs, remove filler words, and rephrase to convey the same meaning with fewer words. The goal is a punchier, clearer line that's easy to say and read quickly.
-   - Ensure it's easy for {TARGET_LANGUAGE} audience to understand and accept
+   - Embrace Localization and Colloquial Language: Aim for contextual smoothness that conforms to modern {TARGET_LANGUAGE} speaking habits. Use everyday language and idioms that a native speaker would naturally use.
-   - Adapt the language style to match the theme (e.g., use casual language for tutorials, professional terminology for technical content, formal language for documentaries)
+   - Ensure Clarity and Accessibility: The final translation must be easy for the target audience to understand instantly.
   - Adapt the Language Style: Match the theme (e.g., use casual language for tutorials, professional terminology for technical content, formal language for documentaries).
 </Translation Analysis Steps>
 ## INPUT
@ -244,6 +249,7 @@ Please use a two-step thinking process to handle the text line by line:
 Note: Start you answer with ```json and end with ```, do not add any other text.
 '''
    return prompt_expressiveness.strip()
--- a/core/spacy_utils/split_by_comma.py
+++ b/core/spacy_utils/split_by_comma.py
@ -6,24 +6,15 @@ from core.spacy_utils.load_nlp_model import init_nlp, SPLIT_BY_COMMA_FILE, SPLIT
 warnings.filterwarnings("ignore", category=FutureWarning)
 def is_valid_phrase(phrase):
    # 🔍 Check for subject and verb
    has_subject = any(token.dep_ in ["nsubj", "nsubjpass"] or token.pos_ == "PRON" for token in phrase)
    has_verb = any((token.pos_ == "VERB" or token.pos_ == 'AUX') for token in phrase)
    return (has_subject and has_verb)
 def analyze_comma(start, doc, token):
    left_phrase = doc[max(start, token.i - 9):token.i]
    right_phrase = doc[token.i + 1:min(len(doc), token.i + 10)]
-    suitable_for_splitting = is_valid_phrase(right_phrase) # and is_valid_phrase(left_phrase) # ! no need to chekc left phrase
+    # 基于字符数判定（去除标点），左右各 ≥6 字符则切分
    left_text  = ''.join([t.text for t in left_phrase if not t.is_punct]).strip()
    right_text = ''.join([t.text for t in right_phrase if not t.is_punct]).strip()
-    # 🚫 Remove punctuation and check word count
+    suitable_for_splitting = len(left_text) >= 6 and len(right_text) >= 6
    left_words = [t for t in left_phrase if not t.is_punct]
    right_words = list(itertools.takewhile(lambda t: not t.is_punct, right_phrase)) # ! only check the first part of the right phrase
    if len(left_words) <= 3 or len(right_words) <= 3:
        suitable_for_splitting = False
    return suitable_for_splitting
@ -63,6 +54,27 @@ def split_by_comma_main(nlp):
    rprint(f"[green]💾 Sentences split by commas saved to →  `{SPLIT_BY_COMMA_FILE}`[/green]")
 def force_split_by_comma_main():
    """
    强制在每个中文逗号处切分句子，不使用复杂的NLP判断。
    """
    try:
        with open(SPLIT_BY_MARK_FILE, "r", encoding="utf-8") as input_file:
            content = input_file.read()
        # 将所有中文逗号替换为换行符，实现强制切分
        content_split = content.replace('，', '\n')
        with open(SPLIT_BY_COMMA_FILE, "w", encoding="utf-8") as output_file:
            output_file.write(content_split)
        os.remove(SPLIT_BY_MARK_FILE)
        rprint(f"[green]💾 Sentences forcefully split by Chinese commas saved to →  `{SPLIT_BY_COMMA_FILE}`[/green]")
    except Exception as e:
        rprint(f"[red]Error in force_split_by_comma_main: {e}[/red]")
 if __name__ == "__main__":
    nlp = init_nlp()
    split_by_comma_main(nlp)
--- a/run.py
+++ b/run.py
@ -206,6 +206,7 @@ def main():
    update_key("target_language", "English")  # 目标语言为英文
    update_key("tts_method", "edge_tts")  # 使用 Edge TTS
    update_key("burn_subtitles", True)  # 烧录字幕到视频
    update_key("reflect_translate", True)  # 启用 Expressiveness 阶段进行翻译润色
    # ===================================================
--- a/run_backup.py
+++ b/run_backup.py
@ -0,0 +1,235 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 VideoLingo 非图形界面版本
 支持通过参数赋值的方式处理视频
 """
 import os
 import sys
 import shutil
 from pathlib import Path
 # 设置路径
 current_dir = os.path.dirname(os.path.abspath(__file__))
 os.environ['PATH'] += os.pathsep + current_dir
 sys.path.append(current_dir)
 # 导入核心模块
 from core.utils.config_utils import load_key, update_key
 from core.utils.onekeycleanup import cleanup
 from core.utils.delete_retry_dubbing import delete_dubbing_files
 from translations.translations import translate as t
 from core import (
    _2_asr, _3_1_split_nlp, _3_2_split_meaning,
    _4_1_summarize, _4_2_translate, _5_split_sub,
    _6_gen_sub, _7_sub_into_vid, _8_1_audio_task,
    _8_2_dub_chunks, _9_refer_audio, _10_gen_audio,
    _11_merge_audio, _12_dub_to_vid
 )
 class VideoLingoProcessor:
    """VideoLingo 视频处理器"""
    def __init__(self, input_path, output_dir="output"):
        """
        初始化处理器
        Args:
            input_path (str): 输入视频文件路径
            output_dir (str): 输出目录路径，默认为 "output"
        """
        self.input_path = Path(input_path)
        self.output_dir = Path(output_dir)
        # 验证输入文件
        if not self.input_path.exists():
            raise FileNotFoundError(f"输入文件不存在: {self.input_path}")
        # 创建输出目录
        self.output_dir.mkdir(exist_ok=True)
        # 设置输出文件路径
        self.sub_video = self.output_dir / "output_sub.mp4"
        self.dub_video = self.output_dir / "output_dub.mp4"
        print(f"📹 输入视频: {self.input_path}")
        print(f"📁 输出目录: {self.output_dir}")
    def setup_video_file(self):
        """设置视频文件到指定位置"""
        # 将输入视频复制到 "output" 目录
        output_dir = Path("output")
        output_dir.mkdir(exist_ok=True)
        target_video = output_dir / self.input_path.name
        if not target_video.exists():
            print(f"📋 复制视频文件到: {target_video}")
            shutil.copy2(self.input_path, target_video)
        return target_video
    def process_subtitles(self):
        """处理字幕生成"""
        print("\n🎬 开始字幕处理流程...")
        try:
            print("🎤 使用 Whisper 进行语音转录...")
            _2_asr.transcribe()
            print("✂️  拆分长句子...")
            _3_1_split_nlp.split_by_spacy()
            _3_2_split_meaning.split_sentences_by_meaning()
            print("📝 总结和翻译...")
            _4_1_summarize.get_summary()
            # 检查是否需要暂停编辑术语
            if load_key("pause_before_translate"):
                input("⚠️  暂停以便编辑术语。请前往 'output/log/terminology.json' 编辑术语表，然后按回车继续...")
            _4_2_translate.translate_all()
            print("⏱️  处理和对齐字幕...")
            _5_split_sub.split_for_sub_main()
            _6_gen_sub.align_timestamp_main()
            print("🎞️  将字幕合并到视频...")
            _7_sub_into_vid.merge_subtitles_to_video()
            print("✅ 字幕处理完成! 🎉")
            return True
        except Exception as e:
            print(f"❌ 字幕处理失败: {str(e)}")
            return False
    def process_dubbing(self):
        """处理配音生成"""
        print("\n🎙️  开始配音处理流程...")
        try:
            print("📋 生成音频任务...")
            _8_1_audio_task.gen_audio_task_main()
            _8_2_dub_chunks.gen_dub_chunks()
            print("🎵 提取参考音频...")
            _9_refer_audio.extract_refer_audio_main()
            print("🔊 生成所有音频...")
            _10_gen_audio.gen_audio()
            print("🎶 合并完整音频...")
            _11_merge_audio.merge_full_audio()
            print("🎬 将配音合并到视频...")
            _12_dub_to_vid.merge_video_audio()
            print("✅ 配音处理完成! 🎇")
            return True
        except Exception as e:
            print(f"❌ 配音处理失败: {str(e)}")
            return False
    def process_all(self, include_dubbing=True):
        """
        执行完整的处理流程
        Args:
            include_dubbing (bool): 是否包含配音处理，默认为 True
        """
        print("🚀 开始 VideoLingo 处理流程")
        print("=" * 50)
        # 设置视频文件
        self.setup_video_file()
        # 处理字幕
        subtitle_success = self.process_subtitles()
        if not subtitle_success:
            print("❌ 字幕处理失败，停止执行")
            return False
        # 如果字幕成功且需要配音，则处理配音
        if include_dubbing:
            dubbing_success = self.process_dubbing()
            if not dubbing_success:
                print("❌ 配音处理失败")
                return False
        print("\n🎊 所有处理完成!")
        print(f"📁 输出文件位于: {self.output_dir.absolute()}")
        # 显示输出文件
        if self.sub_video.exists():
            print(f"  🎬 字幕视频: {self.sub_video}")
        if include_dubbing and self.dub_video.exists():
            print(f"  🎙️  配音视频: {self.dub_video}")
        return True
    def cleanup_files(self):
        """清理临时文件"""
        print("🧹 清理临时文件...")
        cleanup()
        print("✅ 清理完成")
    def delete_dubbing_files(self):
        """删除配音相关文件"""
        print("🗑️  删除配音文件...")
        delete_dubbing_files()
        print("✅ 配音文件删除完成")
 def main():
    """主函数 - 示例用法"""
    # ==================== 配置参数 ====================
    # 请在这里修改您的参数
    # 输入视频路径（必需）
    INPUT_VIDEO_PATH = "/root/autodl-tmp/output/Dajing_Gate/video_gen_20250721_145356/videos/composed_video_20250721_145724.mp4"
    # 输出目录（可选，默认为 "output"）
    OUTPUT_DIR = "output"
    # 处理选项
    INCLUDE_DUBBING = True  # 是否包含配音处理
    # ==================== 可选配置覆盖 ====================
    # 您可以在这里覆盖 config.yaml 中的设置
    # 配置中文转英文翻译
    update_key("whisper.language", "zh")  # 原始语言为中文
    update_key("target_language", "English")  # 目标语言为英文
    update_key("tts_method", "edge_tts")  # 使用 Edge TTS
    update_key("burn_subtitles", True)  # 烧录字幕到视频
    # ===================================================
    try:
        # 创建处理器实例
        processor = VideoLingoProcessor(
            input_path=INPUT_VIDEO_PATH,
            output_dir=OUTPUT_DIR
        )
        # 执行处理
        success = processor.process_all(include_dubbing=INCLUDE_DUBBING)
        if success:
            print("\n🎉 处理成功完成!")
        else:
            print("\n❌ 处理过程中出现错误")
    except FileNotFoundError as e:
        print(f"❌ 文件错误: {e}")
        print("请检查输入视频路径是否正确")
    except Exception as e:
        print(f"❌ 处理失败: {e}")
 if __name__ == "__main__":
    main()
--- a/test_dubbing_only.py
+++ b/test_dubbing_only.py
@ -0,0 +1,56 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 只测试配音处理流程
 """
 import os
 import sys
 from pathlib import Path
 # 设置路径
 current_dir = os.path.dirname(os.path.abspath(__file__))
 os.environ['PATH'] += os.pathsep + current_dir
 sys.path.append(current_dir)
 # 导入核心模块
 from core.utils.config_utils import load_key, update_key
 from core import (
    _8_1_audio_task, _8_2_dub_chunks, _9_refer_audio, _10_gen_audio,
    _11_merge_audio, _12_dub_to_vid
 )
 def test_dubbing_process():
    """测试配音处理流程"""
    print("🎙️  开始配音处理流程...")
    try:
        print("📋 生成音频任务...")
        _8_1_audio_task.gen_audio_task_main()
        print("🎬 生成配音切块...")
        _8_2_dub_chunks.gen_dub_chunks()
        print("🎵 提取参考音频...")
        _9_refer_audio.extract_refer_audio_main()
        print("🔊 生成所有音频...")
        _10_gen_audio.gen_audio()
        print("🎶 合并完整音频...")
        _11_merge_audio.merge_full_audio()
        print("🎬 将配音合并到视频...")
        _12_dub_to_vid.merge_video_audio()
        print("✅ 配音处理完成! 🎇")
        return True
    except Exception as e:
        print(f"❌ 配音处理失败: {str(e)}")
        import traceback
        traceback.print_exc()
        return False
 if __name__ == "__main__":
    test_dubbing_process()
--- a/test_fuzzy_matching.py
+++ b/test_fuzzy_matching.py
@ -0,0 +1,79 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 测试模糊匹配算法的效果
 """
 import os
 import sys
 import pandas as pd
 # 设置路径
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(current_dir)
 from core._6_gen_sub import get_sentence_timestamps_fuzzy, remove_punctuation
 from core.utils.models import _2_CLEANED_CHUNKS, _5_SPLIT_SUB
 from rich.console import Console
 console = Console()
 def test_fuzzy_matching():
    """测试模糊匹配算法"""
    console.print("[bold green]🚀 开始测试模糊匹配算法...[/bold green]")
    try:
        # 读取数据
        console.print("📊 读取数据...")
        df_text = pd.read_excel(_2_CLEANED_CHUNKS)
        df_text['text'] = df_text['text'].str.strip('"').str.strip()
        df_translate = pd.read_excel(_5_SPLIT_SUB)
        console.print(f"📝 原始ASR数据: {len(df_text)} 行")
        console.print(f"📝 待匹配句子: {len(df_translate)} 句")
        # 显示一些原始数据
        console.print("\n📋 ASR原始数据示例:")
        for i in range(min(5, len(df_text))):
            row = df_text.iloc[i]
            console.print(f"  [{i}] {row['text']} ({row['start']:.2f}s - {row['end']:.2f}s)")
        console.print("\n📋 待匹配句子:")
        for i, row in df_translate.iterrows():
            console.print(f"  [{i}] {row['Source']}")
        # 执行模糊匹配
        console.print("\n🔍 开始模糊匹配...")
        time_stamp_list = get_sentence_timestamps_fuzzy(df_text, df_translate)
        # 显示结果
        console.print("\n🎯 匹配结果:")
        total_duration = 0
        for i, (start, end) in enumerate(time_stamp_list):
            duration = end - start
            total_duration += duration
            console.print(f"  句子 {i}: {start:.2f}s - {end:.2f}s (时长: {duration:.2f}s)")
        console.print(f"\n📊 统计信息:")
        console.print(f"  总句数: {len(time_stamp_list)}")
        console.print(f"  总时长: {total_duration:.2f}s")
        console.print(f"  平均每句时长: {total_duration/len(time_stamp_list):.2f}s")
        # 与原视频时长对比
        if len(df_text) > 0:
            video_duration = df_text['end'].max()
            console.print(f"  原视频时长: {video_duration:.2f}s")
            console.print(f"  时长利用率: {(total_duration/video_duration*100):.1f}%")
        console.print("[bold green]✅ 测试完成![/bold green]")
        return True
    except Exception as e:
        console.print(f"[bold red]❌ 测试失败: {e}[/bold red]")
        import traceback
        console.print(traceback.format_exc())
        return False
 if __name__ == "__main__":
    test_fuzzy_matching()