video_translation/test_fuzzy_matching.py

80 lines
2.8 KiB
Python
Raw Permalink Normal View History

2025-08-21 18:09:57 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
测试模糊匹配算法的效果
"""
import os
import sys
import pandas as pd
# 设置路径
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(current_dir)
from core._6_gen_sub import get_sentence_timestamps_fuzzy, remove_punctuation
from core.utils.models import _2_CLEANED_CHUNKS, _5_SPLIT_SUB
from rich.console import Console
console = Console()
def test_fuzzy_matching():
"""测试模糊匹配算法"""
console.print("[bold green]🚀 开始测试模糊匹配算法...[/bold green]")
try:
# 读取数据
console.print("📊 读取数据...")
df_text = pd.read_excel(_2_CLEANED_CHUNKS)
df_text['text'] = df_text['text'].str.strip('"').str.strip()
df_translate = pd.read_excel(_5_SPLIT_SUB)
console.print(f"📝 原始ASR数据: {len(df_text)}")
console.print(f"📝 待匹配句子: {len(df_translate)}")
# 显示一些原始数据
console.print("\n📋 ASR原始数据示例:")
for i in range(min(5, len(df_text))):
row = df_text.iloc[i]
console.print(f" [{i}] {row['text']} ({row['start']:.2f}s - {row['end']:.2f}s)")
console.print("\n📋 待匹配句子:")
for i, row in df_translate.iterrows():
console.print(f" [{i}] {row['Source']}")
# 执行模糊匹配
console.print("\n🔍 开始模糊匹配...")
time_stamp_list = get_sentence_timestamps_fuzzy(df_text, df_translate)
# 显示结果
console.print("\n🎯 匹配结果:")
total_duration = 0
for i, (start, end) in enumerate(time_stamp_list):
duration = end - start
total_duration += duration
console.print(f" 句子 {i}: {start:.2f}s - {end:.2f}s (时长: {duration:.2f}s)")
console.print(f"\n📊 统计信息:")
console.print(f" 总句数: {len(time_stamp_list)}")
console.print(f" 总时长: {total_duration:.2f}s")
console.print(f" 平均每句时长: {total_duration/len(time_stamp_list):.2f}s")
# 与原视频时长对比
if len(df_text) > 0:
video_duration = df_text['end'].max()
console.print(f" 原视频时长: {video_duration:.2f}s")
console.print(f" 时长利用率: {(total_duration/video_duration*100):.1f}%")
console.print("[bold green]✅ 测试完成![/bold green]")
return True
except Exception as e:
console.print(f"[bold red]❌ 测试失败: {e}[/bold red]")
import traceback
console.print(traceback.format_exc())
return False
if __name__ == "__main__":
test_fuzzy_matching()