80 lines
2.8 KiB
Python
80 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
测试模糊匹配算法的效果
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import pandas as pd
|
|
|
|
# 设置路径
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
sys.path.append(current_dir)
|
|
|
|
from core._6_gen_sub import get_sentence_timestamps_fuzzy, remove_punctuation
|
|
from core.utils.models import _2_CLEANED_CHUNKS, _5_SPLIT_SUB
|
|
from rich.console import Console
|
|
|
|
console = Console()
|
|
|
|
def test_fuzzy_matching():
|
|
"""测试模糊匹配算法"""
|
|
console.print("[bold green]🚀 开始测试模糊匹配算法...[/bold green]")
|
|
|
|
try:
|
|
# 读取数据
|
|
console.print("📊 读取数据...")
|
|
df_text = pd.read_excel(_2_CLEANED_CHUNKS)
|
|
df_text['text'] = df_text['text'].str.strip('"').str.strip()
|
|
|
|
df_translate = pd.read_excel(_5_SPLIT_SUB)
|
|
|
|
console.print(f"📝 原始ASR数据: {len(df_text)} 行")
|
|
console.print(f"📝 待匹配句子: {len(df_translate)} 句")
|
|
|
|
# 显示一些原始数据
|
|
console.print("\n📋 ASR原始数据示例:")
|
|
for i in range(min(5, len(df_text))):
|
|
row = df_text.iloc[i]
|
|
console.print(f" [{i}] {row['text']} ({row['start']:.2f}s - {row['end']:.2f}s)")
|
|
|
|
console.print("\n📋 待匹配句子:")
|
|
for i, row in df_translate.iterrows():
|
|
console.print(f" [{i}] {row['Source']}")
|
|
|
|
# 执行模糊匹配
|
|
console.print("\n🔍 开始模糊匹配...")
|
|
time_stamp_list = get_sentence_timestamps_fuzzy(df_text, df_translate)
|
|
|
|
# 显示结果
|
|
console.print("\n🎯 匹配结果:")
|
|
total_duration = 0
|
|
for i, (start, end) in enumerate(time_stamp_list):
|
|
duration = end - start
|
|
total_duration += duration
|
|
console.print(f" 句子 {i}: {start:.2f}s - {end:.2f}s (时长: {duration:.2f}s)")
|
|
|
|
console.print(f"\n📊 统计信息:")
|
|
console.print(f" 总句数: {len(time_stamp_list)}")
|
|
console.print(f" 总时长: {total_duration:.2f}s")
|
|
console.print(f" 平均每句时长: {total_duration/len(time_stamp_list):.2f}s")
|
|
|
|
# 与原视频时长对比
|
|
if len(df_text) > 0:
|
|
video_duration = df_text['end'].max()
|
|
console.print(f" 原视频时长: {video_duration:.2f}s")
|
|
console.print(f" 时长利用率: {(total_duration/video_duration*100):.1f}%")
|
|
|
|
console.print("[bold green]✅ 测试完成![/bold green]")
|
|
return True
|
|
|
|
except Exception as e:
|
|
console.print(f"[bold red]❌ 测试失败: {e}[/bold red]")
|
|
import traceback
|
|
console.print(traceback.format_exc())
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
test_fuzzy_matching()
|