#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 测试模糊匹配算法的效果 """ import os import sys import pandas as pd # 设置路径 current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(current_dir) from core._6_gen_sub import get_sentence_timestamps_fuzzy, remove_punctuation from core.utils.models import _2_CLEANED_CHUNKS, _5_SPLIT_SUB from rich.console import Console console = Console() def test_fuzzy_matching(): """测试模糊匹配算法""" console.print("[bold green]🚀 开始测试模糊匹配算法...[/bold green]") try: # 读取数据 console.print("📊 读取数据...") df_text = pd.read_excel(_2_CLEANED_CHUNKS) df_text['text'] = df_text['text'].str.strip('"').str.strip() df_translate = pd.read_excel(_5_SPLIT_SUB) console.print(f"📝 原始ASR数据: {len(df_text)} 行") console.print(f"📝 待匹配句子: {len(df_translate)} 句") # 显示一些原始数据 console.print("\n📋 ASR原始数据示例:") for i in range(min(5, len(df_text))): row = df_text.iloc[i] console.print(f" [{i}] {row['text']} ({row['start']:.2f}s - {row['end']:.2f}s)") console.print("\n📋 待匹配句子:") for i, row in df_translate.iterrows(): console.print(f" [{i}] {row['Source']}") # 执行模糊匹配 console.print("\n🔍 开始模糊匹配...") time_stamp_list = get_sentence_timestamps_fuzzy(df_text, df_translate) # 显示结果 console.print("\n🎯 匹配结果:") total_duration = 0 for i, (start, end) in enumerate(time_stamp_list): duration = end - start total_duration += duration console.print(f" 句子 {i}: {start:.2f}s - {end:.2f}s (时长: {duration:.2f}s)") console.print(f"\n📊 统计信息:") console.print(f" 总句数: {len(time_stamp_list)}") console.print(f" 总时长: {total_duration:.2f}s") console.print(f" 平均每句时长: {total_duration/len(time_stamp_list):.2f}s") # 与原视频时长对比 if len(df_text) > 0: video_duration = df_text['end'].max() console.print(f" 原视频时长: {video_duration:.2f}s") console.print(f" 时长利用率: {(total_duration/video_duration*100):.1f}%") console.print("[bold green]✅ 测试完成![/bold green]") return True except Exception as e: console.print(f"[bold red]❌ 测试失败: {e}[/bold red]") import traceback console.print(traceback.format_exc()) return False if __name__ == "__main__": test_fuzzy_matching()