import os import glob from pathlib import Path from pre_data_1 import read_json_file, format_ocr_json, merge_and_filter_subtitles def find_ocr_json_files(base_dir): """ 在指定目录中查找所有OCR JSON文件 Args: base_dir: 基础目录路径 Returns: list: 找到的OCR JSON文件路径列表 """ ocr_files = [] base_path = Path(base_dir) # 查找所有可能的OCR目录 for ocr_dir in base_path.rglob("ocr"): if ocr_dir.is_dir(): # 在ocr目录中查找JSON文件 json_files = list(ocr_dir.glob("*.json")) ocr_files.extend(json_files) # 也查找直接包含"subtitles.json"的文件 subtitle_files = list(base_path.rglob("*subtitles.json")) ocr_files.extend(subtitle_files) # 去重 ocr_files = list(set(ocr_files)) return ocr_files def process_ocr_file(ocr_json_path, iou_threshold=0.7, text_similarity_threshold=0.7): """ 处理单个OCR JSON文件 Args: ocr_json_path: OCR JSON文件路径 iou_threshold: IoU阈值 text_similarity_threshold: 文本相似度阈值 Returns: bool: 处理是否成功 """ try: print(f"\n正在处理文件: {ocr_json_path}") # 读取OCR数据 ocr_data = read_json_file(ocr_json_path) if ocr_data is None: print(f"跳过文件 {ocr_json_path} - 读取失败") return False # 格式化OCR数据 pre_data, subtitle_array = format_ocr_json(ocr_data) if not subtitle_array: print(f"跳过文件 {ocr_json_path} - 没有有效的字幕数据") return False # 合并并过滤字幕 processed_text, processed_array = merge_and_filter_subtitles( subtitle_array, iou_threshold, text_similarity_threshold ) # 保存处理结果 output_dir = Path(ocr_json_path).parent output_filename = Path(ocr_json_path).stem + "_processed.txt" output_path = output_dir / output_filename with open(output_path, 'w', encoding='utf-8') as f: f.write(processed_text) print(f"处理完成: {output_path}") print(f"原始字幕数量: {len(subtitle_array)}") print(f"处理后字幕数量: {len(processed_array)}") return True except Exception as e: print(f"处理文件 {ocr_json_path} 时出错: {str(e)}") return False def main(): """主函数""" base_dir = "/root/autodl-tmp/video_processed2" print(f"开始在目录 {base_dir} 中查找OCR JSON文件...") # 查找所有OCR JSON文件 ocr_files = find_ocr_json_files(base_dir) if not ocr_files: print("未找到任何OCR JSON文件") return print(f"找到 {len(ocr_files)} 个OCR JSON文件:") for i, file_path in enumerate(ocr_files, 1): print(f" {i}. {file_path}") # 处理参数 iou_threshold = 0.7 text_similarity_threshold = 0.7 print(f"\n开始批量处理...") print(f"IoU阈值: {iou_threshold}") print(f"文本相似度阈值: {text_similarity_threshold}") # 批量处理 success_count = 0 failed_count = 0 for i, ocr_file in enumerate(ocr_files, 1): print(f"\n进度: {i}/{len(ocr_files)}") if process_ocr_file(ocr_file, iou_threshold, text_similarity_threshold): success_count += 1 else: failed_count += 1 # 输出统计结果 print(f"\n批量处理完成!") print(f"总文件数: {len(ocr_files)}") print(f"成功处理: {success_count}") print(f"处理失败: {failed_count}") print(f"成功率: {success_count/len(ocr_files)*100:.1f}%") if __name__ == "__main__": main()