From b5f2e16eef3938366ae3ce8ab88e34e97af2ac1e Mon Sep 17 00:00:00 2001 From: root <374191531@qq.com> Date: Tue, 15 Jul 2025 14:33:05 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=A2=E5=A4=B1=E7=9A=84batch=5Fpredata?= =?UTF-8?q?=E5=B7=B2=E9=87=8D=E5=86=99=EF=BC=9B=E4=BF=AE=E5=A4=8Dapi=5Fvid?= =?UTF-8?q?eo=E4=B8=AD=EF=BC=8C=E4=B8=8A=E4=BC=A0api=E8=A7=86=E9=A2=91?= =?UTF-8?q?=E9=99=90=E5=88=B6=E4=B8=BA8mb=EF=BC=9B=E5=B7=B2=E6=AD=A3?= =?UTF-8?q?=E7=A1=AE=E4=BF=AE=E6=94=B9=E4=BF=9D=E5=AD=98=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E7=9A=84=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- code/api_video.py | 166 +++++++++++++++++++++++++++++----------- code/batch_api_video.py | 26 +++++-- code/batch_predata.py | 134 ++++++++++++++++++++++++++++++++ code/save_usage_info.py | 24 +++--- 4 files changed, 288 insertions(+), 62 deletions(-) create mode 100644 code/batch_predata.py diff --git a/code/api_video.py b/code/api_video.py index b4d8959..18dc872 100644 --- a/code/api_video.py +++ b/code/api_video.py @@ -2,13 +2,121 @@ from openai import OpenAI import os import base64 import time +import subprocess from datetime import datetime +from pathlib import Path from save_usage_info import save_usage_info_to_txt, save_simple_usage_info +def check_video_size(video_path, max_size_mb=7): + """ + 检查视频文件大小 + + Args: + video_path: 视频文件路径 + max_size_mb: 最大允许大小(MB) + + Returns: + bool: 是否在限制范围内 + """ + if not os.path.exists(video_path): + print(f"视频文件不存在: {video_path}") + return False + + file_size = os.path.getsize(video_path) + size_mb = file_size / 1024 / 1024 + + print(f"视频文件大小: {size_mb:.2f}MB (限制: {max_size_mb}MB)") + + return size_mb <= max_size_mb + +def compress_video_auto(video_path, target_size_mb=7): + """ + 自动压缩视频文件 + + Args: + video_path: 视频文件路径 + target_size_mb: 目标文件大小(MB) + + Returns: + str: 压缩后的视频路径,如果失败返回原路径 + """ + try: + # 检查是否需要压缩 + if check_video_size(video_path, target_size_mb): + print("视频文件大小符合要求,无需压缩") + return video_path + + print(f"视频文件过大,开始自动压缩...") + + # 生成压缩后的文件路径 + video_path_obj = Path(video_path) + compressed_path = video_path_obj.parent / f"{video_path_obj.stem}_compressed{video_path_obj.suffix}" + + # 尝试不同的质量设置 + quality_levels = [23, 25, 28, 30, 32] + + for quality in quality_levels: + print(f"尝试压缩质量: CRF={quality}") + + # FFmpeg压缩命令 + cmd = [ + 'ffmpeg', '-i', str(video_path), + '-c:v', 'libx264', + '-crf', str(quality), + '-preset', 'medium', + '-c:a', 'aac', + '-b:a', '128k', + '-movflags', '+faststart', + '-y', + str(compressed_path) + ] + + # 执行压缩 + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode == 0 and os.path.exists(compressed_path): + # 检查压缩后的文件大小 + compressed_size = os.path.getsize(compressed_path) / 1024 / 1024 + print(f"压缩完成! 文件大小: {compressed_size:.2f}MB") + + if compressed_size <= target_size_mb: + print("✅ 压缩成功,文件大小符合要求") + return str(compressed_path) + else: + print(f"⚠️ 文件仍然过大,尝试更高压缩率") + os.remove(compressed_path) # 删除失败的文件 + else: + print(f"压缩失败: {result.stderr}") + if os.path.exists(compressed_path): + os.remove(compressed_path) + + print("❌ 所有压缩级别都无法达到目标大小") + return video_path + + except Exception as e: + print(f"压缩视频时出错: {e}") + return video_path + # Base64 编码格式 def encode_video(video_path): - with open(video_path, "rb") as video_file: - return base64.b64encode(video_file.read()).decode("utf-8") + """编码视频文件,如果太大会自动压缩""" + try: + # 检查并自动压缩视频 + processed_video_path = compress_video_auto(video_path) + + # 编码视频 + with open(processed_video_path, "rb") as video_file: + encoded_data = base64.b64encode(video_file.read()).decode("utf-8") + + # 如果使用了压缩文件,显示信息 + if processed_video_path != video_path: + print(f"使用压缩后的视频文件: {processed_video_path}") + + return encoded_data + + except Exception as e: + print(f"编码视频时出错: {e}") + return None def encode_audio(audio_path): with open(audio_path, "rb") as audio_file: @@ -197,54 +305,24 @@ def format_clip_json(clip_data): return formatted_text -def save_result_to_txt(response_text, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"): + + +def save_result_to_txt(response_text, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=None): """将分析结果保存为TXT文件""" - # 创建保存目录 - os.makedirs(save_dir, exist_ok=True) - - # 生成文件名(基于视频文件名和时间戳) - video_name = os.path.splitext(os.path.basename(video_path))[0] - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - txt_filename = f"{video_dir}_{timestamp}.txt" - txt_dir = os.path.join(save_dir, "Template", video_name) + # 创建保存目录 - 每次运行创建新的时间戳文件夹 + video_name = os.path.splitext(os.path.basename(base_dir))[0] + if run_timestamp is None: + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + txt_dir = os.path.join(save_dir, "Template",video_name ,run_timestamp) os.makedirs(txt_dir, exist_ok=True) + + # 生成文件名(只使用片段名,不包含时间戳) + txt_filename = f"{video_dir}.txt" txt_path = os.path.join(txt_dir, txt_filename) # 准备保存内容(添加头部信息) content = f"""视频分析结果 ===================================== -视频文件: {video_path} -分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} -===================================== - -{response_text} -""" - - # 保存到文件 - try: - with open(txt_path, 'w', encoding='utf-8') as f: - f.write(content) - print(f"\n✅ 分析结果已保存到: {txt_path}") - return txt_path - except Exception as e: - print(f"\n❌ 保存TXT文件失败: {e}") - return None - -def save_result_to_txt(response_text, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"): - """将分析结果保存为TXT文件""" - # 创建保存目录 - os.makedirs(save_dir, exist_ok=True) - - # 生成文件名(基于视频文件名和时间戳) - video_name = os.path.splitext(os.path.basename(video_path))[0] - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - txt_filename = f"{video_dir}_{timestamp}.txt" - txt_dir = os.path.join(save_dir, "Template", video_name) - os.makedirs(txt_dir, exist_ok=True) - txt_path = os.path.join(txt_dir, txt_filename) - # 准备保存内容(添加头部信息) - content = f"""视频分析结果 -===================================== -视频文件: {video_path} +视频文件: {video_dir} 分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===================================== diff --git a/code/batch_api_video.py b/code/batch_api_video.py index 366ed70..c9f59e2 100644 --- a/code/batch_api_video.py +++ b/code/batch_api_video.py @@ -21,6 +21,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %( logger = logging.getLogger(__name__) STREAM_MODE = True +# 全局运行时间戳,确保所有片段保存在同一个文件夹中 +RUN_TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S") def find_video_dirs(video_processed_dir): """查找所有包含audio_split的目录""" @@ -33,20 +35,29 @@ def find_video_dirs(video_processed_dir): return video_dirs -a = "/root/autodl-tmp/video_processed/成都/video_split/" +a = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施,千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐/video_split/" video_dirs = find_video_dirs(a) print(video_dirs[0]) -for i ,video_dir in enumerate(video_dirs): - print(i, video_dir) +print(f"开始批量处理,运行时间戳: {RUN_TIMESTAMP}") +print(f"找到 {len(video_dirs)} 个视频片段") - base_dir = "/root/autodl-tmp/video_processed/成都" +for i ,video_dir in enumerate(video_dirs): + print(f"\n处理第 {i+1}/{len(video_dirs)} 个片段: {video_dir}") + + base_dir = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施,千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐" video_path = base_dir + "/video_split/" + video_dir + ".mp4" ocr_txt_path = base_dir + "/ocr/" + video_dir + "_subtitles_processed.txt" whisper_json_path = base_dir +"/whisper/" + video_dir + "_transcript.json" base64_video = encode_video(video_path) + + # 检查视频编码是否成功 + if base64_video is None: + print(f"错误: 无法编码视频文件 {video_path}") + print("请检查视频文件是否存在") + continue whisper_data = read_json_file(whisper_json_path) whisper_content = format_whisper_json(whisper_data) @@ -177,7 +188,8 @@ for i ,video_dir in enumerate(video_dirs): ], stream=STREAM_MODE, stream_options={"include_usage": True} if STREAM_MODE else None, - temperature=0.5 + temperature=0.4, + top_p = 0.3 ) if STREAM_MODE: @@ -223,11 +235,11 @@ for i ,video_dir in enumerate(video_dirs): print(full_response) # 保存结果为TXT文件 - txt_file_path = save_result_to_txt(full_response, base_dir, video_dir) + txt_file_path = save_result_to_txt(full_response, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP) # 保存结果为JSON文件 json_file_path = save_result_to_json(full_response, base_dir, video_dir) # 保存使用情况信息 - usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir) + usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP) # 输出使用情况信息 if usage_info: diff --git a/code/batch_predata.py b/code/batch_predata.py new file mode 100644 index 0000000..7010f8b --- /dev/null +++ b/code/batch_predata.py @@ -0,0 +1,134 @@ +import os +import glob +from pathlib import Path +from pre_data_1 import read_json_file, format_ocr_json, merge_and_filter_subtitles + +def find_ocr_json_files(base_dir): + """ + 在指定目录中查找所有OCR JSON文件 + + Args: + base_dir: 基础目录路径 + + Returns: + list: 找到的OCR JSON文件路径列表 + """ + ocr_files = [] + base_path = Path(base_dir) + + # 查找所有可能的OCR目录 + for ocr_dir in base_path.rglob("ocr"): + if ocr_dir.is_dir(): + # 在ocr目录中查找JSON文件 + json_files = list(ocr_dir.glob("*.json")) + ocr_files.extend(json_files) + + # 也查找直接包含"subtitles.json"的文件 + subtitle_files = list(base_path.rglob("*subtitles.json")) + ocr_files.extend(subtitle_files) + + # 去重 + ocr_files = list(set(ocr_files)) + + return ocr_files + +def process_ocr_file(ocr_json_path, iou_threshold=0.7, text_similarity_threshold=0.7): + """ + 处理单个OCR JSON文件 + + Args: + ocr_json_path: OCR JSON文件路径 + iou_threshold: IoU阈值 + text_similarity_threshold: 文本相似度阈值 + + Returns: + bool: 处理是否成功 + """ + try: + print(f"\n正在处理文件: {ocr_json_path}") + + # 读取OCR数据 + ocr_data = read_json_file(ocr_json_path) + if ocr_data is None: + print(f"跳过文件 {ocr_json_path} - 读取失败") + return False + + # 格式化OCR数据 + pre_data, subtitle_array = format_ocr_json(ocr_data) + + if not subtitle_array: + print(f"跳过文件 {ocr_json_path} - 没有有效的字幕数据") + return False + + # 合并并过滤字幕 + processed_text, processed_array = merge_and_filter_subtitles( + subtitle_array, + iou_threshold, + text_similarity_threshold + ) + + # 保存处理结果 + output_dir = Path(ocr_json_path).parent + output_filename = Path(ocr_json_path).stem + "_processed.txt" + output_path = output_dir / output_filename + + with open(output_path, 'w', encoding='utf-8') as f: + f.write(processed_text) + + print(f"处理完成: {output_path}") + print(f"原始字幕数量: {len(subtitle_array)}") + print(f"处理后字幕数量: {len(processed_array)}") + + return True + + except Exception as e: + print(f"处理文件 {ocr_json_path} 时出错: {str(e)}") + return False + +def main(): + """主函数""" + base_dir = "/root/autodl-tmp/video_processed2" + + print(f"开始在目录 {base_dir} 中查找OCR JSON文件...") + + # 查找所有OCR JSON文件 + ocr_files = find_ocr_json_files(base_dir) + + if not ocr_files: + print("未找到任何OCR JSON文件") + return + + print(f"找到 {len(ocr_files)} 个OCR JSON文件:") + for i, file_path in enumerate(ocr_files, 1): + print(f" {i}. {file_path}") + + # 处理参数 + iou_threshold = 0.7 + text_similarity_threshold = 0.7 + + print(f"\n开始批量处理...") + print(f"IoU阈值: {iou_threshold}") + print(f"文本相似度阈值: {text_similarity_threshold}") + + # 批量处理 + success_count = 0 + failed_count = 0 + + for i, ocr_file in enumerate(ocr_files, 1): + print(f"\n进度: {i}/{len(ocr_files)}") + + if process_ocr_file(ocr_file, iou_threshold, text_similarity_threshold): + success_count += 1 + else: + failed_count += 1 + + # 输出统计结果 + print(f"\n批量处理完成!") + print(f"总文件数: {len(ocr_files)}") + print(f"成功处理: {success_count}") + print(f"处理失败: {failed_count}") + print(f"成功率: {success_count/len(ocr_files)*100:.1f}%") + +if __name__ == "__main__": + main() + diff --git a/code/save_usage_info.py b/code/save_usage_info.py index cd91463..59ca950 100644 --- a/code/save_usage_info.py +++ b/code/save_usage_info.py @@ -1,31 +1,33 @@ import os from datetime import datetime -def save_usage_info_to_txt(usage_info, total_duration, money, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"): +def save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=None): """ 保存API使用情况信息到TXT文件 :param usage_info: API使用情况对象 :param total_duration: API总响应时间 :param money: 费用信息字典 :param video_path: 原视频文件路径 + :param video_dir: 视频目录名 :param save_dir: 保存目录 + :param run_timestamp: 运行时间戳,用于创建统一的文件夹 :return: 保存的文件路径 """ - # 创建保存目录 - os.makedirs(save_dir, exist_ok=True) - - # 生成文件名(基于视频文件名和时间戳) - video_name = os.path.splitext(os.path.basename(video_path))[0] - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - txt_filename = f"tokens_{video_dir}_{timestamp}.txt" - txt_dir = os.path.join(save_dir, "cost", video_name) + # 创建保存目录 - 每次运行创建新的时间戳文件夹 + video_name = os.path.splitext(os.path.basename(base_dir))[0] + if run_timestamp is None: + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + txt_dir = os.path.join(save_dir, "cost", video_name, run_timestamp) os.makedirs(txt_dir, exist_ok=True) + + # 生成文件名(只使用片段名,不包含时间戳) + txt_filename = f"tokens_{video_dir}.txt" txt_path = os.path.join(txt_dir, txt_filename) # 格式化使用情况信息 usage_content = f"""API使用情况统计 ===================================== -视频文件: {video_path} +视频文件: {video_dir} 统计时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===================================== @@ -118,7 +120,7 @@ if __name__ == "__main__": test_video_path = "/root/autodl-tmp/video/test.mp4" # 测试详细版本 - save_usage_info_to_txt(test_usage_info, test_duration, test_money, test_video_path) + save_usage_info_to_txt(test_usage_info, test_duration, test_money, test_video_path, "test_segment") # 测试简化版本 save_simple_usage_info(test_usage_info, test_duration, test_money, test_video_path) \ No newline at end of file