From b5f2e16eef3938366ae3ce8ab88e34e97af2ac1e Mon Sep 17 00:00:00 2001
From: root <374191531@qq.com>
Date: Tue, 15 Jul 2025 14:33:05 +0800
Subject: [PATCH] =?UTF-8?q?=E4=B8=A2=E5=A4=B1=E7=9A=84batch=5Fpredata?=
 =?UTF-8?q?=E5=B7=B2=E9=87=8D=E5=86=99=EF=BC=9B=E4=BF=AE=E5=A4=8Dapi=5Fvid?=
 =?UTF-8?q?eo=E4=B8=AD=EF=BC=8C=E4=B8=8A=E4=BC=A0api=E8=A7=86=E9=A2=91?=
 =?UTF-8?q?=E9=99=90=E5=88=B6=E4=B8=BA8mb=EF=BC=9B=E5=B7=B2=E6=AD=A3?=
 =?UTF-8?q?=E7=A1=AE=E4=BF=AE=E6=94=B9=E4=BF=9D=E5=AD=98=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E7=9A=84=E8=B7=AF=E5=BE=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 code/api_video.py       | 166 +++++++++++++++++++++++++++++-----------
 code/batch_api_video.py |  26 +++++--
 code/batch_predata.py   | 134 ++++++++++++++++++++++++++++++++
 code/save_usage_info.py |  24 +++---
 4 files changed, 288 insertions(+), 62 deletions(-)
 create mode 100644 code/batch_predata.py

diff --git a/code/api_video.py b/code/api_video.py
index b4d8959..18dc872 100644
--- a/code/api_video.py
+++ b/code/api_video.py
@@ -2,13 +2,121 @@ from openai import OpenAI
 import os
 import base64
 import time
+import subprocess
 from datetime import datetime
+from pathlib import Path
 from save_usage_info import save_usage_info_to_txt, save_simple_usage_info
 
+def check_video_size(video_path, max_size_mb=7):
+    """
+    检查视频文件大小
+    
+    Args:
+        video_path: 视频文件路径
+        max_size_mb: 最大允许大小（MB）
+    
+    Returns:
+        bool: 是否在限制范围内
+    """
+    if not os.path.exists(video_path):
+        print(f"视频文件不存在: {video_path}")
+        return False
+    
+    file_size = os.path.getsize(video_path)
+    size_mb = file_size / 1024 / 1024
+    
+    print(f"视频文件大小: {size_mb:.2f}MB (限制: {max_size_mb}MB)")
+    
+    return size_mb <= max_size_mb
+
+def compress_video_auto(video_path, target_size_mb=7):
+    """
+    自动压缩视频文件
+    
+    Args:
+        video_path: 视频文件路径
+        target_size_mb: 目标文件大小（MB）
+    
+    Returns:
+        str: 压缩后的视频路径，如果失败返回原路径
+    """
+    try:
+        # 检查是否需要压缩
+        if check_video_size(video_path, target_size_mb):
+            print("视频文件大小符合要求，无需压缩")
+            return video_path
+        
+        print(f"视频文件过大，开始自动压缩...")
+        
+        # 生成压缩后的文件路径
+        video_path_obj = Path(video_path)
+        compressed_path = video_path_obj.parent / f"{video_path_obj.stem}_compressed{video_path_obj.suffix}"
+        
+        # 尝试不同的质量设置
+        quality_levels = [23, 25, 28, 30, 32]
+        
+        for quality in quality_levels:
+            print(f"尝试压缩质量: CRF={quality}")
+            
+            # FFmpeg压缩命令
+            cmd = [
+                'ffmpeg', '-i', str(video_path),
+                '-c:v', 'libx264',
+                '-crf', str(quality),
+                '-preset', 'medium',
+                '-c:a', 'aac',
+                '-b:a', '128k',
+                '-movflags', '+faststart',
+                '-y',
+                str(compressed_path)
+            ]
+            
+            # 执行压缩
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            
+            if result.returncode == 0 and os.path.exists(compressed_path):
+                # 检查压缩后的文件大小
+                compressed_size = os.path.getsize(compressed_path) / 1024 / 1024
+                print(f"压缩完成! 文件大小: {compressed_size:.2f}MB")
+                
+                if compressed_size <= target_size_mb:
+                    print("✅ 压缩成功，文件大小符合要求")
+                    return str(compressed_path)
+                else:
+                    print(f"⚠️ 文件仍然过大，尝试更高压缩率")
+                    os.remove(compressed_path)  # 删除失败的文件
+            else:
+                print(f"压缩失败: {result.stderr}")
+                if os.path.exists(compressed_path):
+                    os.remove(compressed_path)
+        
+        print("❌ 所有压缩级别都无法达到目标大小")
+        return video_path
+        
+    except Exception as e:
+        print(f"压缩视频时出错: {e}")
+        return video_path
+
 #  Base64 编码格式
 def encode_video(video_path):
-    with open(video_path, "rb") as video_file:
-        return base64.b64encode(video_file.read()).decode("utf-8")
+    """编码视频文件，如果太大会自动压缩"""
+    try:
+        # 检查并自动压缩视频
+        processed_video_path = compress_video_auto(video_path)
+        
+        # 编码视频
+        with open(processed_video_path, "rb") as video_file:
+            encoded_data = base64.b64encode(video_file.read()).decode("utf-8")
+        
+        # 如果使用了压缩文件，显示信息
+        if processed_video_path != video_path:
+            print(f"使用压缩后的视频文件: {processed_video_path}")
+        
+        return encoded_data
+        
+    except Exception as e:
+        print(f"编码视频时出错: {e}")
+        return None
     
 def encode_audio(audio_path):
     with open(audio_path, "rb") as audio_file:
@@ -197,54 +305,24 @@ def format_clip_json(clip_data):
     
     return formatted_text
 
-def save_result_to_txt(response_text, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
+
+
+def save_result_to_txt(response_text, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=None):
     """将分析结果保存为TXT文件"""
-    # 创建保存目录
-    os.makedirs(save_dir, exist_ok=True)
-    
-    # 生成文件名（基于视频文件名和时间戳）
-    video_name = os.path.splitext(os.path.basename(video_path))[0]
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    txt_filename = f"{video_dir}_{timestamp}.txt"
-    txt_dir = os.path.join(save_dir, "Template", video_name)
+    # 创建保存目录 - 每次运行创建新的时间戳文件夹
+    video_name = os.path.splitext(os.path.basename(base_dir))[0]
+    if run_timestamp is None:
+        run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    txt_dir = os.path.join(save_dir, "Template",video_name ,run_timestamp)
     os.makedirs(txt_dir, exist_ok=True)
+    
+    # 生成文件名（只使用片段名，不包含时间戳）
+    txt_filename = f"{video_dir}.txt"
     txt_path = os.path.join(txt_dir, txt_filename)
     # 准备保存内容（添加头部信息）
     content = f"""视频分析结果
 =====================================
-视频文件: {video_path}
-分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
-=====================================
-
-{response_text}
-"""
-    
-    # 保存到文件
-    try:
-        with open(txt_path, 'w', encoding='utf-8') as f:
-            f.write(content)
-        print(f"\n✅ 分析结果已保存到: {txt_path}")
-        return txt_path
-    except Exception as e:
-        print(f"\n❌ 保存TXT文件失败: {e}")
-        return None
-
-def save_result_to_txt(response_text, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
-    """将分析结果保存为TXT文件"""
-    # 创建保存目录
-    os.makedirs(save_dir, exist_ok=True)
-    
-    # 生成文件名（基于视频文件名和时间戳）
-    video_name = os.path.splitext(os.path.basename(video_path))[0]
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    txt_filename = f"{video_dir}_{timestamp}.txt"
-    txt_dir = os.path.join(save_dir, "Template", video_name)
-    os.makedirs(txt_dir, exist_ok=True)
-    txt_path = os.path.join(txt_dir, txt_filename)
-    # 准备保存内容（添加头部信息）
-    content = f"""视频分析结果
-=====================================
-视频文件: {video_path}
+视频文件: {video_dir}
 分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
 =====================================
 
diff --git a/code/batch_api_video.py b/code/batch_api_video.py
index 366ed70..c9f59e2 100644
--- a/code/batch_api_video.py
+++ b/code/batch_api_video.py
@@ -21,6 +21,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 logger = logging.getLogger(__name__)
 
 STREAM_MODE = True
+# 全局运行时间戳，确保所有片段保存在同一个文件夹中
+RUN_TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
 
 def find_video_dirs(video_processed_dir):
     """查找所有包含audio_split的目录"""
@@ -33,20 +35,29 @@ def find_video_dirs(video_processed_dir):
     return video_dirs
 
 
-a = "/root/autodl-tmp/video_processed/成都/video_split/"
+a = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施，千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐/video_split/"
 video_dirs = find_video_dirs(a)
 
 print(video_dirs[0])
-for i ,video_dir in enumerate(video_dirs):
-    print(i, video_dir)
+print(f"开始批量处理，运行时间戳: {RUN_TIMESTAMP}")
+print(f"找到 {len(video_dirs)} 个视频片段")
 
-    base_dir = "/root/autodl-tmp/video_processed/成都"
+for i ,video_dir in enumerate(video_dirs):
+    print(f"\n处理第 {i+1}/{len(video_dirs)} 个片段: {video_dir}")
+
+    base_dir = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施，千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐"
 
     video_path = base_dir + "/video_split/" + video_dir + ".mp4"
     ocr_txt_path = base_dir + "/ocr/" + video_dir + "_subtitles_processed.txt"
     whisper_json_path = base_dir +"/whisper/" + video_dir + "_transcript.json"
 
     base64_video = encode_video(video_path)
+    
+    # 检查视频编码是否成功
+    if base64_video is None:
+        print(f"错误: 无法编码视频文件 {video_path}")
+        print("请检查视频文件是否存在")
+        continue
 
     whisper_data = read_json_file(whisper_json_path)
     whisper_content = format_whisper_json(whisper_data)
@@ -177,7 +188,8 @@ for i ,video_dir in enumerate(video_dirs):
         ],
         stream=STREAM_MODE,
         stream_options={"include_usage": True} if STREAM_MODE else None,
-        temperature=0.5
+        temperature=0.4,
+        top_p = 0.3
     )
 
     if STREAM_MODE:
@@ -223,11 +235,11 @@ for i ,video_dir in enumerate(video_dirs):
         print(full_response)
 
         # 保存结果为TXT文件
-        txt_file_path = save_result_to_txt(full_response, base_dir, video_dir)
+        txt_file_path = save_result_to_txt(full_response, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP)
         # 保存结果为JSON文件
         json_file_path = save_result_to_json(full_response, base_dir, video_dir)
         # 保存使用情况信息
-        usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir)
+        usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP)
 
         # 输出使用情况信息
         if usage_info:
diff --git a/code/batch_predata.py b/code/batch_predata.py
new file mode 100644
index 0000000..7010f8b
--- /dev/null
+++ b/code/batch_predata.py
@@ -0,0 +1,134 @@
+import os
+import glob
+from pathlib import Path
+from pre_data_1 import read_json_file, format_ocr_json, merge_and_filter_subtitles
+
+def find_ocr_json_files(base_dir):
+    """
+    在指定目录中查找所有OCR JSON文件
+    
+    Args:
+        base_dir: 基础目录路径
+        
+    Returns:
+        list: 找到的OCR JSON文件路径列表
+    """
+    ocr_files = []
+    base_path = Path(base_dir)
+    
+    # 查找所有可能的OCR目录
+    for ocr_dir in base_path.rglob("ocr"):
+        if ocr_dir.is_dir():
+            # 在ocr目录中查找JSON文件
+            json_files = list(ocr_dir.glob("*.json"))
+            ocr_files.extend(json_files)
+    
+    # 也查找直接包含"subtitles.json"的文件
+    subtitle_files = list(base_path.rglob("*subtitles.json"))
+    ocr_files.extend(subtitle_files)
+    
+    # 去重
+    ocr_files = list(set(ocr_files))
+    
+    return ocr_files
+
+def process_ocr_file(ocr_json_path, iou_threshold=0.7, text_similarity_threshold=0.7):
+    """
+    处理单个OCR JSON文件
+    
+    Args:
+        ocr_json_path: OCR JSON文件路径
+        iou_threshold: IoU阈值
+        text_similarity_threshold: 文本相似度阈值
+        
+    Returns:
+        bool: 处理是否成功
+    """
+    try:
+        print(f"\n正在处理文件: {ocr_json_path}")
+        
+        # 读取OCR数据
+        ocr_data = read_json_file(ocr_json_path)
+        if ocr_data is None:
+            print(f"跳过文件 {ocr_json_path} - 读取失败")
+            return False
+        
+        # 格式化OCR数据
+        pre_data, subtitle_array = format_ocr_json(ocr_data)
+        
+        if not subtitle_array:
+            print(f"跳过文件 {ocr_json_path} - 没有有效的字幕数据")
+            return False
+        
+        # 合并并过滤字幕
+        processed_text, processed_array = merge_and_filter_subtitles(
+            subtitle_array, 
+            iou_threshold, 
+            text_similarity_threshold
+        )
+        
+        # 保存处理结果
+        output_dir = Path(ocr_json_path).parent
+        output_filename = Path(ocr_json_path).stem + "_processed.txt"
+        output_path = output_dir / output_filename
+        
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(processed_text)
+        
+        print(f"处理完成: {output_path}")
+        print(f"原始字幕数量: {len(subtitle_array)}")
+        print(f"处理后字幕数量: {len(processed_array)}")
+        
+        return True
+        
+    except Exception as e:
+        print(f"处理文件 {ocr_json_path} 时出错: {str(e)}")
+        return False
+
+def main():
+    """主函数"""
+    base_dir = "/root/autodl-tmp/video_processed2"
+    
+    print(f"开始在目录 {base_dir} 中查找OCR JSON文件...")
+    
+    # 查找所有OCR JSON文件
+    ocr_files = find_ocr_json_files(base_dir)
+    
+    if not ocr_files:
+        print("未找到任何OCR JSON文件")
+        return
+    
+    print(f"找到 {len(ocr_files)} 个OCR JSON文件:")
+    for i, file_path in enumerate(ocr_files, 1):
+        print(f"  {i}. {file_path}")
+    
+    # 处理参数
+    iou_threshold = 0.7
+    text_similarity_threshold = 0.7
+    
+    print(f"\n开始批量处理...")
+    print(f"IoU阈值: {iou_threshold}")
+    print(f"文本相似度阈值: {text_similarity_threshold}")
+    
+    # 批量处理
+    success_count = 0
+    failed_count = 0
+    
+    for i, ocr_file in enumerate(ocr_files, 1):
+        print(f"\n进度: {i}/{len(ocr_files)}")
+        
+        if process_ocr_file(ocr_file, iou_threshold, text_similarity_threshold):
+            success_count += 1
+        else:
+            failed_count += 1
+    
+    # 输出统计结果
+    print(f"\n批量处理完成!")
+    print(f"总文件数: {len(ocr_files)}")
+    print(f"成功处理: {success_count}")
+    print(f"处理失败: {failed_count}")
+    print(f"成功率: {success_count/len(ocr_files)*100:.1f}%")
+
+if __name__ == "__main__":
+    main()
+
diff --git a/code/save_usage_info.py b/code/save_usage_info.py
index cd91463..59ca950 100644
--- a/code/save_usage_info.py
+++ b/code/save_usage_info.py
@@ -1,31 +1,33 @@
 import os
 from datetime import datetime
 
-def save_usage_info_to_txt(usage_info, total_duration, money, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
+def save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=None):
     """
     保存API使用情况信息到TXT文件
     :param usage_info: API使用情况对象
     :param total_duration: API总响应时间
     :param money: 费用信息字典
     :param video_path: 原视频文件路径
+    :param video_dir: 视频目录名
     :param save_dir: 保存目录
+    :param run_timestamp: 运行时间戳，用于创建统一的文件夹
     :return: 保存的文件路径
     """
-    # 创建保存目录
-    os.makedirs(save_dir, exist_ok=True)
-    
-    # 生成文件名（基于视频文件名和时间戳）
-    video_name = os.path.splitext(os.path.basename(video_path))[0]
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    txt_filename = f"tokens_{video_dir}_{timestamp}.txt"
-    txt_dir = os.path.join(save_dir, "cost", video_name)
+    # 创建保存目录 - 每次运行创建新的时间戳文件夹
+    video_name = os.path.splitext(os.path.basename(base_dir))[0]
+    if run_timestamp is None:
+        run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    txt_dir = os.path.join(save_dir, "cost", video_name, run_timestamp)
     os.makedirs(txt_dir, exist_ok=True)
+    
+    # 生成文件名（只使用片段名，不包含时间戳）
+    txt_filename = f"tokens_{video_dir}.txt"
     txt_path = os.path.join(txt_dir, txt_filename)
     
     # 格式化使用情况信息
     usage_content = f"""API使用情况统计
 =====================================
-视频文件: {video_path}
+视频文件: {video_dir}
 统计时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
 =====================================
 
@@ -118,7 +120,7 @@ if __name__ == "__main__":
     test_video_path = "/root/autodl-tmp/video/test.mp4"
     
     # 测试详细版本
-    save_usage_info_to_txt(test_usage_info, test_duration, test_money, test_video_path)
+    save_usage_info_to_txt(test_usage_info, test_duration, test_money, test_video_path, "test_segment")
     
     # 测试简化版本
     save_simple_usage_info(test_usage_info, test_duration, test_money, test_video_path) 
\ No newline at end of file