丢失的batch_predata已重写;修复api_video中,上传api视频限制为8mb;已正确修改保存文件的路径

This commit is contained in:
root 2025-07-15 14:33:05 +08:00
parent 7636eca330
commit b5f2e16eef
4 changed files with 288 additions and 62 deletions

View File

@ -2,13 +2,121 @@ from openai import OpenAI
import os import os
import base64 import base64
import time import time
import subprocess
from datetime import datetime from datetime import datetime
from pathlib import Path
from save_usage_info import save_usage_info_to_txt, save_simple_usage_info from save_usage_info import save_usage_info_to_txt, save_simple_usage_info
def check_video_size(video_path, max_size_mb=7):
"""
检查视频文件大小
Args:
video_path: 视频文件路径
max_size_mb: 最大允许大小MB
Returns:
bool: 是否在限制范围内
"""
if not os.path.exists(video_path):
print(f"视频文件不存在: {video_path}")
return False
file_size = os.path.getsize(video_path)
size_mb = file_size / 1024 / 1024
print(f"视频文件大小: {size_mb:.2f}MB (限制: {max_size_mb}MB)")
return size_mb <= max_size_mb
def compress_video_auto(video_path, target_size_mb=7):
"""
自动压缩视频文件
Args:
video_path: 视频文件路径
target_size_mb: 目标文件大小MB
Returns:
str: 压缩后的视频路径如果失败返回原路径
"""
try:
# 检查是否需要压缩
if check_video_size(video_path, target_size_mb):
print("视频文件大小符合要求,无需压缩")
return video_path
print(f"视频文件过大,开始自动压缩...")
# 生成压缩后的文件路径
video_path_obj = Path(video_path)
compressed_path = video_path_obj.parent / f"{video_path_obj.stem}_compressed{video_path_obj.suffix}"
# 尝试不同的质量设置
quality_levels = [23, 25, 28, 30, 32]
for quality in quality_levels:
print(f"尝试压缩质量: CRF={quality}")
# FFmpeg压缩命令
cmd = [
'ffmpeg', '-i', str(video_path),
'-c:v', 'libx264',
'-crf', str(quality),
'-preset', 'medium',
'-c:a', 'aac',
'-b:a', '128k',
'-movflags', '+faststart',
'-y',
str(compressed_path)
]
# 执行压缩
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0 and os.path.exists(compressed_path):
# 检查压缩后的文件大小
compressed_size = os.path.getsize(compressed_path) / 1024 / 1024
print(f"压缩完成! 文件大小: {compressed_size:.2f}MB")
if compressed_size <= target_size_mb:
print("✅ 压缩成功,文件大小符合要求")
return str(compressed_path)
else:
print(f"⚠️ 文件仍然过大,尝试更高压缩率")
os.remove(compressed_path) # 删除失败的文件
else:
print(f"压缩失败: {result.stderr}")
if os.path.exists(compressed_path):
os.remove(compressed_path)
print("❌ 所有压缩级别都无法达到目标大小")
return video_path
except Exception as e:
print(f"压缩视频时出错: {e}")
return video_path
# Base64 编码格式 # Base64 编码格式
def encode_video(video_path): def encode_video(video_path):
with open(video_path, "rb") as video_file: """编码视频文件,如果太大会自动压缩"""
return base64.b64encode(video_file.read()).decode("utf-8") try:
# 检查并自动压缩视频
processed_video_path = compress_video_auto(video_path)
# 编码视频
with open(processed_video_path, "rb") as video_file:
encoded_data = base64.b64encode(video_file.read()).decode("utf-8")
# 如果使用了压缩文件,显示信息
if processed_video_path != video_path:
print(f"使用压缩后的视频文件: {processed_video_path}")
return encoded_data
except Exception as e:
print(f"编码视频时出错: {e}")
return None
def encode_audio(audio_path): def encode_audio(audio_path):
with open(audio_path, "rb") as audio_file: with open(audio_path, "rb") as audio_file:
@ -197,54 +305,24 @@ def format_clip_json(clip_data):
return formatted_text return formatted_text
def save_result_to_txt(response_text, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
def save_result_to_txt(response_text, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=None):
"""将分析结果保存为TXT文件""" """将分析结果保存为TXT文件"""
# 创建保存目录 # 创建保存目录 - 每次运行创建新的时间戳文件夹
os.makedirs(save_dir, exist_ok=True) video_name = os.path.splitext(os.path.basename(base_dir))[0]
if run_timestamp is None:
# 生成文件名(基于视频文件名和时间戳) run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
video_name = os.path.splitext(os.path.basename(video_path))[0] txt_dir = os.path.join(save_dir, "Template",video_name ,run_timestamp)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
txt_filename = f"{video_dir}_{timestamp}.txt"
txt_dir = os.path.join(save_dir, "Template", video_name)
os.makedirs(txt_dir, exist_ok=True) os.makedirs(txt_dir, exist_ok=True)
# 生成文件名(只使用片段名,不包含时间戳)
txt_filename = f"{video_dir}.txt"
txt_path = os.path.join(txt_dir, txt_filename) txt_path = os.path.join(txt_dir, txt_filename)
# 准备保存内容(添加头部信息) # 准备保存内容(添加头部信息)
content = f"""视频分析结果 content = f"""视频分析结果
===================================== =====================================
视频文件: {video_path} 视频文件: {video_dir}
分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
=====================================
{response_text}
"""
# 保存到文件
try:
with open(txt_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"\n✅ 分析结果已保存到: {txt_path}")
return txt_path
except Exception as e:
print(f"\n❌ 保存TXT文件失败: {e}")
return None
def save_result_to_txt(response_text, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
"""将分析结果保存为TXT文件"""
# 创建保存目录
os.makedirs(save_dir, exist_ok=True)
# 生成文件名(基于视频文件名和时间戳)
video_name = os.path.splitext(os.path.basename(video_path))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
txt_filename = f"{video_dir}_{timestamp}.txt"
txt_dir = os.path.join(save_dir, "Template", video_name)
os.makedirs(txt_dir, exist_ok=True)
txt_path = os.path.join(txt_dir, txt_filename)
# 准备保存内容(添加头部信息)
content = f"""视频分析结果
=====================================
视频文件: {video_path}
分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} 分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
===================================== =====================================

View File

@ -21,6 +21,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
STREAM_MODE = True STREAM_MODE = True
# 全局运行时间戳,确保所有片段保存在同一个文件夹中
RUN_TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
def find_video_dirs(video_processed_dir): def find_video_dirs(video_processed_dir):
"""查找所有包含audio_split的目录""" """查找所有包含audio_split的目录"""
@ -33,20 +35,29 @@ def find_video_dirs(video_processed_dir):
return video_dirs return video_dirs
a = "/root/autodl-tmp/video_processed/成都/video_split/" a = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施,千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐/video_split/"
video_dirs = find_video_dirs(a) video_dirs = find_video_dirs(a)
print(video_dirs[0]) print(video_dirs[0])
for i ,video_dir in enumerate(video_dirs): print(f"开始批量处理,运行时间戳: {RUN_TIMESTAMP}")
print(i, video_dir) print(f"找到 {len(video_dirs)} 个视频片段")
base_dir = "/root/autodl-tmp/video_processed/成都" for i ,video_dir in enumerate(video_dirs):
print(f"\n处理第 {i+1}/{len(video_dirs)} 个片段: {video_dir}")
base_dir = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施,千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐"
video_path = base_dir + "/video_split/" + video_dir + ".mp4" video_path = base_dir + "/video_split/" + video_dir + ".mp4"
ocr_txt_path = base_dir + "/ocr/" + video_dir + "_subtitles_processed.txt" ocr_txt_path = base_dir + "/ocr/" + video_dir + "_subtitles_processed.txt"
whisper_json_path = base_dir +"/whisper/" + video_dir + "_transcript.json" whisper_json_path = base_dir +"/whisper/" + video_dir + "_transcript.json"
base64_video = encode_video(video_path) base64_video = encode_video(video_path)
# 检查视频编码是否成功
if base64_video is None:
print(f"错误: 无法编码视频文件 {video_path}")
print("请检查视频文件是否存在")
continue
whisper_data = read_json_file(whisper_json_path) whisper_data = read_json_file(whisper_json_path)
whisper_content = format_whisper_json(whisper_data) whisper_content = format_whisper_json(whisper_data)
@ -177,7 +188,8 @@ for i ,video_dir in enumerate(video_dirs):
], ],
stream=STREAM_MODE, stream=STREAM_MODE,
stream_options={"include_usage": True} if STREAM_MODE else None, stream_options={"include_usage": True} if STREAM_MODE else None,
temperature=0.5 temperature=0.4,
top_p = 0.3
) )
if STREAM_MODE: if STREAM_MODE:
@ -223,11 +235,11 @@ for i ,video_dir in enumerate(video_dirs):
print(full_response) print(full_response)
# 保存结果为TXT文件 # 保存结果为TXT文件
txt_file_path = save_result_to_txt(full_response, base_dir, video_dir) txt_file_path = save_result_to_txt(full_response, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP)
# 保存结果为JSON文件 # 保存结果为JSON文件
json_file_path = save_result_to_json(full_response, base_dir, video_dir) json_file_path = save_result_to_json(full_response, base_dir, video_dir)
# 保存使用情况信息 # 保存使用情况信息
usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir) usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP)
# 输出使用情况信息 # 输出使用情况信息
if usage_info: if usage_info:

134
code/batch_predata.py Normal file
View File

@ -0,0 +1,134 @@
import os
import glob
from pathlib import Path
from pre_data_1 import read_json_file, format_ocr_json, merge_and_filter_subtitles
def find_ocr_json_files(base_dir):
"""
在指定目录中查找所有OCR JSON文件
Args:
base_dir: 基础目录路径
Returns:
list: 找到的OCR JSON文件路径列表
"""
ocr_files = []
base_path = Path(base_dir)
# 查找所有可能的OCR目录
for ocr_dir in base_path.rglob("ocr"):
if ocr_dir.is_dir():
# 在ocr目录中查找JSON文件
json_files = list(ocr_dir.glob("*.json"))
ocr_files.extend(json_files)
# 也查找直接包含"subtitles.json"的文件
subtitle_files = list(base_path.rglob("*subtitles.json"))
ocr_files.extend(subtitle_files)
# 去重
ocr_files = list(set(ocr_files))
return ocr_files
def process_ocr_file(ocr_json_path, iou_threshold=0.7, text_similarity_threshold=0.7):
"""
处理单个OCR JSON文件
Args:
ocr_json_path: OCR JSON文件路径
iou_threshold: IoU阈值
text_similarity_threshold: 文本相似度阈值
Returns:
bool: 处理是否成功
"""
try:
print(f"\n正在处理文件: {ocr_json_path}")
# 读取OCR数据
ocr_data = read_json_file(ocr_json_path)
if ocr_data is None:
print(f"跳过文件 {ocr_json_path} - 读取失败")
return False
# 格式化OCR数据
pre_data, subtitle_array = format_ocr_json(ocr_data)
if not subtitle_array:
print(f"跳过文件 {ocr_json_path} - 没有有效的字幕数据")
return False
# 合并并过滤字幕
processed_text, processed_array = merge_and_filter_subtitles(
subtitle_array,
iou_threshold,
text_similarity_threshold
)
# 保存处理结果
output_dir = Path(ocr_json_path).parent
output_filename = Path(ocr_json_path).stem + "_processed.txt"
output_path = output_dir / output_filename
with open(output_path, 'w', encoding='utf-8') as f:
f.write(processed_text)
print(f"处理完成: {output_path}")
print(f"原始字幕数量: {len(subtitle_array)}")
print(f"处理后字幕数量: {len(processed_array)}")
return True
except Exception as e:
print(f"处理文件 {ocr_json_path} 时出错: {str(e)}")
return False
def main():
"""主函数"""
base_dir = "/root/autodl-tmp/video_processed2"
print(f"开始在目录 {base_dir} 中查找OCR JSON文件...")
# 查找所有OCR JSON文件
ocr_files = find_ocr_json_files(base_dir)
if not ocr_files:
print("未找到任何OCR JSON文件")
return
print(f"找到 {len(ocr_files)} 个OCR JSON文件:")
for i, file_path in enumerate(ocr_files, 1):
print(f" {i}. {file_path}")
# 处理参数
iou_threshold = 0.7
text_similarity_threshold = 0.7
print(f"\n开始批量处理...")
print(f"IoU阈值: {iou_threshold}")
print(f"文本相似度阈值: {text_similarity_threshold}")
# 批量处理
success_count = 0
failed_count = 0
for i, ocr_file in enumerate(ocr_files, 1):
print(f"\n进度: {i}/{len(ocr_files)}")
if process_ocr_file(ocr_file, iou_threshold, text_similarity_threshold):
success_count += 1
else:
failed_count += 1
# 输出统计结果
print(f"\n批量处理完成!")
print(f"总文件数: {len(ocr_files)}")
print(f"成功处理: {success_count}")
print(f"处理失败: {failed_count}")
print(f"成功率: {success_count/len(ocr_files)*100:.1f}%")
if __name__ == "__main__":
main()

View File

@ -1,31 +1,33 @@
import os import os
from datetime import datetime from datetime import datetime
def save_usage_info_to_txt(usage_info, total_duration, money, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"): def save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=None):
""" """
保存API使用情况信息到TXT文件 保存API使用情况信息到TXT文件
:param usage_info: API使用情况对象 :param usage_info: API使用情况对象
:param total_duration: API总响应时间 :param total_duration: API总响应时间
:param money: 费用信息字典 :param money: 费用信息字典
:param video_path: 原视频文件路径 :param video_path: 原视频文件路径
:param video_dir: 视频目录名
:param save_dir: 保存目录 :param save_dir: 保存目录
:param run_timestamp: 运行时间戳用于创建统一的文件夹
:return: 保存的文件路径 :return: 保存的文件路径
""" """
# 创建保存目录 # 创建保存目录 - 每次运行创建新的时间戳文件夹
os.makedirs(save_dir, exist_ok=True) video_name = os.path.splitext(os.path.basename(base_dir))[0]
if run_timestamp is None:
# 生成文件名(基于视频文件名和时间戳) run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
video_name = os.path.splitext(os.path.basename(video_path))[0] txt_dir = os.path.join(save_dir, "cost", video_name, run_timestamp)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
txt_filename = f"tokens_{video_dir}_{timestamp}.txt"
txt_dir = os.path.join(save_dir, "cost", video_name)
os.makedirs(txt_dir, exist_ok=True) os.makedirs(txt_dir, exist_ok=True)
# 生成文件名(只使用片段名,不包含时间戳)
txt_filename = f"tokens_{video_dir}.txt"
txt_path = os.path.join(txt_dir, txt_filename) txt_path = os.path.join(txt_dir, txt_filename)
# 格式化使用情况信息 # 格式化使用情况信息
usage_content = f"""API使用情况统计 usage_content = f"""API使用情况统计
===================================== =====================================
视频文件: {video_path} 视频文件: {video_dir}
统计时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} 统计时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
===================================== =====================================
@ -118,7 +120,7 @@ if __name__ == "__main__":
test_video_path = "/root/autodl-tmp/video/test.mp4" test_video_path = "/root/autodl-tmp/video/test.mp4"
# 测试详细版本 # 测试详细版本
save_usage_info_to_txt(test_usage_info, test_duration, test_money, test_video_path) save_usage_info_to_txt(test_usage_info, test_duration, test_money, test_video_path, "test_segment")
# 测试简化版本 # 测试简化版本
save_simple_usage_info(test_usage_info, test_duration, test_money, test_video_path) save_simple_usage_info(test_usage_info, test_duration, test_money, test_video_path)