丢失的batch_predata已重写;修复api_video中,上传api视频限制为8mb;已正确修改保存文件的路径

This commit is contained in:
root 2025-07-15 14:33:05 +08:00
parent 7636eca330
commit b5f2e16eef
4 changed files with 288 additions and 62 deletions

View File

@ -2,13 +2,121 @@ from openai import OpenAI
import os
import base64
import time
import subprocess
from datetime import datetime
from pathlib import Path
from save_usage_info import save_usage_info_to_txt, save_simple_usage_info
def check_video_size(video_path, max_size_mb=7):
"""
检查视频文件大小
Args:
video_path: 视频文件路径
max_size_mb: 最大允许大小MB
Returns:
bool: 是否在限制范围内
"""
if not os.path.exists(video_path):
print(f"视频文件不存在: {video_path}")
return False
file_size = os.path.getsize(video_path)
size_mb = file_size / 1024 / 1024
print(f"视频文件大小: {size_mb:.2f}MB (限制: {max_size_mb}MB)")
return size_mb <= max_size_mb
def compress_video_auto(video_path, target_size_mb=7):
"""
自动压缩视频文件
Args:
video_path: 视频文件路径
target_size_mb: 目标文件大小MB
Returns:
str: 压缩后的视频路径如果失败返回原路径
"""
try:
# 检查是否需要压缩
if check_video_size(video_path, target_size_mb):
print("视频文件大小符合要求,无需压缩")
return video_path
print(f"视频文件过大,开始自动压缩...")
# 生成压缩后的文件路径
video_path_obj = Path(video_path)
compressed_path = video_path_obj.parent / f"{video_path_obj.stem}_compressed{video_path_obj.suffix}"
# 尝试不同的质量设置
quality_levels = [23, 25, 28, 30, 32]
for quality in quality_levels:
print(f"尝试压缩质量: CRF={quality}")
# FFmpeg压缩命令
cmd = [
'ffmpeg', '-i', str(video_path),
'-c:v', 'libx264',
'-crf', str(quality),
'-preset', 'medium',
'-c:a', 'aac',
'-b:a', '128k',
'-movflags', '+faststart',
'-y',
str(compressed_path)
]
# 执行压缩
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0 and os.path.exists(compressed_path):
# 检查压缩后的文件大小
compressed_size = os.path.getsize(compressed_path) / 1024 / 1024
print(f"压缩完成! 文件大小: {compressed_size:.2f}MB")
if compressed_size <= target_size_mb:
print("✅ 压缩成功,文件大小符合要求")
return str(compressed_path)
else:
print(f"⚠️ 文件仍然过大,尝试更高压缩率")
os.remove(compressed_path) # 删除失败的文件
else:
print(f"压缩失败: {result.stderr}")
if os.path.exists(compressed_path):
os.remove(compressed_path)
print("❌ 所有压缩级别都无法达到目标大小")
return video_path
except Exception as e:
print(f"压缩视频时出错: {e}")
return video_path
# Base64 编码格式
def encode_video(video_path):
with open(video_path, "rb") as video_file:
return base64.b64encode(video_file.read()).decode("utf-8")
"""编码视频文件,如果太大会自动压缩"""
try:
# 检查并自动压缩视频
processed_video_path = compress_video_auto(video_path)
# 编码视频
with open(processed_video_path, "rb") as video_file:
encoded_data = base64.b64encode(video_file.read()).decode("utf-8")
# 如果使用了压缩文件,显示信息
if processed_video_path != video_path:
print(f"使用压缩后的视频文件: {processed_video_path}")
return encoded_data
except Exception as e:
print(f"编码视频时出错: {e}")
return None
def encode_audio(audio_path):
with open(audio_path, "rb") as audio_file:
@ -197,54 +305,24 @@ def format_clip_json(clip_data):
return formatted_text
def save_result_to_txt(response_text, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
def save_result_to_txt(response_text, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=None):
"""将分析结果保存为TXT文件"""
# 创建保存目录
os.makedirs(save_dir, exist_ok=True)
# 生成文件名(基于视频文件名和时间戳)
video_name = os.path.splitext(os.path.basename(video_path))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
txt_filename = f"{video_dir}_{timestamp}.txt"
txt_dir = os.path.join(save_dir, "Template", video_name)
# 创建保存目录 - 每次运行创建新的时间戳文件夹
video_name = os.path.splitext(os.path.basename(base_dir))[0]
if run_timestamp is None:
run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
txt_dir = os.path.join(save_dir, "Template",video_name ,run_timestamp)
os.makedirs(txt_dir, exist_ok=True)
# 生成文件名(只使用片段名,不包含时间戳)
txt_filename = f"{video_dir}.txt"
txt_path = os.path.join(txt_dir, txt_filename)
# 准备保存内容(添加头部信息)
content = f"""视频分析结果
=====================================
视频文件: {video_path}
分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
=====================================
{response_text}
"""
# 保存到文件
try:
with open(txt_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"\n✅ 分析结果已保存到: {txt_path}")
return txt_path
except Exception as e:
print(f"\n❌ 保存TXT文件失败: {e}")
return None
def save_result_to_txt(response_text, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
"""将分析结果保存为TXT文件"""
# 创建保存目录
os.makedirs(save_dir, exist_ok=True)
# 生成文件名(基于视频文件名和时间戳)
video_name = os.path.splitext(os.path.basename(video_path))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
txt_filename = f"{video_dir}_{timestamp}.txt"
txt_dir = os.path.join(save_dir, "Template", video_name)
os.makedirs(txt_dir, exist_ok=True)
txt_path = os.path.join(txt_dir, txt_filename)
# 准备保存内容(添加头部信息)
content = f"""视频分析结果
=====================================
视频文件: {video_path}
视频文件: {video_dir}
分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
=====================================

View File

@ -21,6 +21,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
logger = logging.getLogger(__name__)
STREAM_MODE = True
# 全局运行时间戳,确保所有片段保存在同一个文件夹中
RUN_TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
def find_video_dirs(video_processed_dir):
"""查找所有包含audio_split的目录"""
@ -33,20 +35,29 @@ def find_video_dirs(video_processed_dir):
return video_dirs
a = "/root/autodl-tmp/video_processed/成都/video_split/"
a = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施,千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐/video_split/"
video_dirs = find_video_dirs(a)
print(video_dirs[0])
for i ,video_dir in enumerate(video_dirs):
print(i, video_dir)
print(f"开始批量处理,运行时间戳: {RUN_TIMESTAMP}")
print(f"找到 {len(video_dirs)} 个视频片段")
base_dir = "/root/autodl-tmp/video_processed/成都"
for i ,video_dir in enumerate(video_dirs):
print(f"\n处理第 {i+1}/{len(video_dirs)} 个片段: {video_dir}")
base_dir = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施,千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐"
video_path = base_dir + "/video_split/" + video_dir + ".mp4"
ocr_txt_path = base_dir + "/ocr/" + video_dir + "_subtitles_processed.txt"
whisper_json_path = base_dir +"/whisper/" + video_dir + "_transcript.json"
base64_video = encode_video(video_path)
# 检查视频编码是否成功
if base64_video is None:
print(f"错误: 无法编码视频文件 {video_path}")
print("请检查视频文件是否存在")
continue
whisper_data = read_json_file(whisper_json_path)
whisper_content = format_whisper_json(whisper_data)
@ -177,7 +188,8 @@ for i ,video_dir in enumerate(video_dirs):
],
stream=STREAM_MODE,
stream_options={"include_usage": True} if STREAM_MODE else None,
temperature=0.5
temperature=0.4,
top_p = 0.3
)
if STREAM_MODE:
@ -223,11 +235,11 @@ for i ,video_dir in enumerate(video_dirs):
print(full_response)
# 保存结果为TXT文件
txt_file_path = save_result_to_txt(full_response, base_dir, video_dir)
txt_file_path = save_result_to_txt(full_response, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP)
# 保存结果为JSON文件
json_file_path = save_result_to_json(full_response, base_dir, video_dir)
# 保存使用情况信息
usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir)
usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP)
# 输出使用情况信息
if usage_info:

134
code/batch_predata.py Normal file
View File

@ -0,0 +1,134 @@
import os
import glob
from pathlib import Path
from pre_data_1 import read_json_file, format_ocr_json, merge_and_filter_subtitles
def find_ocr_json_files(base_dir):
"""
在指定目录中查找所有OCR JSON文件
Args:
base_dir: 基础目录路径
Returns:
list: 找到的OCR JSON文件路径列表
"""
ocr_files = []
base_path = Path(base_dir)
# 查找所有可能的OCR目录
for ocr_dir in base_path.rglob("ocr"):
if ocr_dir.is_dir():
# 在ocr目录中查找JSON文件
json_files = list(ocr_dir.glob("*.json"))
ocr_files.extend(json_files)
# 也查找直接包含"subtitles.json"的文件
subtitle_files = list(base_path.rglob("*subtitles.json"))
ocr_files.extend(subtitle_files)
# 去重
ocr_files = list(set(ocr_files))
return ocr_files
def process_ocr_file(ocr_json_path, iou_threshold=0.7, text_similarity_threshold=0.7):
"""
处理单个OCR JSON文件
Args:
ocr_json_path: OCR JSON文件路径
iou_threshold: IoU阈值
text_similarity_threshold: 文本相似度阈值
Returns:
bool: 处理是否成功
"""
try:
print(f"\n正在处理文件: {ocr_json_path}")
# 读取OCR数据
ocr_data = read_json_file(ocr_json_path)
if ocr_data is None:
print(f"跳过文件 {ocr_json_path} - 读取失败")
return False
# 格式化OCR数据
pre_data, subtitle_array = format_ocr_json(ocr_data)
if not subtitle_array:
print(f"跳过文件 {ocr_json_path} - 没有有效的字幕数据")
return False
# 合并并过滤字幕
processed_text, processed_array = merge_and_filter_subtitles(
subtitle_array,
iou_threshold,
text_similarity_threshold
)
# 保存处理结果
output_dir = Path(ocr_json_path).parent
output_filename = Path(ocr_json_path).stem + "_processed.txt"
output_path = output_dir / output_filename
with open(output_path, 'w', encoding='utf-8') as f:
f.write(processed_text)
print(f"处理完成: {output_path}")
print(f"原始字幕数量: {len(subtitle_array)}")
print(f"处理后字幕数量: {len(processed_array)}")
return True
except Exception as e:
print(f"处理文件 {ocr_json_path} 时出错: {str(e)}")
return False
def main():
"""主函数"""
base_dir = "/root/autodl-tmp/video_processed2"
print(f"开始在目录 {base_dir} 中查找OCR JSON文件...")
# 查找所有OCR JSON文件
ocr_files = find_ocr_json_files(base_dir)
if not ocr_files:
print("未找到任何OCR JSON文件")
return
print(f"找到 {len(ocr_files)} 个OCR JSON文件:")
for i, file_path in enumerate(ocr_files, 1):
print(f" {i}. {file_path}")
# 处理参数
iou_threshold = 0.7
text_similarity_threshold = 0.7
print(f"\n开始批量处理...")
print(f"IoU阈值: {iou_threshold}")
print(f"文本相似度阈值: {text_similarity_threshold}")
# 批量处理
success_count = 0
failed_count = 0
for i, ocr_file in enumerate(ocr_files, 1):
print(f"\n进度: {i}/{len(ocr_files)}")
if process_ocr_file(ocr_file, iou_threshold, text_similarity_threshold):
success_count += 1
else:
failed_count += 1
# 输出统计结果
print(f"\n批量处理完成!")
print(f"总文件数: {len(ocr_files)}")
print(f"成功处理: {success_count}")
print(f"处理失败: {failed_count}")
print(f"成功率: {success_count/len(ocr_files)*100:.1f}%")
if __name__ == "__main__":
main()

View File

@ -1,31 +1,33 @@
import os
from datetime import datetime
def save_usage_info_to_txt(usage_info, total_duration, money, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
def save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=None):
"""
保存API使用情况信息到TXT文件
:param usage_info: API使用情况对象
:param total_duration: API总响应时间
:param money: 费用信息字典
:param video_path: 原视频文件路径
:param video_dir: 视频目录名
:param save_dir: 保存目录
:param run_timestamp: 运行时间戳用于创建统一的文件夹
:return: 保存的文件路径
"""
# 创建保存目录
os.makedirs(save_dir, exist_ok=True)
# 生成文件名(基于视频文件名和时间戳)
video_name = os.path.splitext(os.path.basename(video_path))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
txt_filename = f"tokens_{video_dir}_{timestamp}.txt"
txt_dir = os.path.join(save_dir, "cost", video_name)
# 创建保存目录 - 每次运行创建新的时间戳文件夹
video_name = os.path.splitext(os.path.basename(base_dir))[0]
if run_timestamp is None:
run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
txt_dir = os.path.join(save_dir, "cost", video_name, run_timestamp)
os.makedirs(txt_dir, exist_ok=True)
# 生成文件名(只使用片段名,不包含时间戳)
txt_filename = f"tokens_{video_dir}.txt"
txt_path = os.path.join(txt_dir, txt_filename)
# 格式化使用情况信息
usage_content = f"""API使用情况统计
=====================================
视频文件: {video_path}
视频文件: {video_dir}
统计时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
=====================================
@ -118,7 +120,7 @@ if __name__ == "__main__":
test_video_path = "/root/autodl-tmp/video/test.mp4"
# 测试详细版本
save_usage_info_to_txt(test_usage_info, test_duration, test_money, test_video_path)
save_usage_info_to_txt(test_usage_info, test_duration, test_money, test_video_path, "test_segment")
# 测试简化版本
save_simple_usage_info(test_usage_info, test_duration, test_money, test_video_path)