丢失的batch_predata已重写;修复api_video中,上传api视频限制为8mb;已正确修改保存文件的路径
This commit is contained in:
parent
7636eca330
commit
b5f2e16eef
@ -2,13 +2,121 @@ from openai import OpenAI
|
||||
import os
|
||||
import base64
|
||||
import time
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from save_usage_info import save_usage_info_to_txt, save_simple_usage_info
|
||||
|
||||
def check_video_size(video_path, max_size_mb=7):
|
||||
"""
|
||||
检查视频文件大小
|
||||
|
||||
Args:
|
||||
video_path: 视频文件路径
|
||||
max_size_mb: 最大允许大小(MB)
|
||||
|
||||
Returns:
|
||||
bool: 是否在限制范围内
|
||||
"""
|
||||
if not os.path.exists(video_path):
|
||||
print(f"视频文件不存在: {video_path}")
|
||||
return False
|
||||
|
||||
file_size = os.path.getsize(video_path)
|
||||
size_mb = file_size / 1024 / 1024
|
||||
|
||||
print(f"视频文件大小: {size_mb:.2f}MB (限制: {max_size_mb}MB)")
|
||||
|
||||
return size_mb <= max_size_mb
|
||||
|
||||
def compress_video_auto(video_path, target_size_mb=7):
|
||||
"""
|
||||
自动压缩视频文件
|
||||
|
||||
Args:
|
||||
video_path: 视频文件路径
|
||||
target_size_mb: 目标文件大小(MB)
|
||||
|
||||
Returns:
|
||||
str: 压缩后的视频路径,如果失败返回原路径
|
||||
"""
|
||||
try:
|
||||
# 检查是否需要压缩
|
||||
if check_video_size(video_path, target_size_mb):
|
||||
print("视频文件大小符合要求,无需压缩")
|
||||
return video_path
|
||||
|
||||
print(f"视频文件过大,开始自动压缩...")
|
||||
|
||||
# 生成压缩后的文件路径
|
||||
video_path_obj = Path(video_path)
|
||||
compressed_path = video_path_obj.parent / f"{video_path_obj.stem}_compressed{video_path_obj.suffix}"
|
||||
|
||||
# 尝试不同的质量设置
|
||||
quality_levels = [23, 25, 28, 30, 32]
|
||||
|
||||
for quality in quality_levels:
|
||||
print(f"尝试压缩质量: CRF={quality}")
|
||||
|
||||
# FFmpeg压缩命令
|
||||
cmd = [
|
||||
'ffmpeg', '-i', str(video_path),
|
||||
'-c:v', 'libx264',
|
||||
'-crf', str(quality),
|
||||
'-preset', 'medium',
|
||||
'-c:a', 'aac',
|
||||
'-b:a', '128k',
|
||||
'-movflags', '+faststart',
|
||||
'-y',
|
||||
str(compressed_path)
|
||||
]
|
||||
|
||||
# 执行压缩
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0 and os.path.exists(compressed_path):
|
||||
# 检查压缩后的文件大小
|
||||
compressed_size = os.path.getsize(compressed_path) / 1024 / 1024
|
||||
print(f"压缩完成! 文件大小: {compressed_size:.2f}MB")
|
||||
|
||||
if compressed_size <= target_size_mb:
|
||||
print("✅ 压缩成功,文件大小符合要求")
|
||||
return str(compressed_path)
|
||||
else:
|
||||
print(f"⚠️ 文件仍然过大,尝试更高压缩率")
|
||||
os.remove(compressed_path) # 删除失败的文件
|
||||
else:
|
||||
print(f"压缩失败: {result.stderr}")
|
||||
if os.path.exists(compressed_path):
|
||||
os.remove(compressed_path)
|
||||
|
||||
print("❌ 所有压缩级别都无法达到目标大小")
|
||||
return video_path
|
||||
|
||||
except Exception as e:
|
||||
print(f"压缩视频时出错: {e}")
|
||||
return video_path
|
||||
|
||||
# Base64 编码格式
|
||||
def encode_video(video_path):
|
||||
with open(video_path, "rb") as video_file:
|
||||
return base64.b64encode(video_file.read()).decode("utf-8")
|
||||
"""编码视频文件,如果太大会自动压缩"""
|
||||
try:
|
||||
# 检查并自动压缩视频
|
||||
processed_video_path = compress_video_auto(video_path)
|
||||
|
||||
# 编码视频
|
||||
with open(processed_video_path, "rb") as video_file:
|
||||
encoded_data = base64.b64encode(video_file.read()).decode("utf-8")
|
||||
|
||||
# 如果使用了压缩文件,显示信息
|
||||
if processed_video_path != video_path:
|
||||
print(f"使用压缩后的视频文件: {processed_video_path}")
|
||||
|
||||
return encoded_data
|
||||
|
||||
except Exception as e:
|
||||
print(f"编码视频时出错: {e}")
|
||||
return None
|
||||
|
||||
def encode_audio(audio_path):
|
||||
with open(audio_path, "rb") as audio_file:
|
||||
@ -197,54 +305,24 @@ def format_clip_json(clip_data):
|
||||
|
||||
return formatted_text
|
||||
|
||||
def save_result_to_txt(response_text, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
|
||||
|
||||
|
||||
def save_result_to_txt(response_text, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=None):
|
||||
"""将分析结果保存为TXT文件"""
|
||||
# 创建保存目录
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
# 生成文件名(基于视频文件名和时间戳)
|
||||
video_name = os.path.splitext(os.path.basename(video_path))[0]
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
txt_filename = f"{video_dir}_{timestamp}.txt"
|
||||
txt_dir = os.path.join(save_dir, "Template", video_name)
|
||||
# 创建保存目录 - 每次运行创建新的时间戳文件夹
|
||||
video_name = os.path.splitext(os.path.basename(base_dir))[0]
|
||||
if run_timestamp is None:
|
||||
run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
txt_dir = os.path.join(save_dir, "Template",video_name ,run_timestamp)
|
||||
os.makedirs(txt_dir, exist_ok=True)
|
||||
|
||||
# 生成文件名(只使用片段名,不包含时间戳)
|
||||
txt_filename = f"{video_dir}.txt"
|
||||
txt_path = os.path.join(txt_dir, txt_filename)
|
||||
# 准备保存内容(添加头部信息)
|
||||
content = f"""视频分析结果
|
||||
=====================================
|
||||
视频文件: {video_path}
|
||||
分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
=====================================
|
||||
|
||||
{response_text}
|
||||
"""
|
||||
|
||||
# 保存到文件
|
||||
try:
|
||||
with open(txt_path, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
print(f"\n✅ 分析结果已保存到: {txt_path}")
|
||||
return txt_path
|
||||
except Exception as e:
|
||||
print(f"\n❌ 保存TXT文件失败: {e}")
|
||||
return None
|
||||
|
||||
def save_result_to_txt(response_text, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
|
||||
"""将分析结果保存为TXT文件"""
|
||||
# 创建保存目录
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
# 生成文件名(基于视频文件名和时间戳)
|
||||
video_name = os.path.splitext(os.path.basename(video_path))[0]
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
txt_filename = f"{video_dir}_{timestamp}.txt"
|
||||
txt_dir = os.path.join(save_dir, "Template", video_name)
|
||||
os.makedirs(txt_dir, exist_ok=True)
|
||||
txt_path = os.path.join(txt_dir, txt_filename)
|
||||
# 准备保存内容(添加头部信息)
|
||||
content = f"""视频分析结果
|
||||
=====================================
|
||||
视频文件: {video_path}
|
||||
视频文件: {video_dir}
|
||||
分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
=====================================
|
||||
|
||||
|
||||
@ -21,6 +21,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
STREAM_MODE = True
|
||||
# 全局运行时间戳,确保所有片段保存在同一个文件夹中
|
||||
RUN_TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
def find_video_dirs(video_processed_dir):
|
||||
"""查找所有包含audio_split的目录"""
|
||||
@ -33,20 +35,29 @@ def find_video_dirs(video_processed_dir):
|
||||
return video_dirs
|
||||
|
||||
|
||||
a = "/root/autodl-tmp/video_processed/成都/video_split/"
|
||||
a = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施,千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐/video_split/"
|
||||
video_dirs = find_video_dirs(a)
|
||||
|
||||
print(video_dirs[0])
|
||||
for i ,video_dir in enumerate(video_dirs):
|
||||
print(i, video_dir)
|
||||
print(f"开始批量处理,运行时间戳: {RUN_TIMESTAMP}")
|
||||
print(f"找到 {len(video_dirs)} 个视频片段")
|
||||
|
||||
base_dir = "/root/autodl-tmp/video_processed/成都"
|
||||
for i ,video_dir in enumerate(video_dirs):
|
||||
print(f"\n处理第 {i+1}/{len(video_dirs)} 个片段: {video_dir}")
|
||||
|
||||
base_dir = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施,千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐"
|
||||
|
||||
video_path = base_dir + "/video_split/" + video_dir + ".mp4"
|
||||
ocr_txt_path = base_dir + "/ocr/" + video_dir + "_subtitles_processed.txt"
|
||||
whisper_json_path = base_dir +"/whisper/" + video_dir + "_transcript.json"
|
||||
|
||||
base64_video = encode_video(video_path)
|
||||
|
||||
# 检查视频编码是否成功
|
||||
if base64_video is None:
|
||||
print(f"错误: 无法编码视频文件 {video_path}")
|
||||
print("请检查视频文件是否存在")
|
||||
continue
|
||||
|
||||
whisper_data = read_json_file(whisper_json_path)
|
||||
whisper_content = format_whisper_json(whisper_data)
|
||||
@ -177,7 +188,8 @@ for i ,video_dir in enumerate(video_dirs):
|
||||
],
|
||||
stream=STREAM_MODE,
|
||||
stream_options={"include_usage": True} if STREAM_MODE else None,
|
||||
temperature=0.5
|
||||
temperature=0.4,
|
||||
top_p = 0.3
|
||||
)
|
||||
|
||||
if STREAM_MODE:
|
||||
@ -223,11 +235,11 @@ for i ,video_dir in enumerate(video_dirs):
|
||||
print(full_response)
|
||||
|
||||
# 保存结果为TXT文件
|
||||
txt_file_path = save_result_to_txt(full_response, base_dir, video_dir)
|
||||
txt_file_path = save_result_to_txt(full_response, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP)
|
||||
# 保存结果为JSON文件
|
||||
json_file_path = save_result_to_json(full_response, base_dir, video_dir)
|
||||
# 保存使用情况信息
|
||||
usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir)
|
||||
usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP)
|
||||
|
||||
# 输出使用情况信息
|
||||
if usage_info:
|
||||
|
||||
134
code/batch_predata.py
Normal file
134
code/batch_predata.py
Normal file
@ -0,0 +1,134 @@
|
||||
import os
|
||||
import glob
|
||||
from pathlib import Path
|
||||
from pre_data_1 import read_json_file, format_ocr_json, merge_and_filter_subtitles
|
||||
|
||||
def find_ocr_json_files(base_dir):
|
||||
"""
|
||||
在指定目录中查找所有OCR JSON文件
|
||||
|
||||
Args:
|
||||
base_dir: 基础目录路径
|
||||
|
||||
Returns:
|
||||
list: 找到的OCR JSON文件路径列表
|
||||
"""
|
||||
ocr_files = []
|
||||
base_path = Path(base_dir)
|
||||
|
||||
# 查找所有可能的OCR目录
|
||||
for ocr_dir in base_path.rglob("ocr"):
|
||||
if ocr_dir.is_dir():
|
||||
# 在ocr目录中查找JSON文件
|
||||
json_files = list(ocr_dir.glob("*.json"))
|
||||
ocr_files.extend(json_files)
|
||||
|
||||
# 也查找直接包含"subtitles.json"的文件
|
||||
subtitle_files = list(base_path.rglob("*subtitles.json"))
|
||||
ocr_files.extend(subtitle_files)
|
||||
|
||||
# 去重
|
||||
ocr_files = list(set(ocr_files))
|
||||
|
||||
return ocr_files
|
||||
|
||||
def process_ocr_file(ocr_json_path, iou_threshold=0.7, text_similarity_threshold=0.7):
|
||||
"""
|
||||
处理单个OCR JSON文件
|
||||
|
||||
Args:
|
||||
ocr_json_path: OCR JSON文件路径
|
||||
iou_threshold: IoU阈值
|
||||
text_similarity_threshold: 文本相似度阈值
|
||||
|
||||
Returns:
|
||||
bool: 处理是否成功
|
||||
"""
|
||||
try:
|
||||
print(f"\n正在处理文件: {ocr_json_path}")
|
||||
|
||||
# 读取OCR数据
|
||||
ocr_data = read_json_file(ocr_json_path)
|
||||
if ocr_data is None:
|
||||
print(f"跳过文件 {ocr_json_path} - 读取失败")
|
||||
return False
|
||||
|
||||
# 格式化OCR数据
|
||||
pre_data, subtitle_array = format_ocr_json(ocr_data)
|
||||
|
||||
if not subtitle_array:
|
||||
print(f"跳过文件 {ocr_json_path} - 没有有效的字幕数据")
|
||||
return False
|
||||
|
||||
# 合并并过滤字幕
|
||||
processed_text, processed_array = merge_and_filter_subtitles(
|
||||
subtitle_array,
|
||||
iou_threshold,
|
||||
text_similarity_threshold
|
||||
)
|
||||
|
||||
# 保存处理结果
|
||||
output_dir = Path(ocr_json_path).parent
|
||||
output_filename = Path(ocr_json_path).stem + "_processed.txt"
|
||||
output_path = output_dir / output_filename
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(processed_text)
|
||||
|
||||
print(f"处理完成: {output_path}")
|
||||
print(f"原始字幕数量: {len(subtitle_array)}")
|
||||
print(f"处理后字幕数量: {len(processed_array)}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文件 {ocr_json_path} 时出错: {str(e)}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
base_dir = "/root/autodl-tmp/video_processed2"
|
||||
|
||||
print(f"开始在目录 {base_dir} 中查找OCR JSON文件...")
|
||||
|
||||
# 查找所有OCR JSON文件
|
||||
ocr_files = find_ocr_json_files(base_dir)
|
||||
|
||||
if not ocr_files:
|
||||
print("未找到任何OCR JSON文件")
|
||||
return
|
||||
|
||||
print(f"找到 {len(ocr_files)} 个OCR JSON文件:")
|
||||
for i, file_path in enumerate(ocr_files, 1):
|
||||
print(f" {i}. {file_path}")
|
||||
|
||||
# 处理参数
|
||||
iou_threshold = 0.7
|
||||
text_similarity_threshold = 0.7
|
||||
|
||||
print(f"\n开始批量处理...")
|
||||
print(f"IoU阈值: {iou_threshold}")
|
||||
print(f"文本相似度阈值: {text_similarity_threshold}")
|
||||
|
||||
# 批量处理
|
||||
success_count = 0
|
||||
failed_count = 0
|
||||
|
||||
for i, ocr_file in enumerate(ocr_files, 1):
|
||||
print(f"\n进度: {i}/{len(ocr_files)}")
|
||||
|
||||
if process_ocr_file(ocr_file, iou_threshold, text_similarity_threshold):
|
||||
success_count += 1
|
||||
else:
|
||||
failed_count += 1
|
||||
|
||||
# 输出统计结果
|
||||
print(f"\n批量处理完成!")
|
||||
print(f"总文件数: {len(ocr_files)}")
|
||||
print(f"成功处理: {success_count}")
|
||||
print(f"处理失败: {failed_count}")
|
||||
print(f"成功率: {success_count/len(ocr_files)*100:.1f}%")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -1,31 +1,33 @@
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
def save_usage_info_to_txt(usage_info, total_duration, money, video_path, video_dir, save_dir="/root/autodl-tmp/video_llm"):
|
||||
def save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=None):
|
||||
"""
|
||||
保存API使用情况信息到TXT文件
|
||||
:param usage_info: API使用情况对象
|
||||
:param total_duration: API总响应时间
|
||||
:param money: 费用信息字典
|
||||
:param video_path: 原视频文件路径
|
||||
:param video_dir: 视频目录名
|
||||
:param save_dir: 保存目录
|
||||
:param run_timestamp: 运行时间戳,用于创建统一的文件夹
|
||||
:return: 保存的文件路径
|
||||
"""
|
||||
# 创建保存目录
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
# 生成文件名(基于视频文件名和时间戳)
|
||||
video_name = os.path.splitext(os.path.basename(video_path))[0]
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
txt_filename = f"tokens_{video_dir}_{timestamp}.txt"
|
||||
txt_dir = os.path.join(save_dir, "cost", video_name)
|
||||
# 创建保存目录 - 每次运行创建新的时间戳文件夹
|
||||
video_name = os.path.splitext(os.path.basename(base_dir))[0]
|
||||
if run_timestamp is None:
|
||||
run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
txt_dir = os.path.join(save_dir, "cost", video_name, run_timestamp)
|
||||
os.makedirs(txt_dir, exist_ok=True)
|
||||
|
||||
# 生成文件名(只使用片段名,不包含时间戳)
|
||||
txt_filename = f"tokens_{video_dir}.txt"
|
||||
txt_path = os.path.join(txt_dir, txt_filename)
|
||||
|
||||
# 格式化使用情况信息
|
||||
usage_content = f"""API使用情况统计
|
||||
=====================================
|
||||
视频文件: {video_path}
|
||||
视频文件: {video_dir}
|
||||
统计时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
=====================================
|
||||
|
||||
@ -118,7 +120,7 @@ if __name__ == "__main__":
|
||||
test_video_path = "/root/autodl-tmp/video/test.mp4"
|
||||
|
||||
# 测试详细版本
|
||||
save_usage_info_to_txt(test_usage_info, test_duration, test_money, test_video_path)
|
||||
save_usage_info_to_txt(test_usage_info, test_duration, test_money, test_video_path, "test_segment")
|
||||
|
||||
# 测试简化版本
|
||||
save_simple_usage_info(test_usage_info, test_duration, test_money, test_video_path)
|
||||
Loading…
x
Reference in New Issue
Block a user