2025-07-09 15:49:39 +08:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
"""
|
|
|
|
|
|
简单的批量Whisper转录脚本
|
|
|
|
|
|
直接使用原始的whisper_audio_transcribe.py
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
from re import A
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import subprocess
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
import logging
|
|
|
|
|
|
from openai import OpenAI
|
|
|
|
|
|
from save_usage_info import save_usage_info_to_txt, save_simple_usage_info
|
|
|
|
|
|
from api_video import *
|
|
|
|
|
|
|
|
|
|
|
|
# 设置日志
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
STREAM_MODE = True
|
2025-07-15 14:33:05 +08:00
|
|
|
|
# 全局运行时间戳,确保所有片段保存在同一个文件夹中
|
|
|
|
|
|
RUN_TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
|
2025-07-09 15:49:39 +08:00
|
|
|
|
|
|
|
|
|
|
def find_video_dirs(video_processed_dir):
|
|
|
|
|
|
"""查找所有包含audio_split的目录"""
|
|
|
|
|
|
video_processed_path = Path(video_processed_dir)
|
|
|
|
|
|
video_dirs = []
|
|
|
|
|
|
|
|
|
|
|
|
for video_dir in video_processed_path.iterdir():
|
|
|
|
|
|
print(video_dir.stem)
|
|
|
|
|
|
video_dirs.append(video_dir.stem)
|
|
|
|
|
|
return video_dirs
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-07-15 14:33:05 +08:00
|
|
|
|
a = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施,千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐/video_split/"
|
2025-07-09 15:49:39 +08:00
|
|
|
|
video_dirs = find_video_dirs(a)
|
|
|
|
|
|
|
|
|
|
|
|
print(video_dirs[0])
|
2025-07-15 14:33:05 +08:00
|
|
|
|
print(f"开始批量处理,运行时间戳: {RUN_TIMESTAMP}")
|
|
|
|
|
|
print(f"找到 {len(video_dirs)} 个视频片段")
|
|
|
|
|
|
|
2025-07-09 15:49:39 +08:00
|
|
|
|
for i ,video_dir in enumerate(video_dirs):
|
2025-07-15 14:33:05 +08:00
|
|
|
|
print(f"\n处理第 {i+1}/{len(video_dirs)} 个片段: {video_dir}")
|
2025-07-09 15:49:39 +08:00
|
|
|
|
|
2025-07-15 14:33:05 +08:00
|
|
|
|
base_dir = "/root/autodl-tmp/video_processed2/_众晖旅游旅行社_39688660792_Q5fQ6OvK1Xw_ 如果你要来恩施,千万别来错时间了。 _旅行推荐官 _带你去旅行 _恩施旅游攻略 _湖北旅游景点推荐"
|
2025-07-09 15:49:39 +08:00
|
|
|
|
|
|
|
|
|
|
video_path = base_dir + "/video_split/" + video_dir + ".mp4"
|
|
|
|
|
|
ocr_txt_path = base_dir + "/ocr/" + video_dir + "_subtitles_processed.txt"
|
|
|
|
|
|
whisper_json_path = base_dir +"/whisper/" + video_dir + "_transcript.json"
|
|
|
|
|
|
|
|
|
|
|
|
base64_video = encode_video(video_path)
|
2025-07-15 14:33:05 +08:00
|
|
|
|
|
|
|
|
|
|
# 检查视频编码是否成功
|
|
|
|
|
|
if base64_video is None:
|
|
|
|
|
|
print(f"错误: 无法编码视频文件 {video_path}")
|
|
|
|
|
|
print("请检查视频文件是否存在")
|
|
|
|
|
|
continue
|
2025-07-09 15:49:39 +08:00
|
|
|
|
|
|
|
|
|
|
whisper_data = read_json_file(whisper_json_path)
|
|
|
|
|
|
whisper_content = format_whisper_json(whisper_data)
|
|
|
|
|
|
|
|
|
|
|
|
with open(ocr_txt_path, 'r') as file:
|
|
|
|
|
|
ocr_content = file.read()
|
|
|
|
|
|
|
|
|
|
|
|
# # 合并内容
|
|
|
|
|
|
txt_content = ""
|
|
|
|
|
|
if ocr_content:
|
|
|
|
|
|
txt_content += ocr_content + "\n\n"
|
|
|
|
|
|
if whisper_content:
|
|
|
|
|
|
txt_content += whisper_content + "\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
print(txt_content)
|
|
|
|
|
|
|
|
|
|
|
|
# 统计提示词token
|
|
|
|
|
|
prompt_text = """🎥 **抖音短视频内容分析专家**
|
|
|
|
|
|
## 任务背景
|
|
|
|
|
|
您是一位经验丰富的视频导演和编辑,需要基于以上OCR和Whisper的两个时间轴数据,和视频内容。为视频写一个完整、流畅的脚本。
|
|
|
|
|
|
请对这个抖音短视频进行详细的内容分析,重点关注以下两个方面:
|
|
|
|
|
|
## 🎤 一、口播内容提取
|
|
|
|
|
|
请仔细听取视频中的语音内容,完整转录:
|
|
|
|
|
|
- **完整口播转录**:逐字逐句转录所有口语表达
|
|
|
|
|
|
- **语音时长**:估算总的讲话时长
|
|
|
|
|
|
## 📝 二、字幕文字识别
|
|
|
|
|
|
请识别视频画面中出现的所有文字内容:
|
|
|
|
|
|
- **屏幕字幕**:视频中显示的字幕文字(包括自动字幕和手动添加的字幕)
|
|
|
|
|
|
- **标题文字**:视频开头、中间、结尾出现的大标题
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## 📊 输出格式要求
|
|
|
|
|
|
|
|
|
|
|
|
## 视频内容分析
|
|
|
|
|
|
请按照以下JSON格式输出视频描述:
|
|
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
"total_Oral broadcasting":"请你生成一个完整的口播内容。",
|
|
|
|
|
|
"summary": "请用一句话总结视频的核心内容,突出视频的主要卖点和价值主张",
|
|
|
|
|
|
"content": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"id": 跟随Whisper口播转文字内容中的id,
|
|
|
|
|
|
"start": 跟随Whisper口播转文字内容中的start,
|
|
|
|
|
|
"end": 跟随Whisper口播转文字内容中的end,
|
|
|
|
|
|
"talk": "请将对应时间的口播或字幕信息,填入此",
|
|
|
|
|
|
"subtitles": "跟随OCR字幕识别内容的文本",
|
|
|
|
|
|
"description": "跳转到视频对应时间,将视频对应时间的图片,描述这个镜头的画面内容、人物动作、场景特点等。不要重复描述。"
|
|
|
|
|
|
},
|
|
|
|
|
|
]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
## 输出要求
|
|
|
|
|
|
1. summary:用一句话概括视频核心内容,突出主要卖点
|
|
|
|
|
|
2. content的时间轴要与whisper的保持一致
|
|
|
|
|
|
2. content:按时间顺序交替描述镜头和转场
|
|
|
|
|
|
描述:
|
|
|
|
|
|
* id:镜头序号,从1开始递增
|
|
|
|
|
|
* start:开始时间(秒),精确到小数点后一位
|
|
|
|
|
|
* end:结束时间(秒),精确到小数点后一位
|
|
|
|
|
|
* talk:该镜头中的对话或文字内容
|
|
|
|
|
|
* subtitles:该镜头中的字幕内容
|
|
|
|
|
|
* description:详细描述镜头内容,包括:
|
|
|
|
|
|
- 画面构图和场景
|
|
|
|
|
|
- 人物动作和表情
|
|
|
|
|
|
- 重要道具和元素
|
|
|
|
|
|
- 特殊效果和转场
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## 注意事项
|
|
|
|
|
|
1. 保持描述简洁明了,但要有足够的细节
|
|
|
|
|
|
2. 突出视频的亮点和特色
|
|
|
|
|
|
3. 确保时间戳的准确性
|
|
|
|
|
|
4. 对话内容要符合视频画面
|
|
|
|
|
|
5. 整体风格要统一连贯
|
|
|
|
|
|
6. 每个镜头的描述要包含关键信息
|
|
|
|
|
|
|
|
|
|
|
|
请根据以上要求,分析视频并输出JSON格式的描述。
|
|
|
|
|
|
|
|
|
|
|
|
请开始详细分析这个抖音短视频:"""
|
|
|
|
|
|
|
|
|
|
|
|
client = OpenAI(
|
|
|
|
|
|
# 若没有配置环境变量,请用百炼API Key将下行替换为:api_key="sk-xxx"
|
|
|
|
|
|
api_key="sk-3a0e98d05fab49cebc1f1379ca92d85d",
|
|
|
|
|
|
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 构建content列表
|
|
|
|
|
|
content_list = [
|
|
|
|
|
|
{
|
|
|
|
|
|
# 直接传入视频文件时,请将type的值设置为video_url
|
|
|
|
|
|
"type": "video_url",
|
|
|
|
|
|
"video_url": {"url": f"data:video/mp4;base64,{base64_video}"},
|
|
|
|
|
|
}
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
## 如果txt文件有内容,添加到content中
|
|
|
|
|
|
# 添加主要提示文本(包含参考资料内容)
|
|
|
|
|
|
prompt_text_with_references = f"""🎥 **抖音短视频内容分析专家**
|
|
|
|
|
|
## 📋 参考资料内容
|
|
|
|
|
|
【OCR转文字内容】
|
|
|
|
|
|
{txt_content}+{prompt_text}"""
|
|
|
|
|
|
|
|
|
|
|
|
content_list.append({
|
|
|
|
|
|
"type": "text",
|
|
|
|
|
|
"text": prompt_text_with_references
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n开始请求API...")
|
|
|
|
|
|
print(f"请求时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
|
|
print(f"Stream模式: {STREAM_MODE}")
|
|
|
|
|
|
print(f"Content项目数量: {len(content_list)}")
|
|
|
|
|
|
|
|
|
|
|
|
# 记录API请求开始时间
|
|
|
|
|
|
api_start_time = time.time()
|
|
|
|
|
|
completion = client.chat.completions.create(
|
|
|
|
|
|
model="qwen-omni-turbo",
|
|
|
|
|
|
#model="/root/autodl-tmp/llm/Qwen-omni",
|
|
|
|
|
|
messages=[
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "system",
|
|
|
|
|
|
"content": "You are a helpful assistant."
|
|
|
|
|
|
#"content": [{"type":"text","text": "You are a helpful assistant."}]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "user",
|
|
|
|
|
|
"content": content_list
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
|
|
|
|
|
stream=STREAM_MODE,
|
|
|
|
|
|
stream_options={"include_usage": True} if STREAM_MODE else None,
|
2025-07-15 14:33:05 +08:00
|
|
|
|
temperature=0.4,
|
|
|
|
|
|
top_p = 0.3
|
2025-07-09 15:49:39 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if STREAM_MODE:
|
|
|
|
|
|
# 流式输出 - 拼接完整回复
|
|
|
|
|
|
full_response = ""
|
|
|
|
|
|
usage_info = None
|
|
|
|
|
|
money = {}
|
|
|
|
|
|
# 记录第一个token的时间
|
|
|
|
|
|
first_token_time = None
|
|
|
|
|
|
|
|
|
|
|
|
print("正在生成回复...")
|
|
|
|
|
|
for chunk in completion:
|
|
|
|
|
|
if chunk.choices:
|
|
|
|
|
|
delta = chunk.choices[0].delta
|
|
|
|
|
|
if delta.content:
|
|
|
|
|
|
# 记录第一个token的时间
|
|
|
|
|
|
if first_token_time is None:
|
|
|
|
|
|
first_token_time = time.time()
|
|
|
|
|
|
first_token_delay = first_token_time - api_start_time
|
|
|
|
|
|
print(f"首个token延迟: {first_token_delay:.2f} 秒")
|
|
|
|
|
|
|
|
|
|
|
|
# 拼接内容
|
|
|
|
|
|
full_response += delta.content
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
# 保存使用情况信息
|
|
|
|
|
|
usage_info = chunk.usage
|
|
|
|
|
|
money["output_momey"] = chunk.usage.completion_tokens * 0.0045 / 1000
|
|
|
|
|
|
money["prompt_momey"] = chunk.usage.prompt_tokens_details.text_tokens * 0.0004 / 1000
|
|
|
|
|
|
money["video_momey"] = chunk.usage.prompt_tokens_details.video_tokens * 0.0015 / 1000
|
|
|
|
|
|
money["audio_momey"] = chunk.usage.prompt_tokens_details.audio_tokens * 0.025 / 1000
|
|
|
|
|
|
money["sum_momey"]= money["output_momey"] + money["prompt_momey"] + money["video_momey"] + money["audio_momey"]
|
|
|
|
|
|
print(usage_info)
|
|
|
|
|
|
|
|
|
|
|
|
# 记录API请求结束时间
|
|
|
|
|
|
api_end_time = time.time()
|
|
|
|
|
|
total_duration = api_end_time - api_start_time
|
|
|
|
|
|
|
|
|
|
|
|
# 输出完整的响应
|
|
|
|
|
|
print("\n" + "="*50)
|
|
|
|
|
|
print("完整回复:")
|
|
|
|
|
|
print("="*50)
|
|
|
|
|
|
print(full_response)
|
|
|
|
|
|
|
|
|
|
|
|
# 保存结果为TXT文件
|
2025-07-15 14:33:05 +08:00
|
|
|
|
txt_file_path = save_result_to_txt(full_response, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP)
|
2025-07-10 17:16:21 +08:00
|
|
|
|
# 保存结果为JSON文件
|
|
|
|
|
|
json_file_path = save_result_to_json(full_response, base_dir, video_dir)
|
2025-07-09 15:49:39 +08:00
|
|
|
|
# 保存使用情况信息
|
2025-07-15 14:33:05 +08:00
|
|
|
|
usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir, video_dir, save_dir="/root/autodl-tmp/video_llm", run_timestamp=RUN_TIMESTAMP)
|
2025-07-09 15:49:39 +08:00
|
|
|
|
|
|
|
|
|
|
# 输出使用情况信息
|
|
|
|
|
|
if usage_info:
|
|
|
|
|
|
print("\n" + "="*50)
|
|
|
|
|
|
print("📈 使用情况:")
|
|
|
|
|
|
print("="*50)
|
|
|
|
|
|
print(usage_info)
|
|
|
|
|
|
#print(base64_video)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|