video_template_gen/code/batch_api_video.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
简单的批量Whisper转录脚本
直接使用原始的whisper_audio_transcribe.py
"""

import os
from re import A
import sys
import subprocess
from pathlib import Path
import argparse
import logging
from openai import OpenAI
from save_usage_info import save_usage_info_to_txt, save_simple_usage_info
from api_video import *

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

STREAM_MODE = True

def find_video_dirs(video_processed_dir):
    """查找所有包含audio_split的目录"""
    video_processed_path = Path(video_processed_dir)
    video_dirs = []

    for video_dir in video_processed_path.iterdir():
        print(video_dir.stem)
        video_dirs.append(video_dir.stem)
    return video_dirs


a = "/root/autodl-tmp/video_processed/成都/video_split/"
video_dirs = find_video_dirs(a)

print(video_dirs[0])
for i ,video_dir in enumerate(video_dirs):
    print(i, video_dir)

    base_dir = "/root/autodl-tmp/video_processed/成都"

    video_path = base_dir + "/video_split/" + video_dir + ".mp4"
    ocr_txt_path = base_dir + "/ocr/" + video_dir + "_subtitles_processed.txt"
    whisper_json_path = base_dir +"/whisper/" + video_dir + "_transcript.json"

    base64_video = encode_video(video_path)

    whisper_data = read_json_file(whisper_json_path)
    whisper_content = format_whisper_json(whisper_data)

    with open(ocr_txt_path, 'r') as file:
        ocr_content = file.read()

    # # 合并内容
    txt_content = ""
    if ocr_content:
        txt_content += ocr_content + "\n\n"
    if whisper_content:
        txt_content += whisper_content + "\n\n"

    print(txt_content)

    # 统计提示词token
    prompt_text = """🎥 **抖音短视频内容分析专家**
        ## 任务背景
    您是一位经验丰富的视频导演和编辑，需要基于以上OCR和Whisper的两个时间轴数据，和视频内容。为视频写一个完整、流畅的脚本。
    请对这个抖音短视频进行详细的内容分析，重点关注以下两个方面：
    ## 🎤 一、口播内容提取
    请仔细听取视频中的语音内容，完整转录：
    - **完整口播转录**：逐字逐句转录所有口语表达
    - **语音时长**：估算总的讲话时长
    ## 📝 二、字幕文字识别
    请识别视频画面中出现的所有文字内容：
    - **屏幕字幕**：视频中显示的字幕文字（包括自动字幕和手动添加的字幕）
    - **标题文字**：视频开头、中间、结尾出现的大标题


    ## 📊 输出格式要求

    ## 视频内容分析
    请按照以下JSON格式输出视频描述：

    {
        "total_Oral broadcasting":"请你生成一个完整的口播内容。",
        "summary": "请用一句话总结视频的核心内容，突出视频的主要卖点和价值主张",
        "content": [
            {
                "id": 跟随Whisper口播转文字内容中的id,
                "start": 跟随Whisper口播转文字内容中的start,
                "end": 跟随Whisper口播转文字内容中的end,
                "talk": "请将对应时间的口播或字幕信息，填入此",
                "subtitles": "跟随OCR字幕识别内容的文本",
                "description": "跳转到视频对应时间，将视频对应时间的图片，描述这个镜头的画面内容、人物动作、场景特点等。不要重复描述。"
            },
        ]
    }

    ## 输出要求
    1. summary：用一句话概括视频核心内容，突出主要卖点
    2. content的时间轴要与whisper的保持一致
    2. content：按时间顺序交替描述镜头和转场
        描述：
        * id：镜头序号，从1开始递增
        * start：开始时间（秒），精确到小数点后一位
        * end：结束时间（秒），精确到小数点后一位
        * talk：该镜头中的对话或文字内容
        * subtitles：该镜头中的字幕内容
        * description：详细描述镜头内容，包括：
        - 画面构图和场景
        - 人物动作和表情
        - 重要道具和元素
        - 特殊效果和转场


    ## 注意事项
    1. 保持描述简洁明了，但要有足够的细节
    2. 突出视频的亮点和特色
    3. 确保时间戳的准确性
    4. 对话内容要符合视频画面
    5. 整体风格要统一连贯
    6. 每个镜头的描述要包含关键信息

    请根据以上要求，分析视频并输出JSON格式的描述。

    请开始详细分析这个抖音短视频："""

    client = OpenAI(
        # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx"
        api_key="sk-3a0e98d05fab49cebc1f1379ca92d85d",
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
    )

    # 构建content列表
    content_list = [
        {
            # 直接传入视频文件时，请将type的值设置为video_url
            "type": "video_url",
            "video_url": {"url": f"data:video/mp4;base64,{base64_video}"},
        }
    ]

    ## 如果txt文件有内容，添加到content中
    # 添加主要提示文本（包含参考资料内容）
    prompt_text_with_references = f"""🎥 **抖音短视频内容分析专家**
    ## 📋 参考资料内容
    【OCR转文字内容】
    {txt_content}+{prompt_text}"""

    content_list.append({
        "type": "text",
        "text": prompt_text_with_references
    })

    print(f"\n开始请求API...")
    print(f"请求时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Stream模式: {STREAM_MODE}")
    print(f"Content项目数量: {len(content_list)}")

    # 记录API请求开始时间
    api_start_time = time.time()
    completion = client.chat.completions.create(
        model="qwen-omni-turbo",
        #model="/root/autodl-tmp/llm/Qwen-omni",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant."
                #"content": [{"type":"text","text": "You are a helpful assistant."}]
            },
            {
                "role": "user",
                "content": content_list
            }
        ],
        stream=STREAM_MODE,
        stream_options={"include_usage": True} if STREAM_MODE else None,
        temperature=0.5
    )

    if STREAM_MODE:
        # 流式输出 - 拼接完整回复
        full_response = ""
        usage_info = None
        money = {}
        # 记录第一个token的时间
        first_token_time = None

        print("正在生成回复...")
        for chunk in completion:
            if chunk.choices:
                delta = chunk.choices[0].delta
                if delta.content:
                    # 记录第一个token的时间
                    if first_token_time is None:
                        first_token_time = time.time()
                        first_token_delay = first_token_time - api_start_time
                        print(f"首个token延迟: {first_token_delay:.2f} 秒")

                    # 拼接内容
                    full_response += delta.content

            else:
                # 保存使用情况信息
                usage_info = chunk.usage
                money["output_momey"] = chunk.usage.completion_tokens * 0.0045 / 1000
                money["prompt_momey"] = chunk.usage.prompt_tokens_details.text_tokens * 0.0004 / 1000
                money["video_momey"] = chunk.usage.prompt_tokens_details.video_tokens * 0.0015 / 1000
                money["audio_momey"] = chunk.usage.prompt_tokens_details.audio_tokens * 0.025 / 1000
                money["sum_momey"]= money["output_momey"] + money["prompt_momey"] + money["video_momey"] + money["audio_momey"]
                print(usage_info)

        # 记录API请求结束时间
        api_end_time = time.time()
        total_duration = api_end_time - api_start_time

        # 输出完整的响应
        print("\n" + "="*50)
        print("完整回复:")
        print("="*50)
        print(full_response)

        # 保存结果为TXT文件
        txt_file_path = save_result_to_txt(full_response, base_dir , video_dir)
        # 保存使用情况信息
        usage_info_txt = save_usage_info_to_txt(usage_info, total_duration, money, base_dir , video_dir)

        # 输出使用情况信息
        if usage_info:
            print("\n" + "="*50)
            print("📈 使用情况:")
            print("="*50)
            print(usage_info)
    #print(base64_video)