hot_video_analyse/code/token_counter.py

import tiktoken
import os
import cv2

def count_tokens(text, model="gpt-4"):
    """统计文本的token数量"""
    try:
        encoding = tiktoken.encoding_for_model(model)
        tokens = encoding.encode(text)
        return len(tokens)
    except Exception as e:
        print(f"Token统计出错: {e}")
        # 简单估算：中文字符约1.5个token，英文单词约1.3个token
        chinese_chars = sum(1 for char in text if '\u4e00' <= char <= '\u9fff')
        english_words = len([word for word in text.split() if word.isascii()])
        estimated_tokens = int(chinese_chars * 1.5 + english_words * 1.3)
        return estimated_tokens

def get_video_token_estimate(video_path):
    """估算视频的token数量（基于文件大小和时长）"""
    try:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            return {'estimated_tokens': 0, 'duration': 0, 'frame_count': 0, 'fps': 0, 'file_size_mb': 0, 'frames_used': 0}
        
        # 获取视频信息
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        duration = frame_count / fps if fps > 0 else 0
        
        # 获取文件大小
        file_size = os.path.getsize(video_path)
        
        cap.release()
        
        # 基于GPT-4V的token估算规则
        # 视频token = 基础token + 帧数 * 每帧token
        base_tokens = 85  # 基础token
        frames_per_second = min(fps, 1)  # 每秒最多1帧
        total_frames = min(frame_count, int(duration * frames_per_second))
        tokens_per_frame = 170  # 每帧约170个token
        
        estimated_tokens = base_tokens + total_frames * tokens_per_frame
        
        return {
            'estimated_tokens': int(estimated_tokens),
            'duration': duration,
            'frame_count': frame_count,
            'fps': fps,
            'file_size_mb': file_size / (1024 * 1024),
            'frames_used': total_frames
        }
    except Exception as e:
        print(f"视频token估算出错: {e}")
        return {'estimated_tokens': 0, 'duration': 0, 'frame_count': 0, 'fps': 0, 'file_size_mb': 0, 'frames_used': 0}

def analyze_input_tokens(video_path, text_content="", prompt_text=""):
    """分析输入token统计"""
    print("\n" + "="*50)
    print("📊 Token统计信息:")
    print("="*50)

    # 统计视频token
    video_token_info = get_video_token_estimate(video_path)
    print(f"🎬 视频Token统计:")
    print(f"  估算Token数量: {video_token_info['estimated_tokens']:,}")
    print(f"  视频时长: {video_token_info['duration']:.2f}秒")
    print(f"  总帧数: {video_token_info['frame_count']:,}")
    print(f"  帧率: {video_token_info['fps']:.2f} fps")
    print(f"  文件大小: {video_token_info['file_size_mb']:.2f} MB")
    print(f"  使用帧数: {video_token_info['frames_used']:,}")

    # 统计文本token
    text_tokens = 0
    if text_content.strip():
        text_tokens = count_tokens(text_content)
        print(f"\n📝 文本Token统计:")
        print(f"  文本内容Token: {text_tokens:,}")
        print(f"  文本字符数: {len(text_content):,}")

    # 统计提示词token
    prompt_tokens = 0
    if prompt_text.strip():
        prompt_tokens = count_tokens(prompt_text)
        print(f"  提示词Token: {prompt_tokens:,}")

    video_cost = 0.0015
    text_cost = 0.0004
    total_cost = (video_token_info['estimated_tokens']*video_cost + text_tokens*text_cost + prompt_tokens*text_cost)/1000
    # 计算总输入token
    total_input_tokens = (video_token_info['estimated_tokens'] + text_tokens + prompt_tokens)
    print(f"\n📈 总输入Token统计:")
    print(f"  视频Token: {video_token_info['estimated_tokens']:,}")
    print(f"  文本Token: {text_tokens:,}")
    print(f"  提示词Token: {prompt_tokens:,}")
    print(f"  🔥 总输入Token: {total_input_tokens:,}")
    print(f"  💰 总费用: {total_cost:.4f}元")
    print("="*50)
    
    return {
        'video_tokens': video_token_info['estimated_tokens'],
        'text_tokens': text_tokens,
        'prompt_tokens': prompt_tokens,
        'total_input_tokens': total_input_tokens,
        'video_info': video_token_info,
        'total_cost': total_cost
    }

if __name__ == "__main__":
    # 测试token统计功能
    test_video = "/root/autodl-tmp/new/哈尔滨.mp4"
    test_text = "这是一个测试文本，包含中英文内容。This is a test text with Chinese and English content."
    test_prompt = "请分析这个视频的内容。"
    
    result = analyze_input_tokens(test_video, test_text, test_prompt)
    print(f"\n测试结果: {result}") 

    # video_token = result['video_tokens']
    # video_cost = 0.0015

    # prompt_token = result['prompt_tokens']
    # text_token = result['text_tokens']
    # text_cost = 0.0004

    # total_cost = video_token*video_cost + prompt_token*text_cost + text_token*text_cost
    
    # print(total_cost)