hot_video_analyse/token_counter.py

113 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import tiktoken
import os
import cv2
def count_tokens(text, model="gpt-4"):
"""统计文本的token数量"""
try:
encoding = tiktoken.encoding_for_model(model)
tokens = encoding.encode(text)
return len(tokens)
except Exception as e:
print(f"Token统计出错: {e}")
# 简单估算中文字符约1.5个token英文单词约1.3个token
chinese_chars = sum(1 for char in text if '\u4e00' <= char <= '\u9fff')
english_words = len([word for word in text.split() if word.isascii()])
estimated_tokens = int(chinese_chars * 1.5 + english_words * 1.3)
return estimated_tokens
def get_video_token_estimate(video_path):
"""估算视频的token数量基于文件大小和时长"""
try:
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
return {'estimated_tokens': 0, 'duration': 0, 'frame_count': 0, 'fps': 0, 'file_size_mb': 0, 'frames_used': 0}
# 获取视频信息
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = frame_count / fps if fps > 0 else 0
# 获取文件大小
file_size = os.path.getsize(video_path)
cap.release()
# 基于GPT-4V的token估算规则
# 视频token = 基础token + 帧数 * 每帧token
base_tokens = 85 # 基础token
frames_per_second = min(fps, 1) # 每秒最多1帧
total_frames = min(frame_count, int(duration * frames_per_second))
tokens_per_frame = 170 # 每帧约170个token
estimated_tokens = base_tokens + total_frames * tokens_per_frame
return {
'estimated_tokens': int(estimated_tokens),
'duration': duration,
'frame_count': frame_count,
'fps': fps,
'file_size_mb': file_size / (1024 * 1024),
'frames_used': total_frames
}
except Exception as e:
print(f"视频token估算出错: {e}")
return {'estimated_tokens': 0, 'duration': 0, 'frame_count': 0, 'fps': 0, 'file_size_mb': 0, 'frames_used': 0}
def analyze_input_tokens(video_path, text_content="", prompt_text=""):
"""分析输入token统计"""
print("\n" + "="*50)
print("📊 Token统计信息:")
print("="*50)
# 统计视频token
video_token_info = get_video_token_estimate(video_path)
print(f"🎬 视频Token统计:")
print(f" 估算Token数量: {video_token_info['estimated_tokens']:,}")
print(f" 视频时长: {video_token_info['duration']:.2f}")
print(f" 总帧数: {video_token_info['frame_count']:,}")
print(f" 帧率: {video_token_info['fps']:.2f} fps")
print(f" 文件大小: {video_token_info['file_size_mb']:.2f} MB")
print(f" 使用帧数: {video_token_info['frames_used']:,}")
# 统计文本token
text_tokens = 0
if text_content.strip():
text_tokens = count_tokens(text_content)
print(f"\n📝 文本Token统计:")
print(f" 文本内容Token: {text_tokens:,}")
print(f" 文本字符数: {len(text_content):,}")
# 统计提示词token
prompt_tokens = 0
if prompt_text.strip():
prompt_tokens = count_tokens(prompt_text)
print(f" 提示词Token: {prompt_tokens:,}")
# 计算总输入token
total_input_tokens = video_token_info['estimated_tokens'] + text_tokens + prompt_tokens
print(f"\n📈 总输入Token统计:")
print(f" 视频Token: {video_token_info['estimated_tokens']:,}")
print(f" 文本Token: {text_tokens:,}")
print(f" 提示词Token: {prompt_tokens:,}")
print(f" 🔥 总输入Token: {total_input_tokens:,}")
print("="*50)
return {
'video_tokens': video_token_info['estimated_tokens'],
'text_tokens': text_tokens,
'prompt_tokens': prompt_tokens,
'total_input_tokens': total_input_tokens,
'video_info': video_token_info
}
if __name__ == "__main__":
# 测试token统计功能
test_video = "/root/autodl-tmp/new/哈尔滨.mp4"
test_text = "这是一个测试文本包含中英文内容。This is a test text with Chinese and English content."
test_prompt = "请分析这个视频的内容。"
result = analyze_input_tokens(test_video, test_text, test_prompt)
print(f"\n测试结果: {result}")