From cdcc55f4b47c0358446fd1bf90a4ccc1bd02dc48 Mon Sep 17 00:00:00 2001
From: Shuang_Dong <374191531@qq.com>
Date: Mon, 30 Jun 2025 15:43:52 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=E8=AE=A1=E7=AE=97to?=
 =?UTF-8?q?kens=E8=B4=B9=E7=94=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 code/api_video.py              | 136 ++++++++++++++++++++++++++++++---
 code/ocr_subtitle_extractor.py |   6 +-
 token_counter.py               | 112 +++++++++++++++++++++++++++
 3 files changed, 243 insertions(+), 11 deletions(-)
 create mode 100644 token_counter.py

diff --git a/code/api_video.py b/code/api_video.py
index 235cfc1..0a07898 100644
--- a/code/api_video.py
+++ b/code/api_video.py
@@ -3,7 +3,7 @@ import os
 import base64
 import time
 from datetime import datetime
-
+from token_counter import *
 
 #  Base64 编码格式
 def encode_video(video_path):
@@ -231,16 +231,16 @@ def save_result_to_txt(response_text, video_path, save_dir="/root/autodl-tmp/fin
 STREAM_MODE = True
 
 # 文件路径配置
-video_path = "/root/autodl-tmp/new/西藏2.mp4"
+video_path = "/root/autodl-tmp/new/哈尔滨.mp4"
 #audio_path = "/root/autodl-tmp/video2audio/sample_demo_6.wav"
 #txt_path = "/root/autodl-tmp/hot_video_analyse/source/example_reference.txt"  # 使用示例参考文档
 
 # JSON文件路径配置
-speech_json_path = "/root/autodl-tmp/new_sensevoice/西藏2_sensevoice.json"  # 口播转文字JSON文件
-ocr_json_path = "/root/autodl-tmp/new_cnocr/西藏2_subtitles.json"     # OCR字幕转文字JSON文件
+speech_json_path = "/root/autodl-tmp/new_sensevoice/哈尔滨_sensevoice.json"  # 口播转文字JSON文件
+ocr_json_path = "/root/autodl-tmp/new_cnocr/哈尔滨_subtitles.json"     # OCR字幕转文字JSON文件
 #clip_json_path = "/root/autodl-tmp/02_VideoSplitter/VideoSplitter_output/shou_gonglve_3_scenes.json"
-whisper_json_path = "/root/autodl-tmp/new_whisper/西藏2_transcript.json"  # Whisper转文字JSON文件
-ocr_txt_path = "/root/autodl-tmp/new_cnocr/西藏2_subtitles_processed.txt"
+whisper_json_path = "/root/autodl-tmp/new_whisper/哈尔滨_transcript.json"  # Whisper转文字JSON文件
+ocr_txt_path = "/root/autodl-tmp/new_cnocr/哈尔滨_subtitles_processed.txt"
 # 编码文件
 print("开始编码文件...")
 encode_start_time = time.time()
@@ -283,6 +283,122 @@ encode_duration = encode_end_time - encode_start_time
 print(f"文件编码完成，耗时: {encode_duration:.2f} 秒")
 
 
+# 统计提示词token
+prompt_text = """🎥 **抖音短视频内容分析专家**
+    ## 任务背景
+您是一位经验丰富的视频导演和编辑，需要基于以上两个时间轴数据，和视频内容。为视频写一个完整、流畅的脚本。
+请对这个抖音短视频进行详细的内容分析，重点关注以下三个方面：
+## 🎤 一、口播内容提取
+请仔细听取视频中的语音内容，完整转录：
+- **完整口播转录**：逐字逐句转录所有口语表达
+- **语音时长**：估算总的讲话时长
+## 📝 二、字幕文字识别
+请识别视频画面中出现的所有文字内容：
+- **屏幕字幕**：视频中显示的字幕文字（包括自动字幕和手动添加的字幕）
+- **标题文字**：视频开头、中间、结尾出现的大标题
+
+## 🎬 三、转场效果分析
+请仔细观察视频中的转场效果，并且结合参考资料中的转场内容，请你整体分析一下视频。比如几个画面出现第一个转场等.
+转场的time_start","time_end","textIdx"请严格按照参考资料中的口播内容的时间戳start,end,id填写，不要自己生成。
+
+
+## 📊 输出格式要求
+
+## 视频内容分析
+请按照以下JSON格式输出视频描述：
+
+{
+    "total_Oral broadcasting":"请你生成一个完整的口播内容。",
+    "summary": "请用一句话总结视频的核心内容，突出视频的主要卖点和价值主张",
+    "content": [
+        {
+            "type": "cut",
+            "scenes": 1,
+            "time_start": 0.0,
+            "time_end": 2.0,
+            "talk": "请将对应时间的口播或字幕信息，填入此",
+            "description": "跳转到视频对应时间，将视频对应时间的图片，描述这个镜头的画面内容、人物动作、场景特点等。不要重复描述。"
+        },
+
+        {
+            "type": "cut",
+            "scenes": 2,
+            "time_start": 2.0,
+            "time_end": 4.5,
+            "talk": "请将对应时间的口播或字幕信息，填入此",
+            "description": "跳转到视频对应时间，将视频对应时间的图片，详细描述这个镜头的画面内容、人物动作、场景特点等。不要描述视频内容，只描述这个镜头的画面内容，不要重复描述。"
+        },
+
+        {
+            "type": "cut",
+            "scenes": 3,
+            "time_start": 4.5,
+            "time_end": 6.0,
+            "talk": "请将对应时间的口播或字幕信息，填入此",
+            "description": "跳转到视频对应时间，将视频对应时间的图片，详细描述这个镜头的画面内容、人物动作、场景特点等。不要描述视频内容，只描述这个镜头的画面内容，不要重复描述。"
+        }
+    ]
+}
+
+## 输出要求
+1. summary：用一句话概括视频核心内容，突出主要卖点
+2. content：按时间顺序交替描述镜头和转场
+   - 镜头(lens)描述：
+     * textIdx：镜头序号，从1开始递增
+     * time_start：开始时间（秒），精确到小数点后一位
+     * time_end：结束时间（秒），精确到小数点后一位
+     * talk：该镜头中的对话或文字内容
+     * description：详细描述镜头内容，包括：
+       - 画面构图和场景
+       - 人物动作和表情
+       - 重要道具和元素
+       - 特殊效果和转场
+
+
+## 注意事项
+1. 保持描述简洁明了，但要有足够的细节
+2. 突出视频的亮点和特色
+3. 确保时间戳的准确性
+4. 对话内容要符合视频画面
+5. 整体风格要统一连贯
+6. 每个镜头的描述要包含关键信息
+
+## 示例内容描述
+1. 镜头1：
+   - 开场特写镜头，展示产品外观
+   - 画面从模糊到清晰，突出产品细节
+   - 背景音乐渐入，营造氛围
+   - 文字提示："全新升级，品质保证"
+
+2. 转场1-2：
+   - 类型：平滑滑动
+   - 目的：自然过渡到使用场景
+   - 效果：画面从产品特写平滑滑向使用场景
+
+3. 镜头2：
+   - 中景展示使用场景
+   - 人物自然流畅的动作展示
+   - 光线明亮，突出产品效果
+   - 文字说明："简单操作，轻松上手"
+
+4. 转场2-3：
+   - 类型：快速缩放
+   - 目的：突出产品核心功能
+   - 效果：画面快速聚焦到产品关键部位
+
+5. 镜头3：
+   - 特写展示产品核心功能
+   - 慢动作展示关键细节
+   - 画面色彩鲜明，对比强烈
+   - 文字强调："专业性能，值得信赖"
+
+请根据以上要求，分析视频并输出JSON格式的描述。 
+
+请开始详细分析这个抖音短视频："""
+
+# 调用token统计功能
+token_stats = analyze_input_tokens(video_path, txt_content, prompt_text)
+
 client = OpenAI(
     # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx"
     api_key="EMPTY",
@@ -348,7 +464,7 @@ content_list.append({
             "time_start": 0.0,
             "time_end": 2.0,
             "talk": "请将对应时间的口播或字幕信息，填入此",
-            "description": "跳转到视频对应时间，将视频对应时间的图片，详细描述这个镜头的画面内容、人物动作、场景特点等。不要描述视频内容，只描述这个镜头的画面内容，不要重复描述。"
+            "description": "跳转到视频对应时间，将视频对应时间的图片，描述这个镜头的画面内容、人物动作、场景特点等。不要重复描述。"
         },
 
         {
@@ -449,7 +565,7 @@ completion = client.chat.completions.create(
     ],
     stream=STREAM_MODE,
     stream_options={"include_usage": True} if STREAM_MODE else None,
-    temperature=0.4
+    temperature=0.5
 )
 
 if STREAM_MODE:
@@ -510,7 +626,9 @@ if STREAM_MODE:
         tokens_per_second = token_count / generation_time
         print(f"🔥 生成速度: {tokens_per_second:.2f} tokens/秒")
     print(f"⏰ 完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    
+    print(f"输出总费用：{token_count*0.0045/1000:.4f}元")
+    final_cost = token_stats['total_cost']+token_count*0.0045/1000
+    print(f"总费用：{final_cost:.4f}元")
     # 输出使用情况信息
     if usage_info:
         print("\n" + "="*50)
diff --git a/code/ocr_subtitle_extractor.py b/code/ocr_subtitle_extractor.py
index 729aab3..384fa78 100644
--- a/code/ocr_subtitle_extractor.py
+++ b/code/ocr_subtitle_extractor.py
@@ -72,6 +72,7 @@ class VideoSubtitleExtractor:
                 self.paddle_ocr = PaddleOCR(
                     use_textline_orientation=True,
                     lang=paddle_lang,
+                    use_gpu=True,  # 启用GPU加速
                     show_log=False  # 减少日志输出
                 )
                 logger.info("PaddleOCR加载完成")
@@ -101,7 +102,8 @@ class VideoSubtitleExtractor:
                 logger.info(f"EasyOCR模型路径: {model_storage_directory}")
                 self.easy_ocr = easyocr.Reader(
                     lang_list, 
-                    model_storage_directory=model_storage_directory
+                    model_storage_directory=model_storage_directory,
+                    gpu=True  # 启用GPU加速
                 )
                 logger.info("EasyOCR加载完成")
             except ImportError:
@@ -125,7 +127,7 @@ class VideoSubtitleExtractor:
                 logger.info("使用CnOCR默认模型配置")
                 
                 # 使用默认配置，CnOCR会自动选择合适的模型
-                self.cn_ocr = CnOcr()
+                self.cn_ocr = CnOcr(device='gpu')  # 启用GPU加速
                 logger.info("CnOCR加载完成")
             except ImportError:
                 logger.error("请安装CnOCR: pip install cnocr")
diff --git a/token_counter.py b/token_counter.py
new file mode 100644
index 0000000..55e0679
--- /dev/null
+++ b/token_counter.py
@@ -0,0 +1,112 @@
+import tiktoken
+import os
+import cv2
+
+def count_tokens(text, model="gpt-4"):
+    """统计文本的token数量"""
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+        tokens = encoding.encode(text)
+        return len(tokens)
+    except Exception as e:
+        print(f"Token统计出错: {e}")
+        # 简单估算：中文字符约1.5个token，英文单词约1.3个token
+        chinese_chars = sum(1 for char in text if '\u4e00' <= char <= '\u9fff')
+        english_words = len([word for word in text.split() if word.isascii()])
+        estimated_tokens = int(chinese_chars * 1.5 + english_words * 1.3)
+        return estimated_tokens
+
+def get_video_token_estimate(video_path):
+    """估算视频的token数量（基于文件大小和时长）"""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            return {'estimated_tokens': 0, 'duration': 0, 'frame_count': 0, 'fps': 0, 'file_size_mb': 0, 'frames_used': 0}
+        
+        # 获取视频信息
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        duration = frame_count / fps if fps > 0 else 0
+        
+        # 获取文件大小
+        file_size = os.path.getsize(video_path)
+        
+        cap.release()
+        
+        # 基于GPT-4V的token估算规则
+        # 视频token = 基础token + 帧数 * 每帧token
+        base_tokens = 85  # 基础token
+        frames_per_second = min(fps, 1)  # 每秒最多1帧
+        total_frames = min(frame_count, int(duration * frames_per_second))
+        tokens_per_frame = 170  # 每帧约170个token
+        
+        estimated_tokens = base_tokens + total_frames * tokens_per_frame
+        
+        return {
+            'estimated_tokens': int(estimated_tokens),
+            'duration': duration,
+            'frame_count': frame_count,
+            'fps': fps,
+            'file_size_mb': file_size / (1024 * 1024),
+            'frames_used': total_frames
+        }
+    except Exception as e:
+        print(f"视频token估算出错: {e}")
+        return {'estimated_tokens': 0, 'duration': 0, 'frame_count': 0, 'fps': 0, 'file_size_mb': 0, 'frames_used': 0}
+
+def analyze_input_tokens(video_path, text_content="", prompt_text=""):
+    """分析输入token统计"""
+    print("\n" + "="*50)
+    print("📊 Token统计信息:")
+    print("="*50)
+
+    # 统计视频token
+    video_token_info = get_video_token_estimate(video_path)
+    print(f"🎬 视频Token统计:")
+    print(f"  估算Token数量: {video_token_info['estimated_tokens']:,}")
+    print(f"  视频时长: {video_token_info['duration']:.2f}秒")
+    print(f"  总帧数: {video_token_info['frame_count']:,}")
+    print(f"  帧率: {video_token_info['fps']:.2f} fps")
+    print(f"  文件大小: {video_token_info['file_size_mb']:.2f} MB")
+    print(f"  使用帧数: {video_token_info['frames_used']:,}")
+
+    # 统计文本token
+    text_tokens = 0
+    if text_content.strip():
+        text_tokens = count_tokens(text_content)
+        print(f"\n📝 文本Token统计:")
+        print(f"  文本内容Token: {text_tokens:,}")
+        print(f"  文本字符数: {len(text_content):,}")
+
+    # 统计提示词token
+    prompt_tokens = 0
+    if prompt_text.strip():
+        prompt_tokens = count_tokens(prompt_text)
+        print(f"  提示词Token: {prompt_tokens:,}")
+
+    # 计算总输入token
+    total_input_tokens = video_token_info['estimated_tokens'] + text_tokens + prompt_tokens
+    print(f"\n📈 总输入Token统计:")
+    print(f"  视频Token: {video_token_info['estimated_tokens']:,}")
+    print(f"  文本Token: {text_tokens:,}")
+    print(f"  提示词Token: {prompt_tokens:,}")
+    print(f"  🔥 总输入Token: {total_input_tokens:,}")
+
+    print("="*50)
+    
+    return {
+        'video_tokens': video_token_info['estimated_tokens'],
+        'text_tokens': text_tokens,
+        'prompt_tokens': prompt_tokens,
+        'total_input_tokens': total_input_tokens,
+        'video_info': video_token_info
+    }
+
+if __name__ == "__main__":
+    # 测试token统计功能
+    test_video = "/root/autodl-tmp/new/哈尔滨.mp4"
+    test_text = "这是一个测试文本，包含中英文内容。This is a test text with Chinese and English content."
+    test_prompt = "请分析这个视频的内容。"
+    
+    result = analyze_input_tokens(test_video, test_text, test_prompt)
+    print(f"\n测试结果: {result}")