hot_video_analyse/code/pre_data_1.py

import os

def read_json_file(json_path):
    """读取JSON文件内容"""
    try:
        import json
        with open(json_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        print(f"成功读取JSON文件: {json_path}")
        return data
    except FileNotFoundError:
        print(f"错误: 找不到文件 {json_path}")
        return None
    except json.JSONDecodeError as e:
        print(f"JSON解析错误: {e}")
        return None
    except Exception as e:
        print(f"读取JSON文件时出错: {e}")
        return None

def calculate_text_similarity(text1, text2):
    """
    计算两个文本的相似度（使用Jaccard相似度）
    
    Args:
        text1: 第一个文本
        text2: 第二个文本
    
    Returns:
        float: 相似度值 (0-1之间)
    """
    # 检查空文本
    if not text1 or not text2:
        return 0.0
    
    # 清理文本，移除空白字符
    text1 = text1.strip()
    text2 = text2.strip()
    
    if not text1 or not text2:
        return 0.0
    
    # 如果两个文本完全相同
    if text1 == text2:
        return 1.0
    
    # 将文本转换为字符集合
    chars1 = set(text1)
    chars2 = set(text2)
    
    # 计算Jaccard相似度
    intersection = len(chars1.intersection(chars2))
    union = len(chars1.union(chars2))
    
    similarity = intersection / union if union > 0 else 0.0
    return similarity

def calculate_iou(box1, box2):
    """
    计算两个边界框的IoU (Intersection over Union)
    
    Args:
        box1: 第一个边界框 [x1, y1, x2, y2] 或 [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
        box2: 第二个边界框 [x1, y1, x2, y2] 或 [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
    
    Returns:
        float: IoU值 (0-1之间)
    """
    # 处理不同的输入格式
    if len(box1) == 4 and isinstance(box1[0], (int, float)):
        # 格式: [x1, y1, x2, y2]
        x1_1, y1_1, x2_1, y2_1 = box1
    elif len(box1) == 4 and isinstance(box1[0], list):
        # 格式: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] - 取最小和最大坐标
        x_coords = [point[0] for point in box1]
        y_coords = [point[1] for point in box1]
        x1_1, x2_1 = min(x_coords), max(x_coords)
        y1_1, y2_1 = min(y_coords), max(y_coords)
    else:
        raise ValueError("box1格式错误，应为[x1,y1,x2,y2]或[[x1,y1],[x2,y2],[x3,y3],[x4,y4]]")
    
    if len(box2) == 4 and isinstance(box2[0], (int, float)):
        # 格式: [x1, y1, x2, y2]
        x1_2, y1_2, x2_2, y2_2 = box2
    elif len(box2) == 4 and isinstance(box2[0], list):
        # 格式: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] - 取最小和最大坐标
        x_coords = [point[0] for point in box2]
        y_coords = [point[1] for point in box2]
        x1_2, x2_2 = min(x_coords), max(x_coords)
        y1_2, y2_2 = min(y_coords), max(y_coords)
    else:
        raise ValueError("box2格式错误，应为[x1,y1,x2,y2]或[[x1,y1],[x2,y2],[x3,y3],[x4,y4]]")
    
    # 计算交集区域
    x_left = max(x1_1, x1_2)
    y_top = max(y1_1, y1_2)
    x_right = min(x2_1, x2_2)
    y_bottom = min(y2_1, y2_2)
    
    # 检查是否有交集
    if x_right < x_left or y_bottom < y_top:
        return 0.0
    
    # 计算交集面积
    intersection_area = (x_right - x_left) * (y_bottom - y_top)
    
    # 计算并集面积
    box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
    box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
    union_area = box1_area + box2_area - intersection_area
    
    # 计算IoU
    iou = intersection_area / union_area if union_area > 0 else 0.0
    
    return iou

def format_ocr_json(ocr_data):
    """格式化OCR字幕转文字JSON数据"""
    if not ocr_data:
        return "", []
    
    formatted_text = "【OCR字幕识别内容】\n"
    
    # 如果是字幕提取器的格式
    if isinstance(ocr_data, dict):
        # 基本信息
        if 'ocr_engine' in ocr_data:
            formatted_text += f"OCR引擎: {ocr_data['ocr_engine']}\n"
        
        if 'video_path' in ocr_data:
            formatted_text += f"视频文件: {ocr_data['video_path']}\n"
            
        if 'duration' in ocr_data:
            formatted_text += f"视频时长: {ocr_data['duration']:.2f}秒\n"
            
        if 'fps' in ocr_data:
            formatted_text += f"视频帧率: {ocr_data['fps']:.2f}FPS\n"
            
        if 'frame_width' in ocr_data and 'frame_height' in ocr_data:
            formatted_text += f"视频分辨率: {ocr_data['frame_width']}x{ocr_data['frame_height']}\n"
        
        # 字幕区域信息
        if 'subtitle_position' in ocr_data:
            formatted_text += f"字幕区域: {ocr_data['subtitle_position']}\n"
            
        if 'subtitle_region' in ocr_data:
            region = ocr_data['subtitle_region']
            formatted_text += f"字幕区域坐标: {region}\n"
        
        # 处理参数
        if 'sample_interval' in ocr_data:
            formatted_text += f"采样间隔: {ocr_data['sample_interval']}帧\n"
            
        if 'confidence_threshold' in ocr_data:
            formatted_text += f"置信度阈值: {ocr_data['confidence_threshold']}\n"
        
        # 完整字幕文本
        if 'continuous_text' in ocr_data:
            formatted_text += f"\n📄 完整字幕文本:\n"
            formatted_text += f"{ocr_data['continuous_text']}\n"
        
        # 详细字幕时间轴 - 按三层嵌套数组结构组织
        if 'subtitles' in ocr_data and len(ocr_data['subtitles']) > 0:
            subtitles = ocr_data['subtitles']
            
            # 按时间戳分组存储
            timestamp_groups = {}
            for subtitle in subtitles:
                timestamp = subtitle.get('timestamp', 0)
                text = subtitle.get('text', '')
                confidence = subtitle.get('confidence', 0)
                engine = subtitle.get('engine', 'Unknown')
                bbox = subtitle.get('bbox', [])

                if timestamp not in timestamp_groups:
                    timestamp_groups[timestamp] = []
                
                # 第三层：内容和位置
                subtitle_content = {
                    'text': text,
                    'bbox': bbox,
                    "timestamp": timestamp
                }
                
                timestamp_groups[timestamp].append(subtitle_content)
            
            # 转换为三层嵌套数组结构
            subtitle_array = []
            sorted_timestamps = sorted(timestamp_groups.keys())
            
            for timestamp in sorted_timestamps:
                # 第一层：时间戳
                timestamp_entry = {
                    'timestamp': timestamp,
                    'contents': timestamp_groups[timestamp] # 第二层：同一时间戳内的各个内容
                }
                subtitle_array.append(timestamp_entry)
            
            # 显示三层嵌套数组结构
            formatted_text += f"\n⏰ 详细字幕时间轴 (三层嵌套数组结构):\n"
            
            # 只显示前10个时间戳，避免过长
            display_count = min(10, len(subtitle_array))
            for i, timestamp_entry in enumerate(subtitle_array[:display_count], 1):
                timestamp = timestamp_entry['timestamp']
                contents = timestamp_entry['contents']
                
                formatted_text += f"  {i}. {timestamp:.2f}s:\n"
                
                # 显示该时间戳下的所有字幕（第二层）
                for j, content in enumerate(contents, 1):
                    text = content['text']
                    bbox = content['bbox']
                    
                    formatted_text += f"    {j}. [{timestamp:.2f}s|{confidence:.3f}]: {text}\n"
                    
                    # 如果有位置信息，显示bbox（第三层）
                    if bbox:
                        formatted_text += f"       位置: {bbox}\n"
                
                formatted_text += "\n"
            
            if len(subtitle_array) > display_count:
                formatted_text += f"  ... (还有{len(subtitle_array) - display_count}个时间戳)\n"
            
            # 返回三层嵌套数组结构
            return formatted_text, subtitle_array
    
    return formatted_text, []

def merge_and_filter_subtitles(subtitle_array, iou_threshold=0.7, text_similarity_threshold=0.7):
    """
    合并并过滤字幕内容，去除重复和空内容，返回格式化字符串和处理后的数组
    """
    # 深拷贝，避免原地修改
    import copy
    subtitle_array = copy.deepcopy(subtitle_array)
    formatted_text = []

    for i in range(len(subtitle_array)):
        for j in range(len(subtitle_array[i]["contents"])):
            # 修复：确保i+k不会超出数组范围
            for k in range(1, len(subtitle_array) - i):  # 从1开始，避免自己和自己比较
                if i + k >= len(subtitle_array):  # 安全检查
                    break
                for l in range(len(subtitle_array[i+k]["contents"])):
                    text = subtitle_array[i]["contents"][j]["text"]
                    bbox = subtitle_array[i]["contents"][j]["bbox"]
                    text_1 = subtitle_array[i+k]["contents"][l]["text"]
                    bbox_1 = subtitle_array[i+k]["contents"][l]["bbox"]
                    
                    iou = calculate_iou(bbox, bbox_1)
                    text_similarity = calculate_text_similarity(text, text_1)
                    
                    if iou > iou_threshold and text_similarity > text_similarity_threshold:
                        # 记录需要删除的索引
                        subtitle_array[i+k]["contents"][l]["text"] = ''
                        subtitle_array[i]["contents"][j]["timestamp"] += 1

    # 删除text为空字符串的contents
    for i in range(len(subtitle_array)):
        subtitle_array[i]["contents"] = [content for content in subtitle_array[i]["contents"] if content["text"] != '']

    # 删除contents为空的时间戳条目
    subtitle_array = [entry for entry in subtitle_array if len(entry["contents"]) > 0]

    #formatted_text.append("处理后的字幕数组:")
    for i, timestamp_entry in enumerate(subtitle_array[:], 1):
        formatted_text.append(f"\n开始时间 {timestamp_entry['timestamp']:.2f}s:")
        #formatted_text.append(f"  包含 {len(timestamp_entry['contents'])} 个字幕内容")
        for j, content in enumerate(timestamp_entry['contents'], 1):
            formatted_text.append(f"    {j}. 文本: '{content['text']}'")
            if content['bbox']:
                formatted_text.append(f"       位置: {content['bbox']}")
            if 'timestamp' in content and content['timestamp']:
                formatted_text.append(f"       结束时间: {content['timestamp']:.2f}s")

    #formatted_text.append("\n完整数组结构:")
    #formatted_text.append(str(subtitle_array))

    return '\n'.join(formatted_text), subtitle_array


ocr_json_path = "/root/autodl-tmp/new_cnocr/哈尔滨_subtitles.json"

ocr_data = read_json_file(ocr_json_path)
pre_data , subtitle_array= format_ocr_json(ocr_data)

iou_threshold = 0.8
text_similarity_threshold = 0.8
a , b  = merge_and_filter_subtitles(subtitle_array, iou_threshold, text_similarity_threshold)
#print("\n完整数组结构:")
print(a)
print(b)

# 保存输出结果到txt文件

output_dir = os.path.dirname(ocr_json_path)
output_filename = os.path.splitext(os.path.basename(ocr_json_path))[0] + "_processed.txt"
output_path = os.path.join(output_dir, output_filename)

try:
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(a)
    print(f"\n处理结果已保存到: {output_path}")
except Exception as e:
    print(f"保存文件时出错: {e}")

#验证 "/root/autodl-tmp/douyin_ocr/兰州_subtitles.json" 里面的重复的两个内容，确实是bbox不重叠
# a = [[303, 243], [442, 243], [442, 303], [303, 303]]
# b = [[339, 231], [495, 241], [490, 304], [335, 294]]
# c = [[482, 273], [660, 276], [660, 303], [481, 300]]
# d = [[536, 268], [732, 273], [731, 300], [535, 295]]

# iou = calculate_iou(a,b) # 0.47
# d = calculate_iou(c,d)  # 0.40