video_template_gen/code/merge_segment_template.py

180 lines
5.9 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
视频片段整合脚本
用于整合多个视频分析片段文件到一个完整的分析结果
"""
import json
import os
import glob
from datetime import datetime
def extract_json_from_file(file_path):
"""从文件中提取JSON内容"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 查找JSON内容的开始和结束位置
start_marker = "```json"
end_marker = "```"
start_pos = content.find(start_marker)
if start_pos == -1:
print(f"警告: 在文件 {file_path} 中未找到JSON标记")
return None
# 找到JSON内容的开始位置
json_start = content.find('{', start_pos)
if json_start == -1:
print(f"警告: 在文件 {file_path} 中未找到JSON对象")
return None
# 找到JSON内容的结束位置
json_end = content.rfind(end_marker)
if json_end == 0:
print(f"警告: 在文件 {file_path} 中未找到JSON对象结束")
return None
# 提取JSON字符串
json_str = content[json_start:json_end]
print(f"json_str:{json_str}")
# 解析JSON
try:
json_data = json.loads(json_str)
return json_data
except json.JSONDecodeError as e:
print(f"错误: 解析文件 {file_path} 中的JSON失败: {e}")
print(f"尝试手动修复JSON...")
except Exception as e:
print(f"错误: 读取文件 {file_path} 失败: {e}")
return None
def merge_segments(segment_files):
"""合并多个片段文件"""
merged_data = {
"total_Oral broadcasting": "",
"summary": "",
"content": []
}
# 按文件名排序,确保按正确顺序合并
segment_files.sort()
for i, file_path in enumerate(segment_files):
print(f"正在处理片段 {i+1}: {os.path.basename(file_path)}")
json_data = extract_json_from_file(file_path)
if json_data is None:
continue
# 合并口播内容
if "total_Oral broadcasting" in json_data:
if merged_data["total_Oral broadcasting"]:
merged_data["total_Oral broadcasting"] += " " + json_data["total_Oral broadcasting"]
else:
merged_data["total_Oral broadcasting"] = json_data["total_Oral broadcasting"]
# 合并摘要
if "summary" in json_data:
if merged_data["summary"]:
merged_data["summary"] += " " + json_data["summary"]
else:
merged_data["summary"] = json_data["summary"]
# 合并内容
if "content" in json_data:
if i >= 1:
for item in json_data["content"]:
item["start"] = item["start"] + i*30
item["end"] = item["end"] + i*30
# 为每个片段的content重新编号
for item in json_data["content"]:
item["id"] = len(merged_data["content"]) + 1
merged_data["content"].append(item)
return merged_data
def save_merged_result(merged_data, output_file):
"""保存合并结果到JSON文件"""
try:
# # 添加元数据到JSON中
# json_data = {
# "metadata": {
# "video_file": "/root/autodl-tmp/video_processed/广州广之旅国际旅行社股份有限公司",
# "analysis_time": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
# "merge_time": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# },
# "data": merged_data
# }
# 保存为纯JSON文件
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(merged_data, f, ensure_ascii=False, indent=4)
print(f"合并结果已保存到: {output_file}")
return True
except Exception as e:
print(f"错误: 保存合并结果失败: {e}")
return False
def main():
"""主函数"""
# 获取当前目录
current_dir = "/root/autodl-tmp/video_llm/Template/牛管家"
print(current_dir)
# 查找所有片段文件
pattern = os.path.join(current_dir, "*.txt")
segment_files = glob.glob(pattern)
# if not segment_files:
# print("错误: 未找到任何片段文件")
# return
# print(f"找到 {len(segment_files)} 个片段文件:")
# #frame = extract_json_from_file(segment_files[1])
# #print(frame["content"][0]["id"])
# # print(frame)
# # for file_path in segment_files:
# # print(f" - {os.path.basename(file_path)}")
# for file_path in segment_files:
# frame = extract_json_from_file(file_path)
# print(frame)
# if __name__ == "__main__":
# main()
# 合并片段
print("\n开始合并片段...")
merged_data = merge_segments(segment_files)
if not merged_data["content"]:
print("错误: 合并失败,没有有效内容")
return
# 生成输出文件名
base_name = "牛管家"
output_file = os.path.join(current_dir, f"{base_name}_merged_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
# 保存合并结果
if save_merged_result(merged_data, output_file):
print(f"\n合并完成!")
print(f"总片段数: {len(segment_files)}")
print(f"总内容条目: {len(merged_data['content'])}")
print(f"输出文件: {os.path.basename(output_file)}")
# 显示统计信息
print(f"\n统计信息:")
print(f"- 口播内容长度: {len(merged_data['total_Oral broadcasting'])} 字符")
print(f"- 摘要长度: {len(merged_data['summary'])} 字符")
print(f"- 内容条目数: {len(merged_data['content'])}")
if __name__ == "__main__":
main()