541 lines
19 KiB
Python
541 lines
19 KiB
Python
from vllm import LLM, SamplingParams
|
||
from transformers import AutoTokenizer
|
||
from vllm.assets.image import ImageAsset
|
||
from vllm.assets.video import VideoAsset
|
||
from dataclasses import dataclass, field
|
||
from typing import Optional, ClassVar, List, Tuple
|
||
import numpy as np
|
||
from numpy.typing import NDArray
|
||
from PIL import Image
|
||
import librosa
|
||
import librosa.util
|
||
import os
|
||
import cv2
|
||
from skimage.metrics import structural_similarity as ssim
|
||
# import ray # 注释掉Ray导入
|
||
|
||
# # 使用本地模式初始化Ray,避免分布式通信问题
|
||
# ray.init(local_mode=True, ignore_reinit_error=True)
|
||
|
||
# 设置环境变量,禁用在线检查
|
||
os.environ["HF_DATASETS_OFFLINE"] = "1"
|
||
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
||
os.environ["HF_HUB_OFFLINE"] = "1"
|
||
|
||
@dataclass(frozen=True)
|
||
class VideoAsset:
|
||
name: str
|
||
num_frames: int = -1
|
||
|
||
_NAME_TO_FILE: ClassVar[dict[str, str]] = {
|
||
"baby_reading": "sample_demo_1.mp4",
|
||
}
|
||
|
||
@property
|
||
def filename(self) -> str:
|
||
return self._NAME_TO_FILE[self.name]
|
||
|
||
@property
|
||
def pil_images(self) -> list[Image.Image]:
|
||
video_path = download_video_asset(self.filename)
|
||
return video_to_pil_images_list(video_path, self.num_frames)
|
||
|
||
@property
|
||
def np_ndarrays(self) -> NDArray:
|
||
video_path = download_video_asset(self.filename)
|
||
return video_to_ndarrays(video_path, self.num_frames)
|
||
|
||
def get_audio(self, sampling_rate: Optional[float] = None) -> NDArray:
|
||
video_path = download_video_asset(self.filename)
|
||
return librosa.load(video_path, sr=sampling_rate)[0]
|
||
|
||
@dataclass(frozen=True)
|
||
class LocalVideoAsset:
|
||
local_path: str
|
||
name: str = "local_video"
|
||
num_frames: int = -1
|
||
|
||
@property
|
||
def filename(self) -> str:
|
||
return self.local_path
|
||
|
||
@property
|
||
def pil_images(self) -> list[Image.Image]:
|
||
return video_to_pil_images_list(self.filename, self.num_frames)
|
||
|
||
@property
|
||
def np_ndarrays(self) -> NDArray:
|
||
return video_to_ndarrays(self.filename, self.num_frames)
|
||
|
||
def get_audio(self, sampling_rate: Optional[float] = None) -> NDArray:
|
||
try:
|
||
if not os.path.exists(self.filename):
|
||
print(f"音频文件不存在: {self.filename}")
|
||
return np.zeros(1) # 返回空数组
|
||
return librosa.load(self.filename, sr=sampling_rate)[0]
|
||
except Exception as e:
|
||
print(f"加载音频时出错: {e}")
|
||
return np.zeros(1) # 出错时返回空数组
|
||
|
||
# 辅助函数实现
|
||
def download_video_asset(filename: str) -> str:
|
||
# 如果路径是绝对路径或相对路径,直接返回
|
||
if filename.startswith("/") or filename.startswith("./"):
|
||
return filename
|
||
# 否则执行下载逻辑(原实现)
|
||
return f"/path/to/downloaded/{filename}"
|
||
|
||
def video_to_pil_images_list(video_path: str, num_frames: int) -> list[Image.Image]:
|
||
"""将视频转换为PIL图像列表"""
|
||
if not os.path.exists(video_path):
|
||
print(f"视频文件不存在: {video_path}")
|
||
return []
|
||
|
||
cap = cv2.VideoCapture(video_path)
|
||
if not cap.isOpened():
|
||
print(f"无法打开视频: {video_path}")
|
||
return []
|
||
|
||
# 获取视频帧数
|
||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||
duration = total_frames / fps if fps > 0 else 0
|
||
|
||
print(f"视频信息: 总帧数={total_frames}, FPS={fps:.2f}, 时长={duration:.2f}秒")
|
||
|
||
# 如果指定了帧数,设置采样间隔;否则读取所有帧
|
||
if num_frames > 0 and num_frames < total_frames:
|
||
frame_interval = total_frames / num_frames
|
||
print(f"将提取 {num_frames} 帧,采样间隔为每 {frame_interval:.2f} 帧")
|
||
else:
|
||
frame_interval = 1
|
||
num_frames = total_frames
|
||
print(f"将提取所有 {total_frames} 帧")
|
||
|
||
pil_images = []
|
||
frame_count = 0
|
||
success = True
|
||
last_progress = -1
|
||
|
||
while success and len(pil_images) < num_frames:
|
||
# 读取下一帧
|
||
success, frame = cap.read()
|
||
if not success:
|
||
break
|
||
|
||
# 按间隔采样帧
|
||
if frame_count % max(1, int(frame_interval)) == 0:
|
||
# OpenCV使用BGR,转为RGB
|
||
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||
# 转为PIL图像
|
||
pil_image = Image.fromarray(rgb_frame)
|
||
pil_images.append(pil_image)
|
||
|
||
# 显示进度(每10%显示一次)
|
||
progress = int(len(pil_images) / num_frames * 10)
|
||
if progress > last_progress:
|
||
print(f"提取进度: {len(pil_images)}/{num_frames} ({len(pil_images)/num_frames*100:.1f}%)")
|
||
last_progress = progress
|
||
|
||
frame_count += 1
|
||
|
||
cap.release()
|
||
print(f"从视频中共提取了 {len(pil_images)} 帧")
|
||
return pil_images
|
||
|
||
def video_to_ndarrays(video_path: str, num_frames: int) -> NDArray:
|
||
"""将视频转换为NumPy数组"""
|
||
pil_images = video_to_pil_images_list(video_path, num_frames)
|
||
if not pil_images:
|
||
print(f"未能从视频中提取帧: {video_path}")
|
||
return np.zeros((1, 224, 224, 3))
|
||
|
||
# 将PIL图像列表转换为NumPy数组
|
||
arrays = []
|
||
for img in pil_images:
|
||
# 调整图像大小为统一尺寸
|
||
img_resized = img.resize((224, 224))
|
||
# 转换为NumPy数组
|
||
arr = np.array(img_resized)
|
||
arrays.append(arr)
|
||
|
||
# 堆叠为单个NumPy数组,形状为[num_frames, height, width, channels]
|
||
stacked_array = np.stack(arrays, axis=0)
|
||
print(f"NumPy数组形状: {stacked_array.shape}")
|
||
return stacked_array
|
||
|
||
@dataclass
|
||
class SceneChangeFrame:
|
||
"""场景变化帧信息"""
|
||
frame_index: int
|
||
timestamp: float
|
||
ssim_score: float
|
||
is_scene_change: bool
|
||
|
||
def calculate_ssim_between_frames(frame1: np.ndarray, frame2: np.ndarray) -> float:
|
||
"""
|
||
计算两帧之间的SSIM相似度
|
||
|
||
Args:
|
||
frame1: 第一帧图像 (H, W, C)
|
||
frame2: 第二帧图像 (H, W, C)
|
||
|
||
Returns:
|
||
SSIM相似度分数 (0-1之间)
|
||
"""
|
||
try:
|
||
# 转换为灰度图像以提高计算效率
|
||
if len(frame1.shape) == 3:
|
||
gray1 = cv2.cvtColor(frame1, cv2.COLOR_RGB2GRAY)
|
||
else:
|
||
gray1 = frame1
|
||
|
||
if len(frame2.shape) == 3:
|
||
gray2 = cv2.cvtColor(frame2, cv2.COLOR_RGB2GRAY)
|
||
else:
|
||
gray2 = frame2
|
||
|
||
# 确保两个图像尺寸相同
|
||
if gray1.shape != gray2.shape:
|
||
gray2 = cv2.resize(gray2, (gray1.shape[1], gray1.shape[0]))
|
||
|
||
# 计算SSIM
|
||
score = ssim(gray1, gray2, data_range=255)
|
||
return score
|
||
except Exception as e:
|
||
print(f"计算SSIM时出错: {e}")
|
||
return 1.0 # 出错时返回高相似度
|
||
|
||
def detect_scene_changes_from_arrays(video_arrays: np.ndarray,
|
||
similarity_threshold: float = 0.8,
|
||
fps: float = 30.0) -> List[SceneChangeFrame]:
|
||
"""
|
||
从视频NumPy数组中检测场景变化
|
||
|
||
Args:
|
||
video_arrays: 视频帧数组,形状为 (num_frames, height, width, channels)
|
||
similarity_threshold: SSIM相似度阈值,低于此值认为是场景变化
|
||
fps: 视频帧率,用于计算时间戳
|
||
|
||
Returns:
|
||
场景变化帧列表
|
||
"""
|
||
if len(video_arrays) < 2:
|
||
print("视频帧数不足,无法进行场景变化检测")
|
||
return []
|
||
|
||
print(f"开始检测场景变化...")
|
||
print(f"视频帧数: {len(video_arrays)}")
|
||
print(f"帧尺寸: {video_arrays.shape[1:3]}")
|
||
print(f"SSIM相似度阈值: {similarity_threshold}")
|
||
print(f"视频帧率: {fps} FPS")
|
||
|
||
scene_changes = []
|
||
|
||
# 逐帧比较相似度
|
||
for i in range(1, len(video_arrays)):
|
||
# 获取前后两帧
|
||
prev_frame = video_arrays[i-1]
|
||
curr_frame = video_arrays[i]
|
||
|
||
# 计算SSIM相似度
|
||
ssim_score = calculate_ssim_between_frames(prev_frame, curr_frame)
|
||
|
||
# 计算时间戳
|
||
timestamp = i / fps
|
||
|
||
# 判断是否为场景变化
|
||
is_scene_change = ssim_score < similarity_threshold
|
||
|
||
# 创建场景变化信息
|
||
scene_change = SceneChangeFrame(
|
||
frame_index=i,
|
||
timestamp=timestamp,
|
||
ssim_score=ssim_score,
|
||
is_scene_change=is_scene_change
|
||
)
|
||
|
||
scene_changes.append(scene_change)
|
||
|
||
# 输出场景变化信息
|
||
if is_scene_change:
|
||
print(f"场景变化检测到! 帧号: {i}, 时间: {timestamp:.2f}s, SSIM: {ssim_score:.4f}")
|
||
|
||
# 显示进度
|
||
if i % 100 == 0:
|
||
progress = i / len(video_arrays) * 100
|
||
print(f"检测进度: {i}/{len(video_arrays)} ({progress:.1f}%)")
|
||
|
||
# 统计结果
|
||
scene_change_count = sum(1 for sc in scene_changes if sc.is_scene_change)
|
||
print(f"\n检测完成!")
|
||
print(f"总比较帧数: {len(scene_changes)}")
|
||
print(f"场景变化数量: {scene_change_count}")
|
||
|
||
return scene_changes
|
||
|
||
def get_scene_change_frames(scene_changes: List[SceneChangeFrame]) -> List[SceneChangeFrame]:
|
||
"""
|
||
获取所有场景变化帧(相似度低于阈值的帧)
|
||
|
||
Args:
|
||
scene_changes: 场景变化检测结果列表
|
||
|
||
Returns:
|
||
场景变化帧列表
|
||
"""
|
||
return [sc for sc in scene_changes if sc.is_scene_change]
|
||
|
||
def save_scene_change_frames_from_arrays(video_arrays: np.ndarray,
|
||
scene_change_frames: List[SceneChangeFrame],
|
||
output_dir: str = "scene_change_frames"):
|
||
"""
|
||
保存场景变化帧为图像文件
|
||
|
||
Args:
|
||
video_arrays: 视频帧数组
|
||
scene_change_frames: 场景变化帧列表
|
||
output_dir: 输出目录
|
||
"""
|
||
if not scene_change_frames:
|
||
print("没有检测到场景变化帧")
|
||
return
|
||
|
||
# 创建输出目录
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
print(f"保存 {len(scene_change_frames)} 个场景变化帧到 {output_dir}")
|
||
|
||
for i, scene_change in enumerate(scene_change_frames):
|
||
frame_index = scene_change.frame_index
|
||
|
||
if frame_index < len(video_arrays):
|
||
# 获取帧图像
|
||
frame = video_arrays[frame_index]
|
||
|
||
# 转换为PIL图像并保存
|
||
pil_image = Image.fromarray(frame.astype(np.uint8))
|
||
filename = f"scene_change_{frame_index:06d}_t{scene_change.timestamp:.2f}s_ssim{scene_change.ssim_score:.4f}.jpg"
|
||
filepath = os.path.join(output_dir, filename)
|
||
pil_image.save(filepath)
|
||
|
||
if (i + 1) % 10 == 0:
|
||
print(f"已保存 {i + 1}/{len(scene_change_frames)} 帧")
|
||
|
||
print(f"所有场景变化帧已保存到: {output_dir}")
|
||
|
||
def export_scene_change_results(scene_change_frames: List[SceneChangeFrame],
|
||
similarity_threshold: float,
|
||
output_file: str = "scene_change_results.txt"):
|
||
"""
|
||
导出场景变化检测结果到文件
|
||
|
||
Args:
|
||
scene_change_frames: 场景变化帧列表
|
||
similarity_threshold: 使用的相似度阈值
|
||
output_file: 输出文件路径
|
||
"""
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
f.write(f"场景变化检测结果\n")
|
||
f.write(f"相似度阈值: {similarity_threshold}\n")
|
||
f.write(f"场景变化数量: {len(scene_change_frames)}\n")
|
||
f.write("-" * 50 + "\n")
|
||
|
||
for sc in scene_change_frames:
|
||
f.write(f"帧号: {sc.frame_index:6d}, "
|
||
f"时间: {sc.timestamp:8.2f}s, "
|
||
f"SSIM: {sc.ssim_score:.6f}\n")
|
||
|
||
print(f"检测结果已导出到: {output_file}")
|
||
|
||
def get_scene_change_frames_arrays(video_arrays: np.ndarray,
|
||
scene_change_frames: List[SceneChangeFrame]) -> np.ndarray:
|
||
"""
|
||
获取场景变化帧对应的NumPy数组集合
|
||
|
||
Args:
|
||
video_arrays: 完整的视频帧数组,形状为 (num_frames, height, width, channels)
|
||
scene_change_frames: 场景变化帧列表
|
||
|
||
Returns:
|
||
场景变化帧的NumPy数组,形状为 (num_scene_changes, height, width, channels)
|
||
"""
|
||
if not scene_change_frames:
|
||
print("没有场景变化帧")
|
||
return np.array([])
|
||
|
||
# 提取场景变化帧的索引
|
||
scene_change_indices = [sc.frame_index for sc in scene_change_frames]
|
||
|
||
# 确保索引在有效范围内
|
||
valid_indices = [idx for idx in scene_change_indices if 0 <= idx < len(video_arrays)]
|
||
|
||
if not valid_indices:
|
||
print("没有有效的场景变化帧索引")
|
||
return np.array([])
|
||
|
||
# 提取对应的帧数组
|
||
scene_change_arrays = video_arrays[valid_indices]
|
||
|
||
print(f"提取了 {len(valid_indices)} 个场景变化帧的NumPy数组")
|
||
print(f"场景变化帧数组形状: {scene_change_arrays.shape}")
|
||
|
||
return scene_change_arrays
|
||
|
||
# 使用示例
|
||
if __name__ == "__main__":
|
||
# 使用本地视频资源
|
||
local_video = LocalVideoAsset(
|
||
local_path="/root/autodl-tmp/hot_video_analyse/source/sample_demo_1.mp4",
|
||
num_frames = -1 # 限制帧数以加快测试速度
|
||
)
|
||
print("本地资源:", local_video.filename)
|
||
|
||
# 获取PIL图像列表(实际会调用download_video_asset和转换函数)
|
||
# pil_images = local_video.pil_images
|
||
# print("PIL图像数量:", len(pil_images))
|
||
# print(pil_images[0])
|
||
|
||
# 获取NumPy数组
|
||
print("\n=== 加载视频帧 ===")
|
||
np_arrays = local_video.np_ndarrays
|
||
print(f"视频数组形状: {np_arrays.shape}")
|
||
|
||
# 设置检测参数
|
||
similarity_threshold = 0.7 # SSIM相似度阈值
|
||
fps = 30.0 # 假设帧率为30fps,可以根据实际视频调整
|
||
|
||
print(f"\n=== 开始场景变化检测 ===")
|
||
print(f"相似度阈值: {similarity_threshold}")
|
||
|
||
# 检测场景变化
|
||
scene_changes = detect_scene_changes_from_arrays(
|
||
video_arrays=np_arrays,
|
||
similarity_threshold=similarity_threshold,
|
||
fps=fps
|
||
)
|
||
|
||
# 获取场景变化帧(相似度低于阈值的帧)
|
||
scene_change_frames = get_scene_change_frames(scene_changes)
|
||
|
||
# 获取场景变化帧对应的NumPy数组集合
|
||
scene_change_arrays = get_scene_change_frames_arrays(np_arrays, scene_change_frames)
|
||
|
||
print(f"\n=== 场景变化帧NumPy数组信息 ===")
|
||
if len(scene_change_arrays) > 0:
|
||
print(f"场景变化帧数组形状: {scene_change_arrays.shape}")
|
||
print(f"数据类型: {scene_change_arrays.dtype}")
|
||
print(f"数组大小: {scene_change_arrays.nbytes / (1024*1024):.2f} MB")
|
||
|
||
# 获取详细信息(包含帧信息和数组)
|
||
print(f"\n=== 检测结果摘要 ===")
|
||
print(f"低于阈值 {similarity_threshold} 的帧数: {len(scene_change_frames)}")
|
||
|
||
if scene_change_frames:
|
||
print(f"\n所有场景变化帧:")
|
||
for i, sc in enumerate(scene_change_frames):
|
||
print(f"{i+1:3d}. 帧号: {sc.frame_index:6d}, "
|
||
f"时间: {sc.timestamp:7.2f}s, "
|
||
f"SSIM: {sc.ssim_score:.6f}")
|
||
|
||
# 保存场景变化帧图像
|
||
print(f"\n=== 保存场景变化帧 ===")
|
||
save_scene_change_frames_from_arrays(
|
||
video_arrays=np_arrays,
|
||
scene_change_frames=scene_change_frames,
|
||
output_dir="scene_change_frames"
|
||
)
|
||
|
||
# 导出结果到文件
|
||
export_scene_change_results(
|
||
scene_change_frames=scene_change_frames,
|
||
similarity_threshold=similarity_threshold,
|
||
output_file="scene_change_results.txt"
|
||
)
|
||
|
||
# 演示如何使用场景变化帧的NumPy数组
|
||
print(f"\n=== 场景变化帧NumPy数组使用示例 ===")
|
||
print(f"场景变化帧数组变量名: scene_change_arrays")
|
||
print(f"可以直接使用这个数组进行进一步处理,例如:")
|
||
print(f"- 输入到深度学习模型")
|
||
print(f"- 进行图像处理操作")
|
||
print(f"- 保存为其他格式")
|
||
|
||
# 保存场景变化帧数组到文件(可选)
|
||
print(f"\n=== 保存场景变化帧数组 ===")
|
||
if len(scene_change_arrays) > 0:
|
||
np.save("scene_change_arrays.npy", scene_change_arrays)
|
||
print(f"场景变化帧NumPy数组已保存到: scene_change_arrays.npy")
|
||
print(f"可以使用 np.load('scene_change_arrays.npy') 重新加载")
|
||
else:
|
||
print("没有检测到场景变化帧")
|
||
print("建议:")
|
||
print("1. 降低相似度阈值(如0.7或0.6)")
|
||
print("2. 检查视频是否包含场景变化")
|
||
print("3. 增加视频帧数进行更全面的检测")
|
||
|
||
# 获取音频数据
|
||
audio = local_video.get_audio(sampling_rate=16000)
|
||
print("音频数据形状:", audio.shape)
|
||
print(type(audio))
|
||
"""
|
||
try:
|
||
print("尝试加载模型...")
|
||
|
||
# 模型和分词器路径
|
||
model_path = "/root/autodl-tmp/llm/Qwen2.5-VL"
|
||
|
||
# 使用离线模式加载分词器
|
||
tokenizer = AutoTokenizer.from_pretrained(
|
||
model_path,
|
||
local_files_only=True,
|
||
trust_remote_code=True
|
||
)
|
||
|
||
# 采样参数
|
||
sampling_params = SamplingParams(
|
||
temperature=0.6,
|
||
top_p=0.95,
|
||
top_k=20,
|
||
max_tokens=1024
|
||
)
|
||
|
||
# 离线模式初始化模型
|
||
llm = LLM(
|
||
model=model_path,
|
||
tokenizer=model_path, # 使用相同的路径作为分词器路径
|
||
max_model_len=4096,
|
||
tensor_parallel_size=1, # 减少为1,避免多GPU通信问题
|
||
gpu_memory_utilization=0.8, # 稍微降低内存使用率
|
||
trust_remote_code=True,
|
||
enable_lora=False, # 禁用LoRA
|
||
)
|
||
|
||
# 使用提取的PIL图像
|
||
print("生成视频描述...")
|
||
prompt = "这个视频展示了什么内容?详细描述一下。"
|
||
|
||
# 使用generate而不是generate_videos (如果不存在generate_videos方法)
|
||
try:
|
||
# 尝试使用generate_videos
|
||
outputs = llm.generate_videos(prompt, videos=[pil_images], sampling_params=sampling_params)
|
||
print(outputs[0].outputs[0].text) # 打印模型输出
|
||
except AttributeError:
|
||
print("generate_videos方法不可用,尝试使用普通generate方法...")
|
||
# 如果不支持generate_videos,使用普通的generate
|
||
outputs = llm.generate([prompt], sampling_params=sampling_params)
|
||
print(outputs[0].outputs[0].text) # 打印模型输出
|
||
|
||
except Exception as e:
|
||
print(f"模型加载或推理过程中出错: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
"""
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|