bangbang-aigc-server/core/media_manager.py
2025-07-31 15:35:23 +08:00

377 lines
15 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
图像存储管理器
统一管理小红书笔记的图片和视频下载和存储
"""
import os
import json
import hashlib
import time
import requests
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
from urllib.parse import urlparse
@dataclass
class MediaInfo:
"""媒体文件信息"""
media_id: str
media_type: str # 'image' or 'video'
original_url: str
local_path: str
file_size: int
download_time: datetime
note_id: str
note_title: str
user_id: str
user_name: str
def to_dict(self) -> Dict:
return {
'media_id': self.media_id,
'media_type': self.media_type,
'original_url': self.original_url,
'local_path': self.local_path,
'file_size': self.file_size,
'download_time': self.download_time.isoformat(),
'note_id': self.note_id,
'note_title': self.note_title,
'user_id': self.user_id,
'user_name': self.user_name
}
class ImageStorageManager:
"""图像存储管理器"""
def __init__(self, base_storage_path: str = "data/media"):
self.base_storage_path = Path(base_storage_path)
self.media_index_file = self.base_storage_path / "media_index.json"
self.media_index: Dict[str, MediaInfo] = {}
self.chunk_size = 1024 * 1024 # 1MB chunks
# 创建存储目录
self.image_path = self.base_storage_path / "images"
self.video_path = self.base_storage_path / "videos"
self.thumbnail_path = self.base_storage_path / "thumbnails"
for path in [self.image_path, self.video_path, self.thumbnail_path]:
path.mkdir(parents=True, exist_ok=True)
self.load_media_index()
def load_media_index(self) -> None:
"""加载媒体索引文件"""
try:
if self.media_index_file.exists():
with open(self.media_index_file, 'r', encoding='utf-8') as f:
data = json.load(f)
for media_id, media_data in data.items():
self.media_index[media_id] = MediaInfo(
media_id=media_data['media_id'],
media_type=media_data['media_type'],
original_url=media_data['original_url'],
local_path=media_data['local_path'],
file_size=media_data['file_size'],
download_time=datetime.fromisoformat(media_data['download_time']),
note_id=media_data['note_id'],
note_title=media_data['note_title'],
user_id=media_data['user_id'],
user_name=media_data['user_name']
)
logger.info(f"加载了 {len(self.media_index)} 个媒体文件索引")
except Exception as e:
logger.error(f"加载媒体索引失败: {str(e)}")
def save_media_index(self) -> None:
"""保存媒体索引文件"""
try:
data = {media_id: media_info.to_dict()
for media_id, media_info in self.media_index.items()}
with open(self.media_index_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.debug("保存媒体索引成功")
except Exception as e:
logger.error(f"保存媒体索引失败: {str(e)}")
def _generate_media_id(self, url: str, note_id: str) -> str:
"""生成媒体文件ID"""
content = f"{url}_{note_id}"
return hashlib.md5(content.encode()).hexdigest()
def _get_file_extension(self, url: str, media_type: str) -> str:
"""获取文件扩展名"""
parsed_url = urlparse(url)
path = parsed_url.path
if path and '.' in path:
return os.path.splitext(path)[1].lower()
# 默认扩展名
return '.jpg' if media_type == 'image' else '.mp4'
def _normalize_filename(self, filename: str) -> str:
"""规范化文件名"""
# 移除不安全字符
unsafe_chars = '<>:"/\\|?*'
for char in unsafe_chars:
filename = filename.replace(char, '_')
# 限制长度
if len(filename) > 100:
filename = filename[:100]
return filename
def _download_media(self, url: str, local_path: str, max_retries: int = 3) -> Tuple[bool, str, int]:
"""下载媒体文件"""
for attempt in range(max_retries):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, stream=True, timeout=30)
response.raise_for_status()
file_size = 0
with open(local_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=self.chunk_size):
if chunk:
f.write(chunk)
file_size += len(chunk)
logger.info(f"下载成功: {local_path} ({file_size} bytes)")
return True, "下载成功", file_size
except requests.exceptions.RequestException as e:
logger.warning(f"下载失败 (尝试 {attempt + 1}/{max_retries}): {str(e)}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # 指数退避
else:
return False, f"下载失败: {str(e)}", 0
except Exception as e:
logger.error(f"下载出错: {str(e)}")
return False, f"下载出错: {str(e)}", 0
return False, "下载失败", 0
def download_image(self, url: str, note_info: Dict) -> Optional[MediaInfo]:
"""下载图片"""
media_id = self._generate_media_id(url, note_info['note_id'])
# 检查是否已存在
if media_id in self.media_index:
existing_media = self.media_index[media_id]
if os.path.exists(existing_media.local_path):
logger.info(f"图片已存在: {existing_media.local_path}")
return existing_media
# 生成本地路径
extension = self._get_file_extension(url, 'image')
user_name = self._normalize_filename(note_info.get('nickname', 'unknown'))
note_title = self._normalize_filename(note_info.get('title', 'untitled'))
filename = f"{user_name}_{note_info['note_id']}_{media_id}{extension}"
local_path = self.image_path / filename
# 下载文件
success, message, file_size = self._download_media(url, str(local_path))
if success:
media_info = MediaInfo(
media_id=media_id,
media_type='image',
original_url=url,
local_path=str(local_path),
file_size=file_size,
download_time=datetime.now(),
note_id=note_info['note_id'],
note_title=note_info.get('title', ''),
user_id=note_info.get('user_id', ''),
user_name=note_info.get('nickname', '')
)
self.media_index[media_id] = media_info
self.save_media_index()
return media_info
else:
logger.error(f"图片下载失败: {message}")
return None
def download_video(self, url: str, note_info: Dict, thumbnail_url: Optional[str] = None) -> Optional[MediaInfo]:
"""下载视频"""
media_id = self._generate_media_id(url, note_info['note_id'])
# 检查是否已存在
if media_id in self.media_index:
existing_media = self.media_index[media_id]
if os.path.exists(existing_media.local_path):
logger.info(f"视频已存在: {existing_media.local_path}")
return existing_media
# 生成本地路径
extension = self._get_file_extension(url, 'video')
user_name = self._normalize_filename(note_info.get('nickname', 'unknown'))
note_title = self._normalize_filename(note_info.get('title', 'untitled'))
filename = f"{user_name}_{note_info['note_id']}_{media_id}{extension}"
local_path = self.video_path / filename
# 下载视频文件
success, message, file_size = self._download_media(url, str(local_path))
if success:
# 如果有缩略图,也下载缩略图
if thumbnail_url:
thumbnail_filename = f"{user_name}_{note_info['note_id']}_{media_id}_thumb.jpg"
thumbnail_path = self.thumbnail_path / thumbnail_filename
self._download_media(thumbnail_url, str(thumbnail_path))
media_info = MediaInfo(
media_id=media_id,
media_type='video',
original_url=url,
local_path=str(local_path),
file_size=file_size,
download_time=datetime.now(),
note_id=note_info['note_id'],
note_title=note_info.get('title', ''),
user_id=note_info.get('user_id', ''),
user_name=note_info.get('nickname', '')
)
self.media_index[media_id] = media_info
self.save_media_index()
return media_info
else:
logger.error(f"视频下载失败: {message}")
return None
def download_note_media(self, note_info: Dict, download_images: bool = True, download_videos: bool = True) -> List[MediaInfo]:
"""下载笔记的所有媒体文件"""
downloaded_media = []
# 下载图片
if download_images and note_info.get('image_list'):
for image_url in note_info['image_list']:
media_info = self.download_image(image_url, note_info)
if media_info:
downloaded_media.append(media_info)
# 下载视频
if download_videos and note_info.get('video_addr'):
media_info = self.download_video(
note_info['video_addr'],
note_info,
note_info.get('video_cover')
)
if media_info:
downloaded_media.append(media_info)
return downloaded_media
def get_media_by_note(self, note_id: str) -> List[MediaInfo]:
"""根据笔记ID获取媒体文件"""
return [media for media in self.media_index.values() if media.note_id == note_id]
def get_media_by_user(self, user_id: str) -> List[MediaInfo]:
"""根据用户ID获取媒体文件"""
return [media for media in self.media_index.values() if media.user_id == user_id]
def get_storage_statistics(self) -> Dict:
"""获取存储统计信息"""
total_files = len(self.media_index)
total_size = sum(media.file_size for media in self.media_index.values())
image_count = sum(1 for media in self.media_index.values() if media.media_type == 'image')
video_count = sum(1 for media in self.media_index.values() if media.media_type == 'video')
image_size = sum(media.file_size for media in self.media_index.values() if media.media_type == 'image')
video_size = sum(media.file_size for media in self.media_index.values() if media.media_type == 'video')
return {
'total_files': total_files,
'total_size': total_size,
'total_size_mb': round(total_size / (1024 * 1024), 2),
'image_count': image_count,
'video_count': video_count,
'image_size': image_size,
'video_size': video_size,
'image_size_mb': round(image_size / (1024 * 1024), 2),
'video_size_mb': round(video_size / (1024 * 1024), 2),
'storage_path': str(self.base_storage_path)
}
def cleanup_orphaned_files(self) -> int:
"""清理孤立的文件(没有索引记录的文件)"""
orphaned_count = 0
# 检查图片目录
if self.image_path.exists():
for file_path in self.image_path.iterdir():
if file_path.is_file():
file_found = False
for media_info in self.media_index.values():
if Path(media_info.local_path) == file_path:
file_found = True
break
if not file_found:
try:
file_path.unlink()
orphaned_count += 1
logger.info(f"删除孤立文件: {file_path}")
except Exception as e:
logger.error(f"删除孤立文件失败: {str(e)}")
# 检查视频目录
if self.video_path.exists():
for file_path in self.video_path.iterdir():
if file_path.is_file():
file_found = False
for media_info in self.media_index.values():
if Path(media_info.local_path) == file_path:
file_found = True
break
if not file_found:
try:
file_path.unlink()
orphaned_count += 1
logger.info(f"删除孤立文件: {file_path}")
except Exception as e:
logger.error(f"删除孤立文件失败: {str(e)}")
logger.info(f"清理了 {orphaned_count} 个孤立文件")
return orphaned_count
def remove_media(self, media_id: str) -> bool:
"""删除媒体文件"""
if media_id not in self.media_index:
return False
media_info = self.media_index[media_id]
# 删除文件
try:
if os.path.exists(media_info.local_path):
os.remove(media_info.local_path)
logger.info(f"删除媒体文件: {media_info.local_path}")
except Exception as e:
logger.error(f"删除媒体文件失败: {str(e)}")
return False
# 从索引中删除
del self.media_index[media_id]
self.save_media_index()
return True