377 lines
15 KiB
Python
377 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
图像存储管理器
|
|
统一管理小红书笔记的图片和视频下载和存储
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import hashlib
|
|
import time
|
|
import requests
|
|
from typing import Dict, List, Optional, Tuple
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
from urllib.parse import urlparse
|
|
|
|
@dataclass
|
|
class MediaInfo:
|
|
"""媒体文件信息"""
|
|
media_id: str
|
|
media_type: str # 'image' or 'video'
|
|
original_url: str
|
|
local_path: str
|
|
file_size: int
|
|
download_time: datetime
|
|
note_id: str
|
|
note_title: str
|
|
user_id: str
|
|
user_name: str
|
|
|
|
def to_dict(self) -> Dict:
|
|
return {
|
|
'media_id': self.media_id,
|
|
'media_type': self.media_type,
|
|
'original_url': self.original_url,
|
|
'local_path': self.local_path,
|
|
'file_size': self.file_size,
|
|
'download_time': self.download_time.isoformat(),
|
|
'note_id': self.note_id,
|
|
'note_title': self.note_title,
|
|
'user_id': self.user_id,
|
|
'user_name': self.user_name
|
|
}
|
|
|
|
class ImageStorageManager:
|
|
"""图像存储管理器"""
|
|
|
|
def __init__(self, base_storage_path: str = "data/media"):
|
|
self.base_storage_path = Path(base_storage_path)
|
|
self.media_index_file = self.base_storage_path / "media_index.json"
|
|
self.media_index: Dict[str, MediaInfo] = {}
|
|
self.chunk_size = 1024 * 1024 # 1MB chunks
|
|
|
|
# 创建存储目录
|
|
self.image_path = self.base_storage_path / "images"
|
|
self.video_path = self.base_storage_path / "videos"
|
|
self.thumbnail_path = self.base_storage_path / "thumbnails"
|
|
|
|
for path in [self.image_path, self.video_path, self.thumbnail_path]:
|
|
path.mkdir(parents=True, exist_ok=True)
|
|
|
|
self.load_media_index()
|
|
|
|
def load_media_index(self) -> None:
|
|
"""加载媒体索引文件"""
|
|
try:
|
|
if self.media_index_file.exists():
|
|
with open(self.media_index_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
for media_id, media_data in data.items():
|
|
self.media_index[media_id] = MediaInfo(
|
|
media_id=media_data['media_id'],
|
|
media_type=media_data['media_type'],
|
|
original_url=media_data['original_url'],
|
|
local_path=media_data['local_path'],
|
|
file_size=media_data['file_size'],
|
|
download_time=datetime.fromisoformat(media_data['download_time']),
|
|
note_id=media_data['note_id'],
|
|
note_title=media_data['note_title'],
|
|
user_id=media_data['user_id'],
|
|
user_name=media_data['user_name']
|
|
)
|
|
logger.info(f"加载了 {len(self.media_index)} 个媒体文件索引")
|
|
except Exception as e:
|
|
logger.error(f"加载媒体索引失败: {str(e)}")
|
|
|
|
def save_media_index(self) -> None:
|
|
"""保存媒体索引文件"""
|
|
try:
|
|
data = {media_id: media_info.to_dict()
|
|
for media_id, media_info in self.media_index.items()}
|
|
|
|
with open(self.media_index_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
logger.debug("保存媒体索引成功")
|
|
except Exception as e:
|
|
logger.error(f"保存媒体索引失败: {str(e)}")
|
|
|
|
def _generate_media_id(self, url: str, note_id: str) -> str:
|
|
"""生成媒体文件ID"""
|
|
content = f"{url}_{note_id}"
|
|
return hashlib.md5(content.encode()).hexdigest()
|
|
|
|
def _get_file_extension(self, url: str, media_type: str) -> str:
|
|
"""获取文件扩展名"""
|
|
parsed_url = urlparse(url)
|
|
path = parsed_url.path
|
|
|
|
if path and '.' in path:
|
|
return os.path.splitext(path)[1].lower()
|
|
|
|
# 默认扩展名
|
|
return '.jpg' if media_type == 'image' else '.mp4'
|
|
|
|
def _normalize_filename(self, filename: str) -> str:
|
|
"""规范化文件名"""
|
|
# 移除不安全字符
|
|
unsafe_chars = '<>:"/\\|?*'
|
|
for char in unsafe_chars:
|
|
filename = filename.replace(char, '_')
|
|
|
|
# 限制长度
|
|
if len(filename) > 100:
|
|
filename = filename[:100]
|
|
|
|
return filename
|
|
|
|
def _download_media(self, url: str, local_path: str, max_retries: int = 3) -> Tuple[bool, str, int]:
|
|
"""下载媒体文件"""
|
|
for attempt in range(max_retries):
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, stream=True, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
file_size = 0
|
|
with open(local_path, 'wb') as f:
|
|
for chunk in response.iter_content(chunk_size=self.chunk_size):
|
|
if chunk:
|
|
f.write(chunk)
|
|
file_size += len(chunk)
|
|
|
|
logger.info(f"下载成功: {local_path} ({file_size} bytes)")
|
|
return True, "下载成功", file_size
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.warning(f"下载失败 (尝试 {attempt + 1}/{max_retries}): {str(e)}")
|
|
if attempt < max_retries - 1:
|
|
time.sleep(2 ** attempt) # 指数退避
|
|
else:
|
|
return False, f"下载失败: {str(e)}", 0
|
|
except Exception as e:
|
|
logger.error(f"下载出错: {str(e)}")
|
|
return False, f"下载出错: {str(e)}", 0
|
|
|
|
return False, "下载失败", 0
|
|
|
|
def download_image(self, url: str, note_info: Dict) -> Optional[MediaInfo]:
|
|
"""下载图片"""
|
|
media_id = self._generate_media_id(url, note_info['note_id'])
|
|
|
|
# 检查是否已存在
|
|
if media_id in self.media_index:
|
|
existing_media = self.media_index[media_id]
|
|
if os.path.exists(existing_media.local_path):
|
|
logger.info(f"图片已存在: {existing_media.local_path}")
|
|
return existing_media
|
|
|
|
# 生成本地路径
|
|
extension = self._get_file_extension(url, 'image')
|
|
user_name = self._normalize_filename(note_info.get('nickname', 'unknown'))
|
|
note_title = self._normalize_filename(note_info.get('title', 'untitled'))
|
|
|
|
filename = f"{user_name}_{note_info['note_id']}_{media_id}{extension}"
|
|
local_path = self.image_path / filename
|
|
|
|
# 下载文件
|
|
success, message, file_size = self._download_media(url, str(local_path))
|
|
|
|
if success:
|
|
media_info = MediaInfo(
|
|
media_id=media_id,
|
|
media_type='image',
|
|
original_url=url,
|
|
local_path=str(local_path),
|
|
file_size=file_size,
|
|
download_time=datetime.now(),
|
|
note_id=note_info['note_id'],
|
|
note_title=note_info.get('title', ''),
|
|
user_id=note_info.get('user_id', ''),
|
|
user_name=note_info.get('nickname', '')
|
|
)
|
|
|
|
self.media_index[media_id] = media_info
|
|
self.save_media_index()
|
|
return media_info
|
|
else:
|
|
logger.error(f"图片下载失败: {message}")
|
|
return None
|
|
|
|
def download_video(self, url: str, note_info: Dict, thumbnail_url: Optional[str] = None) -> Optional[MediaInfo]:
|
|
"""下载视频"""
|
|
media_id = self._generate_media_id(url, note_info['note_id'])
|
|
|
|
# 检查是否已存在
|
|
if media_id in self.media_index:
|
|
existing_media = self.media_index[media_id]
|
|
if os.path.exists(existing_media.local_path):
|
|
logger.info(f"视频已存在: {existing_media.local_path}")
|
|
return existing_media
|
|
|
|
# 生成本地路径
|
|
extension = self._get_file_extension(url, 'video')
|
|
user_name = self._normalize_filename(note_info.get('nickname', 'unknown'))
|
|
note_title = self._normalize_filename(note_info.get('title', 'untitled'))
|
|
|
|
filename = f"{user_name}_{note_info['note_id']}_{media_id}{extension}"
|
|
local_path = self.video_path / filename
|
|
|
|
# 下载视频文件
|
|
success, message, file_size = self._download_media(url, str(local_path))
|
|
|
|
if success:
|
|
# 如果有缩略图,也下载缩略图
|
|
if thumbnail_url:
|
|
thumbnail_filename = f"{user_name}_{note_info['note_id']}_{media_id}_thumb.jpg"
|
|
thumbnail_path = self.thumbnail_path / thumbnail_filename
|
|
self._download_media(thumbnail_url, str(thumbnail_path))
|
|
|
|
media_info = MediaInfo(
|
|
media_id=media_id,
|
|
media_type='video',
|
|
original_url=url,
|
|
local_path=str(local_path),
|
|
file_size=file_size,
|
|
download_time=datetime.now(),
|
|
note_id=note_info['note_id'],
|
|
note_title=note_info.get('title', ''),
|
|
user_id=note_info.get('user_id', ''),
|
|
user_name=note_info.get('nickname', '')
|
|
)
|
|
|
|
self.media_index[media_id] = media_info
|
|
self.save_media_index()
|
|
return media_info
|
|
else:
|
|
logger.error(f"视频下载失败: {message}")
|
|
return None
|
|
|
|
def download_note_media(self, note_info: Dict, download_images: bool = True, download_videos: bool = True) -> List[MediaInfo]:
|
|
"""下载笔记的所有媒体文件"""
|
|
downloaded_media = []
|
|
|
|
# 下载图片
|
|
if download_images and note_info.get('image_list'):
|
|
for image_url in note_info['image_list']:
|
|
media_info = self.download_image(image_url, note_info)
|
|
if media_info:
|
|
downloaded_media.append(media_info)
|
|
|
|
# 下载视频
|
|
if download_videos and note_info.get('video_addr'):
|
|
media_info = self.download_video(
|
|
note_info['video_addr'],
|
|
note_info,
|
|
note_info.get('video_cover')
|
|
)
|
|
if media_info:
|
|
downloaded_media.append(media_info)
|
|
|
|
return downloaded_media
|
|
|
|
def get_media_by_note(self, note_id: str) -> List[MediaInfo]:
|
|
"""根据笔记ID获取媒体文件"""
|
|
return [media for media in self.media_index.values() if media.note_id == note_id]
|
|
|
|
def get_media_by_user(self, user_id: str) -> List[MediaInfo]:
|
|
"""根据用户ID获取媒体文件"""
|
|
return [media for media in self.media_index.values() if media.user_id == user_id]
|
|
|
|
def get_storage_statistics(self) -> Dict:
|
|
"""获取存储统计信息"""
|
|
total_files = len(self.media_index)
|
|
total_size = sum(media.file_size for media in self.media_index.values())
|
|
|
|
image_count = sum(1 for media in self.media_index.values() if media.media_type == 'image')
|
|
video_count = sum(1 for media in self.media_index.values() if media.media_type == 'video')
|
|
|
|
image_size = sum(media.file_size for media in self.media_index.values() if media.media_type == 'image')
|
|
video_size = sum(media.file_size for media in self.media_index.values() if media.media_type == 'video')
|
|
|
|
return {
|
|
'total_files': total_files,
|
|
'total_size': total_size,
|
|
'total_size_mb': round(total_size / (1024 * 1024), 2),
|
|
'image_count': image_count,
|
|
'video_count': video_count,
|
|
'image_size': image_size,
|
|
'video_size': video_size,
|
|
'image_size_mb': round(image_size / (1024 * 1024), 2),
|
|
'video_size_mb': round(video_size / (1024 * 1024), 2),
|
|
'storage_path': str(self.base_storage_path)
|
|
}
|
|
|
|
def cleanup_orphaned_files(self) -> int:
|
|
"""清理孤立的文件(没有索引记录的文件)"""
|
|
orphaned_count = 0
|
|
|
|
# 检查图片目录
|
|
if self.image_path.exists():
|
|
for file_path in self.image_path.iterdir():
|
|
if file_path.is_file():
|
|
file_found = False
|
|
for media_info in self.media_index.values():
|
|
if Path(media_info.local_path) == file_path:
|
|
file_found = True
|
|
break
|
|
|
|
if not file_found:
|
|
try:
|
|
file_path.unlink()
|
|
orphaned_count += 1
|
|
logger.info(f"删除孤立文件: {file_path}")
|
|
except Exception as e:
|
|
logger.error(f"删除孤立文件失败: {str(e)}")
|
|
|
|
# 检查视频目录
|
|
if self.video_path.exists():
|
|
for file_path in self.video_path.iterdir():
|
|
if file_path.is_file():
|
|
file_found = False
|
|
for media_info in self.media_index.values():
|
|
if Path(media_info.local_path) == file_path:
|
|
file_found = True
|
|
break
|
|
|
|
if not file_found:
|
|
try:
|
|
file_path.unlink()
|
|
orphaned_count += 1
|
|
logger.info(f"删除孤立文件: {file_path}")
|
|
except Exception as e:
|
|
logger.error(f"删除孤立文件失败: {str(e)}")
|
|
|
|
logger.info(f"清理了 {orphaned_count} 个孤立文件")
|
|
return orphaned_count
|
|
|
|
def remove_media(self, media_id: str) -> bool:
|
|
"""删除媒体文件"""
|
|
if media_id not in self.media_index:
|
|
return False
|
|
|
|
media_info = self.media_index[media_id]
|
|
|
|
# 删除文件
|
|
try:
|
|
if os.path.exists(media_info.local_path):
|
|
os.remove(media_info.local_path)
|
|
logger.info(f"删除媒体文件: {media_info.local_path}")
|
|
except Exception as e:
|
|
logger.error(f"删除媒体文件失败: {str(e)}")
|
|
return False
|
|
|
|
# 从索引中删除
|
|
del self.media_index[media_id]
|
|
self.save_media_index()
|
|
|
|
return True |