246 lines
8.0 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
热点管理器
统一管理多个数据源的热点数据
"""
import logging
import asyncio
from typing import List, Dict, Optional, Set
from datetime import datetime, timedelta
from pathlib import Path
import json
from .models import HotTopic, HotTopicSource, HotTopicCategory
from .crawlers import WeiboCrawler, BaiduCrawler, CalendarCrawler, XiaohongshuCrawler, BingCrawler
logger = logging.getLogger(__name__)
class HotspotManager:
"""
热点管理器
功能:
- 管理多个爬虫
- 缓存热点数据
- 提供统一查询接口
- 支持手动添加热点
"""
def __init__(self, cache_dir: str = None):
self.logger = logging.getLogger(f"{__name__}.HotspotManager")
# 缓存目录
self.cache_dir = Path(cache_dir) if cache_dir else Path(__file__).parent / 'cache'
self.cache_dir.mkdir(parents=True, exist_ok=True)
# 爬虫实例
self._crawlers = {
HotTopicSource.WEIBO: WeiboCrawler(),
HotTopicSource.BAIDU: BaiduCrawler(),
HotTopicSource.CALENDAR: CalendarCrawler(),
HotTopicSource.XIAOHONGSHU: XiaohongshuCrawler(),
HotTopicSource.BING: BingCrawler(),
}
# 内存缓存
self._cache: Dict[HotTopicSource, List[HotTopic]] = {}
self._cache_time: Dict[HotTopicSource, datetime] = {}
# 自定义热点
self._custom_topics: List[HotTopic] = []
# 缓存有效期 (秒)
self._cache_ttl = {
HotTopicSource.WEIBO: 300, # 5分钟
HotTopicSource.BAIDU: 600, # 10分钟
HotTopicSource.CALENDAR: 3600, # 1小时
HotTopicSource.XIAOHONGSHU: 1800, # 30分钟
HotTopicSource.BING: 1800, # 30分钟
HotTopicSource.CUSTOM: 86400, # 24小时
}
async def fetch_all(self, force: bool = False) -> Dict[str, List[HotTopic]]:
"""
获取所有来源的热点
Args:
force: 是否强制刷新缓存
Returns:
按来源分组的热点数据
"""
results = {}
# 并发获取所有来源
tasks = []
sources = []
for source, crawler in self._crawlers.items():
if force or self._is_cache_expired(source):
tasks.append(crawler.fetch_with_retry())
sources.append(source)
else:
results[source.value] = self._cache.get(source, [])
if tasks:
fetched = await asyncio.gather(*tasks, return_exceptions=True)
for source, data in zip(sources, fetched):
if isinstance(data, Exception):
self.logger.error(f"获取 {source.value} 失败: {data}")
results[source.value] = self._cache.get(source, [])
else:
self._cache[source] = data
self._cache_time[source] = datetime.now()
results[source.value] = data
# 添加自定义热点
results[HotTopicSource.CUSTOM.value] = self._custom_topics
return results
async def fetch_source(self, source: HotTopicSource, force: bool = False) -> List[HotTopic]:
"""获取指定来源的热点"""
if source == HotTopicSource.CUSTOM:
return self._custom_topics
if not force and not self._is_cache_expired(source):
return self._cache.get(source, [])
crawler = self._crawlers.get(source)
if not crawler:
self.logger.warning(f"未知的数据源: {source}")
return []
topics = await crawler.fetch_with_retry()
self._cache[source] = topics
self._cache_time[source] = datetime.now()
return topics
async def get_travel_topics(self, limit: int = 10) -> List[HotTopic]:
"""获取旅游相关热点"""
all_topics = []
# 获取所有来源
results = await self.fetch_all()
for topics in results.values():
for topic in topics:
if topic.is_travel_related() or topic.category == HotTopicCategory.TRAVEL:
all_topics.append(topic)
# 按热度排序
all_topics.sort(key=lambda x: x.heat or 0, reverse=True)
return all_topics[:limit]
async def get_trending(self, limit: int = 20) -> List[HotTopic]:
"""获取热门话题 (所有来源合并)"""
all_topics = []
results = await self.fetch_all()
for topics in results.values():
all_topics.extend(topics)
# 去重 (按标题)
seen = set()
unique_topics = []
for topic in all_topics:
if topic.title not in seen:
seen.add(topic.title)
unique_topics.append(topic)
# 按热度排序
unique_topics.sort(key=lambda x: x.heat or 0, reverse=True)
return unique_topics[:limit]
async def get_festivals(self, days: int = 30) -> List[HotTopic]:
"""获取近期节日"""
topics = await self.fetch_source(HotTopicSource.CALENDAR)
# 过滤指定天数内的
now = datetime.now()
filtered = []
for topic in topics:
days_until = topic.extra.get('days_until', 999)
if -7 <= days_until <= days:
filtered.append(topic)
return filtered
def add_custom_topic(self, topic: HotTopic) -> bool:
"""添加自定义热点"""
topic.source = HotTopicSource.CUSTOM
self._custom_topics.append(topic)
self._save_custom_topics()
return True
def remove_custom_topic(self, title: str) -> bool:
"""移除自定义热点"""
for i, topic in enumerate(self._custom_topics):
if topic.title == title:
self._custom_topics.pop(i)
self._save_custom_topics()
return True
return False
def get_custom_topics(self) -> List[HotTopic]:
"""获取自定义热点"""
return self._custom_topics
def _is_cache_expired(self, source: HotTopicSource) -> bool:
"""检查缓存是否过期"""
if source not in self._cache_time:
return True
ttl = self._cache_ttl.get(source, 300)
elapsed = (datetime.now() - self._cache_time[source]).total_seconds()
return elapsed > ttl
def _save_custom_topics(self):
"""保存自定义热点到文件"""
try:
cache_file = self.cache_dir / 'custom_topics.json'
data = [t.to_dict() for t in self._custom_topics]
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
except Exception as e:
self.logger.error(f"保存自定义热点失败: {e}")
def _load_custom_topics(self):
"""从文件加载自定义热点"""
try:
cache_file = self.cache_dir / 'custom_topics.json'
if cache_file.exists():
with open(cache_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self._custom_topics = [HotTopic.from_dict(d) for d in data]
except Exception as e:
self.logger.error(f"加载自定义热点失败: {e}")
async def close(self):
"""关闭所有爬虫"""
for crawler in self._crawlers.values():
await crawler.close()
# 全局单例
_manager_instance = None
def get_hotspot_manager() -> HotspotManager:
"""获取全局热点管理器"""
global _manager_instance
if _manager_instance is None:
_manager_instance = HotspotManager()
_manager_instance._load_custom_topics()
return _manager_instance