246 lines
8.0 KiB
Python
246 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
热点管理器
|
|
|
|
统一管理多个数据源的热点数据
|
|
"""
|
|
|
|
import logging
|
|
import asyncio
|
|
from typing import List, Dict, Optional, Set
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
import json
|
|
|
|
from .models import HotTopic, HotTopicSource, HotTopicCategory
|
|
from .crawlers import WeiboCrawler, BaiduCrawler, CalendarCrawler, XiaohongshuCrawler, BingCrawler
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class HotspotManager:
|
|
"""
|
|
热点管理器
|
|
|
|
功能:
|
|
- 管理多个爬虫
|
|
- 缓存热点数据
|
|
- 提供统一查询接口
|
|
- 支持手动添加热点
|
|
"""
|
|
|
|
def __init__(self, cache_dir: str = None):
|
|
self.logger = logging.getLogger(f"{__name__}.HotspotManager")
|
|
|
|
# 缓存目录
|
|
self.cache_dir = Path(cache_dir) if cache_dir else Path(__file__).parent / 'cache'
|
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# 爬虫实例
|
|
self._crawlers = {
|
|
HotTopicSource.WEIBO: WeiboCrawler(),
|
|
HotTopicSource.BAIDU: BaiduCrawler(),
|
|
HotTopicSource.CALENDAR: CalendarCrawler(),
|
|
HotTopicSource.XIAOHONGSHU: XiaohongshuCrawler(),
|
|
HotTopicSource.BING: BingCrawler(),
|
|
}
|
|
|
|
# 内存缓存
|
|
self._cache: Dict[HotTopicSource, List[HotTopic]] = {}
|
|
self._cache_time: Dict[HotTopicSource, datetime] = {}
|
|
|
|
# 自定义热点
|
|
self._custom_topics: List[HotTopic] = []
|
|
|
|
# 缓存有效期 (秒)
|
|
self._cache_ttl = {
|
|
HotTopicSource.WEIBO: 300, # 5分钟
|
|
HotTopicSource.BAIDU: 600, # 10分钟
|
|
HotTopicSource.CALENDAR: 3600, # 1小时
|
|
HotTopicSource.XIAOHONGSHU: 1800, # 30分钟
|
|
HotTopicSource.BING: 1800, # 30分钟
|
|
HotTopicSource.CUSTOM: 86400, # 24小时
|
|
}
|
|
|
|
async def fetch_all(self, force: bool = False) -> Dict[str, List[HotTopic]]:
|
|
"""
|
|
获取所有来源的热点
|
|
|
|
Args:
|
|
force: 是否强制刷新缓存
|
|
|
|
Returns:
|
|
按来源分组的热点数据
|
|
"""
|
|
results = {}
|
|
|
|
# 并发获取所有来源
|
|
tasks = []
|
|
sources = []
|
|
|
|
for source, crawler in self._crawlers.items():
|
|
if force or self._is_cache_expired(source):
|
|
tasks.append(crawler.fetch_with_retry())
|
|
sources.append(source)
|
|
else:
|
|
results[source.value] = self._cache.get(source, [])
|
|
|
|
if tasks:
|
|
fetched = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
for source, data in zip(sources, fetched):
|
|
if isinstance(data, Exception):
|
|
self.logger.error(f"获取 {source.value} 失败: {data}")
|
|
results[source.value] = self._cache.get(source, [])
|
|
else:
|
|
self._cache[source] = data
|
|
self._cache_time[source] = datetime.now()
|
|
results[source.value] = data
|
|
|
|
# 添加自定义热点
|
|
results[HotTopicSource.CUSTOM.value] = self._custom_topics
|
|
|
|
return results
|
|
|
|
async def fetch_source(self, source: HotTopicSource, force: bool = False) -> List[HotTopic]:
|
|
"""获取指定来源的热点"""
|
|
if source == HotTopicSource.CUSTOM:
|
|
return self._custom_topics
|
|
|
|
if not force and not self._is_cache_expired(source):
|
|
return self._cache.get(source, [])
|
|
|
|
crawler = self._crawlers.get(source)
|
|
if not crawler:
|
|
self.logger.warning(f"未知的数据源: {source}")
|
|
return []
|
|
|
|
topics = await crawler.fetch_with_retry()
|
|
self._cache[source] = topics
|
|
self._cache_time[source] = datetime.now()
|
|
|
|
return topics
|
|
|
|
async def get_travel_topics(self, limit: int = 10) -> List[HotTopic]:
|
|
"""获取旅游相关热点"""
|
|
all_topics = []
|
|
|
|
# 获取所有来源
|
|
results = await self.fetch_all()
|
|
|
|
for topics in results.values():
|
|
for topic in topics:
|
|
if topic.is_travel_related() or topic.category == HotTopicCategory.TRAVEL:
|
|
all_topics.append(topic)
|
|
|
|
# 按热度排序
|
|
all_topics.sort(key=lambda x: x.heat or 0, reverse=True)
|
|
|
|
return all_topics[:limit]
|
|
|
|
async def get_trending(self, limit: int = 20) -> List[HotTopic]:
|
|
"""获取热门话题 (所有来源合并)"""
|
|
all_topics = []
|
|
|
|
results = await self.fetch_all()
|
|
|
|
for topics in results.values():
|
|
all_topics.extend(topics)
|
|
|
|
# 去重 (按标题)
|
|
seen = set()
|
|
unique_topics = []
|
|
for topic in all_topics:
|
|
if topic.title not in seen:
|
|
seen.add(topic.title)
|
|
unique_topics.append(topic)
|
|
|
|
# 按热度排序
|
|
unique_topics.sort(key=lambda x: x.heat or 0, reverse=True)
|
|
|
|
return unique_topics[:limit]
|
|
|
|
async def get_festivals(self, days: int = 30) -> List[HotTopic]:
|
|
"""获取近期节日"""
|
|
topics = await self.fetch_source(HotTopicSource.CALENDAR)
|
|
|
|
# 过滤指定天数内的
|
|
now = datetime.now()
|
|
filtered = []
|
|
for topic in topics:
|
|
days_until = topic.extra.get('days_until', 999)
|
|
if -7 <= days_until <= days:
|
|
filtered.append(topic)
|
|
|
|
return filtered
|
|
|
|
def add_custom_topic(self, topic: HotTopic) -> bool:
|
|
"""添加自定义热点"""
|
|
topic.source = HotTopicSource.CUSTOM
|
|
self._custom_topics.append(topic)
|
|
self._save_custom_topics()
|
|
return True
|
|
|
|
def remove_custom_topic(self, title: str) -> bool:
|
|
"""移除自定义热点"""
|
|
for i, topic in enumerate(self._custom_topics):
|
|
if topic.title == title:
|
|
self._custom_topics.pop(i)
|
|
self._save_custom_topics()
|
|
return True
|
|
return False
|
|
|
|
def get_custom_topics(self) -> List[HotTopic]:
|
|
"""获取自定义热点"""
|
|
return self._custom_topics
|
|
|
|
def _is_cache_expired(self, source: HotTopicSource) -> bool:
|
|
"""检查缓存是否过期"""
|
|
if source not in self._cache_time:
|
|
return True
|
|
|
|
ttl = self._cache_ttl.get(source, 300)
|
|
elapsed = (datetime.now() - self._cache_time[source]).total_seconds()
|
|
|
|
return elapsed > ttl
|
|
|
|
def _save_custom_topics(self):
|
|
"""保存自定义热点到文件"""
|
|
try:
|
|
cache_file = self.cache_dir / 'custom_topics.json'
|
|
data = [t.to_dict() for t in self._custom_topics]
|
|
with open(cache_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
except Exception as e:
|
|
self.logger.error(f"保存自定义热点失败: {e}")
|
|
|
|
def _load_custom_topics(self):
|
|
"""从文件加载自定义热点"""
|
|
try:
|
|
cache_file = self.cache_dir / 'custom_topics.json'
|
|
if cache_file.exists():
|
|
with open(cache_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
self._custom_topics = [HotTopic.from_dict(d) for d in data]
|
|
except Exception as e:
|
|
self.logger.error(f"加载自定义热点失败: {e}")
|
|
|
|
async def close(self):
|
|
"""关闭所有爬虫"""
|
|
for crawler in self._crawlers.values():
|
|
await crawler.close()
|
|
|
|
|
|
# 全局单例
|
|
_manager_instance = None
|
|
|
|
def get_hotspot_manager() -> HotspotManager:
|
|
"""获取全局热点管理器"""
|
|
global _manager_instance
|
|
if _manager_instance is None:
|
|
_manager_instance = HotspotManager()
|
|
_manager_instance._load_custom_topics()
|
|
return _manager_instance
|