#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 热点管理器 统一管理多个数据源的热点数据 """ import logging import asyncio from typing import List, Dict, Optional, Set from datetime import datetime, timedelta from pathlib import Path import json from .models import HotTopic, HotTopicSource, HotTopicCategory from .crawlers import WeiboCrawler, BaiduCrawler, CalendarCrawler, XiaohongshuCrawler, BingCrawler logger = logging.getLogger(__name__) class HotspotManager: """ 热点管理器 功能: - 管理多个爬虫 - 缓存热点数据 - 提供统一查询接口 - 支持手动添加热点 """ def __init__(self, cache_dir: str = None): self.logger = logging.getLogger(f"{__name__}.HotspotManager") # 缓存目录 self.cache_dir = Path(cache_dir) if cache_dir else Path(__file__).parent / 'cache' self.cache_dir.mkdir(parents=True, exist_ok=True) # 爬虫实例 self._crawlers = { HotTopicSource.WEIBO: WeiboCrawler(), HotTopicSource.BAIDU: BaiduCrawler(), HotTopicSource.CALENDAR: CalendarCrawler(), HotTopicSource.XIAOHONGSHU: XiaohongshuCrawler(), HotTopicSource.BING: BingCrawler(), } # 内存缓存 self._cache: Dict[HotTopicSource, List[HotTopic]] = {} self._cache_time: Dict[HotTopicSource, datetime] = {} # 自定义热点 self._custom_topics: List[HotTopic] = [] # 缓存有效期 (秒) self._cache_ttl = { HotTopicSource.WEIBO: 300, # 5分钟 HotTopicSource.BAIDU: 600, # 10分钟 HotTopicSource.CALENDAR: 3600, # 1小时 HotTopicSource.XIAOHONGSHU: 1800, # 30分钟 HotTopicSource.BING: 1800, # 30分钟 HotTopicSource.CUSTOM: 86400, # 24小时 } async def fetch_all(self, force: bool = False) -> Dict[str, List[HotTopic]]: """ 获取所有来源的热点 Args: force: 是否强制刷新缓存 Returns: 按来源分组的热点数据 """ results = {} # 并发获取所有来源 tasks = [] sources = [] for source, crawler in self._crawlers.items(): if force or self._is_cache_expired(source): tasks.append(crawler.fetch_with_retry()) sources.append(source) else: results[source.value] = self._cache.get(source, []) if tasks: fetched = await asyncio.gather(*tasks, return_exceptions=True) for source, data in zip(sources, fetched): if isinstance(data, Exception): self.logger.error(f"获取 {source.value} 失败: {data}") results[source.value] = self._cache.get(source, []) else: self._cache[source] = data self._cache_time[source] = datetime.now() results[source.value] = data # 添加自定义热点 results[HotTopicSource.CUSTOM.value] = self._custom_topics return results async def fetch_source(self, source: HotTopicSource, force: bool = False) -> List[HotTopic]: """获取指定来源的热点""" if source == HotTopicSource.CUSTOM: return self._custom_topics if not force and not self._is_cache_expired(source): return self._cache.get(source, []) crawler = self._crawlers.get(source) if not crawler: self.logger.warning(f"未知的数据源: {source}") return [] topics = await crawler.fetch_with_retry() self._cache[source] = topics self._cache_time[source] = datetime.now() return topics async def get_travel_topics(self, limit: int = 10) -> List[HotTopic]: """获取旅游相关热点""" all_topics = [] # 获取所有来源 results = await self.fetch_all() for topics in results.values(): for topic in topics: if topic.is_travel_related() or topic.category == HotTopicCategory.TRAVEL: all_topics.append(topic) # 按热度排序 all_topics.sort(key=lambda x: x.heat or 0, reverse=True) return all_topics[:limit] async def get_trending(self, limit: int = 20) -> List[HotTopic]: """获取热门话题 (所有来源合并)""" all_topics = [] results = await self.fetch_all() for topics in results.values(): all_topics.extend(topics) # 去重 (按标题) seen = set() unique_topics = [] for topic in all_topics: if topic.title not in seen: seen.add(topic.title) unique_topics.append(topic) # 按热度排序 unique_topics.sort(key=lambda x: x.heat or 0, reverse=True) return unique_topics[:limit] async def get_festivals(self, days: int = 30) -> List[HotTopic]: """获取近期节日""" topics = await self.fetch_source(HotTopicSource.CALENDAR) # 过滤指定天数内的 now = datetime.now() filtered = [] for topic in topics: days_until = topic.extra.get('days_until', 999) if -7 <= days_until <= days: filtered.append(topic) return filtered def add_custom_topic(self, topic: HotTopic) -> bool: """添加自定义热点""" topic.source = HotTopicSource.CUSTOM self._custom_topics.append(topic) self._save_custom_topics() return True def remove_custom_topic(self, title: str) -> bool: """移除自定义热点""" for i, topic in enumerate(self._custom_topics): if topic.title == title: self._custom_topics.pop(i) self._save_custom_topics() return True return False def get_custom_topics(self) -> List[HotTopic]: """获取自定义热点""" return self._custom_topics def _is_cache_expired(self, source: HotTopicSource) -> bool: """检查缓存是否过期""" if source not in self._cache_time: return True ttl = self._cache_ttl.get(source, 300) elapsed = (datetime.now() - self._cache_time[source]).total_seconds() return elapsed > ttl def _save_custom_topics(self): """保存自定义热点到文件""" try: cache_file = self.cache_dir / 'custom_topics.json' data = [t.to_dict() for t in self._custom_topics] with open(cache_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) except Exception as e: self.logger.error(f"保存自定义热点失败: {e}") def _load_custom_topics(self): """从文件加载自定义热点""" try: cache_file = self.cache_dir / 'custom_topics.json' if cache_file.exists(): with open(cache_file, 'r', encoding='utf-8') as f: data = json.load(f) self._custom_topics = [HotTopic.from_dict(d) for d in data] except Exception as e: self.logger.error(f"加载自定义热点失败: {e}") async def close(self): """关闭所有爬虫""" for crawler in self._crawlers.values(): await crawler.close() # 全局单例 _manager_instance = None def get_hotspot_manager() -> HotspotManager: """获取全局热点管理器""" global _manager_instance if _manager_instance is None: _manager_instance = HotspotManager() _manager_instance._load_custom_topics() return _manager_instance