#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 微博热搜爬虫 数据源: https://weibo.com/ajax/side/hotSearch """ import logging from typing import List from datetime import datetime, timedelta import aiohttp from .base import BaseCrawler from ..models import HotTopic, HotTopicSource, HotTopicCategory logger = logging.getLogger(__name__) class WeiboCrawler(BaseCrawler): """微博热搜爬虫""" source = HotTopicSource.WEIBO name = "微博热搜" # 微博热搜 API (官方,需要 Cookie) API_URL = "https://weibo.com/ajax/side/hotSearch" # 微博热搜页面 (HTML 解析) PAGE_URL = "https://s.weibo.com/top/summary" def _get_default_headers(self) -> dict: """请求头""" return { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', } async def fetch(self) -> List[HotTopic]: """获取微博热搜""" topics = [] try: session = await self._get_session() # 先尝试官方 API try: async with session.get(self.API_URL, timeout=aiohttp.ClientTimeout(total=10)) as response: if response.status == 200: data = await response.json() realtime = data.get('data', {}).get('realtime', []) if realtime: return self._parse_realtime(realtime) except Exception as e: self.logger.debug(f"官方 API 失败: {e}") # 尝试解析 HTML 页面 self.logger.info("尝试解析微博热搜页面") async with session.get(self.PAGE_URL) as response: if response.status != 200: self.logger.error(f"页面请求失败: {response.status}") return [] # 处理编码问题 content = await response.read() try: html = content.decode('utf-8') except UnicodeDecodeError: html = content.decode('gbk', errors='ignore') topics = self._parse_html(html) if topics: self.logger.info(f"获取到 {len(topics)} 条微博热搜 (HTML)") else: self.logger.warning("HTML 解析未获取到数据") except Exception as e: self.logger.error(f"获取微博热搜失败: {e}") return topics def _parse_html(self, html: str) -> List[HotTopic]: """解析 HTML 页面""" import re topics = [] try: # 匹配热搜条目: 话题热度 pattern = r']*>.*?]*href="([^"]*)"[^>]*>([^<]+).*?(?:(\d+))?' matches = re.findall(pattern, html, re.DOTALL) for idx, match in enumerate(matches): url, title, heat_str = match title = title.strip() if not title: continue heat = int(heat_str) if heat_str else 0 topic = HotTopic( title=title, source=self.source, rank=idx + 1, heat=heat, category=self._detect_category(title, []), url=f"https://s.weibo.com{url}" if url.startswith('/') else url, fetched_at=datetime.now(), expires_at=datetime.now() + timedelta(hours=1), ) topics.append(topic) except Exception as e: self.logger.error(f"HTML 解析失败: {e}") return topics def _parse_realtime(self, realtime: list) -> List[HotTopic]: """解析主 API 数据""" topics = [] for idx, item in enumerate(realtime): topic = self._parse_item(item, idx + 1) if topic: topics.append(topic) self.logger.info(f"获取到 {len(topics)} 条微博热搜") return topics def _parse_item(self, item: dict, rank: int) -> HotTopic: """解析单条热搜""" try: word = item.get('word', '') if not word: return None # 热度 raw_hot = item.get('raw_hot', 0) or item.get('num', 0) # 标签 tags = [] label_name = item.get('label_name', '') if label_name: tags.append(label_name) # 判断分类 category = self._detect_category(word, tags) return HotTopic( title=word, source=self.source, rank=rank, heat=raw_hot, category=category, url=f"https://s.weibo.com/weibo?q=%23{word}%23", tags=tags, fetched_at=datetime.now(), expires_at=datetime.now() + timedelta(hours=1), # 1小时后过期 extra={ 'icon_desc': item.get('icon_desc', ''), 'is_hot': item.get('is_hot', 0), 'is_new': item.get('is_new', 0), } ) except Exception as e: self.logger.warning(f"解析热搜失败: {e}") return None def _detect_category(self, title: str, tags: List[str]) -> HotTopicCategory: """检测分类""" # 旅游相关关键词 travel_keywords = ['旅游', '旅行', '景区', '景点', '酒店', '度假', '出游', '自驾'] if any(kw in title for kw in travel_keywords): return HotTopicCategory.TRAVEL # 美食相关 food_keywords = ['美食', '餐厅', '小吃', '火锅', '烧烤', '奶茶'] if any(kw in title for kw in food_keywords): return HotTopicCategory.FOOD # 节日相关 festival_keywords = ['节', '春节', '国庆', '中秋', '元旦', '五一', '十一'] if any(kw in title for kw in festival_keywords): return HotTopicCategory.FESTIVAL return HotTopicCategory.TRENDING