185 lines
6.5 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
微博热搜爬虫
数据源: https://weibo.com/ajax/side/hotSearch
"""
import logging
from typing import List
from datetime import datetime, timedelta
import aiohttp
from .base import BaseCrawler
from ..models import HotTopic, HotTopicSource, HotTopicCategory
logger = logging.getLogger(__name__)
class WeiboCrawler(BaseCrawler):
"""微博热搜爬虫"""
source = HotTopicSource.WEIBO
name = "微博热搜"
# 微博热搜 API (官方,需要 Cookie)
API_URL = "https://weibo.com/ajax/side/hotSearch"
# 微博热搜页面 (HTML 解析)
PAGE_URL = "https://s.weibo.com/top/summary"
def _get_default_headers(self) -> dict:
"""请求头"""
return {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
async def fetch(self) -> List[HotTopic]:
"""获取微博热搜"""
topics = []
try:
session = await self._get_session()
# 先尝试官方 API
try:
async with session.get(self.API_URL, timeout=aiohttp.ClientTimeout(total=10)) as response:
if response.status == 200:
data = await response.json()
realtime = data.get('data', {}).get('realtime', [])
if realtime:
return self._parse_realtime(realtime)
except Exception as e:
self.logger.debug(f"官方 API 失败: {e}")
# 尝试解析 HTML 页面
self.logger.info("尝试解析微博热搜页面")
async with session.get(self.PAGE_URL) as response:
if response.status != 200:
self.logger.error(f"页面请求失败: {response.status}")
return []
# 处理编码问题
content = await response.read()
try:
html = content.decode('utf-8')
except UnicodeDecodeError:
html = content.decode('gbk', errors='ignore')
topics = self._parse_html(html)
if topics:
self.logger.info(f"获取到 {len(topics)} 条微博热搜 (HTML)")
else:
self.logger.warning("HTML 解析未获取到数据")
except Exception as e:
self.logger.error(f"获取微博热搜失败: {e}")
return topics
def _parse_html(self, html: str) -> List[HotTopic]:
"""解析 HTML 页面"""
import re
topics = []
try:
# 匹配热搜条目: <td class="td-02"><a href="...">话题</a><span>热度</span></td>
pattern = r'<td class="td-02"[^>]*>.*?<a[^>]*href="([^"]*)"[^>]*>([^<]+)</a>.*?(?:<span>(\d+)</span>)?'
matches = re.findall(pattern, html, re.DOTALL)
for idx, match in enumerate(matches):
url, title, heat_str = match
title = title.strip()
if not title:
continue
heat = int(heat_str) if heat_str else 0
topic = HotTopic(
title=title,
source=self.source,
rank=idx + 1,
heat=heat,
category=self._detect_category(title, []),
url=f"https://s.weibo.com{url}" if url.startswith('/') else url,
fetched_at=datetime.now(),
expires_at=datetime.now() + timedelta(hours=1),
)
topics.append(topic)
except Exception as e:
self.logger.error(f"HTML 解析失败: {e}")
return topics
def _parse_realtime(self, realtime: list) -> List[HotTopic]:
"""解析主 API 数据"""
topics = []
for idx, item in enumerate(realtime):
topic = self._parse_item(item, idx + 1)
if topic:
topics.append(topic)
self.logger.info(f"获取到 {len(topics)} 条微博热搜")
return topics
def _parse_item(self, item: dict, rank: int) -> HotTopic:
"""解析单条热搜"""
try:
word = item.get('word', '')
if not word:
return None
# 热度
raw_hot = item.get('raw_hot', 0) or item.get('num', 0)
# 标签
tags = []
label_name = item.get('label_name', '')
if label_name:
tags.append(label_name)
# 判断分类
category = self._detect_category(word, tags)
return HotTopic(
title=word,
source=self.source,
rank=rank,
heat=raw_hot,
category=category,
url=f"https://s.weibo.com/weibo?q=%23{word}%23",
tags=tags,
fetched_at=datetime.now(),
expires_at=datetime.now() + timedelta(hours=1), # 1小时后过期
extra={
'icon_desc': item.get('icon_desc', ''),
'is_hot': item.get('is_hot', 0),
'is_new': item.get('is_new', 0),
}
)
except Exception as e:
self.logger.warning(f"解析热搜失败: {e}")
return None
def _detect_category(self, title: str, tags: List[str]) -> HotTopicCategory:
"""检测分类"""
# 旅游相关关键词
travel_keywords = ['旅游', '旅行', '景区', '景点', '酒店', '度假', '出游', '自驾']
if any(kw in title for kw in travel_keywords):
return HotTopicCategory.TRAVEL
# 美食相关
food_keywords = ['美食', '餐厅', '小吃', '火锅', '烧烤', '奶茶']
if any(kw in title for kw in food_keywords):
return HotTopicCategory.FOOD
# 节日相关
festival_keywords = ['', '春节', '国庆', '中秋', '元旦', '五一', '十一']
if any(kw in title for kw in festival_keywords):
return HotTopicCategory.FESTIVAL
return HotTopicCategory.TRENDING