147 lines
5.0 KiB
Python
147 lines
5.0 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
百度热搜爬虫
|
|
|
|
数据源: https://top.baidu.com/board?tab=realtime
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import List
|
|
from datetime import datetime, timedelta
|
|
|
|
from .base import BaseCrawler
|
|
from ..models import HotTopic, HotTopicSource, HotTopicCategory
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BaiduCrawler(BaseCrawler):
|
|
"""百度热搜爬虫"""
|
|
|
|
source = HotTopicSource.BAIDU
|
|
name = "百度热搜"
|
|
|
|
# 百度热搜 API - 支持多个榜单
|
|
API_URLS = {
|
|
'realtime': 'https://top.baidu.com/api/board?platform=wise&tab=realtime', # 实时热点
|
|
'travel': 'https://top.baidu.com/api/board?platform=wise&tab=travel', # 旅游榜 ⭐
|
|
'novel': 'https://top.baidu.com/api/board?platform=wise&tab=novel', # 小说榜
|
|
'movie': 'https://top.baidu.com/api/board?platform=wise&tab=movie', # 电影榜
|
|
'teleplay': 'https://top.baidu.com/api/board?platform=wise&tab=teleplay', # 电视剧榜
|
|
'car': 'https://top.baidu.com/api/board?platform=wise&tab=car', # 汽车榜
|
|
'game': 'https://top.baidu.com/api/board?platform=wise&tab=game', # 游戏榜
|
|
}
|
|
|
|
def __init__(self, tabs: List[str] = None):
|
|
super().__init__()
|
|
# 默认获取实时热点 + 旅游榜
|
|
self.tabs = tabs or ['realtime', 'travel']
|
|
|
|
async def fetch(self) -> List[HotTopic]:
|
|
"""获取百度热搜 (支持多榜单)"""
|
|
all_topics = []
|
|
|
|
try:
|
|
session = await self._get_session()
|
|
|
|
for tab in self.tabs:
|
|
url = self.API_URLS.get(tab)
|
|
if not url:
|
|
continue
|
|
|
|
topics = await self._fetch_tab(session, url, tab)
|
|
all_topics.extend(topics)
|
|
|
|
self.logger.info(f"获取到 {len(all_topics)} 条百度热搜")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"获取百度热搜失败: {e}")
|
|
|
|
return all_topics
|
|
|
|
async def _fetch_tab(self, session, url: str, tab: str) -> List[HotTopic]:
|
|
"""获取单个榜单"""
|
|
topics = []
|
|
|
|
try:
|
|
async with session.get(url) as response:
|
|
if response.status != 200:
|
|
self.logger.error(f"请求 {tab} 失败: {response.status}")
|
|
return []
|
|
|
|
data = await response.json()
|
|
|
|
# 解析数据
|
|
cards = data.get('data', {}).get('cards', [])
|
|
if cards:
|
|
content = cards[0].get('content', [])
|
|
|
|
for idx, item in enumerate(content):
|
|
topic = self._parse_item(item, idx + 1, tab)
|
|
if topic:
|
|
topics.append(topic)
|
|
except Exception as e:
|
|
self.logger.error(f"获取 {tab} 榜单失败: {e}")
|
|
|
|
return topics
|
|
|
|
def _parse_item(self, item: dict, rank: int, tab: str = 'realtime') -> HotTopic:
|
|
"""解析单条热搜"""
|
|
try:
|
|
word = item.get('word', '') or item.get('query', '')
|
|
if not word:
|
|
return None
|
|
|
|
# 热度 (确保是整数)
|
|
hot_score = item.get('hotScore', 0)
|
|
if isinstance(hot_score, str):
|
|
try:
|
|
hot_score = int(hot_score)
|
|
except ValueError:
|
|
hot_score = 0
|
|
|
|
# 描述
|
|
desc = item.get('desc', '')
|
|
|
|
# 判断分类
|
|
category = self._detect_category(word, desc)
|
|
|
|
return HotTopic(
|
|
title=word,
|
|
source=self.source,
|
|
rank=rank,
|
|
heat=hot_score,
|
|
category=category,
|
|
url=item.get('url', f"https://www.baidu.com/s?wd={word}"),
|
|
description=desc,
|
|
fetched_at=datetime.now(),
|
|
expires_at=datetime.now() + timedelta(hours=2), # 2小时后过期
|
|
extra={
|
|
'img': item.get('img', ''),
|
|
'show': item.get('show', []),
|
|
'tab': tab, # 来源榜单
|
|
}
|
|
)
|
|
except Exception as e:
|
|
self.logger.warning(f"解析热搜失败: {e}")
|
|
return None
|
|
|
|
def _detect_category(self, title: str, desc: str) -> HotTopicCategory:
|
|
"""检测分类"""
|
|
text = f"{title} {desc}"
|
|
|
|
# 旅游相关
|
|
travel_keywords = ['旅游', '旅行', '景区', '景点', '酒店', '度假', '出游']
|
|
if any(kw in text for kw in travel_keywords):
|
|
return HotTopicCategory.TRAVEL
|
|
|
|
# 美食相关
|
|
food_keywords = ['美食', '餐厅', '小吃', '火锅']
|
|
if any(kw in text for kw in food_keywords):
|
|
return HotTopicCategory.FOOD
|
|
|
|
return HotTopicCategory.TRENDING
|