TravelContentCreator/domain/hotspot/crawlers/baidu.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
百度热搜爬虫

数据源: https://top.baidu.com/board?tab=realtime
"""

import logging
import re
from typing import List
from datetime import datetime, timedelta

from .base import BaseCrawler
from ..models import HotTopic, HotTopicSource, HotTopicCategory

logger = logging.getLogger(__name__)


class BaiduCrawler(BaseCrawler):
    """百度热搜爬虫"""
    
    source = HotTopicSource.BAIDU
    name = "百度热搜"
    
    # 百度热搜 API - 支持多个榜单
    API_URLS = {
        'realtime': 'https://top.baidu.com/api/board?platform=wise&tab=realtime',  # 实时热点
        'travel': 'https://top.baidu.com/api/board?platform=wise&tab=travel',      # 旅游榜 ⭐
        'novel': 'https://top.baidu.com/api/board?platform=wise&tab=novel',        # 小说榜
        'movie': 'https://top.baidu.com/api/board?platform=wise&tab=movie',        # 电影榜
        'teleplay': 'https://top.baidu.com/api/board?platform=wise&tab=teleplay',  # 电视剧榜
        'car': 'https://top.baidu.com/api/board?platform=wise&tab=car',            # 汽车榜
        'game': 'https://top.baidu.com/api/board?platform=wise&tab=game',          # 游戏榜
    }
    
    def __init__(self, tabs: List[str] = None):
        super().__init__()
        # 默认获取实时热点 + 旅游榜
        self.tabs = tabs or ['realtime', 'travel']
    
    async def fetch(self) -> List[HotTopic]:
        """获取百度热搜 (支持多榜单)"""
        all_topics = []
        
        try:
            session = await self._get_session()
            
            for tab in self.tabs:
                url = self.API_URLS.get(tab)
                if not url:
                    continue
                
                topics = await self._fetch_tab(session, url, tab)
                all_topics.extend(topics)
            
            self.logger.info(f"获取到 {len(all_topics)} 条百度热搜")
                
        except Exception as e:
            self.logger.error(f"获取百度热搜失败: {e}")
        
        return all_topics
    
    async def _fetch_tab(self, session, url: str, tab: str) -> List[HotTopic]:
        """获取单个榜单"""
        topics = []
        
        try:
            async with session.get(url) as response:
                if response.status != 200:
                    self.logger.error(f"请求 {tab} 失败: {response.status}")
                    return []
                
                data = await response.json()
                
                # 解析数据
                cards = data.get('data', {}).get('cards', [])
                if cards:
                    content = cards[0].get('content', [])
                    
                    for idx, item in enumerate(content):
                        topic = self._parse_item(item, idx + 1, tab)
                        if topic:
                            topics.append(topic)
        except Exception as e:
            self.logger.error(f"获取 {tab} 榜单失败: {e}")
        
        return topics
    
    def _parse_item(self, item: dict, rank: int, tab: str = 'realtime') -> HotTopic:
        """解析单条热搜"""
        try:
            word = item.get('word', '') or item.get('query', '')
            if not word:
                return None
            
            # 热度 (确保是整数)
            hot_score = item.get('hotScore', 0)
            if isinstance(hot_score, str):
                try:
                    hot_score = int(hot_score)
                except ValueError:
                    hot_score = 0
            
            # 描述
            desc = item.get('desc', '')
            
            # 判断分类
            category = self._detect_category(word, desc)
            
            return HotTopic(
                title=word,
                source=self.source,
                rank=rank,
                heat=hot_score,
                category=category,
                url=item.get('url', f"https://www.baidu.com/s?wd={word}"),
                description=desc,
                fetched_at=datetime.now(),
                expires_at=datetime.now() + timedelta(hours=2),  # 2小时后过期
                extra={
                    'img': item.get('img', ''),
                    'show': item.get('show', []),
                    'tab': tab,  # 来源榜单
                }
            )
        except Exception as e:
            self.logger.warning(f"解析热搜失败: {e}")
            return None
    
    def _detect_category(self, title: str, desc: str) -> HotTopicCategory:
        """检测分类"""
        text = f"{title} {desc}"
        
        # 旅游相关
        travel_keywords = ['旅游', '旅行', '景区', '景点', '酒店', '度假', '出游']
        if any(kw in text for kw in travel_keywords):
            return HotTopicCategory.TRAVEL
        
        # 美食相关
        food_keywords = ['美食', '餐厅', '小吃', '火锅']
        if any(kw in text for kw in food_keywords):
            return HotTopicCategory.FOOD
        
        return HotTopicCategory.TRENDING
清理了部分无效的模块 2025-12-10 10:07:40 +08:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`"""`
			`百度热搜爬虫`

			`数据源: https://top.baidu.com/board?tab=realtime`
			`"""`

			`import logging`
			`import re`
			`from typing import List`
			`from datetime import datetime, timedelta`

			`from .base import BaseCrawler`
			`from ..models import HotTopic, HotTopicSource, HotTopicCategory`

			`logger = logging.getLogger(__name__)`


			`class BaiduCrawler(BaseCrawler):`
			`"""百度热搜爬虫"""`

			`source = HotTopicSource.BAIDU`
			`name = "百度热搜"`

			`# 百度热搜 API - 支持多个榜单`
			`API_URLS = {`
			`'realtime': 'https://top.baidu.com/api/board?platform=wise&tab=realtime', # 实时热点`
			`'travel': 'https://top.baidu.com/api/board?platform=wise&tab=travel', # 旅游榜 ⭐`
			`'novel': 'https://top.baidu.com/api/board?platform=wise&tab=novel', # 小说榜`
			`'movie': 'https://top.baidu.com/api/board?platform=wise&tab=movie', # 电影榜`
			`'teleplay': 'https://top.baidu.com/api/board?platform=wise&tab=teleplay', # 电视剧榜`
			`'car': 'https://top.baidu.com/api/board?platform=wise&tab=car', # 汽车榜`
			`'game': 'https://top.baidu.com/api/board?platform=wise&tab=game', # 游戏榜`
			`}`

			`def __init__(self, tabs: List[str] = None):`
			`super().__init__()`
			`# 默认获取实时热点 + 旅游榜`
			`self.tabs = tabs or ['realtime', 'travel']`

			`async def fetch(self) -> List[HotTopic]:`
			`"""获取百度热搜 (支持多榜单)"""`
			`all_topics = []`

			`try:`
			`session = await self._get_session()`

			`for tab in self.tabs:`
			`url = self.API_URLS.get(tab)`
			`if not url:`
			`continue`

			`topics = await self._fetch_tab(session, url, tab)`
			`all_topics.extend(topics)`

			`self.logger.info(f"获取到 {len(all_topics)} 条百度热搜")`

			`except Exception as e:`
			`self.logger.error(f"获取百度热搜失败: {e}")`

			`return all_topics`

			`async def _fetch_tab(self, session, url: str, tab: str) -> List[HotTopic]:`
			`"""获取单个榜单"""`
			`topics = []`

			`try:`
			`async with session.get(url) as response:`
			`if response.status != 200:`
			`self.logger.error(f"请求 {tab} 失败: {response.status}")`
			`return []`

			`data = await response.json()`

			`# 解析数据`
			`cards = data.get('data', {}).get('cards', [])`
			`if cards:`
			`content = cards[0].get('content', [])`

			`for idx, item in enumerate(content):`
			`topic = self._parse_item(item, idx + 1, tab)`
			`if topic:`
			`topics.append(topic)`
			`except Exception as e:`
			`self.logger.error(f"获取 {tab} 榜单失败: {e}")`

			`return topics`

			`def _parse_item(self, item: dict, rank: int, tab: str = 'realtime') -> HotTopic:`
			`"""解析单条热搜"""`
			`try:`
			`word = item.get('word', '') or item.get('query', '')`
			`if not word:`
			`return None`

			`# 热度 (确保是整数)`
			`hot_score = item.get('hotScore', 0)`
			`if isinstance(hot_score, str):`
			`try:`
			`hot_score = int(hot_score)`
			`except ValueError:`
			`hot_score = 0`

			`# 描述`
			`desc = item.get('desc', '')`

			`# 判断分类`
			`category = self._detect_category(word, desc)`

			`return HotTopic(`
			`title=word,`
			`source=self.source,`
			`rank=rank,`
			`heat=hot_score,`
			`category=category,`
			`url=item.get('url', f"https://www.baidu.com/s?wd={word}"),`
			`description=desc,`
			`fetched_at=datetime.now(),`
			`expires_at=datetime.now() + timedelta(hours=2), # 2小时后过期`
			`extra={`
			`'img': item.get('img', ''),`
			`'show': item.get('show', []),`
			`'tab': tab, # 来源榜单`
			`}`
			`)`
			`except Exception as e:`
			`self.logger.warning(f"解析热搜失败: {e}")`
			`return None`

			`def _detect_category(self, title: str, desc: str) -> HotTopicCategory:`
			`"""检测分类"""`
			`text = f"{title} {desc}"`

			`# 旅游相关`
			`travel_keywords = ['旅游', '旅行', '景区', '景点', '酒店', '度假', '出游']`
			`if any(kw in text for kw in travel_keywords):`
			`return HotTopicCategory.TRAVEL`

			`# 美食相关`
			`food_keywords = ['美食', '餐厅', '小吃', '火锅']`
			`if any(kw in text for kw in food_keywords):`
			`return HotTopicCategory.FOOD`

			`return HotTopicCategory.TRENDING`