#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 小红书热门话题爬虫 基于 MediaCrawler 项目获取实时数据 需要先扫码登录 使用方式: crawler = XiaohongshuCrawler() await crawler.login() # 扫码登录 topics = await crawler.fetch() """ import logging from typing import List, Optional, Dict from datetime import datetime, timedelta import asyncio from .base import BaseCrawler from ..models import HotTopic, HotTopicSource, HotTopicCategory logger = logging.getLogger(__name__) # 尝试导入 MediaCrawler 桥接器 try: from .mediacrawler import get_xhs_bridge, XHSCrawlerBridge MEDIACRAWLER_AVAILABLE = True except ImportError: MEDIACRAWLER_AVAILABLE = False logger.warning("MediaCrawler 模块未加载,小红书爬虫不可用") class XiaohongshuCrawler(BaseCrawler): """ 小红书热门话题爬虫 基于 MediaCrawler 项目,需要先扫码登录 """ source = HotTopicSource.XIAOHONGSHU name = "小红书热门" # 搜索关键词 (文旅相关) SEARCH_KEYWORDS = [ "旅游攻略", "周末去哪玩", "亲子游推荐", "自驾游路线", "网红打卡地", "小众景点", "酒店推荐", "民宿推荐", "冬季旅行", "滑雪攻略", "温泉度假", ] def __init__(self, keywords: List[str] = None): super().__init__() self._keywords = keywords or self.SEARCH_KEYWORDS self._xhs_bridge: Optional[XHSCrawlerBridge] = None @property def is_available(self) -> bool: """检查是否可用""" return MEDIACRAWLER_AVAILABLE async def login(self) -> bool: """ 扫码登录小红书 Returns: 是否成功 """ if not MEDIACRAWLER_AVAILABLE: self.logger.error("MediaCrawler 不可用,请检查 libs/MediaCrawler 目录") return False if not self._xhs_bridge: self._xhs_bridge = get_xhs_bridge() return await self._xhs_bridge.login() async def fetch(self) -> List[HotTopic]: """ 获取小红书热门话题 通过搜索关键词获取相关笔记,统计热度 """ if not MEDIACRAWLER_AVAILABLE: self.logger.warning("MediaCrawler 不可用") return [] topics = [] try: # 获取桥接器 if not self._xhs_bridge: self._xhs_bridge = get_xhs_bridge() if not self._xhs_bridge.is_available: self.logger.warning("MediaCrawler 不可用") return [] # 搜索热门关键词 for keyword in self._keywords: try: notes = await self._xhs_bridge.search_notes(keyword, page_size=10) if notes: # 计算热度 (基于互动数据) total_likes = sum(int(n.get('liked_count', 0) or 0) for n in notes) topic = HotTopic( title=keyword, source=self.source, rank=len(topics) + 1, heat=total_likes, category=HotTopicCategory.TRAVEL, tags=[keyword], description=f"相关笔记 {len(notes)} 篇,总点赞 {total_likes}", fetched_at=datetime.now(), expires_at=datetime.now() + timedelta(hours=1), extra={ 'notes_count': len(notes), 'sample_notes': notes[:3], } ) topics.append(topic) # 避免请求过快 await asyncio.sleep(1) except Exception as e: self.logger.warning(f"搜索 '{keyword}' 失败: {e}") # 按热度排序 topics.sort(key=lambda x: x.heat or 0, reverse=True) # 更新排名 for idx, t in enumerate(topics): t.rank = idx + 1 self.logger.info(f"获取到 {len(topics)} 条小红书热门话题") except Exception as e: self.logger.error(f"获取小红书热门失败: {e}") return topics async def search_notes(self, keyword: str, page_size: int = 20) -> List[Dict]: """ 搜索笔记 Args: keyword: 搜索关键词 page_size: 每页数量 Returns: 笔记列表 """ if not MEDIACRAWLER_AVAILABLE: return [] if not self._xhs_bridge: self._xhs_bridge = get_xhs_bridge() return await self._xhs_bridge.search_notes(keyword, page_size=page_size) async def get_note_detail(self, note_id: str) -> Optional[Dict]: """ 获取笔记详情 Args: note_id: 笔记 ID Returns: 笔记详情 """ if not MEDIACRAWLER_AVAILABLE: return None if not self._xhs_bridge: self._xhs_bridge = get_xhs_bridge() return await self._xhs_bridge.get_note_detail(note_id)