181 lines
5.5 KiB
Python
Raw Normal View History

2025-12-10 10:07:40 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
小红书热门话题爬虫
基于 MediaCrawler 项目获取实时数据
需要先扫码登录
使用方式:
crawler = XiaohongshuCrawler()
await crawler.login() # 扫码登录
topics = await crawler.fetch()
"""
import logging
from typing import List, Optional, Dict
from datetime import datetime, timedelta
import asyncio
from .base import BaseCrawler
from ..models import HotTopic, HotTopicSource, HotTopicCategory
logger = logging.getLogger(__name__)
# 尝试导入 MediaCrawler 桥接器
try:
from .mediacrawler import get_xhs_bridge, XHSCrawlerBridge
MEDIACRAWLER_AVAILABLE = True
except ImportError:
MEDIACRAWLER_AVAILABLE = False
logger.warning("MediaCrawler 模块未加载,小红书爬虫不可用")
class XiaohongshuCrawler(BaseCrawler):
"""
小红书热门话题爬虫
基于 MediaCrawler 项目需要先扫码登录
"""
source = HotTopicSource.XIAOHONGSHU
name = "小红书热门"
# 搜索关键词 (文旅相关)
SEARCH_KEYWORDS = [
"旅游攻略", "周末去哪玩", "亲子游推荐", "自驾游路线",
"网红打卡地", "小众景点", "酒店推荐", "民宿推荐",
"冬季旅行", "滑雪攻略", "温泉度假",
]
def __init__(self, keywords: List[str] = None):
super().__init__()
self._keywords = keywords or self.SEARCH_KEYWORDS
self._xhs_bridge: Optional[XHSCrawlerBridge] = None
@property
def is_available(self) -> bool:
"""检查是否可用"""
return MEDIACRAWLER_AVAILABLE
async def login(self) -> bool:
"""
扫码登录小红书
Returns:
是否成功
"""
if not MEDIACRAWLER_AVAILABLE:
self.logger.error("MediaCrawler 不可用,请检查 libs/MediaCrawler 目录")
return False
if not self._xhs_bridge:
self._xhs_bridge = get_xhs_bridge()
return await self._xhs_bridge.login()
async def fetch(self) -> List[HotTopic]:
"""
获取小红书热门话题
通过搜索关键词获取相关笔记统计热度
"""
if not MEDIACRAWLER_AVAILABLE:
self.logger.warning("MediaCrawler 不可用")
return []
topics = []
try:
# 获取桥接器
if not self._xhs_bridge:
self._xhs_bridge = get_xhs_bridge()
if not self._xhs_bridge.is_available:
self.logger.warning("MediaCrawler 不可用")
return []
# 搜索热门关键词
for keyword in self._keywords:
try:
notes = await self._xhs_bridge.search_notes(keyword, page_size=10)
if notes:
# 计算热度 (基于互动数据)
total_likes = sum(int(n.get('liked_count', 0) or 0) for n in notes)
topic = HotTopic(
title=keyword,
source=self.source,
rank=len(topics) + 1,
heat=total_likes,
category=HotTopicCategory.TRAVEL,
tags=[keyword],
description=f"相关笔记 {len(notes)} 篇,总点赞 {total_likes}",
fetched_at=datetime.now(),
expires_at=datetime.now() + timedelta(hours=1),
extra={
'notes_count': len(notes),
'sample_notes': notes[:3],
}
)
topics.append(topic)
# 避免请求过快
await asyncio.sleep(1)
except Exception as e:
self.logger.warning(f"搜索 '{keyword}' 失败: {e}")
# 按热度排序
topics.sort(key=lambda x: x.heat or 0, reverse=True)
# 更新排名
for idx, t in enumerate(topics):
t.rank = idx + 1
self.logger.info(f"获取到 {len(topics)} 条小红书热门话题")
except Exception as e:
self.logger.error(f"获取小红书热门失败: {e}")
return topics
async def search_notes(self, keyword: str, page_size: int = 20) -> List[Dict]:
"""
搜索笔记
Args:
keyword: 搜索关键词
page_size: 每页数量
Returns:
笔记列表
"""
if not MEDIACRAWLER_AVAILABLE:
return []
if not self._xhs_bridge:
self._xhs_bridge = get_xhs_bridge()
return await self._xhs_bridge.search_notes(keyword, page_size=page_size)
async def get_note_detail(self, note_id: str) -> Optional[Dict]:
"""
获取笔记详情
Args:
note_id: 笔记 ID
Returns:
笔记详情
"""
if not MEDIACRAWLER_AVAILABLE:
return None
if not self._xhs_bridge:
self._xhs_bridge = get_xhs_bridge()
return await self._xhs_bridge.get_note_detail(note_id)