181 lines
5.5 KiB
Python
181 lines
5.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
小红书热门话题爬虫
|
|||
|
|
|
|||
|
|
基于 MediaCrawler 项目获取实时数据
|
|||
|
|
需要先扫码登录
|
|||
|
|
|
|||
|
|
使用方式:
|
|||
|
|
crawler = XiaohongshuCrawler()
|
|||
|
|
await crawler.login() # 扫码登录
|
|||
|
|
topics = await crawler.fetch()
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import logging
|
|||
|
|
from typing import List, Optional, Dict
|
|||
|
|
from datetime import datetime, timedelta
|
|||
|
|
import asyncio
|
|||
|
|
|
|||
|
|
from .base import BaseCrawler
|
|||
|
|
from ..models import HotTopic, HotTopicSource, HotTopicCategory
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# 尝试导入 MediaCrawler 桥接器
|
|||
|
|
try:
|
|||
|
|
from .mediacrawler import get_xhs_bridge, XHSCrawlerBridge
|
|||
|
|
MEDIACRAWLER_AVAILABLE = True
|
|||
|
|
except ImportError:
|
|||
|
|
MEDIACRAWLER_AVAILABLE = False
|
|||
|
|
logger.warning("MediaCrawler 模块未加载,小红书爬虫不可用")
|
|||
|
|
|
|||
|
|
|
|||
|
|
class XiaohongshuCrawler(BaseCrawler):
|
|||
|
|
"""
|
|||
|
|
小红书热门话题爬虫
|
|||
|
|
|
|||
|
|
基于 MediaCrawler 项目,需要先扫码登录
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
source = HotTopicSource.XIAOHONGSHU
|
|||
|
|
name = "小红书热门"
|
|||
|
|
|
|||
|
|
# 搜索关键词 (文旅相关)
|
|||
|
|
SEARCH_KEYWORDS = [
|
|||
|
|
"旅游攻略", "周末去哪玩", "亲子游推荐", "自驾游路线",
|
|||
|
|
"网红打卡地", "小众景点", "酒店推荐", "民宿推荐",
|
|||
|
|
"冬季旅行", "滑雪攻略", "温泉度假",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
def __init__(self, keywords: List[str] = None):
|
|||
|
|
super().__init__()
|
|||
|
|
self._keywords = keywords or self.SEARCH_KEYWORDS
|
|||
|
|
self._xhs_bridge: Optional[XHSCrawlerBridge] = None
|
|||
|
|
|
|||
|
|
@property
|
|||
|
|
def is_available(self) -> bool:
|
|||
|
|
"""检查是否可用"""
|
|||
|
|
return MEDIACRAWLER_AVAILABLE
|
|||
|
|
|
|||
|
|
async def login(self) -> bool:
|
|||
|
|
"""
|
|||
|
|
扫码登录小红书
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
是否成功
|
|||
|
|
"""
|
|||
|
|
if not MEDIACRAWLER_AVAILABLE:
|
|||
|
|
self.logger.error("MediaCrawler 不可用,请检查 libs/MediaCrawler 目录")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
if not self._xhs_bridge:
|
|||
|
|
self._xhs_bridge = get_xhs_bridge()
|
|||
|
|
|
|||
|
|
return await self._xhs_bridge.login()
|
|||
|
|
|
|||
|
|
async def fetch(self) -> List[HotTopic]:
|
|||
|
|
"""
|
|||
|
|
获取小红书热门话题
|
|||
|
|
|
|||
|
|
通过搜索关键词获取相关笔记,统计热度
|
|||
|
|
"""
|
|||
|
|
if not MEDIACRAWLER_AVAILABLE:
|
|||
|
|
self.logger.warning("MediaCrawler 不可用")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
topics = []
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 获取桥接器
|
|||
|
|
if not self._xhs_bridge:
|
|||
|
|
self._xhs_bridge = get_xhs_bridge()
|
|||
|
|
|
|||
|
|
if not self._xhs_bridge.is_available:
|
|||
|
|
self.logger.warning("MediaCrawler 不可用")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# 搜索热门关键词
|
|||
|
|
for keyword in self._keywords:
|
|||
|
|
try:
|
|||
|
|
notes = await self._xhs_bridge.search_notes(keyword, page_size=10)
|
|||
|
|
|
|||
|
|
if notes:
|
|||
|
|
# 计算热度 (基于互动数据)
|
|||
|
|
total_likes = sum(int(n.get('liked_count', 0) or 0) for n in notes)
|
|||
|
|
|
|||
|
|
topic = HotTopic(
|
|||
|
|
title=keyword,
|
|||
|
|
source=self.source,
|
|||
|
|
rank=len(topics) + 1,
|
|||
|
|
heat=total_likes,
|
|||
|
|
category=HotTopicCategory.TRAVEL,
|
|||
|
|
tags=[keyword],
|
|||
|
|
description=f"相关笔记 {len(notes)} 篇,总点赞 {total_likes}",
|
|||
|
|
fetched_at=datetime.now(),
|
|||
|
|
expires_at=datetime.now() + timedelta(hours=1),
|
|||
|
|
extra={
|
|||
|
|
'notes_count': len(notes),
|
|||
|
|
'sample_notes': notes[:3],
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
topics.append(topic)
|
|||
|
|
|
|||
|
|
# 避免请求过快
|
|||
|
|
await asyncio.sleep(1)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.warning(f"搜索 '{keyword}' 失败: {e}")
|
|||
|
|
|
|||
|
|
# 按热度排序
|
|||
|
|
topics.sort(key=lambda x: x.heat or 0, reverse=True)
|
|||
|
|
|
|||
|
|
# 更新排名
|
|||
|
|
for idx, t in enumerate(topics):
|
|||
|
|
t.rank = idx + 1
|
|||
|
|
|
|||
|
|
self.logger.info(f"获取到 {len(topics)} 条小红书热门话题")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f"获取小红书热门失败: {e}")
|
|||
|
|
|
|||
|
|
return topics
|
|||
|
|
|
|||
|
|
async def search_notes(self, keyword: str, page_size: int = 20) -> List[Dict]:
|
|||
|
|
"""
|
|||
|
|
搜索笔记
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
keyword: 搜索关键词
|
|||
|
|
page_size: 每页数量
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
笔记列表
|
|||
|
|
"""
|
|||
|
|
if not MEDIACRAWLER_AVAILABLE:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
if not self._xhs_bridge:
|
|||
|
|
self._xhs_bridge = get_xhs_bridge()
|
|||
|
|
|
|||
|
|
return await self._xhs_bridge.search_notes(keyword, page_size=page_size)
|
|||
|
|
|
|||
|
|
async def get_note_detail(self, note_id: str) -> Optional[Dict]:
|
|||
|
|
"""
|
|||
|
|
获取笔记详情
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
note_id: 笔记 ID
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
笔记详情
|
|||
|
|
"""
|
|||
|
|
if not MEDIACRAWLER_AVAILABLE:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
if not self._xhs_bridge:
|
|||
|
|
self._xhs_bridge = get_xhs_bridge()
|
|||
|
|
|
|||
|
|
return await self._xhs_bridge.get_note_detail(note_id)
|