299 lines
9.4 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
小红书爬虫 - MediaCrawler 桥接模块
直接调用 libs/MediaCrawler 项目的功能
"""
import asyncio
import logging
import sys
import json
from pathlib import Path
from typing import List, Dict, Optional
logger = logging.getLogger(__name__)
# 添加 MediaCrawler 到 Python 路径
MEDIACRAWLER_PATH = Path(__file__).parent.parent.parent.parent.parent / 'libs' / 'MediaCrawler'
if MEDIACRAWLER_PATH.exists():
sys.path.insert(0, str(MEDIACRAWLER_PATH))
MEDIACRAWLER_AVAILABLE = True
else:
MEDIACRAWLER_AVAILABLE = False
logger.warning(f"MediaCrawler 路径不存在: {MEDIACRAWLER_PATH}")
class XHSCrawlerBridge:
"""
小红书爬虫桥接器
调用 MediaCrawler 项目的小红书爬虫功能
使用方式:
bridge = XHSCrawlerBridge()
# 搜索笔记
notes = await bridge.search_notes("旅游攻略")
# 获取笔记详情
detail = await bridge.get_note_detail("note_id")
"""
def __init__(self):
self._client = None
self._context = None
self._page = None
self._initialized = False
# Cookie 缓存
self._cookie_cache_path = Path(__file__).parent.parent.parent / 'cache' / 'xhs_cookies.json'
self._cookie_cache_path.parent.mkdir(parents=True, exist_ok=True)
@property
def is_available(self) -> bool:
"""检查 MediaCrawler 是否可用"""
return MEDIACRAWLER_AVAILABLE
async def init(self, headless: bool = True) -> bool:
"""
初始化爬虫
Args:
headless: 是否无头模式
Returns:
是否成功
"""
if not MEDIACRAWLER_AVAILABLE:
logger.error("MediaCrawler 不可用")
return False
if self._initialized:
return True
try:
# 导入 MediaCrawler 模块
from media_platform.xhs.client import XiaoHongShuClient
from playwright.async_api import async_playwright
# 启动浏览器
self._playwright = await async_playwright().start()
browser = await self._playwright.chromium.launch(headless=headless)
self._context = await browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
)
self._page = await self._context.new_page()
# 访问小红书
await self._page.goto("https://www.xiaohongshu.com", wait_until="domcontentloaded")
await asyncio.sleep(2)
# 加载缓存的 Cookie
cookies = self._load_cookies()
if cookies:
await self._context.add_cookies(cookies)
await self._page.reload()
await asyncio.sleep(1)
# 初始化客户端
cookie_list = await self._context.cookies()
cookie_dict = {c["name"]: c["value"] for c in cookie_list}
self._client = XiaoHongShuClient(
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Cookie": "; ".join([f"{k}={v}" for k, v in cookie_dict.items()]),
},
playwright_page=self._page,
cookie_dict=cookie_dict,
)
self._initialized = True
logger.info("XHS 爬虫初始化成功")
return True
except Exception as e:
logger.error(f"初始化失败: {e}")
return False
async def login(self) -> bool:
"""
扫码登录
Returns:
是否成功
"""
if not await self.init(headless=False):
return False
try:
logger.info("请扫描二维码登录小红书...")
# 等待登录 (最多 120 秒)
for i in range(120):
await asyncio.sleep(1)
cookies = await self._context.cookies()
cookie_dict = {c["name"]: c["value"] for c in cookies}
if "web_session" in cookie_dict:
logger.info("登录成功!")
self._save_cookies(cookies)
# 更新客户端
if self._client:
await self._client.update_cookies(self._context)
return True
if i % 10 == 0:
logger.info(f"等待登录... ({i}/120)")
logger.warning("登录超时")
return False
except Exception as e:
logger.error(f"登录失败: {e}")
return False
async def search_notes(
self,
keyword: str,
page: int = 1,
page_size: int = 20,
sort: str = "general",
) -> List[Dict]:
"""
搜索笔记
Args:
keyword: 关键词
page: 页码
page_size: 每页数量
sort: 排序 (general, time_descending, popularity_descending)
Returns:
笔记列表
"""
if not self._initialized:
if not await self.init():
return []
try:
from media_platform.xhs.field import SearchSortType
sort_map = {
"general": SearchSortType.GENERAL,
"time_descending": SearchSortType.LATEST,
"popularity_descending": SearchSortType.MOST_POPULAR,
}
result = await self._client.get_note_by_keyword(
keyword=keyword,
page=page,
page_size=page_size,
sort=sort_map.get(sort, SearchSortType.GENERAL),
)
items = result.get("items", [])
return self._parse_notes(items)
except Exception as e:
logger.error(f"搜索失败: {e}")
return []
async def get_note_detail(self, note_id: str, xsec_token: str = "") -> Optional[Dict]:
"""
获取笔记详情
Args:
note_id: 笔记 ID
xsec_token: 验证 token
Returns:
笔记详情
"""
if not self._initialized:
if not await self.init():
return None
try:
result = await self._client.get_note_by_id(
note_id=note_id,
xsec_source="pc_search",
xsec_token=xsec_token,
)
return result
except Exception as e:
logger.error(f"获取详情失败: {e}")
return None
def _parse_notes(self, items: List[Dict]) -> List[Dict]:
"""解析笔记列表"""
notes = []
for item in items:
try:
note_card = item.get("note_card", {})
user = note_card.get("user", {})
interact = note_card.get("interact_info", {})
notes.append({
"id": item.get("id"),
"xsec_token": item.get("xsec_token", ""),
"title": note_card.get("display_title", ""),
"desc": note_card.get("desc", ""),
"type": note_card.get("type", ""),
"liked_count": interact.get("liked_count", "0"),
"collected_count": interact.get("collected_count", "0"),
"comment_count": interact.get("comment_count", "0"),
"user_id": user.get("user_id"),
"user_name": user.get("nickname"),
"cover": note_card.get("cover", {}).get("url_default", ""),
"tags": [t.get("name") for t in note_card.get("tag_list", [])],
})
except Exception as e:
logger.warning(f"解析笔记失败: {e}")
return notes
def _load_cookies(self) -> List[Dict]:
"""加载缓存的 Cookie"""
try:
if self._cookie_cache_path.exists():
with open(self._cookie_cache_path, 'r') as f:
return json.load(f)
except Exception as e:
logger.warning(f"加载 Cookie 失败: {e}")
return []
def _save_cookies(self, cookies: List[Dict]):
"""保存 Cookie"""
try:
with open(self._cookie_cache_path, 'w') as f:
json.dump(cookies, f)
logger.info(f"Cookie 已保存到 {self._cookie_cache_path}")
except Exception as e:
logger.error(f"保存 Cookie 失败: {e}")
async def close(self):
"""关闭"""
if self._context:
await self._context.close()
if hasattr(self, '_playwright') and self._playwright:
await self._playwright.stop()
self._initialized = False
# 全局实例
_bridge_instance: Optional[XHSCrawlerBridge] = None
def get_xhs_bridge() -> XHSCrawlerBridge:
"""获取全局桥接器实例"""
global _bridge_instance
if _bridge_instance is None:
_bridge_instance = XHSCrawlerBridge()
return _bridge_instance