#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 小红书爬虫 - MediaCrawler 桥接模块 直接调用 libs/MediaCrawler 项目的功能 """ import asyncio import logging import sys import json from pathlib import Path from typing import List, Dict, Optional logger = logging.getLogger(__name__) # 添加 MediaCrawler 到 Python 路径 MEDIACRAWLER_PATH = Path(__file__).parent.parent.parent.parent.parent / 'libs' / 'MediaCrawler' if MEDIACRAWLER_PATH.exists(): sys.path.insert(0, str(MEDIACRAWLER_PATH)) MEDIACRAWLER_AVAILABLE = True else: MEDIACRAWLER_AVAILABLE = False logger.warning(f"MediaCrawler 路径不存在: {MEDIACRAWLER_PATH}") class XHSCrawlerBridge: """ 小红书爬虫桥接器 调用 MediaCrawler 项目的小红书爬虫功能 使用方式: bridge = XHSCrawlerBridge() # 搜索笔记 notes = await bridge.search_notes("旅游攻略") # 获取笔记详情 detail = await bridge.get_note_detail("note_id") """ def __init__(self): self._client = None self._context = None self._page = None self._initialized = False # Cookie 缓存 self._cookie_cache_path = Path(__file__).parent.parent.parent / 'cache' / 'xhs_cookies.json' self._cookie_cache_path.parent.mkdir(parents=True, exist_ok=True) @property def is_available(self) -> bool: """检查 MediaCrawler 是否可用""" return MEDIACRAWLER_AVAILABLE async def init(self, headless: bool = True) -> bool: """ 初始化爬虫 Args: headless: 是否无头模式 Returns: 是否成功 """ if not MEDIACRAWLER_AVAILABLE: logger.error("MediaCrawler 不可用") return False if self._initialized: return True try: # 导入 MediaCrawler 模块 from media_platform.xhs.client import XiaoHongShuClient from playwright.async_api import async_playwright # 启动浏览器 self._playwright = await async_playwright().start() browser = await self._playwright.chromium.launch(headless=headless) self._context = await browser.new_context( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ) self._page = await self._context.new_page() # 访问小红书 await self._page.goto("https://www.xiaohongshu.com", wait_until="domcontentloaded") await asyncio.sleep(2) # 加载缓存的 Cookie cookies = self._load_cookies() if cookies: await self._context.add_cookies(cookies) await self._page.reload() await asyncio.sleep(1) # 初始化客户端 cookie_list = await self._context.cookies() cookie_dict = {c["name"]: c["value"] for c in cookie_list} self._client = XiaoHongShuClient( headers={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", "Cookie": "; ".join([f"{k}={v}" for k, v in cookie_dict.items()]), }, playwright_page=self._page, cookie_dict=cookie_dict, ) self._initialized = True logger.info("XHS 爬虫初始化成功") return True except Exception as e: logger.error(f"初始化失败: {e}") return False async def login(self) -> bool: """ 扫码登录 Returns: 是否成功 """ if not await self.init(headless=False): return False try: logger.info("请扫描二维码登录小红书...") # 等待登录 (最多 120 秒) for i in range(120): await asyncio.sleep(1) cookies = await self._context.cookies() cookie_dict = {c["name"]: c["value"] for c in cookies} if "web_session" in cookie_dict: logger.info("登录成功!") self._save_cookies(cookies) # 更新客户端 if self._client: await self._client.update_cookies(self._context) return True if i % 10 == 0: logger.info(f"等待登录... ({i}/120)") logger.warning("登录超时") return False except Exception as e: logger.error(f"登录失败: {e}") return False async def search_notes( self, keyword: str, page: int = 1, page_size: int = 20, sort: str = "general", ) -> List[Dict]: """ 搜索笔记 Args: keyword: 关键词 page: 页码 page_size: 每页数量 sort: 排序 (general, time_descending, popularity_descending) Returns: 笔记列表 """ if not self._initialized: if not await self.init(): return [] try: from media_platform.xhs.field import SearchSortType sort_map = { "general": SearchSortType.GENERAL, "time_descending": SearchSortType.LATEST, "popularity_descending": SearchSortType.MOST_POPULAR, } result = await self._client.get_note_by_keyword( keyword=keyword, page=page, page_size=page_size, sort=sort_map.get(sort, SearchSortType.GENERAL), ) items = result.get("items", []) return self._parse_notes(items) except Exception as e: logger.error(f"搜索失败: {e}") return [] async def get_note_detail(self, note_id: str, xsec_token: str = "") -> Optional[Dict]: """ 获取笔记详情 Args: note_id: 笔记 ID xsec_token: 验证 token Returns: 笔记详情 """ if not self._initialized: if not await self.init(): return None try: result = await self._client.get_note_by_id( note_id=note_id, xsec_source="pc_search", xsec_token=xsec_token, ) return result except Exception as e: logger.error(f"获取详情失败: {e}") return None def _parse_notes(self, items: List[Dict]) -> List[Dict]: """解析笔记列表""" notes = [] for item in items: try: note_card = item.get("note_card", {}) user = note_card.get("user", {}) interact = note_card.get("interact_info", {}) notes.append({ "id": item.get("id"), "xsec_token": item.get("xsec_token", ""), "title": note_card.get("display_title", ""), "desc": note_card.get("desc", ""), "type": note_card.get("type", ""), "liked_count": interact.get("liked_count", "0"), "collected_count": interact.get("collected_count", "0"), "comment_count": interact.get("comment_count", "0"), "user_id": user.get("user_id"), "user_name": user.get("nickname"), "cover": note_card.get("cover", {}).get("url_default", ""), "tags": [t.get("name") for t in note_card.get("tag_list", [])], }) except Exception as e: logger.warning(f"解析笔记失败: {e}") return notes def _load_cookies(self) -> List[Dict]: """加载缓存的 Cookie""" try: if self._cookie_cache_path.exists(): with open(self._cookie_cache_path, 'r') as f: return json.load(f) except Exception as e: logger.warning(f"加载 Cookie 失败: {e}") return [] def _save_cookies(self, cookies: List[Dict]): """保存 Cookie""" try: with open(self._cookie_cache_path, 'w') as f: json.dump(cookies, f) logger.info(f"Cookie 已保存到 {self._cookie_cache_path}") except Exception as e: logger.error(f"保存 Cookie 失败: {e}") async def close(self): """关闭""" if self._context: await self._context.close() if hasattr(self, '_playwright') and self._playwright: await self._playwright.stop() self._initialized = False # 全局实例 _bridge_instance: Optional[XHSCrawlerBridge] = None def get_xhs_bridge() -> XHSCrawlerBridge: """获取全局桥接器实例""" global _bridge_instance if _bridge_instance is None: _bridge_instance = XHSCrawlerBridge() return _bridge_instance