TravelContentCreator/domain/hotspot/crawlers/mediacrawler/xhs_crawler.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
小红书爬虫 - MediaCrawler 桥接模块

直接调用 libs/MediaCrawler 项目的功能
"""

import asyncio
import logging
import sys
import json
from pathlib import Path
from typing import List, Dict, Optional

logger = logging.getLogger(__name__)

# 添加 MediaCrawler 到 Python 路径
MEDIACRAWLER_PATH = Path(__file__).parent.parent.parent.parent.parent / 'libs' / 'MediaCrawler'
if MEDIACRAWLER_PATH.exists():
    sys.path.insert(0, str(MEDIACRAWLER_PATH))
    MEDIACRAWLER_AVAILABLE = True
else:
    MEDIACRAWLER_AVAILABLE = False
    logger.warning(f"MediaCrawler 路径不存在: {MEDIACRAWLER_PATH}")


class XHSCrawlerBridge:
    """
    小红书爬虫桥接器

    调用 MediaCrawler 项目的小红书爬虫功能

    使用方式:
        bridge = XHSCrawlerBridge()

        # 搜索笔记
        notes = await bridge.search_notes("旅游攻略")

        # 获取笔记详情
        detail = await bridge.get_note_detail("note_id")
    """

    def __init__(self):
        self._client = None
        self._context = None
        self._page = None
        self._initialized = False

        # Cookie 缓存
        self._cookie_cache_path = Path(__file__).parent.parent.parent / 'cache' / 'xhs_cookies.json'
        self._cookie_cache_path.parent.mkdir(parents=True, exist_ok=True)

    @property
    def is_available(self) -> bool:
        """检查 MediaCrawler 是否可用"""
        return MEDIACRAWLER_AVAILABLE

    async def init(self, headless: bool = True) -> bool:
        """
        初始化爬虫

        Args:
            headless: 是否无头模式

        Returns:
            是否成功
        """
        if not MEDIACRAWLER_AVAILABLE:
            logger.error("MediaCrawler 不可用")
            return False

        if self._initialized:
            return True

        try:
            # 导入 MediaCrawler 模块
            from media_platform.xhs.client import XiaoHongShuClient
            from playwright.async_api import async_playwright

            # 启动浏览器
            self._playwright = await async_playwright().start()
            browser = await self._playwright.chromium.launch(headless=headless)
            self._context = await browser.new_context(
                user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
            )
            self._page = await self._context.new_page()

            # 访问小红书
            await self._page.goto("https://www.xiaohongshu.com", wait_until="domcontentloaded")
            await asyncio.sleep(2)

            # 加载缓存的 Cookie
            cookies = self._load_cookies()
            if cookies:
                await self._context.add_cookies(cookies)
                await self._page.reload()
                await asyncio.sleep(1)

            # 初始化客户端
            cookie_list = await self._context.cookies()
            cookie_dict = {c["name"]: c["value"] for c in cookie_list}

            self._client = XiaoHongShuClient(
                headers={
                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
                    "Cookie": "; ".join([f"{k}={v}" for k, v in cookie_dict.items()]),
                },
                playwright_page=self._page,
                cookie_dict=cookie_dict,
            )

            self._initialized = True
            logger.info("XHS 爬虫初始化成功")
            return True

        except Exception as e:
            logger.error(f"初始化失败: {e}")
            return False

    async def login(self) -> bool:
        """
        扫码登录

        Returns:
            是否成功
        """
        if not await self.init(headless=False):
            return False

        try:
            logger.info("请扫描二维码登录小红书...")

            # 等待登录 (最多 120 秒)
            for i in range(120):
                await asyncio.sleep(1)
                cookies = await self._context.cookies()
                cookie_dict = {c["name"]: c["value"] for c in cookies}

                if "web_session" in cookie_dict:
                    logger.info("登录成功!")
                    self._save_cookies(cookies)

                    # 更新客户端
                    if self._client:
                        await self._client.update_cookies(self._context)

                    return True

                if i % 10 == 0:
                    logger.info(f"等待登录... ({i}/120)")

            logger.warning("登录超时")
            return False

        except Exception as e:
            logger.error(f"登录失败: {e}")
            return False

    async def search_notes(
        self,
        keyword: str,
        page: int = 1,
        page_size: int = 20,
        sort: str = "general",
    ) -> List[Dict]:
        """
        搜索笔记

        Args:
            keyword: 关键词
            page: 页码
            page_size: 每页数量
            sort: 排序 (general, time_descending, popularity_descending)

        Returns:
            笔记列表
        """
        if not self._initialized:
            if not await self.init():
                return []

        try:
            from media_platform.xhs.field import SearchSortType

            sort_map = {
                "general": SearchSortType.GENERAL,
                "time_descending": SearchSortType.LATEST,
                "popularity_descending": SearchSortType.MOST_POPULAR,
            }

            result = await self._client.get_note_by_keyword(
                keyword=keyword,
                page=page,
                page_size=page_size,
                sort=sort_map.get(sort, SearchSortType.GENERAL),
            )

            items = result.get("items", [])
            return self._parse_notes(items)

        except Exception as e:
            logger.error(f"搜索失败: {e}")
            return []

    async def get_note_detail(self, note_id: str, xsec_token: str = "") -> Optional[Dict]:
        """
        获取笔记详情

        Args:
            note_id: 笔记 ID
            xsec_token: 验证 token

        Returns:
            笔记详情
        """
        if not self._initialized:
            if not await self.init():
                return None

        try:
            result = await self._client.get_note_by_id(
                note_id=note_id,
                xsec_source="pc_search",
                xsec_token=xsec_token,
            )
            return result

        except Exception as e:
            logger.error(f"获取详情失败: {e}")
            return None

    def _parse_notes(self, items: List[Dict]) -> List[Dict]:
        """解析笔记列表"""
        notes = []
        for item in items:
            try:
                note_card = item.get("note_card", {})
                user = note_card.get("user", {})
                interact = note_card.get("interact_info", {})

                notes.append({
                    "id": item.get("id"),
                    "xsec_token": item.get("xsec_token", ""),
                    "title": note_card.get("display_title", ""),
                    "desc": note_card.get("desc", ""),
                    "type": note_card.get("type", ""),
                    "liked_count": interact.get("liked_count", "0"),
                    "collected_count": interact.get("collected_count", "0"),
                    "comment_count": interact.get("comment_count", "0"),
                    "user_id": user.get("user_id"),
                    "user_name": user.get("nickname"),
                    "cover": note_card.get("cover", {}).get("url_default", ""),
                    "tags": [t.get("name") for t in note_card.get("tag_list", [])],
                })
            except Exception as e:
                logger.warning(f"解析笔记失败: {e}")

        return notes

    def _load_cookies(self) -> List[Dict]:
        """加载缓存的 Cookie"""
        try:
            if self._cookie_cache_path.exists():
                with open(self._cookie_cache_path, 'r') as f:
                    return json.load(f)
        except Exception as e:
            logger.warning(f"加载 Cookie 失败: {e}")
        return []

    def _save_cookies(self, cookies: List[Dict]):
        """保存 Cookie"""
        try:
            with open(self._cookie_cache_path, 'w') as f:
                json.dump(cookies, f)
            logger.info(f"Cookie 已保存到 {self._cookie_cache_path}")
        except Exception as e:
            logger.error(f"保存 Cookie 失败: {e}")

    async def close(self):
        """关闭"""
        if self._context:
            await self._context.close()
        if hasattr(self, '_playwright') and self._playwright:
            await self._playwright.stop()
        self._initialized = False


# 全局实例
_bridge_instance: Optional[XHSCrawlerBridge] = None

def get_xhs_bridge() -> XHSCrawlerBridge:
    """获取全局桥接器实例"""
    global _bridge_instance
    if _bridge_instance is None:
        _bridge_instance = XHSCrawlerBridge()
    return _bridge_instance