299 lines
9.4 KiB
Python
299 lines
9.4 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
小红书爬虫 - MediaCrawler 桥接模块
|
|
|
|
直接调用 libs/MediaCrawler 项目的功能
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# 添加 MediaCrawler 到 Python 路径
|
|
MEDIACRAWLER_PATH = Path(__file__).parent.parent.parent.parent.parent / 'libs' / 'MediaCrawler'
|
|
if MEDIACRAWLER_PATH.exists():
|
|
sys.path.insert(0, str(MEDIACRAWLER_PATH))
|
|
MEDIACRAWLER_AVAILABLE = True
|
|
else:
|
|
MEDIACRAWLER_AVAILABLE = False
|
|
logger.warning(f"MediaCrawler 路径不存在: {MEDIACRAWLER_PATH}")
|
|
|
|
|
|
class XHSCrawlerBridge:
|
|
"""
|
|
小红书爬虫桥接器
|
|
|
|
调用 MediaCrawler 项目的小红书爬虫功能
|
|
|
|
使用方式:
|
|
bridge = XHSCrawlerBridge()
|
|
|
|
# 搜索笔记
|
|
notes = await bridge.search_notes("旅游攻略")
|
|
|
|
# 获取笔记详情
|
|
detail = await bridge.get_note_detail("note_id")
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._client = None
|
|
self._context = None
|
|
self._page = None
|
|
self._initialized = False
|
|
|
|
# Cookie 缓存
|
|
self._cookie_cache_path = Path(__file__).parent.parent.parent / 'cache' / 'xhs_cookies.json'
|
|
self._cookie_cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
@property
|
|
def is_available(self) -> bool:
|
|
"""检查 MediaCrawler 是否可用"""
|
|
return MEDIACRAWLER_AVAILABLE
|
|
|
|
async def init(self, headless: bool = True) -> bool:
|
|
"""
|
|
初始化爬虫
|
|
|
|
Args:
|
|
headless: 是否无头模式
|
|
|
|
Returns:
|
|
是否成功
|
|
"""
|
|
if not MEDIACRAWLER_AVAILABLE:
|
|
logger.error("MediaCrawler 不可用")
|
|
return False
|
|
|
|
if self._initialized:
|
|
return True
|
|
|
|
try:
|
|
# 导入 MediaCrawler 模块
|
|
from media_platform.xhs.client import XiaoHongShuClient
|
|
from playwright.async_api import async_playwright
|
|
|
|
# 启动浏览器
|
|
self._playwright = await async_playwright().start()
|
|
browser = await self._playwright.chromium.launch(headless=headless)
|
|
self._context = await browser.new_context(
|
|
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
|
)
|
|
self._page = await self._context.new_page()
|
|
|
|
# 访问小红书
|
|
await self._page.goto("https://www.xiaohongshu.com", wait_until="domcontentloaded")
|
|
await asyncio.sleep(2)
|
|
|
|
# 加载缓存的 Cookie
|
|
cookies = self._load_cookies()
|
|
if cookies:
|
|
await self._context.add_cookies(cookies)
|
|
await self._page.reload()
|
|
await asyncio.sleep(1)
|
|
|
|
# 初始化客户端
|
|
cookie_list = await self._context.cookies()
|
|
cookie_dict = {c["name"]: c["value"] for c in cookie_list}
|
|
|
|
self._client = XiaoHongShuClient(
|
|
headers={
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
|
"Cookie": "; ".join([f"{k}={v}" for k, v in cookie_dict.items()]),
|
|
},
|
|
playwright_page=self._page,
|
|
cookie_dict=cookie_dict,
|
|
)
|
|
|
|
self._initialized = True
|
|
logger.info("XHS 爬虫初始化成功")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"初始化失败: {e}")
|
|
return False
|
|
|
|
async def login(self) -> bool:
|
|
"""
|
|
扫码登录
|
|
|
|
Returns:
|
|
是否成功
|
|
"""
|
|
if not await self.init(headless=False):
|
|
return False
|
|
|
|
try:
|
|
logger.info("请扫描二维码登录小红书...")
|
|
|
|
# 等待登录 (最多 120 秒)
|
|
for i in range(120):
|
|
await asyncio.sleep(1)
|
|
cookies = await self._context.cookies()
|
|
cookie_dict = {c["name"]: c["value"] for c in cookies}
|
|
|
|
if "web_session" in cookie_dict:
|
|
logger.info("登录成功!")
|
|
self._save_cookies(cookies)
|
|
|
|
# 更新客户端
|
|
if self._client:
|
|
await self._client.update_cookies(self._context)
|
|
|
|
return True
|
|
|
|
if i % 10 == 0:
|
|
logger.info(f"等待登录... ({i}/120)")
|
|
|
|
logger.warning("登录超时")
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"登录失败: {e}")
|
|
return False
|
|
|
|
async def search_notes(
|
|
self,
|
|
keyword: str,
|
|
page: int = 1,
|
|
page_size: int = 20,
|
|
sort: str = "general",
|
|
) -> List[Dict]:
|
|
"""
|
|
搜索笔记
|
|
|
|
Args:
|
|
keyword: 关键词
|
|
page: 页码
|
|
page_size: 每页数量
|
|
sort: 排序 (general, time_descending, popularity_descending)
|
|
|
|
Returns:
|
|
笔记列表
|
|
"""
|
|
if not self._initialized:
|
|
if not await self.init():
|
|
return []
|
|
|
|
try:
|
|
from media_platform.xhs.field import SearchSortType
|
|
|
|
sort_map = {
|
|
"general": SearchSortType.GENERAL,
|
|
"time_descending": SearchSortType.LATEST,
|
|
"popularity_descending": SearchSortType.MOST_POPULAR,
|
|
}
|
|
|
|
result = await self._client.get_note_by_keyword(
|
|
keyword=keyword,
|
|
page=page,
|
|
page_size=page_size,
|
|
sort=sort_map.get(sort, SearchSortType.GENERAL),
|
|
)
|
|
|
|
items = result.get("items", [])
|
|
return self._parse_notes(items)
|
|
|
|
except Exception as e:
|
|
logger.error(f"搜索失败: {e}")
|
|
return []
|
|
|
|
async def get_note_detail(self, note_id: str, xsec_token: str = "") -> Optional[Dict]:
|
|
"""
|
|
获取笔记详情
|
|
|
|
Args:
|
|
note_id: 笔记 ID
|
|
xsec_token: 验证 token
|
|
|
|
Returns:
|
|
笔记详情
|
|
"""
|
|
if not self._initialized:
|
|
if not await self.init():
|
|
return None
|
|
|
|
try:
|
|
result = await self._client.get_note_by_id(
|
|
note_id=note_id,
|
|
xsec_source="pc_search",
|
|
xsec_token=xsec_token,
|
|
)
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"获取详情失败: {e}")
|
|
return None
|
|
|
|
def _parse_notes(self, items: List[Dict]) -> List[Dict]:
|
|
"""解析笔记列表"""
|
|
notes = []
|
|
for item in items:
|
|
try:
|
|
note_card = item.get("note_card", {})
|
|
user = note_card.get("user", {})
|
|
interact = note_card.get("interact_info", {})
|
|
|
|
notes.append({
|
|
"id": item.get("id"),
|
|
"xsec_token": item.get("xsec_token", ""),
|
|
"title": note_card.get("display_title", ""),
|
|
"desc": note_card.get("desc", ""),
|
|
"type": note_card.get("type", ""),
|
|
"liked_count": interact.get("liked_count", "0"),
|
|
"collected_count": interact.get("collected_count", "0"),
|
|
"comment_count": interact.get("comment_count", "0"),
|
|
"user_id": user.get("user_id"),
|
|
"user_name": user.get("nickname"),
|
|
"cover": note_card.get("cover", {}).get("url_default", ""),
|
|
"tags": [t.get("name") for t in note_card.get("tag_list", [])],
|
|
})
|
|
except Exception as e:
|
|
logger.warning(f"解析笔记失败: {e}")
|
|
|
|
return notes
|
|
|
|
def _load_cookies(self) -> List[Dict]:
|
|
"""加载缓存的 Cookie"""
|
|
try:
|
|
if self._cookie_cache_path.exists():
|
|
with open(self._cookie_cache_path, 'r') as f:
|
|
return json.load(f)
|
|
except Exception as e:
|
|
logger.warning(f"加载 Cookie 失败: {e}")
|
|
return []
|
|
|
|
def _save_cookies(self, cookies: List[Dict]):
|
|
"""保存 Cookie"""
|
|
try:
|
|
with open(self._cookie_cache_path, 'w') as f:
|
|
json.dump(cookies, f)
|
|
logger.info(f"Cookie 已保存到 {self._cookie_cache_path}")
|
|
except Exception as e:
|
|
logger.error(f"保存 Cookie 失败: {e}")
|
|
|
|
async def close(self):
|
|
"""关闭"""
|
|
if self._context:
|
|
await self._context.close()
|
|
if hasattr(self, '_playwright') and self._playwright:
|
|
await self._playwright.stop()
|
|
self._initialized = False
|
|
|
|
|
|
# 全局实例
|
|
_bridge_instance: Optional[XHSCrawlerBridge] = None
|
|
|
|
def get_xhs_bridge() -> XHSCrawlerBridge:
|
|
"""获取全局桥接器实例"""
|
|
global _bridge_instance
|
|
if _bridge_instance is None:
|
|
_bridge_instance = XHSCrawlerBridge()
|
|
return _bridge_instance
|