#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ XHS Spider Adapter 小红书爬虫适配器 作为core模块与xhs_spider模块的桥梁,提供统一的接口 """ import sys import os from typing import Dict, List, Optional, Any from pathlib import Path import logging from .models import XHSNote, XHSSearchResult, SearchConfig logger = logging.getLogger(__name__) try: from .xhs_spider import Data_Spider, XHS_Apis XHS_AVAILABLE = True except ImportError as e: logger.error(f"XHS Spider模块导入失败: {e}") XHS_AVAILABLE = False class XHSAdapter: """小红书爬虫适配器""" def __init__(self, cookies_str: Optional[str] = None): """ 初始化适配器 Args: cookies_str: Cookie字符串 """ self.cookies_str = cookies_str or "" self.spider: Optional[Data_Spider] = None self.available = XHS_AVAILABLE if not self.available: raise ImportError("XHS Spider模块不可用,请检查模块导入") try: self.spider = Data_Spider() logger.info("XHS Spider适配器初始化成功") except Exception as e: logger.error(f"XHS Spider初始化失败: {e}") raise def search_notes(self, config: SearchConfig) -> XHSSearchResult: """ 搜索小红书笔记 Args: config: 搜索配置 Returns: XHSSearchResult: 搜索结果 """ if not self.cookies_str: logger.warning("未设置cookies,搜索可能失败") if not self.spider: raise RuntimeError("Spider未初始化") try: # 调用XHS_Apis的搜索方法 success, msg, notes = self.spider.xhs_apis.search_some_note( query=config.keyword, require_num=config.max_notes, cookies_str=self.cookies_str, sort_type_choice=config.sort_type, note_type=config.note_type ) if not success: logger.error(f"搜索失败: {msg}") return XHSSearchResult( keyword=config.keyword, notes=[], total_count=0, success=False, error_message=str(msg) ) # 转换为XHSNote对象 xhs_notes = [] for note_data in notes: if note_data.get('model_type') == 'note': note_card = note_data.get('note_card', {}) user_info = note_card.get('user', {}) interact_info = note_card.get('interact_info', {}) # 提取图片URL列表 image_urls = [] for img in note_card.get('image_list', []): for info in img.get('info_list', []): if info.get('image_scene') == 'WB_DFT': image_urls.append(info.get('url', '')) break # 处理点赞、评论、分享数(可能是字符串格式) def parse_count(count_str): if isinstance(count_str, str): try: return int(count_str) except ValueError: return 0 return int(count_str) if count_str else 0 note = XHSNote( note_id=note_data.get('id', ''), title=note_card.get('display_title', ''), content=note_card.get('desc', ''), # 搜索结果中可能没有完整内容 author=user_info.get('nickname', ''), author_id=user_info.get('user_id', ''), tags=note_card.get('tag_list', []), images=image_urls, videos=note_card.get('video', []), likes=parse_count(interact_info.get('liked_count', 0)), comments=parse_count(interact_info.get('comment_count', 0)), shares=parse_count(interact_info.get('shared_count', 0)), created_time=note_card.get('time', ''), note_url=f"https://www.xiaohongshu.com/explore/{note_data.get('id', '')}?xsec_token={note_data.get('xsec_token', '')}" ) xhs_notes.append(note) return XHSSearchResult( keyword=config.keyword, notes=xhs_notes, total_count=len(xhs_notes), success=True ) except Exception as e: logger.error(f"搜索异常: {e}") return XHSSearchResult( keyword=config.keyword, notes=[], total_count=0, success=False, error_message=str(e) ) def search_notes_with_content(self, config: SearchConfig, fetch_content: bool = True) -> XHSSearchResult: """ 搜索小红书笔记并可选择获取详细内容 Args: config: 搜索配置 fetch_content: 是否获取详细内容 Returns: XHSSearchResult: 搜索结果 """ # 首先进行基本搜索 search_result = self.search_notes(config) if not search_result.success or not fetch_content: return search_result # 为每个笔记获取详细内容 enhanced_notes = [] for note in search_result.notes: try: detailed_note = self.get_note_info(note.note_url) if detailed_note: enhanced_notes.append(detailed_note) else: # 如果获取详情失败,使用原始笔记数据 enhanced_notes.append(note) logger.warning(f"获取笔记详情失败,使用基本信息: {note.note_id}") except Exception as e: logger.error(f"获取笔记详情异常: {e}") enhanced_notes.append(note) return XHSSearchResult( keyword=config.keyword, notes=enhanced_notes, total_count=len(enhanced_notes), success=True ) def get_note_info(self, note_url: str) -> Optional[XHSNote]: """ 获取笔记详细信息 Args: note_url: 笔记URL Returns: XHSNote: 笔记信息 """ if not self.cookies_str: logger.warning("未设置cookies,获取笔记信息可能失败") if not self.spider: raise RuntimeError("Spider未初始化") try: success, msg, note_info = self.spider.spider_note( note_url=note_url, cookies_str=self.cookies_str ) if not success or not note_info: logger.error(f"获取笔记信息失败: {msg}") return None # 转换为XHSNote对象 note = XHSNote( note_id=note_info.get('note_id', ''), title=note_info.get('title', ''), content=note_info.get('desc', ''), author=note_info.get('user', {}).get('nickname', ''), author_id=note_info.get('user', {}).get('user_id', ''), tags=note_info.get('tag_list', []), images=note_info.get('image_list', []), videos=note_info.get('video', []), likes=note_info.get('interact_info', {}).get('liked_count', 0), comments=note_info.get('interact_info', {}).get('comment_count', 0), shares=note_info.get('interact_info', {}).get('share_count', 0), created_time=note_info.get('time', ''), note_url=note_url ) return note except Exception as e: logger.error(f"获取笔记信息异常: {e}") return None def is_available(self) -> bool: """检查适配器是否可用""" return self.available and self.spider is not None def set_cookies(self, cookies_str: str): """设置cookies""" self.cookies_str = cookies_str logger.info("Cookies已更新")