244 lines
8.6 KiB
Python
244 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
"""
|
||
XHS Spider Adapter
|
||
小红书爬虫适配器
|
||
|
||
作为core模块与xhs_spider模块的桥梁,提供统一的接口
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
from typing import Dict, List, Optional, Any
|
||
from pathlib import Path
|
||
import logging
|
||
|
||
from .models import XHSNote, XHSSearchResult, SearchConfig
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
try:
|
||
from .xhs_spider import Data_Spider, XHS_Apis
|
||
XHS_AVAILABLE = True
|
||
except ImportError as e:
|
||
logger.error(f"XHS Spider模块导入失败: {e}")
|
||
XHS_AVAILABLE = False
|
||
|
||
class XHSAdapter:
|
||
"""小红书爬虫适配器"""
|
||
|
||
def __init__(self, cookies_str: Optional[str] = None):
|
||
"""
|
||
初始化适配器
|
||
|
||
Args:
|
||
cookies_str: Cookie字符串
|
||
"""
|
||
self.cookies_str = cookies_str or ""
|
||
self.spider: Optional[Data_Spider] = None
|
||
self.available = XHS_AVAILABLE
|
||
|
||
if not self.available:
|
||
raise ImportError("XHS Spider模块不可用,请检查模块导入")
|
||
|
||
try:
|
||
self.spider = Data_Spider()
|
||
logger.info("XHS Spider适配器初始化成功")
|
||
except Exception as e:
|
||
logger.error(f"XHS Spider初始化失败: {e}")
|
||
raise
|
||
|
||
def search_notes(self, config: SearchConfig) -> XHSSearchResult:
|
||
"""
|
||
搜索小红书笔记
|
||
|
||
Args:
|
||
config: 搜索配置
|
||
|
||
Returns:
|
||
XHSSearchResult: 搜索结果
|
||
"""
|
||
if not self.cookies_str:
|
||
logger.warning("未设置cookies,搜索可能失败")
|
||
|
||
if not self.spider:
|
||
raise RuntimeError("Spider未初始化")
|
||
|
||
try:
|
||
# 调用XHS_Apis的搜索方法
|
||
success, msg, notes = self.spider.xhs_apis.search_some_note(
|
||
query=config.keyword,
|
||
require_num=config.max_notes,
|
||
cookies_str=self.cookies_str,
|
||
sort_type_choice=config.sort_type,
|
||
note_type=config.note_type
|
||
)
|
||
|
||
if not success:
|
||
logger.error(f"搜索失败: {msg}")
|
||
return XHSSearchResult(
|
||
keyword=config.keyword,
|
||
notes=[],
|
||
total_count=0,
|
||
success=False,
|
||
error_message=str(msg)
|
||
)
|
||
|
||
# 转换为XHSNote对象
|
||
xhs_notes = []
|
||
for note_data in notes:
|
||
if note_data.get('model_type') == 'note':
|
||
note_card = note_data.get('note_card', {})
|
||
user_info = note_card.get('user', {})
|
||
interact_info = note_card.get('interact_info', {})
|
||
|
||
# 提取图片URL列表
|
||
image_urls = []
|
||
for img in note_card.get('image_list', []):
|
||
for info in img.get('info_list', []):
|
||
if info.get('image_scene') == 'WB_DFT':
|
||
image_urls.append(info.get('url', ''))
|
||
break
|
||
|
||
# 处理点赞、评论、分享数(可能是字符串格式)
|
||
def parse_count(count_str):
|
||
if isinstance(count_str, str):
|
||
try:
|
||
return int(count_str)
|
||
except ValueError:
|
||
return 0
|
||
return int(count_str) if count_str else 0
|
||
|
||
note = XHSNote(
|
||
note_id=note_data.get('id', ''),
|
||
title=note_card.get('display_title', ''),
|
||
content=note_card.get('desc', ''), # 搜索结果中可能没有完整内容
|
||
author=user_info.get('nickname', ''),
|
||
author_id=user_info.get('user_id', ''),
|
||
tags=note_card.get('tag_list', []),
|
||
images=image_urls,
|
||
videos=note_card.get('video', []),
|
||
likes=parse_count(interact_info.get('liked_count', 0)),
|
||
comments=parse_count(interact_info.get('comment_count', 0)),
|
||
shares=parse_count(interact_info.get('shared_count', 0)),
|
||
created_time=note_card.get('time', ''),
|
||
note_url=f"https://www.xiaohongshu.com/explore/{note_data.get('id', '')}?xsec_token={note_data.get('xsec_token', '')}"
|
||
)
|
||
xhs_notes.append(note)
|
||
|
||
return XHSSearchResult(
|
||
keyword=config.keyword,
|
||
notes=xhs_notes,
|
||
total_count=len(xhs_notes),
|
||
success=True
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"搜索异常: {e}")
|
||
return XHSSearchResult(
|
||
keyword=config.keyword,
|
||
notes=[],
|
||
total_count=0,
|
||
success=False,
|
||
error_message=str(e)
|
||
)
|
||
|
||
|
||
def search_notes_with_content(self, config: SearchConfig, fetch_content: bool = True) -> XHSSearchResult:
|
||
"""
|
||
搜索小红书笔记并可选择获取详细内容
|
||
|
||
Args:
|
||
config: 搜索配置
|
||
fetch_content: 是否获取详细内容
|
||
|
||
Returns:
|
||
XHSSearchResult: 搜索结果
|
||
"""
|
||
# 首先进行基本搜索
|
||
search_result = self.search_notes(config)
|
||
|
||
if not search_result.success or not fetch_content:
|
||
return search_result
|
||
|
||
# 为每个笔记获取详细内容
|
||
enhanced_notes = []
|
||
for note in search_result.notes:
|
||
try:
|
||
detailed_note = self.get_note_info(note.note_url)
|
||
if detailed_note:
|
||
enhanced_notes.append(detailed_note)
|
||
else:
|
||
# 如果获取详情失败,使用原始笔记数据
|
||
enhanced_notes.append(note)
|
||
logger.warning(f"获取笔记详情失败,使用基本信息: {note.note_id}")
|
||
except Exception as e:
|
||
logger.error(f"获取笔记详情异常: {e}")
|
||
enhanced_notes.append(note)
|
||
|
||
return XHSSearchResult(
|
||
keyword=config.keyword,
|
||
notes=enhanced_notes,
|
||
total_count=len(enhanced_notes),
|
||
success=True
|
||
)
|
||
|
||
|
||
def get_note_info(self, note_url: str) -> Optional[XHSNote]:
|
||
"""
|
||
获取笔记详细信息
|
||
|
||
Args:
|
||
note_url: 笔记URL
|
||
|
||
Returns:
|
||
XHSNote: 笔记信息
|
||
"""
|
||
if not self.cookies_str:
|
||
logger.warning("未设置cookies,获取笔记信息可能失败")
|
||
|
||
if not self.spider:
|
||
raise RuntimeError("Spider未初始化")
|
||
|
||
try:
|
||
success, msg, note_info = self.spider.spider_note(
|
||
note_url=note_url,
|
||
cookies_str=self.cookies_str
|
||
)
|
||
|
||
if not success or not note_info:
|
||
logger.error(f"获取笔记信息失败: {msg}")
|
||
return None
|
||
|
||
# 转换为XHSNote对象
|
||
note = XHSNote(
|
||
note_id=note_info.get('note_id', ''),
|
||
title=note_info.get('title', ''),
|
||
content=note_info.get('desc', ''),
|
||
author=note_info.get('user', {}).get('nickname', ''),
|
||
author_id=note_info.get('user', {}).get('user_id', ''),
|
||
tags=note_info.get('tag_list', []),
|
||
images=note_info.get('image_list', []),
|
||
videos=note_info.get('video', []),
|
||
likes=note_info.get('interact_info', {}).get('liked_count', 0),
|
||
comments=note_info.get('interact_info', {}).get('comment_count', 0),
|
||
shares=note_info.get('interact_info', {}).get('share_count', 0),
|
||
created_time=note_info.get('time', ''),
|
||
note_url=note_url
|
||
)
|
||
|
||
return note
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取笔记信息异常: {e}")
|
||
return None
|
||
|
||
def is_available(self) -> bool:
|
||
"""检查适配器是否可用"""
|
||
return self.available and self.spider is not None
|
||
|
||
def set_cookies(self, cookies_str: str):
|
||
"""设置cookies"""
|
||
self.cookies_str = cookies_str
|
||
logger.info("Cookies已更新") |