244 lines
8.6 KiB
Python
244 lines
8.6 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
XHS Spider Adapter
|
|||
|
|
小红书爬虫适配器
|
|||
|
|
|
|||
|
|
作为core模块与xhs_spider模块的桥梁,提供统一的接口
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sys
|
|||
|
|
import os
|
|||
|
|
from typing import Dict, List, Optional, Any
|
|||
|
|
from pathlib import Path
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
from .models import XHSNote, XHSSearchResult, SearchConfig
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from .xhs_spider import Data_Spider, XHS_Apis
|
|||
|
|
XHS_AVAILABLE = True
|
|||
|
|
except ImportError as e:
|
|||
|
|
logger.error(f"XHS Spider模块导入失败: {e}")
|
|||
|
|
XHS_AVAILABLE = False
|
|||
|
|
|
|||
|
|
class XHSAdapter:
|
|||
|
|
"""小红书爬虫适配器"""
|
|||
|
|
|
|||
|
|
def __init__(self, cookies_str: Optional[str] = None):
|
|||
|
|
"""
|
|||
|
|
初始化适配器
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
cookies_str: Cookie字符串
|
|||
|
|
"""
|
|||
|
|
self.cookies_str = cookies_str or ""
|
|||
|
|
self.spider: Optional[Data_Spider] = None
|
|||
|
|
self.available = XHS_AVAILABLE
|
|||
|
|
|
|||
|
|
if not self.available:
|
|||
|
|
raise ImportError("XHS Spider模块不可用,请检查模块导入")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
self.spider = Data_Spider()
|
|||
|
|
logger.info("XHS Spider适配器初始化成功")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"XHS Spider初始化失败: {e}")
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
def search_notes(self, config: SearchConfig) -> XHSSearchResult:
|
|||
|
|
"""
|
|||
|
|
搜索小红书笔记
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
config: 搜索配置
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
XHSSearchResult: 搜索结果
|
|||
|
|
"""
|
|||
|
|
if not self.cookies_str:
|
|||
|
|
logger.warning("未设置cookies,搜索可能失败")
|
|||
|
|
|
|||
|
|
if not self.spider:
|
|||
|
|
raise RuntimeError("Spider未初始化")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 调用XHS_Apis的搜索方法
|
|||
|
|
success, msg, notes = self.spider.xhs_apis.search_some_note(
|
|||
|
|
query=config.keyword,
|
|||
|
|
require_num=config.max_notes,
|
|||
|
|
cookies_str=self.cookies_str,
|
|||
|
|
sort_type_choice=config.sort_type,
|
|||
|
|
note_type=config.note_type
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if not success:
|
|||
|
|
logger.error(f"搜索失败: {msg}")
|
|||
|
|
return XHSSearchResult(
|
|||
|
|
keyword=config.keyword,
|
|||
|
|
notes=[],
|
|||
|
|
total_count=0,
|
|||
|
|
success=False,
|
|||
|
|
error_message=str(msg)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 转换为XHSNote对象
|
|||
|
|
xhs_notes = []
|
|||
|
|
for note_data in notes:
|
|||
|
|
if note_data.get('model_type') == 'note':
|
|||
|
|
note_card = note_data.get('note_card', {})
|
|||
|
|
user_info = note_card.get('user', {})
|
|||
|
|
interact_info = note_card.get('interact_info', {})
|
|||
|
|
|
|||
|
|
# 提取图片URL列表
|
|||
|
|
image_urls = []
|
|||
|
|
for img in note_card.get('image_list', []):
|
|||
|
|
for info in img.get('info_list', []):
|
|||
|
|
if info.get('image_scene') == 'WB_DFT':
|
|||
|
|
image_urls.append(info.get('url', ''))
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 处理点赞、评论、分享数(可能是字符串格式)
|
|||
|
|
def parse_count(count_str):
|
|||
|
|
if isinstance(count_str, str):
|
|||
|
|
try:
|
|||
|
|
return int(count_str)
|
|||
|
|
except ValueError:
|
|||
|
|
return 0
|
|||
|
|
return int(count_str) if count_str else 0
|
|||
|
|
|
|||
|
|
note = XHSNote(
|
|||
|
|
note_id=note_data.get('id', ''),
|
|||
|
|
title=note_card.get('display_title', ''),
|
|||
|
|
content=note_card.get('desc', ''), # 搜索结果中可能没有完整内容
|
|||
|
|
author=user_info.get('nickname', ''),
|
|||
|
|
author_id=user_info.get('user_id', ''),
|
|||
|
|
tags=note_card.get('tag_list', []),
|
|||
|
|
images=image_urls,
|
|||
|
|
videos=note_card.get('video', []),
|
|||
|
|
likes=parse_count(interact_info.get('liked_count', 0)),
|
|||
|
|
comments=parse_count(interact_info.get('comment_count', 0)),
|
|||
|
|
shares=parse_count(interact_info.get('shared_count', 0)),
|
|||
|
|
created_time=note_card.get('time', ''),
|
|||
|
|
note_url=f"https://www.xiaohongshu.com/explore/{note_data.get('id', '')}?xsec_token={note_data.get('xsec_token', '')}"
|
|||
|
|
)
|
|||
|
|
xhs_notes.append(note)
|
|||
|
|
|
|||
|
|
return XHSSearchResult(
|
|||
|
|
keyword=config.keyword,
|
|||
|
|
notes=xhs_notes,
|
|||
|
|
total_count=len(xhs_notes),
|
|||
|
|
success=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"搜索异常: {e}")
|
|||
|
|
return XHSSearchResult(
|
|||
|
|
keyword=config.keyword,
|
|||
|
|
notes=[],
|
|||
|
|
total_count=0,
|
|||
|
|
success=False,
|
|||
|
|
error_message=str(e)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def search_notes_with_content(self, config: SearchConfig, fetch_content: bool = True) -> XHSSearchResult:
|
|||
|
|
"""
|
|||
|
|
搜索小红书笔记并可选择获取详细内容
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
config: 搜索配置
|
|||
|
|
fetch_content: 是否获取详细内容
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
XHSSearchResult: 搜索结果
|
|||
|
|
"""
|
|||
|
|
# 首先进行基本搜索
|
|||
|
|
search_result = self.search_notes(config)
|
|||
|
|
|
|||
|
|
if not search_result.success or not fetch_content:
|
|||
|
|
return search_result
|
|||
|
|
|
|||
|
|
# 为每个笔记获取详细内容
|
|||
|
|
enhanced_notes = []
|
|||
|
|
for note in search_result.notes:
|
|||
|
|
try:
|
|||
|
|
detailed_note = self.get_note_info(note.note_url)
|
|||
|
|
if detailed_note:
|
|||
|
|
enhanced_notes.append(detailed_note)
|
|||
|
|
else:
|
|||
|
|
# 如果获取详情失败,使用原始笔记数据
|
|||
|
|
enhanced_notes.append(note)
|
|||
|
|
logger.warning(f"获取笔记详情失败,使用基本信息: {note.note_id}")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"获取笔记详情异常: {e}")
|
|||
|
|
enhanced_notes.append(note)
|
|||
|
|
|
|||
|
|
return XHSSearchResult(
|
|||
|
|
keyword=config.keyword,
|
|||
|
|
notes=enhanced_notes,
|
|||
|
|
total_count=len(enhanced_notes),
|
|||
|
|
success=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_note_info(self, note_url: str) -> Optional[XHSNote]:
|
|||
|
|
"""
|
|||
|
|
获取笔记详细信息
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
note_url: 笔记URL
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
XHSNote: 笔记信息
|
|||
|
|
"""
|
|||
|
|
if not self.cookies_str:
|
|||
|
|
logger.warning("未设置cookies,获取笔记信息可能失败")
|
|||
|
|
|
|||
|
|
if not self.spider:
|
|||
|
|
raise RuntimeError("Spider未初始化")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
success, msg, note_info = self.spider.spider_note(
|
|||
|
|
note_url=note_url,
|
|||
|
|
cookies_str=self.cookies_str
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if not success or not note_info:
|
|||
|
|
logger.error(f"获取笔记信息失败: {msg}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 转换为XHSNote对象
|
|||
|
|
note = XHSNote(
|
|||
|
|
note_id=note_info.get('note_id', ''),
|
|||
|
|
title=note_info.get('title', ''),
|
|||
|
|
content=note_info.get('desc', ''),
|
|||
|
|
author=note_info.get('user', {}).get('nickname', ''),
|
|||
|
|
author_id=note_info.get('user', {}).get('user_id', ''),
|
|||
|
|
tags=note_info.get('tag_list', []),
|
|||
|
|
images=note_info.get('image_list', []),
|
|||
|
|
videos=note_info.get('video', []),
|
|||
|
|
likes=note_info.get('interact_info', {}).get('liked_count', 0),
|
|||
|
|
comments=note_info.get('interact_info', {}).get('comment_count', 0),
|
|||
|
|
shares=note_info.get('interact_info', {}).get('share_count', 0),
|
|||
|
|
created_time=note_info.get('time', ''),
|
|||
|
|
note_url=note_url
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return note
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"获取笔记信息异常: {e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def is_available(self) -> bool:
|
|||
|
|
"""检查适配器是否可用"""
|
|||
|
|
return self.available and self.spider is not None
|
|||
|
|
|
|||
|
|
def set_cookies(self, cookies_str: str):
|
|||
|
|
"""设置cookies"""
|
|||
|
|
self.cookies_str = cookies_str
|
|||
|
|
logger.info("Cookies已更新")
|