bangbang-aigc-server/core/xhs_adapter.py

244 lines
8.6 KiB
Python
Raw Normal View History

2025-07-31 15:35:23 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
XHS Spider Adapter
小红书爬虫适配器
作为core模块与xhs_spider模块的桥梁提供统一的接口
"""
import sys
import os
from typing import Dict, List, Optional, Any
from pathlib import Path
import logging
from .models import XHSNote, XHSSearchResult, SearchConfig
logger = logging.getLogger(__name__)
try:
from .xhs_spider import Data_Spider, XHS_Apis
XHS_AVAILABLE = True
except ImportError as e:
logger.error(f"XHS Spider模块导入失败: {e}")
XHS_AVAILABLE = False
class XHSAdapter:
"""小红书爬虫适配器"""
def __init__(self, cookies_str: Optional[str] = None):
"""
初始化适配器
Args:
cookies_str: Cookie字符串
"""
self.cookies_str = cookies_str or ""
self.spider: Optional[Data_Spider] = None
self.available = XHS_AVAILABLE
if not self.available:
raise ImportError("XHS Spider模块不可用请检查模块导入")
try:
self.spider = Data_Spider()
logger.info("XHS Spider适配器初始化成功")
except Exception as e:
logger.error(f"XHS Spider初始化失败: {e}")
raise
def search_notes(self, config: SearchConfig) -> XHSSearchResult:
"""
搜索小红书笔记
Args:
config: 搜索配置
Returns:
XHSSearchResult: 搜索结果
"""
if not self.cookies_str:
logger.warning("未设置cookies搜索可能失败")
if not self.spider:
raise RuntimeError("Spider未初始化")
try:
# 调用XHS_Apis的搜索方法
success, msg, notes = self.spider.xhs_apis.search_some_note(
query=config.keyword,
require_num=config.max_notes,
cookies_str=self.cookies_str,
sort_type_choice=config.sort_type,
note_type=config.note_type
)
if not success:
logger.error(f"搜索失败: {msg}")
return XHSSearchResult(
keyword=config.keyword,
notes=[],
total_count=0,
success=False,
error_message=str(msg)
)
# 转换为XHSNote对象
xhs_notes = []
for note_data in notes:
if note_data.get('model_type') == 'note':
note_card = note_data.get('note_card', {})
user_info = note_card.get('user', {})
interact_info = note_card.get('interact_info', {})
# 提取图片URL列表
image_urls = []
for img in note_card.get('image_list', []):
for info in img.get('info_list', []):
if info.get('image_scene') == 'WB_DFT':
image_urls.append(info.get('url', ''))
break
# 处理点赞、评论、分享数(可能是字符串格式)
def parse_count(count_str):
if isinstance(count_str, str):
try:
return int(count_str)
except ValueError:
return 0
return int(count_str) if count_str else 0
note = XHSNote(
note_id=note_data.get('id', ''),
title=note_card.get('display_title', ''),
content=note_card.get('desc', ''), # 搜索结果中可能没有完整内容
author=user_info.get('nickname', ''),
author_id=user_info.get('user_id', ''),
tags=note_card.get('tag_list', []),
images=image_urls,
videos=note_card.get('video', []),
likes=parse_count(interact_info.get('liked_count', 0)),
comments=parse_count(interact_info.get('comment_count', 0)),
shares=parse_count(interact_info.get('shared_count', 0)),
created_time=note_card.get('time', ''),
note_url=f"https://www.xiaohongshu.com/explore/{note_data.get('id', '')}?xsec_token={note_data.get('xsec_token', '')}"
)
xhs_notes.append(note)
return XHSSearchResult(
keyword=config.keyword,
notes=xhs_notes,
total_count=len(xhs_notes),
success=True
)
except Exception as e:
logger.error(f"搜索异常: {e}")
return XHSSearchResult(
keyword=config.keyword,
notes=[],
total_count=0,
success=False,
error_message=str(e)
)
def search_notes_with_content(self, config: SearchConfig, fetch_content: bool = True) -> XHSSearchResult:
"""
搜索小红书笔记并可选择获取详细内容
Args:
config: 搜索配置
fetch_content: 是否获取详细内容
Returns:
XHSSearchResult: 搜索结果
"""
# 首先进行基本搜索
search_result = self.search_notes(config)
if not search_result.success or not fetch_content:
return search_result
# 为每个笔记获取详细内容
enhanced_notes = []
for note in search_result.notes:
try:
detailed_note = self.get_note_info(note.note_url)
if detailed_note:
enhanced_notes.append(detailed_note)
else:
# 如果获取详情失败,使用原始笔记数据
enhanced_notes.append(note)
logger.warning(f"获取笔记详情失败,使用基本信息: {note.note_id}")
except Exception as e:
logger.error(f"获取笔记详情异常: {e}")
enhanced_notes.append(note)
return XHSSearchResult(
keyword=config.keyword,
notes=enhanced_notes,
total_count=len(enhanced_notes),
success=True
)
def get_note_info(self, note_url: str) -> Optional[XHSNote]:
"""
获取笔记详细信息
Args:
note_url: 笔记URL
Returns:
XHSNote: 笔记信息
"""
if not self.cookies_str:
logger.warning("未设置cookies获取笔记信息可能失败")
if not self.spider:
raise RuntimeError("Spider未初始化")
try:
success, msg, note_info = self.spider.spider_note(
note_url=note_url,
cookies_str=self.cookies_str
)
if not success or not note_info:
logger.error(f"获取笔记信息失败: {msg}")
return None
# 转换为XHSNote对象
note = XHSNote(
note_id=note_info.get('note_id', ''),
title=note_info.get('title', ''),
content=note_info.get('desc', ''),
author=note_info.get('user', {}).get('nickname', ''),
author_id=note_info.get('user', {}).get('user_id', ''),
tags=note_info.get('tag_list', []),
images=note_info.get('image_list', []),
videos=note_info.get('video', []),
likes=note_info.get('interact_info', {}).get('liked_count', 0),
comments=note_info.get('interact_info', {}).get('comment_count', 0),
shares=note_info.get('interact_info', {}).get('share_count', 0),
created_time=note_info.get('time', ''),
note_url=note_url
)
return note
except Exception as e:
logger.error(f"获取笔记信息异常: {e}")
return None
def is_available(self) -> bool:
"""检查适配器是否可用"""
return self.available and self.spider is not None
def set_cookies(self, cookies_str: str):
"""设置cookies"""
self.cookies_str = cookies_str
logger.info("Cookies已更新")