TravelContentCreator/core/xhs_adapter.py

244 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
XHS Spider Adapter
小红书爬虫适配器
作为core模块与xhs_spider模块的桥梁提供统一的接口
"""
import sys
import os
from typing import Dict, List, Optional, Any
from pathlib import Path
import logging
from .models import XHSNote, XHSSearchResult, SearchConfig
logger = logging.getLogger(__name__)
try:
from .xhs_spider import Data_Spider, XHS_Apis
XHS_AVAILABLE = True
except ImportError as e:
logger.error(f"XHS Spider模块导入失败: {e}")
XHS_AVAILABLE = False
class XHSAdapter:
"""小红书爬虫适配器"""
def __init__(self, cookies_str: Optional[str] = None):
"""
初始化适配器
Args:
cookies_str: Cookie字符串
"""
self.cookies_str = cookies_str or ""
self.spider: Optional[Data_Spider] = None
self.available = XHS_AVAILABLE
if not self.available:
raise ImportError("XHS Spider模块不可用请检查模块导入")
try:
self.spider = Data_Spider()
logger.info("XHS Spider适配器初始化成功")
except Exception as e:
logger.error(f"XHS Spider初始化失败: {e}")
raise
def search_notes(self, config: SearchConfig) -> XHSSearchResult:
"""
搜索小红书笔记
Args:
config: 搜索配置
Returns:
XHSSearchResult: 搜索结果
"""
if not self.cookies_str:
logger.warning("未设置cookies搜索可能失败")
if not self.spider:
raise RuntimeError("Spider未初始化")
try:
# 调用XHS_Apis的搜索方法
success, msg, notes = self.spider.xhs_apis.search_some_note(
query=config.keyword,
require_num=config.max_notes,
cookies_str=self.cookies_str,
sort_type_choice=config.sort_type,
note_type=config.note_type
)
if not success:
logger.error(f"搜索失败: {msg}")
return XHSSearchResult(
keyword=config.keyword,
notes=[],
total_count=0,
success=False,
error_message=str(msg)
)
# 转换为XHSNote对象
xhs_notes = []
for note_data in notes:
if note_data.get('model_type') == 'note':
note_card = note_data.get('note_card', {})
user_info = note_card.get('user', {})
interact_info = note_card.get('interact_info', {})
# 提取图片URL列表
image_urls = []
for img in note_card.get('image_list', []):
for info in img.get('info_list', []):
if info.get('image_scene') == 'WB_DFT':
image_urls.append(info.get('url', ''))
break
# 处理点赞、评论、分享数(可能是字符串格式)
def parse_count(count_str):
if isinstance(count_str, str):
try:
return int(count_str)
except ValueError:
return 0
return int(count_str) if count_str else 0
note = XHSNote(
note_id=note_data.get('id', ''),
title=note_card.get('display_title', ''),
content=note_card.get('desc', ''), # 搜索结果中可能没有完整内容
author=user_info.get('nickname', ''),
author_id=user_info.get('user_id', ''),
tags=note_card.get('tag_list', []),
images=image_urls,
videos=note_card.get('video', []),
likes=parse_count(interact_info.get('liked_count', 0)),
comments=parse_count(interact_info.get('comment_count', 0)),
shares=parse_count(interact_info.get('shared_count', 0)),
created_time=note_card.get('time', ''),
note_url=f"https://www.xiaohongshu.com/explore/{note_data.get('id', '')}?xsec_token={note_data.get('xsec_token', '')}"
)
xhs_notes.append(note)
return XHSSearchResult(
keyword=config.keyword,
notes=xhs_notes,
total_count=len(xhs_notes),
success=True
)
except Exception as e:
logger.error(f"搜索异常: {e}")
return XHSSearchResult(
keyword=config.keyword,
notes=[],
total_count=0,
success=False,
error_message=str(e)
)
def search_notes_with_content(self, config: SearchConfig, fetch_content: bool = True) -> XHSSearchResult:
"""
搜索小红书笔记并可选择获取详细内容
Args:
config: 搜索配置
fetch_content: 是否获取详细内容
Returns:
XHSSearchResult: 搜索结果
"""
# 首先进行基本搜索
search_result = self.search_notes(config)
if not search_result.success or not fetch_content:
return search_result
# 为每个笔记获取详细内容
enhanced_notes = []
for note in search_result.notes:
try:
detailed_note = self.get_note_info(note.note_url)
if detailed_note:
enhanced_notes.append(detailed_note)
else:
# 如果获取详情失败,使用原始笔记数据
enhanced_notes.append(note)
logger.warning(f"获取笔记详情失败,使用基本信息: {note.note_id}")
except Exception as e:
logger.error(f"获取笔记详情异常: {e}")
enhanced_notes.append(note)
return XHSSearchResult(
keyword=config.keyword,
notes=enhanced_notes,
total_count=len(enhanced_notes),
success=True
)
def get_note_info(self, note_url: str) -> Optional[XHSNote]:
"""
获取笔记详细信息
Args:
note_url: 笔记URL
Returns:
XHSNote: 笔记信息
"""
if not self.cookies_str:
logger.warning("未设置cookies获取笔记信息可能失败")
if not self.spider:
raise RuntimeError("Spider未初始化")
try:
success, msg, note_info = self.spider.spider_note(
note_url=note_url,
cookies_str=self.cookies_str
)
if not success or not note_info:
logger.error(f"获取笔记信息失败: {msg}")
return None
# 转换为XHSNote对象
note = XHSNote(
note_id=note_info.get('note_id', ''),
title=note_info.get('title', ''),
content=note_info.get('desc', ''),
author=note_info.get('user', {}).get('nickname', ''),
author_id=note_info.get('user', {}).get('user_id', ''),
tags=note_info.get('tag_list', []),
images=note_info.get('image_list', []),
videos=note_info.get('video', []),
likes=note_info.get('interact_info', {}).get('liked_count', 0),
comments=note_info.get('interact_info', {}).get('comment_count', 0),
shares=note_info.get('interact_info', {}).get('share_count', 0),
created_time=note_info.get('time', ''),
note_url=note_url
)
return note
except Exception as e:
logger.error(f"获取笔记信息异常: {e}")
return None
def is_available(self) -> bool:
"""检查适配器是否可用"""
return self.available and self.spider is not None
def set_cookies(self, cookies_str: str):
"""设置cookies"""
self.cookies_str = cookies_str
logger.info("Cookies已更新")