154 lines
7.3 KiB
Python
Raw Normal View History

2025-07-31 15:35:23 +08:00
import json
import os
import logging
logger = logging.getLogger(__name__)
from .apis.xhs_pc_apis import XHS_Apis
from .xhs_utils.common_util import init
from .xhs_utils.data_util import handle_note_info, download_note, save_to_xlsx
class Data_Spider():
def __init__(self):
self.xhs_apis = XHS_Apis()
def spider_note(self, note_url: str, cookies_str: str, proxies=None):
"""
爬取一个笔记的信息
:param note_url:
:param cookies_str:
:return:
"""
note_info = None
try:
success, msg, note_info = self.xhs_apis.get_note_info(note_url, cookies_str, proxies)
if success:
note_info = note_info['data']['items'][0]
note_info['url'] = note_url
note_info = handle_note_info(note_info)
except Exception as e:
success = False
msg = e
logger.info(f'爬取笔记信息 {note_url}: {success}, msg: {msg}')
return success, msg, note_info
def spider_some_note(self, notes: list, cookies_str: str, base_path: dict, save_choice: str, excel_name: str = '', proxies=None):
"""
爬取一些笔记的信息
:param notes:
:param cookies_str:
:param base_path:
:return:
"""
if (save_choice == 'all' or save_choice == 'excel') and excel_name == '':
raise ValueError('excel_name 不能为空')
note_list = []
for note_url in notes:
success, msg, note_info = self.spider_note(note_url, cookies_str, proxies)
if note_info is not None and success:
note_list.append(note_info)
for note_info in note_list:
if save_choice == 'all' or 'media' in save_choice:
download_note(note_info, base_path['media'], save_choice)
if save_choice == 'all' or save_choice == 'excel':
file_path = os.path.abspath(os.path.join(base_path['excel'], f'{excel_name}.xlsx'))
save_to_xlsx(note_list, file_path)
def spider_user_all_note(self, user_url: str, cookies_str: str, base_path: dict, save_choice: str, excel_name: str = '', proxies=None):
"""
爬取一个用户的所有笔记
:param user_url:
:param cookies_str:
:param base_path:
:return:
"""
note_list = []
try:
success, msg, all_note_info = self.xhs_apis.get_user_all_notes(user_url, cookies_str, proxies)
if success:
logger.info(f'用户 {user_url} 作品数量: {len(all_note_info)}')
for simple_note_info in all_note_info:
note_url = f"https://www.xiaohongshu.com/explore/{simple_note_info['note_id']}?xsec_token={simple_note_info['xsec_token']}"
note_list.append(note_url)
if save_choice == 'all' or save_choice == 'excel':
excel_name = user_url.split('/')[-1].split('?')[0]
self.spider_some_note(note_list, cookies_str, base_path, save_choice, excel_name, proxies)
except Exception as e:
success = False
msg = e
logger.info(f'爬取用户所有视频 {user_url}: {success}, msg: {msg}')
return note_list, success, msg
def spider_some_search_note(self, query: str, require_num: int, cookies_str: str, base_path: dict, save_choice: str, sort_type_choice=0, note_type=0, note_time=0, note_range=0, pos_distance=0, geo: dict = None, excel_name: str = '', proxies=None):
"""
指定数量搜索笔记设置排序方式和笔记类型和笔记数量
:param query 搜索的关键词
:param require_num 搜索的数量
:param cookies_str 你的cookies
:param base_path 保存路径
:param sort_type_choice 排序方式 0 综合排序, 1 最新, 2 最多点赞, 3 最多评论, 4 最多收藏
:param note_type 笔记类型 0 不限, 1 视频笔记, 2 普通笔记
:param note_time 笔记时间 0 不限, 1 一天内, 2 一周内天, 3 半年内
:param note_range 笔记范围 0 不限, 1 已看过, 2 未看过, 3 已关注
:param pos_distance 位置距离 0 不限, 1 同城, 2 附近 指定这个必须要指定 geo
返回搜索的结果
"""
note_list = []
try:
success, msg, notes = self.xhs_apis.search_some_note(query, require_num, cookies_str, sort_type_choice, note_type, note_time, note_range, pos_distance, geo, proxies)
if success:
notes = list(filter(lambda x: x['model_type'] == "note", notes))
logger.info(f'搜索关键词 {query} 笔记数量: {len(notes)}')
for note in notes:
note_url = f"https://www.xiaohongshu.com/explore/{note['id']}?xsec_token={note['xsec_token']}"
note_list.append(note_url)
if save_choice == 'all' or save_choice == 'excel':
excel_name = query
self.spider_some_note(note_list, cookies_str, base_path, save_choice, excel_name, proxies)
except Exception as e:
success = False
msg = e
logger.info(f'搜索关键词 {query} 笔记: {success}, msg: {msg}')
return note_list, success, msg
if __name__ == '__main__':
"""
此文件为爬虫的入口文件可以直接运行
apis/xhs_pc_apis.py 为爬虫的api文件包含小红书的全部数据接口可以继续封装
apis/xhs_creator_apis.py 为小红书创作者中心的api文件
感谢star和follow
"""
cookies_str, base_path = init()
data_spider = Data_Spider()
"""
save_choice: all: 保存所有的信息, media: 保存视频和图片media-video只下载视频, media-image只下载图片media都下载, excel: 保存到excel
save_choice excel 或者 all excel_name 不能为空
"""
# # 1 爬取列表的所有笔记信息 笔记链接 如下所示 注意此url会过期
# notes = [
# r'https://www.xiaohongshu.com/explore/683fe17f0000000023017c6a?xsec_token=ABBr_cMzallQeLyKSRdPk9fwzA0torkbT_ubuQP1ayvKA=&xsec_source=pc_user',
# ]
# data_spider.spider_some_note(notes, cookies_str, base_path, 'all', 'test')
# 2 爬取用户的所有笔记信息 用户链接 如下所示 注意此url会过期
# user_url = 'https://www.xiaohongshu.com/user/profile/5bc7fce62d9b750001ac98e0?xsec_token=ABaf3Kup4axIxi2fTRAYvn5qmpV6Af9glCiVcfsFcICN8=&xsec_source=pc_feed'
# data_spider.spider_user_all_note(user_url, cookies_str, base_path, 'all')
# 3 搜索指定关键词的笔记
query = "上海馥桂萌宠园攻略"
query_num = 20
sort_type_choice = 2 # 0 综合排序, 1 最新, 2 最多点赞, 3 最多评论, 4 最多收藏
note_type = 2 # 0 不限, 1 视频笔记, 2 普通笔记
note_time = 0 # 0 不限, 1 一天内, 2 一周内天, 3 半年内
note_range = 0 # 0 不限, 1 已看过, 2 未看过, 3 已关注
pos_distance = 0 # 0 不限, 1 同城, 2 附近 指定这个1或2必须要指定 geo
# geo = {
# # 经纬度
# "latitude": 39.9725,
# "longitude": 116.4207
# }
data_spider.spider_some_search_note(query, query_num, cookies_str, base_path, 'all', sort_type_choice, note_type, note_time, note_range, pos_distance, geo=None)