xhsAutoPublisher/author_search.py
2025-08-27 09:25:17 +08:00

74 lines
3.4 KiB
Python

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import random
def grab_herf(driver):
herf_list = driver.find_elements(By.CSS_SELECTOR, "a.cover.mask.ld")
herf_result = []
for herf in herf_list:
herf_result.append(herf.get_attribute("href"))
return herf_result
def scroll_page(driver):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(random.random()*2)
return True
def main():
driver = webdriver.Safari()
# Url = "https://www.xiaohongshu.com/search_result?keyword=%25E6%2597%2585%25E6%25B8%25B8%25E6%2594%25BB%25E7%2595%25A5&source=web_explore_feed"
# Url = "https://www.xiaohongshu.com/user/profile/5b0c19d2e8ac2b4f75c6cad2?xsec_token=ABTQo4AIGKlpW474JBfIZcd8Ln8DQr1Ugt_Yk-w3h1bzs%3D&xsec_source=pc_search"
Url = "https://www.xiaohongshu.com/user/profile/5ac968dbe8ac2b0a5581b909?xsec_token=YBG4U7EYzyAxyWC5H0HQKip9JT9B9KxrA43gWtkA3mi1A%3D&xsec_source=app_share&xhsshare=WeixinSession&appuid=6326ddf20000000023024438&apptime=1741862947&share_id=5da7602111cb431ca55a0dcfd6b20bc3&share_channel=wechat"
Url = "https://www.xiaohongshu.com/user/profile/663057ef000000001e0060c9?xsec_token=YB1i6fylSYGpHg1gVgG5KZohqjJvDmwR96wwU_AQxdRrI=&xsec_source=app_share&xhsshare=WeixinSession&appuid=6326ddf20000000023024438&apptime=1741863000&share_id=b53d29eb6ce144c4b2d99035e4369d55&share_channel=wechat&wechatWid=cb745eed2e2630a361f9ce99520d9c9a&wechatOrigin=menu"
cookies = json.load(open("/Users/yarrow/Spider/seleniumXH/cookies.json", "r"))
for cookie in cookies:
cookie['sameSite'] = 'Strict' # 或者根据需要设置为'Strict'或'None'
driver.add_cookie(cookie)
## 连接网页
driver.get(Url)
driver.maximize_window()
## 等待页面加载
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "search_result")))
time.sleep(1.5)
## 最新 tag
## <div class="filter" data-v-26690e46=""><span data-v-26690e46="">综合</span><svg class="reds-icon filter-icon" width="16" height="16" data-v-26690e46="" data-v-55b36ac6=""><use xlink:href="#chevron_down" data-v-55b36ac6=""></use></svg></div>
## mousecenter 选最热
## 点击最热
# driver.find_element(By.CSS_SELECTOR, ".filter").click()
## 等待页面
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".filter-item")))
## 点击最热
## 选择第一个
# driver.find_element(By.CSS_SELECTOR, ".filter-item").click()
herf_result = []
# 抓取页面中的所有
# <a data-v-5304b27b="" class="cover mask ld" target="_self" href="/search_result/674096cb00000000070250a5?xsec_token=ABML4D5II-Ado63pNUz4LUixIG1XC29gcGeBTfHQoWx-E=&amp;xsec_source=" style="height: 335px;">
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a.cover.mask.ld")))
i = 0
while True:
herf_result.extend(grab_herf(driver))
scroll_page(driver)
print(herf_result)
with open("herf_result.json", "w") as f:
json.dump(herf_result, f)
time.sleep(random.random())
i += 1
if i > 100:
break
driver.quit()
## 下滑 抓取链接
if __name__ == "__main__":
main()