xhsAutoPublisher/req_search.py

72 lines
2.8 KiB
Python
Raw Normal View History

2025-08-27 09:25:17 +08:00
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import random
def grab_herf(driver):
herf_list = driver.find_elements(By.CSS_SELECTOR, "a.cover.mask.ld")
herf_result = []
for herf in herf_list:
herf_result.append(herf.get_attribute("href"))
return herf_result
def scroll_page(driver):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(random.random()*2)
return True
def main():
driver = webdriver.Safari()
# Url = "https://www.xiaohongshu.com/search_result?keyword=%25E5%2591%25A8%25E6%259C%25AB%25E5%258E%25BB%25E5%2593%25AA%25E7%258E%25A9&source=web_search_result_notes"
Url = "https://www.xiaohongshu.com/search_result?keyword=%25E6%2597%2585%25E6%25B8%25B8%25E6%2594%25BB%25E7%2595%25A5&source=web_explore_feed"
cookies = json.load(open("/Users/yarrow/Spider/seleniumXH/cookies.json", "r"))
for cookie in cookies:
cookie['sameSite'] = 'Strict' # 或者根据需要设置为'Strict'或'None'
driver.add_cookie(cookie)
## 连接网页
driver.get(Url)
driver.maximize_window()
## 等待页面加载
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "search_result")))
time.sleep(1.5)
## 最新 tag
## <div class="filter" data-v-26690e46=""><span data-v-26690e46="">综合</span><svg class="reds-icon filter-icon" width="16" height="16" data-v-26690e46="" data-v-55b36ac6=""><use xlink:href="#chevron_down" data-v-55b36ac6=""></use></svg></div>
## mousecenter 选最热
## 点击最热
# driver.find_element(By.CSS_SELECTOR, ".filter").click()
## 等待页面
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".filter-item")))
## 点击最热
## 选择第一个
# driver.find_element(By.CSS_SELECTOR, ".filter-item").click()
herf_result = []
# 抓取页面中的所有
# <a data-v-5304b27b="" class="cover mask ld" target="_self" href="/search_result/674096cb00000000070250a5?xsec_token=ABML4D5II-Ado63pNUz4LUixIG1XC29gcGeBTfHQoWx-E=&amp;xsec_source=" style="height: 335px;">
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a.cover.mask.ld")))
i = 0
while True:
herf_result.extend(grab_herf(driver))
scroll_page(driver)
print(herf_result)
with open("herf_result.json", "w") as f:
json.dump(herf_result, f)
time.sleep(random.random())
i += 1
if i > 100:
break
driver.quit()
## 下滑 抓取链接
if __name__ == "__main__":
main()