xhsAutoPublisher/author_search.py

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import random

def grab_herf(driver):
    herf_list = driver.find_elements(By.CSS_SELECTOR, "a.cover.mask.ld")
    herf_result = []
    for herf in herf_list:
        herf_result.append(herf.get_attribute("href"))
    return herf_result

def scroll_page(driver):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(random.random()*2)
    return True


def main():
    driver = webdriver.Safari()
    # Url = "https://www.xiaohongshu.com/search_result?keyword=%25E6%2597%2585%25E6%25B8%25B8%25E6%2594%25BB%25E7%2595%25A5&source=web_explore_feed"
    # Url = "https://www.xiaohongshu.com/user/profile/5b0c19d2e8ac2b4f75c6cad2?xsec_token=ABTQo4AIGKlpW474JBfIZcd8Ln8DQr1Ugt_Yk-w3h1bzs%3D&xsec_source=pc_search"
    Url = "https://www.xiaohongshu.com/user/profile/5ac968dbe8ac2b0a5581b909?xsec_token=YBG4U7EYzyAxyWC5H0HQKip9JT9B9KxrA43gWtkA3mi1A%3D&xsec_source=app_share&xhsshare=WeixinSession&appuid=6326ddf20000000023024438&apptime=1741862947&share_id=5da7602111cb431ca55a0dcfd6b20bc3&share_channel=wechat"
    Url = "https://www.xiaohongshu.com/user/profile/663057ef000000001e0060c9?xsec_token=YB1i6fylSYGpHg1gVgG5KZohqjJvDmwR96wwU_AQxdRrI=&xsec_source=app_share&xhsshare=WeixinSession&appuid=6326ddf20000000023024438&apptime=1741863000&share_id=b53d29eb6ce144c4b2d99035e4369d55&share_channel=wechat&wechatWid=cb745eed2e2630a361f9ce99520d9c9a&wechatOrigin=menu"
    cookies = json.load(open("/Users/yarrow/Spider/seleniumXH/cookies.json", "r"))
    for cookie in cookies:
        cookie['sameSite'] = 'Strict'  # 或者根据需要设置为'Strict'或'None'
        driver.add_cookie(cookie)
    ## 连接网页
    driver.get(Url)
    driver.maximize_window()
    ## 等待页面加载
    # WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "search_result")))
    time.sleep(1.5)
    ## 最新 tag
    ## <div class="filter" data-v-26690e46=""><span data-v-26690e46="">综合</span><svg class="reds-icon filter-icon" width="16" height="16" data-v-26690e46="" data-v-55b36ac6=""><use xlink:href="#chevron_down" data-v-55b36ac6=""></use></svg></div>
    ## mousecenter 选最热
    ## 点击最热
    # driver.find_element(By.CSS_SELECTOR, ".filter").click()
    ## 等待页面
    # WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".filter-item")))
    ## 点击最热
    ## 选择第一个
    # driver.find_element(By.CSS_SELECTOR, ".filter-item").click()
    herf_result = []
    # 抓取页面中的所有
    # <a data-v-5304b27b="" class="cover mask ld" target="_self" href="/search_result/674096cb00000000070250a5?xsec_token=ABML4D5II-Ado63pNUz4LUixIG1XC29gcGeBTfHQoWx-E=&amp;xsec_source=" style="height: 335px;">
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a.cover.mask.ld")))
    i = 0
    while True:
        herf_result.extend(grab_herf(driver))
        scroll_page(driver)
        print(herf_result)
        with open("herf_result.json", "w") as f:
            json.dump(herf_result, f)
        time.sleep(random.random())
        i += 1
        if i > 100:
            break

    driver.quit()


    ## 下滑 抓取链接


if __name__ == "__main__":
    main()