74 lines
3.4 KiB
Python
74 lines
3.4 KiB
Python
import selenium
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
import time
|
|
import json
|
|
import random
|
|
|
|
def grab_herf(driver):
|
|
herf_list = driver.find_elements(By.CSS_SELECTOR, "a.cover.mask.ld")
|
|
herf_result = []
|
|
for herf in herf_list:
|
|
herf_result.append(herf.get_attribute("href"))
|
|
return herf_result
|
|
|
|
def scroll_page(driver):
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
time.sleep(random.random()*2)
|
|
return True
|
|
|
|
|
|
|
|
def main():
|
|
driver = webdriver.Safari()
|
|
# Url = "https://www.xiaohongshu.com/search_result?keyword=%25E6%2597%2585%25E6%25B8%25B8%25E6%2594%25BB%25E7%2595%25A5&source=web_explore_feed"
|
|
# Url = "https://www.xiaohongshu.com/user/profile/5b0c19d2e8ac2b4f75c6cad2?xsec_token=ABTQo4AIGKlpW474JBfIZcd8Ln8DQr1Ugt_Yk-w3h1bzs%3D&xsec_source=pc_search"
|
|
Url = "https://www.xiaohongshu.com/user/profile/5ac968dbe8ac2b0a5581b909?xsec_token=YBG4U7EYzyAxyWC5H0HQKip9JT9B9KxrA43gWtkA3mi1A%3D&xsec_source=app_share&xhsshare=WeixinSession&appuid=6326ddf20000000023024438&apptime=1741862947&share_id=5da7602111cb431ca55a0dcfd6b20bc3&share_channel=wechat"
|
|
Url = "https://www.xiaohongshu.com/user/profile/663057ef000000001e0060c9?xsec_token=YB1i6fylSYGpHg1gVgG5KZohqjJvDmwR96wwU_AQxdRrI=&xsec_source=app_share&xhsshare=WeixinSession&appuid=6326ddf20000000023024438&apptime=1741863000&share_id=b53d29eb6ce144c4b2d99035e4369d55&share_channel=wechat&wechatWid=cb745eed2e2630a361f9ce99520d9c9a&wechatOrigin=menu"
|
|
cookies = json.load(open("/Users/yarrow/Spider/seleniumXH/cookies.json", "r"))
|
|
for cookie in cookies:
|
|
cookie['sameSite'] = 'Strict' # 或者根据需要设置为'Strict'或'None'
|
|
driver.add_cookie(cookie)
|
|
## 连接网页
|
|
driver.get(Url)
|
|
driver.maximize_window()
|
|
## 等待页面加载
|
|
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "search_result")))
|
|
time.sleep(1.5)
|
|
## 最新 tag
|
|
## <div class="filter" data-v-26690e46=""><span data-v-26690e46="">综合</span><svg class="reds-icon filter-icon" width="16" height="16" data-v-26690e46="" data-v-55b36ac6=""><use xlink:href="#chevron_down" data-v-55b36ac6=""></use></svg></div>
|
|
## mousecenter 选最热
|
|
## 点击最热
|
|
# driver.find_element(By.CSS_SELECTOR, ".filter").click()
|
|
## 等待页面
|
|
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".filter-item")))
|
|
## 点击最热
|
|
## 选择第一个
|
|
# driver.find_element(By.CSS_SELECTOR, ".filter-item").click()
|
|
herf_result = []
|
|
# 抓取页面中的所有
|
|
# <a data-v-5304b27b="" class="cover mask ld" target="_self" href="/search_result/674096cb00000000070250a5?xsec_token=ABML4D5II-Ado63pNUz4LUixIG1XC29gcGeBTfHQoWx-E=&xsec_source=" style="height: 335px;">
|
|
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a.cover.mask.ld")))
|
|
i = 0
|
|
while True:
|
|
herf_result.extend(grab_herf(driver))
|
|
scroll_page(driver)
|
|
print(herf_result)
|
|
with open("herf_result.json", "w") as f:
|
|
json.dump(herf_result, f)
|
|
time.sleep(random.random())
|
|
i += 1
|
|
if i > 100:
|
|
break
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
## 下滑 抓取链接
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |