72 lines
2.8 KiB
Python
72 lines
2.8 KiB
Python
|
|
import selenium
|
||
|
|
from selenium import webdriver
|
||
|
|
from selenium.webdriver.common.by import By
|
||
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
||
|
|
from selenium.webdriver.support import expected_conditions as EC
|
||
|
|
import time
|
||
|
|
import json
|
||
|
|
import random
|
||
|
|
|
||
|
|
def grab_herf(driver):
|
||
|
|
herf_list = driver.find_elements(By.CSS_SELECTOR, "a.cover.mask.ld")
|
||
|
|
herf_result = []
|
||
|
|
for herf in herf_list:
|
||
|
|
herf_result.append(herf.get_attribute("href"))
|
||
|
|
return herf_result
|
||
|
|
|
||
|
|
def scroll_page(driver):
|
||
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||
|
|
time.sleep(random.random()*2)
|
||
|
|
return True
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
driver = webdriver.Safari()
|
||
|
|
# Url = "https://www.xiaohongshu.com/search_result?keyword=%25E5%2591%25A8%25E6%259C%25AB%25E5%258E%25BB%25E5%2593%25AA%25E7%258E%25A9&source=web_search_result_notes"
|
||
|
|
Url = "https://www.xiaohongshu.com/search_result?keyword=%25E6%2597%2585%25E6%25B8%25B8%25E6%2594%25BB%25E7%2595%25A5&source=web_explore_feed"
|
||
|
|
cookies = json.load(open("/Users/yarrow/Spider/seleniumXH/cookies.json", "r"))
|
||
|
|
for cookie in cookies:
|
||
|
|
cookie['sameSite'] = 'Strict' # 或者根据需要设置为'Strict'或'None'
|
||
|
|
driver.add_cookie(cookie)
|
||
|
|
## 连接网页
|
||
|
|
driver.get(Url)
|
||
|
|
driver.maximize_window()
|
||
|
|
## 等待页面加载
|
||
|
|
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "search_result")))
|
||
|
|
time.sleep(1.5)
|
||
|
|
## 最新 tag
|
||
|
|
## <div class="filter" data-v-26690e46=""><span data-v-26690e46="">综合</span><svg class="reds-icon filter-icon" width="16" height="16" data-v-26690e46="" data-v-55b36ac6=""><use xlink:href="#chevron_down" data-v-55b36ac6=""></use></svg></div>
|
||
|
|
## mousecenter 选最热
|
||
|
|
## 点击最热
|
||
|
|
# driver.find_element(By.CSS_SELECTOR, ".filter").click()
|
||
|
|
## 等待页面
|
||
|
|
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".filter-item")))
|
||
|
|
## 点击最热
|
||
|
|
## 选择第一个
|
||
|
|
# driver.find_element(By.CSS_SELECTOR, ".filter-item").click()
|
||
|
|
herf_result = []
|
||
|
|
# 抓取页面中的所有
|
||
|
|
# <a data-v-5304b27b="" class="cover mask ld" target="_self" href="/search_result/674096cb00000000070250a5?xsec_token=ABML4D5II-Ado63pNUz4LUixIG1XC29gcGeBTfHQoWx-E=&xsec_source=" style="height: 335px;">
|
||
|
|
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a.cover.mask.ld")))
|
||
|
|
i = 0
|
||
|
|
while True:
|
||
|
|
herf_result.extend(grab_herf(driver))
|
||
|
|
scroll_page(driver)
|
||
|
|
print(herf_result)
|
||
|
|
with open("herf_result.json", "w") as f:
|
||
|
|
json.dump(herf_result, f)
|
||
|
|
time.sleep(random.random())
|
||
|
|
i += 1
|
||
|
|
if i > 100:
|
||
|
|
break
|
||
|
|
|
||
|
|
driver.quit()
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
## 下滑 抓取链接
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|