使用chrome的driver爬取数据:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common. keys import Keys
from selenium. webdriver.support import expected_conditions as EC
from selenium. webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from urllib import request
import requests
from lxml import etree
import pandas as pd
browser = webdriver.Chrome()
filepath = 'C:/data.csv'
data = pd.read_csv(filepath)
nrows = len(data)
path_data = []
find_data = data['keyword'].values.tolist()
for i in range(0,nrows):
browser.get('https://www.baidu.com')
query =find_data[i]
input = browser.find_element_by_id ('kw')
input.send_keys(query)
ActionChains(browser).send_keys(Keys.ENTER).perform()
#input.send_keys(Keys.ENTER)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))
#print(browser.current_url)
#print(browser.get_cookies())
#print(browser.page_source)
source_html = browser.page_source
html_object=etree.HTML(source_html)
path_data = html_object.xpath('//div[@id="tsn_inner"]/div[2]/span[1]/text()')
data['hot_num'][i] = str(path_data)
data.to_excel('C:/百度20230228.xlsx',sheet_name='Sheet1',index=False)
备注:文章就是之前爬取方式的一个变种,增加了chromedriver的下载,使用selenium貌似需要成功安装后才能使用这个方式。