1、下载chrome浏览器,查看浏览器版本(在chrome浏览器中输入下面命令)
chrome://version/
3、将下载后的chrome驱动放入到python路径下的Scripts目录下
①找不到对应版本(大版本一致即可)
chrome版本:84.0.4147.125 (正式版本) (64 位)
chromedriver.exe版本:84.0.4147.30
②chromedriver.exe放入对应位置
F:\Anaconda3\Scripts
4、测试是否成功
https://www.jd.com/ :输入框id为key (可以键盘Enter,也可以点击按钮)
from selenium import webdriver # 用来驱动浏览器的
from selenium.webdriver.common.keys import Keys # 键盘按键操作
from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素
import time
driver=webdriver.Chrome()
try:
#隐式等待:在查找所有元素时,如果尚未被加载,则等10秒
driver.implicitly_wait(10)
#1、访问百度
driver.get('https://www.baidu.com/')
#2、查找输入框
input_tag=driver.find_element_by_id("kw")
#3、在搜索框在输入要搜索的内容
input_tag.send_keys('秦时明月')
# 4、按键盘回车键
input_tag.send_keys(Keys.ENTER)
finally:
#driver.close()
pass
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
driver = webdriver.Chrome()
try:
driver.implicitly_wait(10)
# 1、往jd发送请求
driver.get('https://www.jd.com/')
# 找到输入框输入围城
input_tag = driver.find_element_by_id('key')
input_tag.send_keys('围城')
# 键盘回车
input_tag.send_keys(Keys.ENTER)
time.sleep(2)
# 找到输入框输入墨菲定律
input_tag = driver.find_element_by_id('key')
input_tag.clear()
input_tag.send_keys('墨菲定律')
# 找到搜索按钮点击搜索
button = driver.find_element_by_class_name('button')
button.click()
time.sleep(10)
finally:
driver.close()
5、如何获取html内容
①隐式等待
②driver.find_element_by_xpath(path)
path:单斜杠,绝对路径
path:双斜杠,相对路径
find_element_by_xpath:第一个
find_elements_by_xpath:全部,list
//div[@id="images"] :限制条件 [@id="images"]
from selenium import webdriver
driver = webdriver.Chrome()
'''
Example website'''
try:
# 隐式等待: 写在get请求前
driver.implicitly_wait(10)
driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html')
#根据xpath语法查找元素(element elements)
#单斜杠/-->从根节点开始 绝对定位
html = driver.find_element_by_xpath('/html')
a = driver.find_elements_by_xpath("/html/body/div/a")
#html = driver.find_element_by_xpath('/head') # 报错
print(a[2].text)
#双斜杠//--> 相对路径
div = driver.find_element_by_xpath('//a')
print(div.tag_name)
# 艾特符号@ -->查找id为images的div节点
div = driver.find_element_by_xpath('//div[@id="images"]')
print(div.tag_name)
print(div.text)
# 找到任意一个a节点
a = driver.find_element_by_xpath('//a')
print(a.get_attribute('href'))
print(a.text)
a = driver.find_elements_by_xpath('//img')
print(a[2].get_attribute('src'))
finally:
#pass
driver.close()
6、如何爬取一个页面(不同版本,可能对于的div属性名不一致,需要检查class和target等等)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
options = webdriver.ChromeOptions()
prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': './download'}
options.add_experimental_option('prefs', prefs)
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # 2不加载图片,加快访问速度
options.add_experimental_option('excludeSwitches',
['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
# options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(10)
driver.get(url="https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&word=%E5%A4%B1%E4%B8%9A%E5%B0%B1%E4%B8%9A")
for key in ["自然灾害"]:
yy=driver.find_element_by_id("kw") #找到输入框
yy.clear() #清空输入框
yy.send_keys(key) #输入框填充数据
driver.find_element_by_id("su").click() #点击搜索
from tqdm import tqdm
from lxml import etree
#print(driver.page_source) #整个页面
allcontent=[]
alltitle=[]
for index in tqdm(range(1)):
try:
titles=[]
urls=[]
dom = etree.HTML(driver.page_source)
for j in dom.xpath('//div[@class="result-op c-container xpath-log new-pmd"]'):
url=j.xpath('.//a[@target="_blank"]')[0].xpath('.//@href')
title="".join([i.strip() for i in j.xpath('.//a[@target="_blank"]')[0].xpath('.//text()')])
urls.append(url)
titles.append(title)
assert len(urls)==len(titles)
#根据url获取标题里面的内容
import requests as req
txts=[]
for i in urls:
tmp=req.get(i[0]).content
try:
content = str(tmp,"utf-8")
except:
content = str(tmp,"gbk")
dom = etree.HTML(content)
txt=dom.xpath('//p//text()') #所有的p标签
txts.append(txt)
assert len(titles)==len(txts)
alltitle.extend(titles)
allcontent.extend(txts)
except:
pass
#下一页
tmp=driver.find_elements_by_class_name('n')
time.sleep(3) #延迟3秒
tmp[-1].click() #上一页和下一页的区别,只取下一页
#写入
import pandas as pd
df=pd.DataFrame()
df["text"]=allcontent
df["title"]=alltitle
print(alltitle[0])
print(type(alltitle[0]))
df.to_csv(key+".csv",header=True,index=False,encoding="utf_8_sig") #utf-8
driver.quit()
7、如何爬取百度图片
from selenium import webdriver
import time
import requests
import os
from lxml import etree
name = input('请输入你要搜索的名字:')
number = int(input('请输入你要保存的文件数量:'))
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get('http://image.baidu.com/') #打开百度图片
driver.find_element_by_id('kw').send_keys(name) #文本框输入name
driver.find_element_by_class_name('s_search').click() #点击搜索
def downImg(imgUrl, dirpath, imgName):
filename = os.path.join(dirpath, imgName)
try:
res = requests.get(imgUrl, timeout=15)
if str(res.status_code)[0] == "4":
print(str(res.status_code), ":" , imgUrl)
return False
except Exception as e:
print("抛出异常:", imgUrl)
print(e)
return False
with open(filename, "wb") as f:
f.write(res.content)
return True
urls=[]
while len(urls)
urls.clear()
dom = etree.HTML(driver.page_source)
for url in dom.xpath('//img[@class="main_img img-hover"]'):
urls.append(url.xpath(".//@data-imgurl")[0])
print(len(urls))
print("刷新...")
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") #滚到最底部
time.sleep(3)
for i in range(number):
downImg(urls[i],"./img/",str(i)+".jpg")
print('finish...')
driver.quit()
8、总结
1、selenium默认解析函数
tmp=find_element_by_xpath() #获取标签
tmp=find_elements_by_xpath()
tmp.get_attribute('href')) #获取属性
相对路径和绝对路径
2、将chrome驱动获取的内容转换成html
dom=etree.HTML(driver.page_source)
dom.xpath(//img) #获取标签
dom.xpath(.//@hearf) #获取当前标签下的属性
原文链接:https://blog.csdn.net/hqh131360239/article/details/108232926