0. selenium准备工作
(1)下载selenium 3.14.1版本。运行以下程序测试:
RIGHT Example:
from selenium.webdriver import Chrome
# 创建浏览器对象
b = Chrome()
# 打开网页
b.get('http://www.baidu.com')
(2)下载webdriver。
a. 获取谷歌浏览器版本号。设置->关于谷歌
b. 下载webdriver。网址:谷歌浏览器webdriver
c. 放入python主环境根目录
d. 运行上面的测试程序,成功打开网页即配置成功
1. selenium完整示范
RIGHT Example:
# 1. 创建浏览器对象
b = Chrome()
# 2. 打开网页(需要获取数据的网页)
b.get('http://movie.douban.com/top250')
# 2.5 等待浏览器加载
sleep(2)
# 3. 获取网页源代码
result = b.page_source
print(result)
# 4. 解析数据获取所有电影的名字
# (1)正则
r1 = findall(r'alt="(.+?)"', result)[:-1]
print(r1)
# (2)CSS选择器
soup = BeautifulSoup(result, 'lxml')
movie_name = soup.select('ol.grid_view > li > div > div.info > div.hd > a > span:nth-child(1)')
movie_name = [x.text for x in movie_name]
print(movie_name)
# (3)XPath
html = etree.HTML(result)
movie_name = html.xpath('//div[@class="hd"]/a/span[1]/text()')
print(movie_name)
2. 应用案例
APPLICATION 获取名字 价格 地址 面积 朝向 户型:
import csv
from selenium.webdriver import Chrome
from re import sub, search
from bs4 import BeautifulSoup
from time import sleep
# 获取网页源代码并创建bs4对象
b = Chrome()
sleep(2)
b.get('https://cd.zu.ke.com/zufang')
soup = BeautifulSoup(b.page_source, 'lxml')
b.close()
# 获取所有房子的整体信息
all_house = soup.select('div.content__list > div')
all_data = []
for house in all_house:
# 获取名字
name = house.select_one('p.content__list--item--title > a').text.strip()
# 获取价格
price = house.select_one('span.content__list--item-price').text
# 获取同时含有地址、面积、朝向、户型的信息
info = house.select_one('p.content__list--item--des').text
info = sub(r'\s', '', info)
infos = info.split('/')
address = infos[-5]
area = infos[-4]
type = infos[-2]
all_data.append([name, price, address, area, type])
writer = csv.writer(open('files/租房信息.csv', 'a', encoding='utf-8', newline=''))
writer.writerow(['名称', '价格', '地址', '面积', '户型'])
writer.writerows(all_data)
3. 点击标签
RIGHT Example :
from time import sleep
from selenium.webdriver import Chrome
b = Chrome()
b.get('https://cd.zu.ke.com/zufang')
sleep(2)
# 1. 点击标签“整租”
# (1)找到需要点击的标签
a = b.find_element_by_xpath('//ul[@class="beike__nav--tab"]/li[2]/a')
# (2)点击标签
a.click()
sleep(2)
4. 输入框
RIGHT Example :
from time import sleep
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
b = Chrome()
b.get('https://www.jd.com/')
sleep(2)
# 一、控制输入框输入内容
# 1. 获取输入框
search = b.find_element_by_id('key')
# 2. 输入内容
search.send_keys('奥特曼')
# 3. 按回车
search.send_keys(Keys.ENTER)
sleep(2)
# 二、页面滚动
# js实现滚动操作:window.scrollBy(x方向的偏移量, y方向偏移量)
# 在y方向滚动指定距离
# b.execute_script('window.scrollBy(0, 600)')
for x in range(10):
b.execute_script('window.scrollBy(0, 600)')
sleep(1)
# 三、获取网页数据
soup = BeautifulSoup(b.page_source, 'lxml')
lis = soup.select('ul.gl-warp.clearfix > li')
print(len(lis))
b.close()
5. 更改配置
RIGHT Example :
from selenium.webdriver import Chrome, ChromeOptions
# 1. 创建设置对象
options = ChromeOptions()
# 设置取消测试环境
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 设置取消图片加载
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 2. 创建浏览器
b = Chrome(options=options)
b.get('https://www.jd.com')
6. 课堂练习
APPLICATION 获取商品信息:
import csv
import os
from tqdm import tqdm
from time import sleep
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
def single_page_get():
# 5. 滚动到下面
for __ in range(10):
jd.execute_script('window.scrollBy(0, 800)')
sleep(1)
# 6. 获取网页源代码
goods = jd.page_source
# 7. 创建bs4对象
soup = BeautifulSoup(goods, 'lxml')
# 8. 获取当页全部商品信息
all_info = soup.select('ul.gl-warp.clearfix > li')
# 9. 获取单个商品信息
for single_info in tqdm(all_info):
good_name = single_info.select_one('div.p-name.p-name-type-2 > a > em').text
good_name = good_name.split('\n')[-1]
good_price = single_info.select_one('div.p-price > strong > i').text
good_link = 'https:' + single_info.select_one('div.p-name.p-name-type-2 > a').attrs['href']
good_comments = single_info.select_one('div.p-commit > strong').text
store_name = single_info.select_one('div.p-shop > span > a').text
store_link = 'https:' + single_info.select_one('div.p-shop > span > a').attrs['href']
results.writerow([good_name, good_price, good_link, good_comments, store_name, store_link])
# 10. 点击下一页
next_button = jd.find_element_by_class_name('pn-next')
next_button.click()
sleep(1)
if __name__ == '__main__':
# 1. 创建设置对象
options = ChromeOptions()
# 设置取消测试环境
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 设置取消图片加载
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 2. 创建浏览器
jd = Chrome(options=options)
jd.get('https://www.jd.com')
sleep(1)
# 3. 创建csv文件并写入表头
is_exists = os.path.exists('files/js_goods.csv')
f = open('files/jd_goods.csv', 'a', encoding='utf-8', newline='')
results = csv.writer(f)
if not is_exists:
results.writerow(['商品名', '商品价格', '商品链接', '评论数', '商店名', '商店链接'])
# 4. 搜索及回车
search_box = jd.find_element_by_id('key')
search_box.send_keys('口红')
search_box.send_keys(Keys.ENTER)
sleep(1)
for _ in tqdm(range(10)):
single_page_get()
f.close()
jd.close()
7. 选项卡切换
RIGHT Example :
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
# 1. 创建浏览器对象
b = Chrome()
# 2. 打开网页
b.get('https://www.cnki.net/')
# 3. 搜索论文
search_box = b.find_element_by_id('txt_SearchText')
search_box.send_keys('数据分析')
search_box.send_keys(Keys.ENTER)
sleep(2)
# 4. 获取每个搜索结果对应的标签
a_list = b.find_elements_by_css_selector('table.result-table-list tr > td.name > a')
a_list[0].click() # 点击第一个搜索结果
sleep(2)
# 5. 将选项卡切换到第二个页面,获取详情页数据
# 浏览器对象.window_handles:获取当前所有的选项卡,返回一个列表
b.switch_to.window(b.window_handles[1])
# 获取摘要数据
soup = BeautifulSoup(b.page_source, 'lxml')
print(soup.select_one('#ChiDivSummary').text)
b.close()
# 6. 回到第一页
b.switch_to.window(b.window_handles[0])
a_list[1].click()
sleep(2)
# 获取摘要数据
b.switch_to.window(b.window_handles[-1])
soup = BeautifulSoup(b.page_source, 'lxml')
print(soup.select_one('#ChiDivSummary').text)