概念
selenium是一个自动化测试框架,我们主要用它的webdriver
webdriver
支持各大主流浏览器
chromedriver安装:
安装chromedriver,下载压缩包,解压以后,把可执行文件所在的目录放到PATH环境变量里,比如 chromedriver在/home/user/
地址:http://npm.taobao.org/mirrors/chromedriver/,与谷歌浏览器版本一致
centos7、ubuntun:
export PATH=“PATH":/home/user"
windows:
把解压后的chromedriver,放到python的scripts目录下
如果没有设置环境变量,则需要在启动时指定chromedriver的路径
启动浏览器
from selenium import webdriver
#启动chrome浏览器
driver = webdriver.Chrome()
#指定chromedriver的路径并启动Chrome
driver = webdriver.Chrome(executable_path=executable_path=r’D:\Anacond\chromedriver.exe’)')
#无界面访问url地址
from selenium.webdriver.chrome.option import Options
option=Options()
option.add_agrument('--headless')
driver = webdriver.Chrome(chrome_options=option)
#启动phantomjs
driver = webdriver.PhantomJs()
chrome_headless是无界面版的chrom,它替代了停止维护的phantomjs
控制浏览器
#访问某个url
driver.get('http://www.badi.com')
#刷新
driver.refersh()
#前进
driver.forward()
#后退
driver.back()
#当前的url
driver.current_url()
#截图
driver.save_screenshot('/tmp/test.png')
find函数:
#根据元素的class属性的值查找
driver.find_element_by_class_name
#用css选择器查找
driver.find_element_by_css_selector
#根据元素查找
drive.find_element_by_id
#根据链接内的文件查找
drive.find_element_by_link_text
#根据元素的name属性查找
drive.find_element_by_name
#根据链接内的文本是否包含指定的查找文字
drive.find_element_by_partial_link_text
#根据标签名查找
drive.find_element_by_tag_name
#根据xpath表达式查找
drive.find_element_by_xpath
另外,以上函数分别还有多个查找元素的对应函数,把element变成 elements,比如:
find_elements,返回一个列表,包含若干返回一个webElement对象,如果查找不到,直接抛出异常 find_element,返回一个webElement对象
爬取去哪儿网:https://flight.qunar.com/
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
driver = webdriver.Chrome()
driver.get('https://www.qunar.com/')
toCity = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//input[@name="toCity"]')))
# toCity = driver.find_element_by_xpath('//input[@name="toCity"]')
toCity.send_keys('北京')
time.sleep(1)
toCity.send_keys(Keys.RETURN)
driver.find_element_by_css_selector('button.button-search').click()
#隐式等待
# driver.implicitly_wait(10)
#显示等待
flights = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div[@class="m-airfly-lst"]/div[@class="b-airfly"]')))
for f in flights:
fdata = {}
airlines = f.find_element_by_xpath('.//div[@class="b-airfly"]')
fdata['airlines'] = [airline.text.replace('\n', '_') for airline in airlines]
fdata['start'] = f.find_element_by_xpath('.//div[@class="sep-lf"]"]').text
fdata['duration'] = f.find_element_by_xpath('.//div[@class="sep-ct"]').text
fdata['end'] = f.find_element_by_xpath('.//div[@class="sep-rt"]').text
fake_price = lit(f.find_element_by_xpath('.//span[@class="prc_wp"]/em/b[1]').text)
covers = f.find_element_by_xpath('.//span[@class="prc_wp"]/em/b[position()>1]')
for c in covers:
index = int(c.value_of_css_property('left')[:-2]) // c.size['width']
print(fdata['airlines'], index, c.text)
fake_price[index] = c.text
fdata['price'] = ''.join(fake_price)
# print(fdata)
driver.quit()
爬取前程无忧职位详情:
import time
import csv
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.action_chains import ActionChains
# option=Options()
# option.add_argument('--headless')
# driver = webdriver.Chrome(chrome_options=option)
option = ChromeOptions()
option.add_experimental_option('excludeSwitches',['enable-automation'])
driver = Chrome(options=option)
driver.get('https://www.51job.com/')
input = driver.find_element_by_xpath('//input[@id="kwdselectid"]')
keyword = 'python'
input.send_keys(keyword)
time.sleep(1)
driver.find_element_by_xpath('//div[contains(@class,"ush")]/button').click()
# list = driver.find_elements_by_xpath('//div[@class="j_joblist"]/div')
handle = driver.current_window_handle
has_next = True
while has_next:
time.sleep(5)
list = driver.find_elements_by_xpath('//div[@class="j_joblist"]/div')
for li in list:
row = {}
curr = li.find_element_by_xpath('//ul/li[@class="on"]/div').text
print("-------current page is %s---------" % curr)
print(driver.title)
driver.implicitly_wait(2)
links = li.find_element_by_xpath('./a').click()
driver.switch_to.window(driver.window_handles[-1])
print(driver.title)
word = keyword
if '滑动验证页面' in driver.title:
time.sleep(5)
source = driver.find_elements_by_css_selector('.btn_slide')
box = driver.find_elements_by_css_selector('#nc_1__scale_text > span')
ActionChains(driver).drag_and_drop_by_offset(source, box['width'], 0).perform()
else:
try:
time.sleep(2)
position = driver.find_element_by_xpath('//div[@class="tBorderTop_box"]/div[contains(@class,"bmsg")]').text
except Exception:
position = ''
time.sleep(3)
driver.close()
driver.switch_to.window(driver.window_handles[0])
file = open('%s_positionDesc.csv' % keyword, 'a', encoding='utf-8')
writer = csv.writer(file,fieldnames=['keyword','position_desc'])
writer.writerow([keyword, position])
print(position)
time.sleep(2)
# driver.execute_script('window.scrollTo(0,6775)')
next = driver.find_element_by_xpath('//ul/li[@class="next"]/a')
if 'bk next' in next.get_attribute('class'):
has_next = False
else:
next.click()
time.sleep(5)
driver.quit()