Selenium自动化测试的学习

最新推荐文章于 2024-08-06 16:21:24 发布

lu_seng

最新推荐文章于 2024-08-06 16:21:24 发布

阅读量136

点赞数

分类专栏： python爬虫文章标签： python

本文链接：https://blog.csdn.net/lu_seng/article/details/118991583

版权

python爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

概念
selenium是一个自动化测试框架，我们主要用它的webdriver

webdriver
支持各大主流浏览器

chromedriver安装：
安装chromedriver,下载压缩包，解压以后，把可执行文件所在的目录放到PATH环境变量里，比如 chromedriver在/home/user/
地址：http://npm.taobao.org/mirrors/chromedriver/，与谷歌浏览器版本一致

centos7、ubuntun：
export PATH=“PATH":/home/user"

windows:
把解压后的chromedriver，放到python的scripts目录下

如果没有设置环境变量，则需要在启动时指定chromedriver的路径

启动浏览器

	from selenium import webdriver
	#启动chrome浏览器
	driver = webdriver.Chrome()
	#指定chromedriver的路径并启动Chrome
	driver = webdriver.Chrome(executable_path=executable_path=r’D:\Anacond\chromedriver.exe’)')
	#无界面访问url地址
	from selenium.webdriver.chrome.option import Options
	option=Options()
	option.add_agrument('--headless')
	driver = webdriver.Chrome(chrome_options=option)
	
	#启动phantomjs
	driver = webdriver.PhantomJs()

chrome_headless是无界面版的chrom，它替代了停止维护的phantomjs

控制浏览器

	#访问某个url
	driver.get('http://www.badi.com')
	#刷新
	driver.refersh()
	#前进
	driver.forward()
	#后退
	driver.back()
	#当前的url
	driver.current_url()
	#截图
	driver.save_screenshot('/tmp/test.png')

find函数：
	#根据元素的class属性的值查找
	driver.find_element_by_class_name
	#用css选择器查找
	driver.find_element_by_css_selector
	#根据元素查找
	drive.find_element_by_id
	#根据链接内的文件查找
	drive.find_element_by_link_text
	#根据元素的name属性查找
	drive.find_element_by_name
	#根据链接内的文本是否包含指定的查找文字
	drive.find_element_by_partial_link_text
	#根据标签名查找
	drive.find_element_by_tag_name
	#根据xpath表达式查找
	drive.find_element_by_xpath

另外，以上函数分别还有多个查找元素的对应函数，把element变成 elements，比如：
find_elements,返回一个列表,包含若干返回一个webElement对象，如果查找不到，直接抛出异常 find_element，返回一个webElement对象

爬取去哪儿网：https://flight.qunar.com/

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

driver = webdriver.Chrome()
driver.get('https://www.qunar.com/')
toCity = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//input[@name="toCity"]')))
# toCity = driver.find_element_by_xpath('//input[@name="toCity"]')
toCity.send_keys('北京')
time.sleep(1)
toCity.send_keys(Keys.RETURN)
driver.find_element_by_css_selector('button.button-search').click()
#隐式等待
# driver.implicitly_wait(10)

#显示等待
flights = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, '//div[@class="m-airfly-lst"]/div[@class="b-airfly"]')))
for f in flights:
    fdata = {}
    airlines = f.find_element_by_xpath('.//div[@class="b-airfly"]')
    fdata['airlines'] = [airline.text.replace('\n', '_') for airline in airlines]
    fdata['start'] = f.find_element_by_xpath('.//div[@class="sep-lf"]"]').text
    fdata['duration'] = f.find_element_by_xpath('.//div[@class="sep-ct"]').text
    fdata['end'] = f.find_element_by_xpath('.//div[@class="sep-rt"]').text
    fake_price = lit(f.find_element_by_xpath('.//span[@class="prc_wp"]/em/b[1]').text)
    covers = f.find_element_by_xpath('.//span[@class="prc_wp"]/em/b[position()>1]')
    for c in covers:
        index = int(c.value_of_css_property('left')[:-2]) // c.size['width']
        print(fdata['airlines'], index, c.text)
        fake_price[index] = c.text
    fdata['price'] = ''.join(fake_price)
    # print(fdata)
driver.quit()

爬取前程无忧职位详情：

import time
import csv
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.action_chains import ActionChains
# option=Options()
# option.add_argument('--headless')
# driver = webdriver.Chrome(chrome_options=option)
option = ChromeOptions()
option.add_experimental_option('excludeSwitches',['enable-automation'])
driver = Chrome(options=option)
driver.get('https://www.51job.com/')
input = driver.find_element_by_xpath('//input[@id="kwdselectid"]')
keyword = 'python'
input.send_keys(keyword)
time.sleep(1)
driver.find_element_by_xpath('//div[contains(@class,"ush")]/button').click()
# list = driver.find_elements_by_xpath('//div[@class="j_joblist"]/div')
handle = driver.current_window_handle
has_next = True
while has_next:
    time.sleep(5)
    list = driver.find_elements_by_xpath('//div[@class="j_joblist"]/div')
    for li in list:
        row = {}
        curr = li.find_element_by_xpath('//ul/li[@class="on"]/div').text
        print("-------current page is %s---------" % curr)
        print(driver.title)
        driver.implicitly_wait(2)
        links = li.find_element_by_xpath('./a').click()
        driver.switch_to.window(driver.window_handles[-1])
        print(driver.title)
        word = keyword
        if '滑动验证页面' in driver.title:
            time.sleep(5)
            source = driver.find_elements_by_css_selector('.btn_slide')
            box = driver.find_elements_by_css_selector('#nc_1__scale_text > span')
            ActionChains(driver).drag_and_drop_by_offset(source, box['width'], 0).perform()
        else:
            try:
                time.sleep(2)
                position = driver.find_element_by_xpath('//div[@class="tBorderTop_box"]/div[contains(@class,"bmsg")]').text
            except Exception:
                position = ''
            time.sleep(3)
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            file = open('%s_positionDesc.csv' % keyword, 'a', encoding='utf-8')
            writer = csv.writer(file,fieldnames=['keyword','position_desc'])
            writer.writerow([keyword, position])
            print(position)
    time.sleep(2)
    # driver.execute_script('window.scrollTo(0,6775)')
    next = driver.find_element_by_xpath('//ul/li[@class="next"]/a')
    if 'bk next' in next.get_attribute('class'):
        has_next = False
    else:
        next.click()
        time.sleep(5)
driver.quit()