什么是selenium
- selenium 是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。支持的浏览器包括IE(7, 8, 9, 10, 11),Mozilla Firefox,Safari,Google Chrome,Opera等。selenium 是一套完整的web应用程序测试系统,包含了测试的录制(selenium IDE),编写及运行(Selenium Remote Control)和测试的并行处理(Selenium Grid)。
- selenium可以模拟真实浏览器,自动化测试工具,支持多种浏览器,爬虫中主要用来解决JavaScript渲染问题。
操作Google
from selenium import webdriver
import time
path = 'D:\chromedriver_win32\chromedriver.exe'
browser = webdriver.Chrome(executable_path=path)
url = 'http://www.baidu.com/'
browser.get(url)
time.sleep(3)
my_input = browser.find_elements_by_id('kw')
my_input.send_keys('美女')
time.sleep(3)
button = browser.find_elements_by_class_name('s_btn')[0]
button.click()
time.sleep(3)
image = browser.find_elements_by_class_name('op-img-address-link-imgs')[6]
image.click()
time.sleep(3)
browser.quit()
控制headless_chrome
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-hpu')
path = r'D:\chromedriver_win32\chromedriver.exe'
browser = webdriver.Chrome(executable_path=path, chrome_options=options)
url = 'http://www.baidu.com/'
browser.get(url)
time.sleep(3)
browser.save_screenshot('headless_chrome/baidu.png')
browser.quit()
操作phantomJS
from selenium import webdriver
import time
phantomjs_exe_path = r'D:\phantomjs-2.1.1-windows\bin\phantomjs.exe'
browser = webdriver.PhantomJS(phantomjs_exe_path)
'''
#打开百度
url = 'http://ww.baidu.com/'
browser.get(url)
time.sleep(4)
#截图
browser.save_screenshot(r'E:\pycharm project\爬虫2\phantom_picture\baidu.png')
my_input = browser.find_elements_by_id('kw')
my_input[0].send_keys('美女')
time.sleep(3)
browser.save_screenshot(r'E:\pycharm project\爬虫2\phantom_picture\su.png')
button = browser.find_elements_by_class_name('s_btn')[0]
button.click()
time.sleep(3)
browser.save_screenshot(r'E:\pycharm project\爬虫2\phantom_picture\meinv.png')
'''
'''
#豆瓣电影
url = r'https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action='
browser.get(url)
time.sleep(3)
browser.save_screenshot(r'E:\pycharm project\爬虫2\phantom_picture\douban.png')
#让browser执行简单的js代码,模拟滚动条滚动到底部
js = 'document.body.scrollTop=10000'
browser.execute_script(js)
time.sleep(5)
browser.save_screenshot(r'E:\pycharm project\爬虫2\phantom_picture\douban1.png')
#获取网页代码,保存到文件中
html = browser.page_source
with open(r'phantom_douban.html','w',encoding='utf8') as fp:
fp.write(html)
'''
url = r'http://sc.chinaz.com/tag_tupian/OuMeiMeiNv.html'
browser.get(url)
time.sleep(3)
with open(r'tupian1.html','w',encoding='utf8') as fp1:
fp1.write(browser.page_source)
js = 'document.body.scrollTop=10000'
browser.execute_script(js)
time.sleep(3)
with open(r'tupian2.html','w',encoding='utf8') as fp:
fp.write(browser.page_source)
browser.quit()