selenium
selenium基本用法
from selenium.webdriver import Chrome, Edge
- 创建浏览器对象
web = Edge()
- 输入网址
web.get('https://www.gamersky.com')
- 获取网页源代码
web.page_source
- 关闭浏览器
web.close()
常规交互
from selenium.webdriver import Edge
from selenium.webdriver.common.keys import Keys
from time import sleep
in_put = web.find_element_by_id('kwdselectid')
print(in_put)
in_put.send_keys('数据分析')
in_put.send_keys(Keys.ENTER)
web.page_source
next = web.find_element_by_css_selector('.next')
next.click()
sleep(2)
print(web.page_source)
selenium常用配置
添加配置
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
options.add_argument(f'--proxy-server=http://代{ips[randint(0, 3)]}')
创建网页对象
web = Chrome(options=options)
web.get('https://movie.douban.com/top250')
print(web.page_source)
爬淘宝数据
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
options = ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
web = Chrome(options=options)
web.get('https://www.taobao.com/')
cookies = eval(open('cookies.txt', encoding='utf-8').read())
for cookie in cookies:
if cookie['secure']:
web.add_cookie(cookie)
web.get('https://www.taobao.com/')
search_input = web.find_element_by_id('q')
search_input.send_keys('鞋子')
search_input.send_keys(Keys.ENTER)
爬取51job职位数据
import time, re, csv
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
def config():
"""
生成浏览器配置
:return:
"""
options = ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
options.add_experimental_option('excludeSwitches', ['enable-automation'])
return options
def web_config():
web.get('https://www.51job.com/')
in_put = web.find_element_by_id('kwdselectid')
in_put.send_keys('数据分析')
in_put.send_keys(Keys.ENTER)
if __name__ == '__main__':
f = open('files/51job1.csv', 'a', encoding='utf-8', newline='')
writer = csv.writer(f)
web = Chrome(options=config())
web_config()
next = web.find_element_by_class_name('next')
page = re.findall(r'\d{2,}', web.find_element_by_class_name('tright').text)
for x in range(2):
print(x)
info = web.page_source
soup = BeautifulSoup(info, 'lxml')
assign_info = soup.select('.j_joblist>.e')
tmp = []
for eve_info in assign_info:
name = eve_info.select_one('.jname').get_text()
salary = eve_info.select_one('.info > .sal').get_text()
company = eve_info.select_one('.er > a').get_text()
link = eve_info.select_one('a').attrs['href']
tmp.extend([name, salary, company, link])
writer.writerow(tmp)
tmp.clear()
next.click()
time.sleep(3)