什么是selenium
Selenium 是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。
用selenium 模拟浏览器进行操作,能有效的反反爬虫。
这里用selenium+chrome爬取百聘和boss直聘。
爬取BOSS直聘
完整代码:
import json
import time #时间模块,主要是用.sleep防止访问过快导致ip被封
import xlrd #操作excel模块
import xlwt
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import ui,expected_conditions
chrome=Chrome(executable_path='chromedriver.exe')#chrome 模块
urls = []
def get_job():
chrome.execute_script('var q=document.documentElement.scrollTop=500')
# 等待出现 class_name=listitem
ui.WebDriverWait(chrome, 60).until(
expected_conditions.visibility_of_all_elements_located((
By.CLASS_NAME, 'job-box'
))
)
items = chrome.find_elements(By.CSS_SELECTOR, '.job-list>ul>li')
for item in items:
time.sleep(1)
a=item.find_element(By.CSS_SELECTOR,'.job-name a').get_attribute('href')
urls.append(a)
chrome.execute_script('var q=document.documentElement.scrollTop=3500')
time.sleep(1)
if chrome.find_element_by_css_selector('.next'):
chrome.find_element_by_css_selector('.next').click()
if len(urls)<10:
get_job()
time.sleep(1)
else:
write_job(urls)
else:
return 0
def write_job(urls):
f = xlwt.Workbook()
sheet1 = f.add_sheet('python', cell_overwrite_ok=True)
i = 1
sheet1.write_merge(0, 0, 0, 2, '职位名称')
sheet1.write_merge(0, 0, 3, 5, '职位薪资')
sheet1.write_merge(0, 0, 6, 10, '职位要求')
i=1
time.sleep(3)
for url in urls:
chrome.get(url)
time.sleep(5)
name=chrome.find_element(By.CSS_SELECTOR,'.name h1').text
salary = chrome.find_element(By.CSS_SELECTOR, '.name span').text
detail=chrome.find_element(By.CSS_SELECTOR, '.text').text
sheet1.write_merge(i, i, 0, 2, name)
sheet1.write_merge(i, i, 3, 4, salary)
sheet1.write_merge(i, i, 5, 12, detail)
time.sleep(2)
f.save('boos_python.xls')
i+=1
time.sleep(10)
if __name__=='__main__':
url = f'https://www.zhipin.com/shanghai/?ka=city-sites-101020100'
chrome.get(url)
query = chrome.find_element_by_css_selector('input[name="query"]')
query.send_keys('python')
time.sleep(5)
chrome.execute_script('var q=document.documentElement.scrollLeft=500')
time.sleep(0.5)
chrome.find_element_by_css_selector('.btn-search').click()
time.sleep(0.5)
get_job()
代码解析
开始
if __name__=='__main__':
url = f'https://www.zhipin.com/shanghai/?ka=city-sites-101020100'#代表城市,这里我选的上海作为代表
chrome.get(url)#chrome打开链接
query = chrome.find_element_by_css_selector('input[name="query"]')#查找搜索框
query.send_keys('python')#在搜索框写入python
time.sleep(5)
chrome.execute_script('var q=document.documentElement.scrollLeft=500')#向右移动500码
time.sleep(0.5)
chrome.find_element_by_css_selector('.btn-search').click()#点击搜索按钮
time.sleep(0.5)
get_job()#开始获取职位链接
这块的代码主要模拟人操作,自动化的输入,点击搜索。可以把python换为其他的关键字。
爬取职位链接
def get_job():
chrome.execute_script('var q=document.documentElement.scrollTop=500')#网页向下滚动
# 等待出现 class_name=listitem
ui.WebDriverWait(chrome, 60).until(
expected_conditions.visibility_of_all_elements_located((
By.CLASS_NAME, 'job-box'
))#包含职位信息的box的class是'job-box',等待这个盒子的出现
)
items = chrome.find_elements(By.CSS_SELECTOR, '.job-list>ul>li')#获取所有的职位信息
for item in items:
time.sleep(1)
a=item.find_element(By.CSS_SELECTOR,'.job-name a').get_attribute('href')
urls.append(a)#将职位链接保存
chrome.execute_script('var q=document.documentElement.scrollTop=3500')
time.sleep(1)
if chrome.find_element_by_css_selector('.next'):#寻找下一页的按钮
chrome.find_element_by_css_selector('.next').click()#找到按钮,继续下一页的爬取
if len(urls)<10:#这里我设置爬到十个结束,开始写入excel
get_job()
time.sleep(1)
else:
write_job(urls)
else:
return 0
爬取职位详细信息
def write_job(urls):
f = xlwt.Workbook()#新建excel文件
sheet1 = f.add_sheet('python', cell_overwrite_ok=True)#建立一个sheet
i = 1
sheet1.write_merge(0, 0, 0, 2, '职位名称')
sheet1.write_merge(0, 0, 3, 5, '职位薪资')
sheet1.write_merge(0, 0, 6, 10, '职位要求')#设置第一行
i=1
time.sleep(3)
for url in urls:
chrome.get(url)
time.sleep(5)
name=chrome.find_element(By.CSS_SELECTOR,'.name h1').text#爬取职位名
salary = chrome.find_element(By.CSS_SELECTOR, '.name span').text#爬取薪资
detail=chrome.find_element(By.CSS_SELECTOR, '.text').text#爬取职位要求
sheet1.write_merge(i, i, 0, 2, name)
sheet1.write_merge(i, i, 3, 4, salary)
sheet1.write_merge(i, i, 5, 12, detail)#写入
time.sleep(2)
f.save('boos_python.xls')#保存
i+=1
time.sleep(10)#这里建议比10大,因为boss的反爬很强,在没有设置代理ip的情况下,还是访问慢一点比较稳。
在爬取职位链接时,进程很稳,没有出现较大的反爬措施。
获取链接之后,循环从职位链接中提取职位信息时,如果过快,ip就很容易被封。
在爬的时候,我被封了好几个ip。
爬取百聘
import json
import time
import xlrd
import xlwt
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import ui,expected_conditions
chrome=Chrome(executable_path='chromedriver.exe')
from urllib.parse import quote
def start(cityname):
url=f'http://zhaopin.baidu.com/?city={quote(cityname)}'
chrome.get(url)
time.sleep(5)
query=chrome.find_element_by_css_selector('input[name="query"]')
query.send_keys('python')
time.sleep(0.5)
chrome.execute_script('var q=document.documentElement.scrollLeft=500')
chrome.find_element_by_css_selector('.search-btn').click()
time.sleep(0.5)
chrome.execute_script('var q=document.documentElement.scrollTop=500')
#等待出现 class_name=listitem
ui.WebDriverWait(chrome,60).until(
expected_conditions.visibility_of_all_elements_located((
By.CLASS_NAME,'listitems'
))
)
chrome.execute_script('var q=document.documentElement.scrollTop=2500')
#获取所有岗位信息
items=chrome.find_elements(By.CSS_SELECTOR,'.listitem>a')
urls=[]
for item in items:#第一个是广告
try:
adv=item.find_element(By.CLASS_NAME,'adbar-item')
continue
except:
info_url=item.get_attribute('href')
data=item.find_element(By.TAG_NAME,'div').get_attribute('data-click')
info_url=json.loads(data)['url']
urls.append(info_url)
title=item.find_element(By.CLASS_NAME,'title').text
salary=item.find_element(By.CSS_SELECTOR,'.salaryarea span').text
time.sleep(0.5)
f=xlwt.Workbook()
sheet1=f.add_sheet('python',cell_overwrite_ok=True)
i=0
for url in urls:
job_info = []
chrome.get(url)
job_info.append(chrome.find_element(By.CLASS_NAME, 'job-name').text)
job_info.append(chrome.find_element(By.CLASS_NAME, 'salary').text)
job_info.append( chrome.find_element(By.CSS_SELECTOR, '.job-detail p ').text)
time.sleep(1)
sheet1.write_merge(i, i,0,2,job_info[0])
sheet1.write_merge(i, i, 3, 4, job_info[1])
sheet1.write_merge(i, i, 5, 12, job_info[2])
i+=1
f.save('python_job.xls')
print(job_info)
# def write_excel():
# ws = xlwt.open_workbook(r'formatting.xls')
# wb = xlwt.Workbook()
# ws = wb.add_sheet('Python', cell_overwrite_ok=True)
# aliggmeng = xlwt.Alignment()
# aliggmeng.horz = xlwt.Alignment.HORZ_CENTER
# aliggmeng.vert = xlwt.Alignment.VERT_CENTER
# style = xlwt.XFStyle()
# style.alignment = aliggmeng
# ws.write_merge(0, 0, 0, 5, 'Python开发', style)
# for i in range(2, 5):
# for j in range(0, 3):
# ws.write(i, j, job1[j])
# wb.save('formatting.xls')
if __name__=='__main__':
start("郑州")