Selenium 爬取百聘,BOSS直聘

什么是selenium

Selenium 是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。
用selenium 模拟浏览器进行操作,能有效的反反爬虫。
这里用selenium+chrome爬取百聘和boss直聘。

爬取BOSS直聘

完整代码:

import json
import time  #时间模块,主要是用.sleep防止访问过快导致ip被封

import xlrd #操作excel模块
import xlwt
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from  selenium.webdriver.support import ui,expected_conditions
chrome=Chrome(executable_path='chromedriver.exe')#chrome 模块
urls = []
def get_job():
    chrome.execute_script('var q=document.documentElement.scrollTop=500')
    # 等待出现 class_name=listitem
    ui.WebDriverWait(chrome, 60).until(
        expected_conditions.visibility_of_all_elements_located((
            By.CLASS_NAME, 'job-box'
        ))
    )
    items = chrome.find_elements(By.CSS_SELECTOR, '.job-list>ul>li')

    for item in items:
        time.sleep(1)
        a=item.find_element(By.CSS_SELECTOR,'.job-name a').get_attribute('href')
        urls.append(a)
    chrome.execute_script('var q=document.documentElement.scrollTop=3500')
    time.sleep(1)
    if chrome.find_element_by_css_selector('.next'):
        chrome.find_element_by_css_selector('.next').click()
        if len(urls)<10:
          get_job()
          time.sleep(1)
        else:
            write_job(urls)
    else:
        return 0
def write_job(urls):
    f = xlwt.Workbook()
    sheet1 = f.add_sheet('python', cell_overwrite_ok=True)
    i = 1
    sheet1.write_merge(0, 0, 0, 2, '职位名称')
    sheet1.write_merge(0, 0, 3, 5, '职位薪资')
    sheet1.write_merge(0, 0, 6, 10, '职位要求')
    i=1
    time.sleep(3)
    for url in urls:
        chrome.get(url)
        time.sleep(5)
        name=chrome.find_element(By.CSS_SELECTOR,'.name h1').text
        salary = chrome.find_element(By.CSS_SELECTOR, '.name span').text
        detail=chrome.find_element(By.CSS_SELECTOR, '.text').text
        sheet1.write_merge(i, i, 0, 2, name)
        sheet1.write_merge(i, i, 3, 4, salary)
        sheet1.write_merge(i, i, 5, 12, detail)
        time.sleep(2)
        f.save('boos_python.xls')
        i+=1
        time.sleep(10)
if __name__=='__main__':
    url = f'https://www.zhipin.com/shanghai/?ka=city-sites-101020100'
    chrome.get(url)
    query = chrome.find_element_by_css_selector('input[name="query"]')
    query.send_keys('python')
    time.sleep(5)
    chrome.execute_script('var q=document.documentElement.scrollLeft=500')
    time.sleep(0.5)
    chrome.find_element_by_css_selector('.btn-search').click()
    time.sleep(0.5)
    get_job()
代码解析
开始
if __name__=='__main__':
    url = f'https://www.zhipin.com/shanghai/?ka=city-sites-101020100'#代表城市,这里我选的上海作为代表
    chrome.get(url)#chrome打开链接
    query = chrome.find_element_by_css_selector('input[name="query"]')#查找搜索框
    query.send_keys('python')#在搜索框写入python
    time.sleep(5)
    chrome.execute_script('var q=document.documentElement.scrollLeft=500')#向右移动500码
    time.sleep(0.5)
    chrome.find_element_by_css_selector('.btn-search').click()#点击搜索按钮
    time.sleep(0.5)
    get_job()#开始获取职位链接

这块的代码主要模拟人操作,自动化的输入,点击搜索。可以把python换为其他的关键字。

爬取职位链接
def get_job():
    chrome.execute_script('var q=document.documentElement.scrollTop=500')#网页向下滚动
    # 等待出现 class_name=listitem
    ui.WebDriverWait(chrome, 60).until(
        expected_conditions.visibility_of_all_elements_located((
            By.CLASS_NAME, 'job-box'
        ))#包含职位信息的box的class是'job-box',等待这个盒子的出现
    )
    items = chrome.find_elements(By.CSS_SELECTOR, '.job-list>ul>li')#获取所有的职位信息

    for item in items:
        time.sleep(1)
        a=item.find_element(By.CSS_SELECTOR,'.job-name a').get_attribute('href')
        urls.append(a)#将职位链接保存
    chrome.execute_script('var q=document.documentElement.scrollTop=3500')
    time.sleep(1)
    if chrome.find_element_by_css_selector('.next'):#寻找下一页的按钮
        chrome.find_element_by_css_selector('.next').click()#找到按钮,继续下一页的爬取
        if len(urls)<10:#这里我设置爬到十个结束,开始写入excel
          get_job()
          time.sleep(1)
        else:
            write_job(urls)
    else:
        return 0
爬取职位详细信息

def write_job(urls):
    f = xlwt.Workbook()#新建excel文件
    sheet1 = f.add_sheet('python', cell_overwrite_ok=True)#建立一个sheet
    i = 1
    sheet1.write_merge(0, 0, 0, 2, '职位名称')
    sheet1.write_merge(0, 0, 3, 5, '职位薪资')
    sheet1.write_merge(0, 0, 6, 10, '职位要求')#设置第一行
    i=1
    time.sleep(3)
    for url in urls:
        chrome.get(url)
        time.sleep(5)
        name=chrome.find_element(By.CSS_SELECTOR,'.name h1').text#爬取职位名
        salary = chrome.find_element(By.CSS_SELECTOR, '.name span').text#爬取薪资
        detail=chrome.find_element(By.CSS_SELECTOR, '.text').text#爬取职位要求
        sheet1.write_merge(i, i, 0, 2, name)
        sheet1.write_merge(i, i, 3, 4, salary)
        sheet1.write_merge(i, i, 5, 12, detail)#写入
        time.sleep(2)
        f.save('boos_python.xls')#保存
        i+=1
        time.sleep(10)#这里建议比10大,因为boss的反爬很强,在没有设置代理ip的情况下,还是访问慢一点比较稳。

在爬取职位链接时,进程很稳,没有出现较大的反爬措施。
获取链接之后,循环从职位链接中提取职位信息时,如果过快,ip就很容易被封。
在爬的时候,我被封了好几个ip。

爬取百聘

import json
import time

import xlrd
import xlwt
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from  selenium.webdriver.support import ui,expected_conditions
chrome=Chrome(executable_path='chromedriver.exe')
from  urllib.parse import quote

def start(cityname):
    url=f'http://zhaopin.baidu.com/?city={quote(cityname)}'
    chrome.get(url)
    time.sleep(5)
    query=chrome.find_element_by_css_selector('input[name="query"]')
    query.send_keys('python')
    time.sleep(0.5)
    chrome.execute_script('var q=document.documentElement.scrollLeft=500')
    chrome.find_element_by_css_selector('.search-btn').click()

    time.sleep(0.5)
    chrome.execute_script('var q=document.documentElement.scrollTop=500')
     #等待出现 class_name=listitem
    ui.WebDriverWait(chrome,60).until(
        expected_conditions.visibility_of_all_elements_located((
            By.CLASS_NAME,'listitems'
        ))
    )
    chrome.execute_script('var q=document.documentElement.scrollTop=2500')
    #获取所有岗位信息
    items=chrome.find_elements(By.CSS_SELECTOR,'.listitem>a')
    urls=[]
    for item in items:#第一个是广告
        try:
            adv=item.find_element(By.CLASS_NAME,'adbar-item')
            continue
        except:
            info_url=item.get_attribute('href')
            data=item.find_element(By.TAG_NAME,'div').get_attribute('data-click')
            info_url=json.loads(data)['url']
            urls.append(info_url)
            title=item.find_element(By.CLASS_NAME,'title').text
            salary=item.find_element(By.CSS_SELECTOR,'.salaryarea span').text
            time.sleep(0.5)
    f=xlwt.Workbook()
    sheet1=f.add_sheet('python',cell_overwrite_ok=True)
    i=0
    for url in urls:
        job_info = []
        chrome.get(url)
        job_info.append(chrome.find_element(By.CLASS_NAME, 'job-name').text)
        job_info.append(chrome.find_element(By.CLASS_NAME, 'salary').text)
        job_info.append( chrome.find_element(By.CSS_SELECTOR, '.job-detail p ').text)
        time.sleep(1)
        sheet1.write_merge(i, i,0,2,job_info[0])
        sheet1.write_merge(i, i, 3, 4, job_info[1])
        sheet1.write_merge(i, i, 5, 12, job_info[2])
        i+=1
    f.save('python_job.xls')
    print(job_info)

# def write_excel():
#     ws = xlwt.open_workbook(r'formatting.xls')
#     wb = xlwt.Workbook()
#     ws = wb.add_sheet('Python', cell_overwrite_ok=True)
#     aliggmeng = xlwt.Alignment()
#     aliggmeng.horz = xlwt.Alignment.HORZ_CENTER
#     aliggmeng.vert = xlwt.Alignment.VERT_CENTER
#     style = xlwt.XFStyle()
#     style.alignment = aliggmeng
#     ws.write_merge(0, 0, 0, 5, 'Python开发', style)
#     for i in range(2, 5):
#         for j in range(0, 3):
#             ws.write(i, j, job1[j])
#     wb.save('formatting.xls')
if __name__=='__main__':
    start("郑州")

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值