如何用Python快速爬下拉勾招聘信息

1、简介

在爬取之前,我先用requests包get了一下拉勾的官网,发现源码并没有包含我想要的招聘信息,这时候就麻烦,当然解决的方法有很多种,可以抓包得到招聘信息的json数据,通过解析json来获取招聘信息,另外还可以通过PhantomJS来伪装浏览器获取,PhantomJS的安装我就不多说了,大家可以自行度娘。接下来我就告诉大家如何爬取拉勾。

2、爬取招聘信息的网站

首先大家进入拉勾官网,我随便输入了数据分析这个岗位,就会出现如下页面:
这里写图片描述
这时候我们发现虽然有大部分招聘信息但是还不是很全,而我们需要爬的信息是点开链接后的信息
这里写图片描述

所以我们得找到这个链接对不对,这个很好找,通过检查元素我们很容易获取,如下
href为我们需要找的链接
想要定位这个链接,用bs4很容易:
注意用到了队列,全部代码会放在后面。

    bs = BeautifulSoup(driver.page_source)
    req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
    urllinks = req.find_all('a',class_='position_link')
    for i in urllinks:
        print(i.get('href'))
        que.put(i.get('href'))

3、爬取信息

当我们获得了这些链接后,后续就简单了,只需要对这些链接解析就行了,具体的代码如下:
注意:我把数据放在了mysql里面,当然你也可以直接放到本地

driver2=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
while que:
    try :
        newurl = que.get()
        driver2.get(newurl)
        driver2.implicitly_wait(100)
        bs2 = BeautifulSoup(driver2.page_source)

        job_info = bs2.find('div', class_='job-name')
        company = job_info.find('div', class_='company')
        reg1 = re.compile("<[^>]*>")
        ###部门
        company = reg1.sub('', company.prettify())
        ####职位
        job = job_info.find('span', class_='name')
        reg2 = re.compile("<[^>]*>")
        job = reg2.sub('', job.prettify()).strip('\n')
        ###工资 、地点 、经验、学历
        job_req = bs2.find('dd', class_='job_request')
        all_info = []
        for i in job_req.find_all('span'):
            reg3 = re.compile("<[^>]*>")
            new_in = reg3.sub('', i.prettify())
            all_info.append(new_in)

        salary = all_info[0]
        mod = re.compile('/')
        salary = mod.sub('', salary).strip('\n')

        address = all_info[1]
        address = mod.sub('', address).strip('\n')
        exp = all_info[2]
        exp = mod.sub('', exp).strip('\n')
        edu = all_info[3]
        edu = mod.sub('', edu).strip('\n')
        ###job_detail
        job_det = bs2.find('dl', class_='job_detail', id='job_detail')
        ###职位诱惑
        job_lu = job_det.find('dd', class_='job-advantage').find('p')
        reg4 = re.compile("<[^>]*>")
        job_lu = reg4.sub('', job_lu.prettify())
        ###工作责任与要求
        job_zong = job_det.find('dd', class_='job_bt')
        job_res = job_zong.find('div')
        reg5 = re.compile("<[^>]*>")
        job_res = str(reg5.sub('', job_res.prettify()).strip('\n').strip())
        ###工作地址
        job_ad = bs2.find('dd', class_='job-address clearfix').find('div', class_='work_addr')
        reg6 = re.compile("<[^>]*>")
        job_ad = reg6.sub('', job_ad.prettify()).strip('\n')
        job_con = bs2.find('dl', class_='job_company', id='job_company')
        ###公司名称
        com_name = job_con.find('dt').find('a').find('img').get('alt')
        ###公司类型
        com_cat = job_con.find('ul', class_='c_feature').find_all('li')
        all_info2 = []
        for i in com_cat:
            reg7 = re.compile("<[^>]*>")
            new_in = reg7.sub('', i.prettify())
            all_info2.append(new_in)
        com_cat = all_info2[0].strip('\n')
        lingyu  = '领域'
        dev = '发展阶段'
        gui ='规模'

        a1 = re.compile(lingyu)
        a2 = re.compile(dev)
        a3 = re.compile(gui)
        com_cat = a1.sub('',com_cat).strip()
        com_qua = all_info2[1].strip('\n')
        com_qua = a2.sub('',com_qua).strip()
        com_peo = all_info2[2].strip('\n')
        com_peo = a3.sub('',com_peo).strip()
        print(company, job, salary, address, exp, edu, job_lu, job_res, job_ad, com_name, com_cat, com_qua, com_peo)
        db = pymysql.connect('localhost', 'root', '*********', 'test')
        db.encoding = 'utf-8'
        cursor = db.cursor()
        cursor.execute('set names utf8')

        sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
        cursor.execute(sql % (job, com_name, address, com_cat, com_qua, com_peo, exp, edu, salary, job_res))

        db.commit()
        cursor.close()
        db.close()
    except:
        print('该页面无法获取')

4、结果:

结果如下:
这里写图片描述

5、全部代码

from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
from time import sleep
import re
what1 = '数据分析'
what2 ='全职'
what3 = '广州'
what1 = urllib.parse.quote(what1)
what2  = urllib.parse.quote(what2)
what3  = urllib.parse.quote(what3)
# driver=webdriver.PhantomJS()
driver=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
url = 'https://www.lagou.com/jobs/list_%s?px=default&gx=%s&city=%s#order' % (what1,what2,what3)
url2 = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=sug&fromSearch=true&suginput=shuju'
driver.implicitly_wait(100)
driver.get(url)
bs = BeautifulSoup(driver.page_source)
req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
urllinks = req.find_all('a',class_='position_link')
import queue
que = queue.Queue()
for i in urllinks:
    print(i.get('href'))
    que.put(i.get('href'))
link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")
link_next.click()
times = 0
while True:
    times += 1
    driver.implicitly_wait(10)
    bs = BeautifulSoup(driver.page_source)
    req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
    urllinks = req.find_all('a',class_='position_link')
    for i in urllinks:
        print(i.get('href'))
        que.put(i.get('href'))
    print(times)
    if times  == 9:
        break
    link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")
    link_next.click()
    sleep(3)


# driver2 = webdriver.PhantomJS()
driver2=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
while que:
    try :
        newurl = que.get()
        driver2.get(newurl)
        driver2.implicitly_wait(100)
        bs2 = BeautifulSoup(driver2.page_source)

        job_info = bs2.find('div', class_='job-name')
        company = job_info.find('div', class_='company')
        reg1 = re.compile("<[^>]*>")
        ###部门
        company = reg1.sub('', company.prettify())
        ####职位
        job = job_info.find('span', class_='name')
        reg2 = re.compile("<[^>]*>")
        job = reg2.sub('', job.prettify()).strip('\n')
        ###工资 、地点 、经验、学历
        job_req = bs2.find('dd', class_='job_request')
        all_info = []
        for i in job_req.find_all('span'):
            reg3 = re.compile("<[^>]*>")
            new_in = reg3.sub('', i.prettify())
            all_info.append(new_in)

        salary = all_info[0]
        mod = re.compile('/')
        salary = mod.sub('', salary).strip('\n')

        address = all_info[1]
        address = mod.sub('', address).strip('\n')
        exp = all_info[2]
        exp = mod.sub('', exp).strip('\n')
        edu = all_info[3]
        edu = mod.sub('', edu).strip('\n')
        ###job_detail
        job_det = bs2.find('dl', class_='job_detail', id='job_detail')
        ###职位诱惑
        job_lu = job_det.find('dd', class_='job-advantage').find('p')
        reg4 = re.compile("<[^>]*>")
        job_lu = reg4.sub('', job_lu.prettify())
        ###工作责任与要求
        job_zong = job_det.find('dd', class_='job_bt')
        job_res = job_zong.find('div')
        reg5 = re.compile("<[^>]*>")
        job_res = str(reg5.sub('', job_res.prettify()).strip('\n').strip())
        ###工作地址
        job_ad = bs2.find('dd', class_='job-address clearfix').find('div', class_='work_addr')
        reg6 = re.compile("<[^>]*>")
        job_ad = reg6.sub('', job_ad.prettify()).strip('\n')
        job_con = bs2.find('dl', class_='job_company', id='job_company')
        ###公司名称
        com_name = job_con.find('dt').find('a').find('img').get('alt')
        ###公司类型
        com_cat = job_con.find('ul', class_='c_feature').find_all('li')
        all_info2 = []
        for i in com_cat:
            reg7 = re.compile("<[^>]*>")
            new_in = reg7.sub('', i.prettify())
            all_info2.append(new_in)
        com_cat = all_info2[0].strip('\n')
        lingyu  = '领域'
        dev = '发展阶段'
        gui ='规模'

        a1 = re.compile(lingyu)
        a2 = re.compile(dev)
        a3 = re.compile(gui)
        com_cat = a1.sub('',com_cat).strip()
        com_qua = all_info2[1].strip('\n')
        com_qua = a2.sub('',com_qua).strip()
        com_peo = all_info2[2].strip('\n')
        com_peo = a3.sub('',com_peo).strip()
        print(company, job, salary, address, exp, edu, job_lu, job_res, job_ad, com_name, com_cat, com_qua, com_peo)
        db = pymysql.connect('localhost', 'root', '*********', 'test')
        db.encoding = 'utf-8'
        cursor = db.cursor()
        cursor.execute('set names utf8')

        sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
        cursor.execute(sql % (job, com_name, address, com_cat, com_qua, com_peo, exp, edu, salary, job_res))

        db.commit()
        cursor.close()
        db.close()
    except:
        print('该页面无法获取')


  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值