Python爬取招聘网岗位动态信息

1.熟悉chromedriver

https://blog.csdn.net/weixin_41098099/article/details/107128566

2.查看网页的页数、分析

进入https://www.51job.com/,搜索随意个岗位,例如这里是‘大数据’,如图:
在这里插入图片描述
分析网页地址:
第一页:

https://search.51job.com/list/000000,000000,0000,00,9,99,大数据,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=

第二页:

https://search.51job.com/list/000000,000000,0000,00,9,99,大数据,2,2.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=

第三页:

https://search.51job.com/list/000000,000000,0000,00,9,99,大数据,2,2.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=

可以发现其的搜索页面大概的格式:关键字+页数
因此,代码可以定为:

'https://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(i)

3.岗位页面分析

在这里插入图片描述
可以发现,每条具体岗位的url地址,格式为:

//*[@class="j_joblist"]/div/a/@href

进入到具体岗位信息后:
在这里插入图片描述
同理,我们想获取岗位名或者工资同样可以锁定
例如
大数据架构师(JS)(岗位名):

//*[@class="tHeader tHjob"]/div/div/h1/@title

1.5-3万/月(工资):

//*[@class="tHeader tHjob"]/div/div/strong/text()

4.具体代码

from lxml import etree
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

"""创建csv文件,输入头标题"""
fp = open('51job.csv', 'wt', newline='', encoding='GBK', errors='ignore')
writer = csv.writer(fp)
writer.writerow(('职位', '薪资', '公司', '公司信息', '公司地址', '地区', '工作经验', '学历', '人数', '时间', '岗位信息'))
j = 1   # 记录爬取条数

def parseInfo(url):
    """获取具体岗位的基本信息"""
    global j

    # 为了将Chrome不弹出界面,实现无界面动态爬取
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options)

    # 获取网页
    driver.get(url)

    # 转换形式,用于接下的xpath爬取
    selector = etree.HTML(driver.page_source)

    # 提取出岗位的各种信息
    title = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/h1/@title'))[2:-2].replace('\\xa0', '')  # 去掉['内容']中的[' '],去掉内容中的\xa0
    salary = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/strong/text()'))[2:-2].replace('\\xa0', '')
    company = str(selector.xpath('//*[@class="tCompany_sidebar"]/div/div/a/p/text()'))[2:-2].replace('\\xa0', '')
    companyinfo = selector.xpath('//*[@class="tCompany_sidebar"]/div/div/p/@title')
    companyinfo = '|'.join(companyinfo)     # 将获取的列表中多项信息转换成字符串
    companyplace = str(selector.xpath('//*[@class="tCompany_main"]/div[2]/div/p/text()'))[2:-2].replace('\\xa0', '')

    place = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/p[2]/text()[1]'))[2:-2].replace('\\xa0', '')
    exp = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/p[2]/text()[2]'))[2:-2].replace('\\xa0', '')
    edu = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/p[2]/text()[3]'))[2:-2].replace('\\xa0', '')
    num = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/p[2]/text()[4]'))[2:-2].replace('\\xa0', '')
    time = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/p[2]/text()[5]'))[2:-2].replace('\\xa0', '')
    info = selector.xpath('//*[@class="tCompany_main"]/div[1]/div/p/text()')
    info = ''.join(info)

    print("*"*100)
    print("-" * 50)
    print("第" + str(j) + "条")
    j += 1
    print("-" * 50)
    print(title, salary, company, companyinfo, companyplace, place, exp, edu, num, time, info)
    print("-"*100)
    writer.writerow((title, salary, company, companyinfo, companyplace, place, exp, edu, num, time, info))

    # 退出浏览器
    driver.quit()

def getUrl(url):
    """从搜索页面中获取具体岗位的url"""

    # 为了将Chrome不弹出界面,实现无界面爬取
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options)

    # 发送请求,这里选取的某当网一页面。
    driver.get(url)

    selector = etree.HTML(driver.page_source)
    urls = selector.xpath('//*[@class="j_joblist"]/div/a/@href')
    driver.quit()   # 关闭driver

    for url in urls:
        print(url)
        parseInfo(url)

if __name__ == '__main__':
    key = '大数据'
    urls = [
        'https://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(
            i) for i in range(1, 6)]   # 设定爬取页数
    i = 1
    for url in urls:
        print("-" * 50)
        print("第" + str(i) + "页")
        print("-" * 50)
        getUrl(url)
        i += 1

5.代码运行效果

运行结果:
在这里插入图片描述
生成的csv文件:
在这里插入图片描述
(注:因为不是每个网页的格式都完全一样,会出现某些数据不完整,需要调整)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值