实习生实习(校招)岗位信息爬取(2022.11.28可用)

实习生实习岗位信息爬取

一、准备工作

本文参考博客https://blog.csdn.net/weixin_45929716/article/details/119920428一些准备工作请先移步至该博文,尊重原创。该博文代码可直接使用,本文只是在该文的基础上加入了对岗位下一级url的内容提取,例如岗位的一些要求

二、完整代码

import requests
import xlwt
import urllib.parse
from lxml import etree
import re
from fontTools.ttLib import TTFont

font = TTFont("file.ttf")
font.saveXML("font.xml")


def get_dict():
    # 打开并读取font.xml
    with open('font.xml') as f:
        xml = f.read()

    # 正则表达式提取code和name
    keys = re.findall('<map code="(0x.*?)" name="uni.*?"/>', xml)
    values = re.findall('<map code="0x.*?" name="uni(.*?)"/>', xml)

    word_dict = {}
    # 将name解码成中文并作为值写入字典word_dict,该字典的键为keys
    for i in range(len(values)):
        if len(values[i]) < 4:
            values[i] = ('\\u00' + values[i]).encode('utf-8').decode('unicode_escape')
        else:
            values[i] = ('\\u' + values[i]).encode('utf-8').decode('unicode_escape')
        word_dict[keys[i]] = values[i]
    print(word_dict)
    return word_dict


dict = get_dict()

# 输入要爬取的岗位名称并转urlencode编码
job = input('请输入你要在实习僧爬取的实习岗位名称:')
job_urlencode = urllib.parse.quote(job)


def spider_sxs():
    # 创建execl并设置列名
    workbook = xlwt.Workbook(encoding='utf-8')
    sheet1 = workbook.add_sheet('{}'.format(job))
    sheet1.write(0, 0, '职位名称')
    sheet1.write(0, 1, '工资')
    sheet1.write(0, 2, '城市')
    sheet1.write(0, 3, '出勤要求')
    sheet1.write(0, 4, '实习周期')
    sheet1.write(0, 5, '职位福利')
    sheet1.write(0, 6, '公司名称')
    sheet1.write(0, 7, '所属行业')
    sheet1.write(0, 8, '公司规模')
    sheet1.write(0, 9, '投递链接')
    #sheet1.write(0, 10, '公司链接')
    sheet1.write(0, 10, '学历要求')
    sheet1.write(0, 11, '职位要求')

    # 设置excel列宽度
    sheet1.col(0).width = 256 * 30
    sheet1.col(1).width = 256 * 20
    sheet1.col(2).width = 256 * 10
    sheet1.col(3).width = 256 * 15
    sheet1.col(4).width = 256 * 15
    sheet1.col(5).width = 256 * 60
    sheet1.col(6).width = 256 * 20
    sheet1.col(7).width = 256 * 20
    sheet1.col(8).width = 256 * 15
    sheet1.col(9).width = 256 * 30
    sheet1.col(10).width = 256 * 30

    sheet1_row = 0
    # 解析网页源代码
    for i in range(1, int(input('请输入要爬取{}岗位的页数:'.format(job))) + 1):
        url = 'https://www.shixiseng.com/interns?page={}&type=intern&keyword={}'.format(
            i, job_urlencode)
        print('第{}页的链接是:{}'.format(i, url))

        response = requests.get(url)
        response_text = response.text.replace('&#', '0')  # 将源码中&#xefed=>0xefed
        for key in dict:
            response_text = response_text.replace(key, dict[key])  # 0xefed格式=>对应的字典的值

        html_sxs = etree.HTML(response_text)
        all_div = html_sxs.xpath(
            '//*[@id="__layout"]/div/div[2]/div[2]/div[1]/div[1]/div[1]//div[@class="intern-wrap intern-item"]')

        # 循环语句获取数据并存入excel
        for item in all_div:
            try:
                # 获取数据
                next_url = item.xpath('.//a/@href')[0] # 获取下一级页面url
                response = requests.get(next_url)
                response_text = response.text

                item1 = etree.HTML(response_text)

                # 获取数据并存入excel
                try:
                    # 获取数据
                    job_name = item1.xpath('.//div[@class="new_job_name"]/span/text()')[0]  # 职位名称
                    wages = item1.xpath('.//span[@class="job_money cutom_font"]/text()')[0]  # 工资
                    city = item1.xpath('.//span[@class="job_position"]/text()')[0]  # 城市
                    academic = item1.xpath('.//span[@class="job_academic"]/text()')[0]  # 学历
                    week_time = item1.xpath('.//span[@class="job_week cutom_font"]/text()')[0]  # 出勤要求
                    work_time = item1.xpath('.//span[@class="job_time cutom_font"]/text()')[0]  # 实习周期
                    job_detail = item1.xpath('.//div[@class="job_detail"]//text()')  # 职位描述
                    job_welfare = item1.xpath('.//div[@class="job_good_list"]//text()')  # 职位福利
                    # 去除换行符
                    lists = [x.strip() for x in job_detail]
                    job_detail = list(set(lists))
                    job_detail.sort(key=lists.index)
                    job_detail.remove('')
                except:
                    '爬取失败'

                company_name = item.xpath('.//a[@class="title ellipsis"]/text()')[0]  # 公司名称
                company_type = item.xpath('.//span[@class="ellipsis"]/text()')[0]  # 所属行业
                company_size = item.xpath('.//span[@class="font"]/text()')[2]  # 公司规模
                job_href = item.xpath('.//a[@class="title ellipsis font"]/@href')[0]  # 投递链接
                #company_href = item.xpath('.//a[@class="title ellipsis"]/@href')[0]  # 公司链接

                # 向execl写入数据
                sheet1_row = sheet1_row + 1
                sheet1.write(sheet1_row, 0, job_name)
                sheet1.write(sheet1_row, 1, wages)
                sheet1.write(sheet1_row, 2, city)
                sheet1.write(sheet1_row, 3, week_time)
                sheet1.write(sheet1_row, 4, work_time)
                sheet1.write(sheet1_row, 5, job_welfare)
                sheet1.write(sheet1_row, 6, company_name)
                sheet1.write(sheet1_row, 7, company_type)
                sheet1.write(sheet1_row, 8, company_size)
                sheet1.write(sheet1_row, 9, job_href)
                #sheet1.write(sheet1_row, 10, company_href)
                sheet1.write(sheet1_row, 10, academic)
                sheet1.write(sheet1_row, 11, job_detail)

            except:
                ''

    workbook.save('实习僧{}岗位.xls'.format(job))
    print('爬取成功')
    print('------------------------------------------------------')


spider_sxs()


三、操作流程

直接运行后控制台输入你想爬取的岗位名称回车后加入你想爬取的页数即可,爬取完成后在项目文件夹内就能看见最终的excel文件

实习僧校招岗位爬取

由于实习僧主要是为实习服务,所以校招信息较少,可能会有需要的朋友,在此也将代码完整给出,操作流程同实习的爬取。

完整代码

import requests
import xlwt
import urllib.parse
from lxml import etree
import re
from fontTools.ttLib import TTFont

font = TTFont("file.ttf")
font.saveXML("font.xml")


def get_dict():
    # 打开并读取font.xml
    with open('font.xml') as f:
        xml = f.read()

    # 正则表达式提取code和name
    keys = re.findall('<map code="(0x.*?)" name="uni.*?"/>', xml)
    values = re.findall('<map code="0x.*?" name="uni(.*?)"/>', xml)

    word_dict = {}
    # 将name解码成中文并作为值写入字典word_dict,该字典的键为keys
    for i in range(len(values)):
        if len(values[i]) < 4:
            values[i] = ('\\u00' + values[i]).encode('utf-8').decode('unicode_escape')
        else:
            values[i] = ('\\u' + values[i]).encode('utf-8').decode('unicode_escape')
        word_dict[keys[i]] = values[i]
    print(word_dict)
    return word_dict


dict = get_dict()

# 输入要爬取的岗位名称并转urlencode编码
job = input('请输入你要在实习僧爬取的实习岗位名称:')
job_urlencode = urllib.parse.quote(job)


def spider_sxs():
    # 创建execl并设置列名
    workbook = xlwt.Workbook(encoding='utf-8')
    sheet1 = workbook.add_sheet('{}'.format(job))
    sheet1.write(0, 0, '职位名称')
    sheet1.write(0, 1, '工资')
    sheet1.write(0, 2, '城市')
    sheet1.write(0, 3, '学历要求')
    sheet1.write(0, 4, '职位描述')
    sheet1.write(0, 5, '职位福利')
    sheet1.write(0, 6, '公司名称')
    sheet1.write(0, 7, '所属行业')
    sheet1.write(0, 8, '公司规模')
    sheet1.write(0, 9, '投递链接')
    sheet1.write(0, 10, '职位要求')

    # 设置excel列宽度
    sheet1.col(0).width = 256 * 30
    sheet1.col(1).width = 256 * 20
    sheet1.col(2).width = 256 * 10
    sheet1.col(3).width = 256 * 15
    sheet1.col(4).width = 256 * 15
    sheet1.col(5).width = 256 * 60
    sheet1.col(6).width = 256 * 20
    sheet1.col(7).width = 256 * 20
    sheet1.col(8).width = 256 * 15
    sheet1.col(9).width = 256 * 30
    sheet1.col(10).width = 256 * 30

    sheet1_row = 0
    # 解析网页源代码
    for i in range(1, int(input('请输入要爬取{}岗位的页数:'.format(job))) + 1):
        url = 'https://www.shixiseng.com/interns?page={}&type=school&keyword={}'.format(
            i, job_urlencode)
        print('第{}页的链接是:{}'.format(i, url))

        response = requests.get(url)
        response_text = response.text.replace('&#', '0')  # 将源码中&#xefed=>0xefed
        for key in dict:
            response_text = response_text.replace(key, dict[key])  # 0xefed格式=>对应的字典的值

        html_sxs = etree.HTML(response_text)
        all_div = html_sxs.xpath(
            '//*[@id="__layout"]/div/div[2]/div[2]/div[1]/div[2]/div[1]//div[@class="intern-wrap intern-item"]')

        # 循环语句获取数据并存入excel
        for item in all_div:
            try:
                # 获取数据
                next_url = item.xpath('.//a/@href')[0] # 获取下一级页面url
                response = requests.get(next_url)
                response_text = response.text

                item1 = etree.HTML(response_text)

                # 获取数据并存入excel
                try:
                    # 获取数据
                    job_name = item1.xpath('.//div[@class="new_job_name"]/span/text()')[0]  # 职位名称
                    wages = item1.xpath('.//span[@class="job_money cutom_font"]/text()')[0]  # 工资
                    city = item1.xpath('.//span[@class="job_position"]/text()')[0]  # 城市
                    academic = item1.xpath('.//span[@class="job_academic cutom_font"]/text()')[0]  # 学历
                    #week_time = item1.xpath('.//span[@class="job_week cutom_font"]/text()')[0]  # 出勤要求
                    #work_time = item1.xpath('.//span[@class="job_time cutom_font"]/text()')[0]  # 实习周期
                    job_description = item1.xpath('.//div[@class="job_description"]//text()')  # 职位描述
                    job_detail = item1.xpath('.//div[@class="job_detail"]//text()')  # 职位要求
                    job_welfare = item1.xpath('.//div[@class="job_good_list"]//text()')  # 职位福利
                    # 去除换行符
                    lists = [x.strip() for x in job_detail]
                    job_detail = list(set(lists))
                    job_detail.sort(key=lists.index)
                    job_detail.remove('')
                except:
                    '爬取失败'

                company_name = item.xpath('.//a[@class="title ellipsis"]/text()')[0]  # 公司名称
                company_type = item.xpath('.//span[@class="ellipsis"]/text()')[0]  # 所属行业
                company_size = item.xpath('.//span[@class="font"]/text()')[2]  # 公司规模
                job_href = item.xpath('.//a[@class="title ellipsis font"]/@href')[0]  # 投递链接
                #company_href = item.xpath('.//a[@class="title ellipsis"]/@href')[0]  # 公司链接

                # 向execl写入数据
                sheet1_row = sheet1_row + 1
                sheet1.write(sheet1_row, 0, job_name)
                sheet1.write(sheet1_row, 1, wages)
                sheet1.write(sheet1_row, 2, city)
                sheet1.write(sheet1_row, 3, academic)
                sheet1.write(sheet1_row, 4, job_description)
                sheet1.write(sheet1_row, 5, job_welfare)
                sheet1.write(sheet1_row, 6, company_name)
                sheet1.write(sheet1_row, 7, company_type)
                sheet1.write(sheet1_row, 8, company_size)
                sheet1.write(sheet1_row, 9, job_href)
                sheet1.write(sheet1_row, 10, job_detail)

            except:
                ''

    workbook.save('实习僧{}校招岗位.xls'.format(job))
    print('爬取成功')
    print('------------------------------------------------------')


spider_sxs()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值