实习生实习（校招）岗位信息爬取（2022.11.28可用）

最新推荐文章于 2024-06-17 17:35:06 发布

Eclipse_XBY

最新推荐文章于 2024-06-17 17:35:06 发布

阅读量199

点赞数

文章标签： python 爬虫数据分析

本文链接：https://blog.csdn.net/qq_44262514/article/details/128082489

版权

实习生实习岗位信息爬取

一、准备工作

本文参考博客https://blog.csdn.net/weixin_45929716/article/details/119920428一些准备工作请先移步至该博文，尊重原创。该博文代码可直接使用，本文只是在该文的基础上加入了对岗位下一级url的内容提取，例如岗位的一些要求。

二、完整代码

import requests
import xlwt
import urllib.parse
from lxml import etree
import re
from fontTools.ttLib import TTFont

font = TTFont("file.ttf")
font.saveXML("font.xml")


def get_dict():
    # 打开并读取font.xml
    with open('font.xml') as f:
        xml = f.read()

    # 正则表达式提取code和name
    keys = re.findall('<map code="(0x.*?)" name="uni.*?"/>', xml)
    values = re.findall('<map code="0x.*?" name="uni(.*?)"/>', xml)

    word_dict = {}
    # 将name解码成中文并作为值写入字典word_dict，该字典的键为keys
    for i in range(len(values)):
        if len(values[i]) < 4:
            values[i] = ('\\u00' + values[i]).encode('utf-8').decode('unicode_escape')
        else:
            values[i] = ('\\u' + values[i]).encode('utf-8').decode('unicode_escape')
        word_dict[keys[i]] = values[i]
    print(word_dict)
    return word_dict


dict = get_dict()

# 输入要爬取的岗位名称并转urlencode编码
job = input('请输入你要在实习僧爬取的实习岗位名称：')
job_urlencode = urllib.parse.quote(job)


def spider_sxs():
    # 创建execl并设置列名
    workbook = xlwt.Workbook(encoding='utf-8')
    sheet1 = workbook.add_sheet('{}'.format(job))
    sheet1.write(0, 0, '职位名称')
    sheet1.write(0, 1, '工资')
    sheet1.write(0, 2, '城市')
    sheet1.write(0, 3, '出勤要求')
    sheet1.write(0, 4, '实习周期')
    sheet1.write(0, 5, '职位福利')
    sheet1.write(0, 6, '公司名称')
    sheet1.write(0, 7, '所属行业')
    sheet1.write(0, 8, '公司规模')
    sheet1.write(0, 9, '投递链接')
    #sheet1.write(0, 10, '公司链接')
    sheet1.write(0, 10, '学历要求')
    sheet1.write(0, 11, '职位要求')

    # 设置excel列宽度
    sheet1.col(0).width = 256 * 30
    sheet1.col(1).width = 256 * 20
    sheet1.col(2).width = 256 * 10
    sheet1.col(3).width = 256 * 15
    sheet1.col(4).width = 256 * 15
    sheet1.col(5).width = 256 * 60
    sheet1.col(6).width = 256 * 20
    sheet1.col(7).width = 256 * 20
    sheet1.col(8).width = 256 * 15
    sheet1.col(9).width = 256 * 30
    sheet1.col(10).width = 256 * 30

    sheet1_row = 0
    # 解析网页源代码
    for i in range(1, int(input('请输入要爬取{}岗位的页数：'.format(job))) + 1):
        url = 'https://www.shixiseng.com/interns?page={}&type=intern&keyword={}'.format(
            i, job_urlencode)
        print('第{}页的链接是：{}'.format(i, url))

        response = requests.get(url)
        response_text = response.text.replace('&#', '0')  # 将源码中&#xefed=>0xefed
        for key in dict:
            response_text = response_text.replace(key, dict[key])  # 0xefed格式=>对应的字典的值

        html_sxs = etree.HTML(response_text)
        all_div = html_sxs.xpath(
            '//*[@id="__layout"]/div/div[2]/div[2]/div[1]/div[1]/div[1]//div[@class="intern-wrap intern-item"]')

        # 循环语句获取数据并存入excel
        for item in all_div:
            try:
                # 获取数据
                next_url = item.xpath('.//a/@href')[0] # 获取下一级页面url
                response = requests.get(next_url)
                response_text = response.text

                item1 = etree.HTML(response_text)

                # 获取数据并存入excel
                try:
                    # 获取数据
                    job_name = item1.xpath('.//div[@class="new_job_name"]/span/text()')[0]  # 职位名称
                    wages = item1.xpath('.//span[@class="job_money cutom_font"]/text()')[0]  # 工资
                    city = item1.xpath('.//span[@class="job_position"]/text()')[0]  # 城市
                    academic = item1.xpath('.//span[@class="job_academic"]/text()')[0]  # 学历
                    week_time = item1.xpath('.//span[@class="job_week cutom_font"]/text()')[0]  # 出勤要求
                    work_time = item1.xpath('.//span[@class="job_time cutom_font"]/text()')[0]  # 实习周期
                    job_detail = item1.xpath('.//div[@class="job_detail"]//text()')  # 职位描述
                    job_welfare = item1.xpath('.//div[@class="job_good_list"]//text()')  # 职位福利
                    # 去除换行符
                    lists = [x.strip() for x in job_detail]
                    job_detail = list(set(lists))
                    job_detail.sort(key=lists.index)
                    job_detail.remove('')
                except:
                    '爬取失败'

                company_name = item.xpath('.//a[@class="title ellipsis"]/text()')[0]  # 公司名称
                company_type = item.xpath('.//span[@class="ellipsis"]/text()')[0]  # 所属行业
                company_size = item.xpath('.//span[@class="font"]/text()')[2]  # 公司规模
                job_href = item.xpath('.//a[@class="title ellipsis font"]/@href')[0]  # 投递链接
                #company_href = item.xpath('.//a[@class="title ellipsis"]/@href')[0]  # 公司链接

                # 向execl写入数据
                sheet1_row = sheet1_row + 1
                sheet1.write(sheet1_row, 0, job_name)
                sheet1.write(sheet1_row, 1, wages)
                sheet1.write(sheet1_row, 2, city)
                sheet1.write(sheet1_row, 3, week_time)
                sheet1.write(sheet1_row, 4, work_time)
                sheet1.write(sheet1_row, 5, job_welfare)
                sheet1.write(sheet1_row, 6, company_name)
                sheet1.write(sheet1_row, 7, company_type)
                sheet1.write(sheet1_row, 8, company_size)
                sheet1.write(sheet1_row, 9, job_href)
                #sheet1.write(sheet1_row, 10, company_href)
                sheet1.write(sheet1_row, 10, academic)
                sheet1.write(sheet1_row, 11, job_detail)

            except:
                ''

    workbook.save('实习僧{}岗位.xls'.format(job))
    print('爬取成功')
    print('------------------------------------------------------')


spider_sxs()

三、操作流程

直接运行后控制台输入你想爬取的岗位名称回车后加入你想爬取的页数即可，爬取完成后在项目文件夹内就能看见最终的excel文件

实习僧校招岗位爬取

由于实习僧主要是为实习服务，所以校招信息较少，可能会有需要的朋友，在此也将代码完整给出，操作流程同实习的爬取。

完整代码

import requests
import xlwt
import urllib.parse
from lxml import etree
import re
from fontTools.ttLib import TTFont

font = TTFont("file.ttf")
font.saveXML("font.xml")


def get_dict():
    # 打开并读取font.xml
    with open('font.xml') as f:
        xml = f.read()

    # 正则表达式提取code和name
    keys = re.findall('<map code="(0x.*?)" name="uni.*?"/>', xml)
    values = re.findall('<map code="0x.*?" name="uni(.*?)"/>', xml)

    word_dict = {}
    # 将name解码成中文并作为值写入字典word_dict，该字典的键为keys
    for i in range(len(values)):
        if len(values[i]) < 4:
            values[i] = ('\\u00' + values[i]).encode('utf-8').decode('unicode_escape')
        else:
            values[i] = ('\\u' + values[i]).encode('utf-8').decode('unicode_escape')
        word_dict[keys[i]] = values[i]
    print(word_dict)
    return word_dict


dict = get_dict()

# 输入要爬取的岗位名称并转urlencode编码
job = input('请输入你要在实习僧爬取的实习岗位名称：')
job_urlencode = urllib.parse.quote(job)


def spider_sxs():
    # 创建execl并设置列名
    workbook = xlwt.Workbook(encoding='utf-8')
    sheet1 = workbook.add_sheet('{}'.format(job))
    sheet1.write(0, 0, '职位名称')
    sheet1.write(0, 1, '工资')
    sheet1.write(0, 2, '城市')
    sheet1.write(0, 3, '学历要求')
    sheet1.write(0, 4, '职位描述')
    sheet1.write(0, 5, '职位福利')
    sheet1.write(0, 6, '公司名称')
    sheet1.write(0, 7, '所属行业')
    sheet1.write(0, 8, '公司规模')
    sheet1.write(0, 9, '投递链接')
    sheet1.write(0, 10, '职位要求')

    # 设置excel列宽度
    sheet1.col(0).width = 256 * 30
    sheet1.col(1).width = 256 * 20
    sheet1.col(2).width = 256 * 10
    sheet1.col(3).width = 256 * 15
    sheet1.col(4).width = 256 * 15
    sheet1.col(5).width = 256 * 60
    sheet1.col(6).width = 256 * 20
    sheet1.col(7).width = 256 * 20
    sheet1.col(8).width = 256 * 15
    sheet1.col(9).width = 256 * 30
    sheet1.col(10).width = 256 * 30

    sheet1_row = 0
    # 解析网页源代码
    for i in range(1, int(input('请输入要爬取{}岗位的页数：'.format(job))) + 1):
        url = 'https://www.shixiseng.com/interns?page={}&type=school&keyword={}'.format(
            i, job_urlencode)
        print('第{}页的链接是：{}'.format(i, url))

        response = requests.get(url)
        response_text = response.text.replace('&#', '0')  # 将源码中&#xefed=>0xefed
        for key in dict:
            response_text = response_text.replace(key, dict[key])  # 0xefed格式=>对应的字典的值

        html_sxs = etree.HTML(response_text)
        all_div = html_sxs.xpath(
            '//*[@id="__layout"]/div/div[2]/div[2]/div[1]/div[2]/div[1]//div[@class="intern-wrap intern-item"]')

        # 循环语句获取数据并存入excel
        for item in all_div:
            try:
                # 获取数据
                next_url = item.xpath('.//a/@href')[0] # 获取下一级页面url
                response = requests.get(next_url)
                response_text = response.text

                item1 = etree.HTML(response_text)

                # 获取数据并存入excel
                try:
                    # 获取数据
                    job_name = item1.xpath('.//div[@class="new_job_name"]/span/text()')[0]  # 职位名称
                    wages = item1.xpath('.//span[@class="job_money cutom_font"]/text()')[0]  # 工资
                    city = item1.xpath('.//span[@class="job_position"]/text()')[0]  # 城市
                    academic = item1.xpath('.//span[@class="job_academic cutom_font"]/text()')[0]  # 学历
                    #week_time = item1.xpath('.//span[@class="job_week cutom_font"]/text()')[0]  # 出勤要求
                    #work_time = item1.xpath('.//span[@class="job_time cutom_font"]/text()')[0]  # 实习周期
                    job_description = item1.xpath('.//div[@class="job_description"]//text()')  # 职位描述
                    job_detail = item1.xpath('.//div[@class="job_detail"]//text()')  # 职位要求
                    job_welfare = item1.xpath('.//div[@class="job_good_list"]//text()')  # 职位福利
                    # 去除换行符
                    lists = [x.strip() for x in job_detail]
                    job_detail = list(set(lists))
                    job_detail.sort(key=lists.index)
                    job_detail.remove('')
                except:
                    '爬取失败'

                company_name = item.xpath('.//a[@class="title ellipsis"]/text()')[0]  # 公司名称
                company_type = item.xpath('.//span[@class="ellipsis"]/text()')[0]  # 所属行业
                company_size = item.xpath('.//span[@class="font"]/text()')[2]  # 公司规模
                job_href = item.xpath('.//a[@class="title ellipsis font"]/@href')[0]  # 投递链接
                #company_href = item.xpath('.//a[@class="title ellipsis"]/@href')[0]  # 公司链接

                # 向execl写入数据
                sheet1_row = sheet1_row + 1
                sheet1.write(sheet1_row, 0, job_name)
                sheet1.write(sheet1_row, 1, wages)
                sheet1.write(sheet1_row, 2, city)
                sheet1.write(sheet1_row, 3, academic)
                sheet1.write(sheet1_row, 4, job_description)
                sheet1.write(sheet1_row, 5, job_welfare)
                sheet1.write(sheet1_row, 6, company_name)
                sheet1.write(sheet1_row, 7, company_type)
                sheet1.write(sheet1_row, 8, company_size)
                sheet1.write(sheet1_row, 9, job_href)
                sheet1.write(sheet1_row, 10, job_detail)

            except:
                ''

    workbook.save('实习僧{}校招岗位.xls'.format(job))
    print('爬取成功')
    print('------------------------------------------------------')


spider_sxs()