51JOB网站爬虫

最新推荐文章于 2024-08-01 11:20:11 发布

Monkey_Men

最新推荐文章于 2024-08-01 11:20:11 发布

阅读量834

点赞数

分类专栏：手记爬虫文章标签：爬虫

本文链接：https://blog.csdn.net/qq_43183340/article/details/110202592

版权

手记同时被 2 个专栏收录

2 篇文章 1 订阅

订阅专栏

爬虫

2 篇文章 1 订阅

订阅专栏

关于51job网站的爬虫

一、原由

学校里有要求每日收集相应就业信息内容

二、查看51job网站内容

三、部分代码展示

def create_excel(excel_data):
    localtime = time.localtime(time.time())
    name = str(localtime.tm_mon) + str(localtime.tm_mday) + college + ".xlsx"
    app = xw.App(visible=True, add_book=False)
    # 新建工作簿
    wb = app.books.add()

    # 存储excel
    # wb.save('example.xlsx')

    # 引用工作表
    sht = wb.sheets['sheet1']

    sht.range('A1').options(expand='table').value = excel_data

    print(sht.range('A1').value)

    wb.save(name)

    # 退出工作簿
    wb.close()

    # 推出excel
    app.quit()
    return

def write_excel(excels):
    for excel in excels:
        for word in excel:
            print(word)
    return

# 上海 软件工程
def get_html(job_pos, search):
    target_url = get_url(job_pos, 3, urllib.parse.quote(search))
    print(target_url)
    response = request.urlopen(target_url)
    html_doc = response.read().decode('gbk')
    soup = BeautifulSoup(html_doc, 'html.parser')
    div_test = soup.find_all("script")
    # div_test = div_test.find_all("script")
    # print(div_test)
    ans = re.findall(r"window.__SEARCH_RESULT__\s=\s({.*})", div_test.__str__())
    ans = re.findall(r"job_href\":\"(https:[^\"]*t=0)", ans[0])
    for an in ans:
        job_list.append(str(an).replace("\\", ""))
    return job_list

# 收集职业就业网页信息
def get_information(aim_websites):
    i = 1
    job_information_list.append(type_table)
    for website in aim_websites:
        job_information = []
        if i == 11:
            break
        # "https://jobs.51job.com/shanghai-ypq/125300004.html?s=01&t=0"
        response = request.urlopen(website)
        html_doc = response.read().decode('gbk')
        # print(html_doc)
        soup = BeautifulSoup(html_doc, 'html.parser')
        # 输出公司名字
        company_name = soup.find(class_="com_msg").p['title']
        # 输出公司类型
        company_type = soup.find(class_="com_tag").p['title']
        # 输出招聘信息
        cn = soup.find(class_="cn")
        job_name = cn.h1['title']
        stuff_info = soup.find(class_="cn").find(class_="msg ltype")['title']
        ans = stuff_info.split("|")

        # 生成相应内容
        detail = []
        aim_create.append(type_table)
        for an in ans:
            detail.append(str(an).replace(u'\xa0', u''))
        print('--------------------' + str(i) + '----------------------')
        # 序号
        # print("序号:")
        job_information.append(i)
        i += 1
        # 类别
        # print("类别:")
        job_information.append(college)
        # 创建时间
        # print("创建时间:")
        localtime = time.localtime(time.time())
        name = str(localtime.tm_year) + '/' + str(localtime.tm_mon) + '/' + str(localtime.tm_mday)
        job_information.append(name)
        # 过期时间
        # print("过期时间:")
        job_information.append('')
        # 工作地点
        # print("工作地点:" + job_list[1])
        job_information.append(detail[0])
        # 公司名称
        # print("公司名称:" + company_name)
        job_information.append(company_name)
        # 链接地址
        # print("链接地址:")
        job_information.append(website)
        # 职务名称
        # print("职务名称:" + job_name)
        job_information.append(job_name)
        # 是否推荐
        job_information.append('')
        # 公司性质
        job_information.append(company_type)
        # 职务性质
        job_information.append('实习')
        # 教育背景
        # print("教育背景" + job_list[2])
        job_information.append(detail[2])
        # 信用良好
        # print("信用良好:" + '1')
        job_information.append('1')
        # 不良信用
        # print("不良信用")
        job_information.append('')
        # 500强
        # print("500强")
        job_information.append('')
        # 上市
        # print("上市")
        job_information.append('')
        # 200人以上
        # print("200人以上")
        job_information.append('1')
        # 200人以下
        # print("200人以下")
        job_information.append('')
        # 岗位数
        # print("岗位数:" + '1')
        job_information.append('1')
        # 需求人数
        # print("需求数:" + job_list[3])
        job_information.append(detail[3])
        job_information_list.append(job_information.copy())
    return job_information_list

def get_pos():
    return

# 生成搜索接口
def get_url(pos_name, num, search):
    postion = pos[pos_name]
    num = company_size[3]
    target = url + "/list/"+postion+",000000,0000,00,9,99,"+urllib.parse.quote(search)+",2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize="+num+"&ord_field=0&dibiaoid=0&line=&welfare="
    return target

def turn_page(source):
    ans_list = []
    for i in range(1, 10):
        ans = re.sub(r',(\d*).html', "," + str(i) + ".html", source)
        ans_list.append(ans)
    return ans_list

def position(chars):
    string = ""
    for char in chars:
        string = char + ""
    return string


def to_unicode(string):
    ret = ''
    for v in string:
        ret = ret + hex(ord(v)).upper().replace('0X', '\\u')

    return ret