3.4 【实战】爬取智联招聘信息

欧阳枫落

于 2024-09-11 15:08:10 发布

阅读量385

点赞数 3

分类专栏： Python 教学文章标签： python 爬虫

本文链接：https://blog.csdn.net/weixin_44815507/article/details/142140162

版权

Python 教学专栏收录该内容

15 篇文章 6 订阅

订阅专栏

课程目标

爬取智联招聘信息

课程内容

编码实现

解析网站：https://www.spidertools.cn/#/curl2Request
目标链接: https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1

1. 基于playwright 获取cookie

如果cookie 失效,可基于这个获取

from playwright.sync_api import sync_playwright
import time
with sync_playwright() as p:
    browser = p.chromium.launch(
        headless=False)
    context = browser.new_context()

    page = context.new_page()
    page.goto("https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1")
    # 获取cookie
    time.sleep(5)
    cookies = context.cookies()
    cookie_dict = {}
    for cookie in cookies:
        cookie_dict[cookie["name"]] = cookie["value"]
    print(cookie_dict)

2. 爬取到excel

from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm
import pandas as pd
headers = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "cache-control": "max-age=0",
    "priority": "u=0, i",
    "referer": "https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1",
    "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Microsoft Edge\";v=\"128\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
    
}
cookies = {'_uab_collina': '172603833692980471099565', 'acw_tc': '1a0c638f17260383368666189e00347e6ac270373627cb2b28b40bf7747fb6', 'acw_sc__v2': '66e1413f368267fe14db235d7102befafd592806', 'x-zp-client-id': '62556d7c-493b-4b3f-b799-4852d97e40a5', 'FSSBBIl1UgzbN7NS': '5KhbTbuE1pM96Ujhm28DlCLRGMQdDm8ev2w9yTmAl1TxcLpfH.32gn9jHWAG63XY5m.zwJzIpwjpsWQypqhJRqq', 'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%22191dfe6e4a11254-01e41e0f4f92ff8-26001151-921600-191dfe6e4a220c%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTkxZGZlNmU0YTExMjU0LTAxZTQxZTBmNGY5MmZmOC0yNjAwMTE1MS05MjE2MDAtMTkxZGZlNmU0YTIyMGMifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22191dfe6e4a11254-01e41e0f4f92ff8-26001151-921600-191dfe6e4a220c%22%7D', 'sajssdk_2015_cross_new_user': '1', 'HMACCOUNT_BFESS': '5CE1833FAA2F8083', 'Hm_lvt_21a348fada873bdc2f7f75015beeefeb': '1726038337', 'Hm_lpvt_21a348fada873bdc2f7f75015beeefeb': '1726038337', 'HMACCOUNT': '5CE1833FAA2F8083', 'locationInfo_search': '{%22code%22:%22801%22%2C%22name%22:%22%E6%88%90%E9%83%BD%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}', 'FSSBBIl1UgzbN7NT': '5RXmrHCMfAsZqqqDp65MjXayj2UoH1vC_Yw1wF8F4tofi7LSoYbUF_om_EhdyKyFar.pUPc.OI1g19MJPU8CPbKI8.Vq4R8.hdQZHCwBvXRaYKUYo1Dfqp6ROvOHQT4H2sELfG01VM5i6tXFohtYVF4ZSM8pg2nMp_5_JaC43_YOU2BTFOv.E8c6bC9i1J93sERm3jiefBNCllZtt4G0f1q_6dRW6PC1QPW6e1X5oNctO5dsUGf1cOs0hW5mUWzlkWUsW5vR29qXbPFhZ4X.S6p', '1420ba6bb40c9512e9642a1f8c243891': 'bb4a9dc0-b1d0-4824-98c2-ef484a86f8a5'}
infos = []
for i in range(1,6):
    print(f"开始爬取{i}页")
    time.sleep(1)
    url = f"https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p{i}"
    referer = i if i == 1 else i-1
    headers["referer"] = f"https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p{referer}"
    response = requests.get(url, headers=headers,cookies=cookies)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser') # 解析器
    divs = soup.find_all('div',class_='joblist-box__item') # 找到所有职位块节点
    for div in tqdm(divs):
        job_name = div.find("a",class_="jobinfo__name").text.strip()
        salary = div.find("p",class_="jobinfo__salary").text.strip()
        skills = []
        skill_father = div.find('div',class_='jobinfo__tag') # 技能要求所有标签
        if skill_father:
            skill_items = skill_father.find_all('div',class_="joblist-box__item-tag")
            for skill_item in skill_items:
                skills.append(skill_item.text)
        recruiter_name_recruiter_position = div.find('div',class_="companyinfo__staff-name").text.strip()
        if "·" in recruiter_name_recruiter_position:
            recruiter_name,recruiter_position = recruiter_name_recruiter_position.split('·')
        else:
            recruiter_name,recruiter_position = recruiter_name_recruiter_position,""
        companyinfo_tag = div.find('div',class_="companyinfo__tag")
        companyinfo_items = companyinfo_tag.find_all('div', class_='joblist-box__item-tag')
        if companyinfo_items:
            financing = companyinfo_items[0].text.strip()
        else:
            financing = ""
        if len(companyinfo_items) >= 2:
            scale = companyinfo_items[1].text.strip()
        else:
            scale = ""
        if len(companyinfo_items) == 3:
            industry = companyinfo_items[2].text.strip()
        else:
            industry = ""
        otherinfo_father = div.find('div', class_='jobinfo__other-info')
        companyinfo_items = otherinfo_father.find_all('div', class_='jobinfo__other-info-item')
        areas = companyinfo_items[0].text.strip()
        areas = areas.split('·')
        try:
            area_grandfather = areas[0]
        except:
            area_grandfather = ""
        try:
            area_pather = areas[1]
        except:
            area_pather = ""
        try:
            area_son = areas[2]
        except:
            area_son = ""
        experience_requirement = companyinfo_items[1].text.strip()
        education_background_requirement = companyinfo_items[2].text.strip()
        info = {
            "岗位名称":job_name,
            "工资":salary,
            "技能要求":skills,
            "招聘人":recruiter_name,
            "招聘人职位":recruiter_position,
            "公司融资信息":financing,
            "公司规模":scale,
            "公司行业":industry,
            "工作地点":area_son,
            "工作区域":area_pather,
            "工作大区":area_grandfather
        }
        infos.append(info)

df = pd.DataFrame(infos)
df.to_excel("智联招聘招聘信息.xlsx",index=False)