Python爬虫抓取猎聘职位

本爬虫基于以下博文代码修改所得:

Python爬虫实战:从猎聘网获取职位信息并存入数据库_python爬取猎聘的职位信息-CSDN博客文章浏览阅读766次,点赞5次,收藏14次。通过使用python从猎聘网获取职位信息并存入mysql数据库中。接下来,我们将解析得到的职位信息存入MySQL数据库中。获取到的数据通常是JSON格式的,我们需要解析JSON数据,提取出我们需要的职位信息,例如职位名称、公司名称、工作地点、薪资待遇等。这个函数用于读取JavaScript代码,并执行JavaScript来生成一个参数(ckId),用于后续的HTTP请求。这个函数用于解析HTTP响应,提取其中的职位信息,并调用sync_data2db()函数将数据存入数据库中。_python爬取猎聘的职位信息https://blog.csdn.net/lhyandlwl/article/details/136968232数据库建表

CREATE TABLE `job_detail` (
  `ID` int NOT NULL AUTO_INCREMENT,
  `job_title` varchar(200) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `location` varchar(200) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `salary_amount` varchar(200) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `work_experience` varchar(200) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `tags` varchar(800) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `company_name` varchar(800) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `industry` varchar(800) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `company_size` varchar(200) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `job_link` varchar(200) DEFAULT NULL,
  `refresh_dt` varchar(45) DEFAULT NULL,
  `Edu` varchar(45) DEFAULT NULL,
  PRIMARY KEY (`ID`),
  KEY `idx_job_detail_job_title` (`job_title`),
  KEY `idx_job_link` (`job_link`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

可运行的Python爬虫代码

# -*- coding: utf-8 -*-
import time
import requests 
import execjs 
import random
import pymysql 

db_config = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': '12345678',
    'database': 'work_data',
    'charset': 'utf8mb4',
    'cursorclass': pymysql.cursors.DictCursor
}

'''
def read_js_code():
    f = open('/Users/shareit/workspace/chart_show/demo.js', encoding='utf-8')
    txt = f.read()
    js_code = execjs.compile(txt)
    ckId = js_code.call('r', 32)
    return ckId
'''

def post_data():
##    read_js_code()
    url = "https://api-c.liepin.com/api/com.liepin.searchfront4c.pc-search-job"
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Sec-Ch-Ua-Platform': 'macOS',
        'Content-Length': '398',
        'Content-Type': 'application/json;charset=UTF-8;',
        'Host': 'api-c.liepin.com',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'Origin': 'https://www.liepin.com',
        'Referer': 'https://www.liepin.com/',
        'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-site',
        'X-Client-Type': 'web',
        'X-Fscp-Bi-Stat': '{"location": "https://www.liepin.com/zhaopin"}',
        'X-Fscp-Fe-Version': '',
        'X-Fscp-Std-Info': '{"client_id": "40108"}',
        'X-Fscp-Trace-Id': '52262313-e6ca-4cfd-bb67-41b4a32b8bb5',
        'X-Fscp-Version': '1.1',
        'X-Requested-With': 'XMLHttpRequest',
    }

    list = ["H01$H0001", "H01$H0002","H01$H0003", "H01$H0004", "H01$H0005","H01$H0006", "H01$H0007", "H01$H0008","H01$H0009", "H01$H00010", "H02$H0018", "H02$H0019", "H03$H0022",
            "H03$H0023", "H03$H0024", "H03$H0025", "H04$H0030", "H04$H0031",
            "H04$H0032", "H05$H05", "H06$H06", "H07$H07", "H08$H08"]
    list = ["H01","H02","H03","H04","H05","H06","H07","H08","H09","H10","H01$H0001", "H01$H0002","H01$H0003", "H01$H0004", "H01$H0005","H01$H0006", "H01$H0007", "H01$H0008","H01$H0009", "H01$H00010"]
    for name in list:
        print("-------{}---------".format(name))
        for i in range(1):
            print("------------第{}页-----------".format(i))
            data = {"data": {"mainSearchPcConditionForm":
                                 {"city": "010", "dq": "010", "pubTime": "", "currentPage": i, "pageSize": 40,
                                  "key": "SQL",
                                  "suggestTag": "", "workYearCode": "0", "compId": "", "compName": "", "compTag": "",
                                  "industry": name, "salary": "", "jobKind": "", "compScale": "", "compKind": "",
                                  "compStage": "",
                                  "eduLevel": ""},
                             "passThroughForm":
                                 {"scene": "page", "skId": "ltzb0i55gx4xc8wdgsn0n73lrbnni548",
                                  "fkId": "5j6hk9ypqv0fh15t7mzxpgik5r7w39db",
##                                  "ckId": read_js_code(),
                                  "ckId": "62pq67k25gqzn6gngwi4kkxl29vr5fk8",
                                  'sfrom': 'search_job_pc'}}}
            response = requests.post(url=url, json=data, headers=headers)
            ## time.sleep(5)
            time.sleep(random.uniform(5, 20))
            parse_data(response)


def process_salary(salary):
    if '薪资面议' or '面议' == salary:
        return 0
    salary = salary.split("k")[0]
    if '-' in salary:
        low, high = salary.split('-')
        low = float(low) * 1000  # 将 'k' 替换为实际的单位
        return low
    else:
        return float(salary) * 1000

def parse_data(response):
    try:
        jobCardList = response.json()['data']['data']['jobCardList']
        sync_data2db(jobCardList)
    except Exception as e:
        return


def sync_data2db(jobCardList):
    connection = pymysql.connect(**db_config)
    try:
        with connection.cursor() as cursor:
            for job in jobCardList:
                #print(job)
                insert_query = "INSERT INTO job_detail(job_title,location,salary_amount,work_experience,tags,company_name,industry,company_size, Job_link, refresh_dt, Edu) VALUES "
                #values = (job['job']['title'], job['job']['dq'].split("-")[0], process_salary(job['job']['salary']),
                values = (job['job']['title'], job['job']['dq'], job['job']['salary'],
                                    #job['job']['campusJobKind'] if 'campusJobKind' in job['job'] else '应届'
                                    job['job']['requireWorkYears']
                                    , " ".join(job['job']['labels']), job['comp']['compName'], job['comp']['compIndustry'], job['comp']['compScale'], 
                                    job['job']['link'], job['job']['refreshTime'], job['job']['requireEduLevel'])
                values2 = [str(item) for item in values]
                insert2db = insert_query + "('" + "','".join(values2) + "')"
                #TestValue = job['job']['link']
                #print(TestValue)
                #print(insert_query) 
                print(values)
                #print(values2)
                #print(insert2db)
                cursor.execute(insert2db)
        connection.commit()
    except Exception as e:
        print(e)
    finally:
        connection.close()

if __name__ == '__main__':
    post_data()

最后,祝大家使用愉快!

  • 4
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值