Python数据可视化：浅谈拉勾网Python爬虫职位

最新推荐文章于 2024-07-24 15:37:31 发布

_小朋友_

最新推荐文章于 2024-07-24 15:37:31 发布

阅读量1.6k

点赞数 2

本文链接：https://blog.csdn.net/qq_41156193/article/details/86483993

版权

本文从拉勾网上爬取Python爬虫岗位的职位信息和任职要求，并将数据保存到mysql数据库中，最后通过echarts模块实现数据可视化，直观地展示了这个职位的薪资、学历、工作经验和不同城市需求等信息。

author = 小朋友
微信 = qq735833020
关注作者微信公众号，了解更多详情，或获取代码
在这里插入图片描述

import requests
from lxml import etree
import time
from pymysql import connect

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
    "Cookie": "TG-TRACK-CODE=search_code; JSESSIONID=ABAAABAABEEAAJA288CDC5B3C39F6B68B738BE0F3680D5B; SEARCH_ID=bec263de83e24d58938d14a4e56c7012",
    "Referer": "https://www.lagou.com/jobs/list_Python%E7%88%AC%E8%99%AB?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=",
    "Connection": "keep-alive"
}

conn = connect(host="127.0.0.1", port=3306, user="root", password="123456", database="lagou", charset="utf8")
cur = conn.cursor()


def get_index_info(pn):
    index_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
    data = {
        "first": "true",
        "pn": pn,
        "kd": "Python爬虫"
    }
    info_list = list()
    print("正在爬取第{}页...".format(pn))
    response = requests.post(index_url, headers=headers, data=data).json()
    for item in (response["content"]["positionResult"]["result"]):
        positionId = item["positionId"]
        positionName = item["positionName"]
        companyFullName = item["companyFullName"]
        city = item["city"]
        salary = item["salary"]
        workYear = item["workYear"]
        education = item["education"]
        positionLables = ",".join(item["positionLables"]) if len(item["positionLables"]) > 0 else "None"
        industryField = item["industryField"]
        info_list.append((positionId, positionName, companyFullName, city, salary, workYear, education, positionLables,
                          industryField))
    # print(info_list)
    return info_list


def get_detail_info(index_info):
    for item in index_info:
        detail_url = "https://www.lagou.com/jobs/{}.html".format(item[0])
        print(detail_url)
        response = requests.get(detail_url, headers=headers).text
        html = etree.HTML(response)
        job_desc = "".join([item.strip() for item in html.xpath('//div[@class="job-detail"]//text()')])
        # print(job_desc)
        save_mysql(item, job_desc, detail_url)


def save_mysql(item, job_desc, detail_url):
    try:
        sql = "insert into job (id, positionName, companyFullName, city, salary, workYear, education,  positionLables, industryField, detail_info, detail_url) values (default, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
        cur.execute(sql, (item[1], item[2], item[3], item[4], item[5], item[6], item[7], item[8], job_desc, detail_url))
        conn.commit()
    except Exception as e:
        print(e)
        conn.rollback()


def main():
    for pn in range(1, 24):
        index_info = get_index_info(pn)
        get_detail_info(index_info)
        time.sleep(60)
    cur.close()
    conn.close()


if __name__ == '__main__':
    main()