python数据分析——招聘数据获取与分析

学着吃鱼(妮妮版)

于 2024-08-29 16:02:26 发布

阅读量209

点赞数 6

文章标签： python

本文链接：https://blog.csdn.net/weixin_56708289/article/details/141680787

版权

前言

小明最近准备找一个Python相关的工作，但是作为刚毕业的大学生，对于行情并不怎么了解。于是他想要通过获取腾讯招聘中的招聘信息，来了解Python相关工作需要掌握哪些技能。

思路分析

1.数据爬取
2.处理数据
3.存储数据
1-用pandas存起来，然后再存入csv
2-用openpyxl存入csv
4.制作词云图
1-获取要处理的列Responsibility数据，整理成一个大字符串
2-做分词处理
3-停用词清洗
4-词云图实现

1.数据爬取

0-用到的第三方库

import time
import requests
import openpyxl
import random
import json
import numpy as np
import pandas as pd
from pyecharts.charts import *
from pyecharts.globals import CurrentConfig,NotebookType
CurrentConfig.NOTEBOOK_TYPE =NotebookType.JUPYTER_LAB
# 下载 pip install wordcloud
from pyecharts.charts import *
from wordcloud import WordCloud
from PIL import Image

1-openpyxl

# 使用pandas
lst=[]
timestamp=int(time.time()*1000)
keyword=input('请输入关键字：')
pageIndex=int(input('请输入页数：'))
# 记住不要超出页数
for page in range(1,pageIndex+1):
    url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp={timestamp}&countryId=&cityId=3&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={keyword}&pageIndex={page}&pageSize=10&language=zh-cn&area=cn'
    
    headers={
            #头部
            }
    res=requests.get(url,headers=headers)
    data=res.json()
    # print(data)
    print(f'这是第{page}页')
    for i in data['Data']['Posts']:
        dic={}
        dic['LastUpdateTime']=i['LastUpdateTime']
        dic['RecruitPostName']=i['RecruitPostName']
        dic['Responsibility']=i['Responsibility']
        PostId=i['PostId']
        # print(dic)
        details_url=f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp={timestamp}&postId={PostId}&language=zh-cn'
        # print(details_url)
        res1=requests.get(details_url,headers=headers)
        # print(res1)
        data1=res1.json()
        # print(data1)
        dic['Requirement']=data1['Data']['Requirement']
        lst.append(dic)
df=pd.DataFrame(lst)
df.to_excel('./腾讯招聘.xlsx',index=False)

2-pandas

lst=[]
timestamp=int(time.time()*1000)
# keyword=input('关键字：')
keyword=input('请输入关键字：')
pageIndex=int(input('请输入页数：'))
# pageIndex=int(input('页码：'))
for page in range(1,pageIndex+1):
    url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp={timestamp}&countryId=&cityId=3&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={keyword}&pageIndex={page}&pageSize=10&language=zh-cn&area=cn'

    # print(f'这是{page}页的数据{url}')
    response=requests.get(url=url,headers=headers)
    result=response.text
    
    # 休眠5秒
    # time.sleep(random.randint(1,5))
    content_dict=json.loads(result)
    # print(content_dict)
    post_list=content_dict['Data']['Posts']
    # print(post_list)
    for value in post_list:
        dic={}
        dic['LastUpdateTime']=value['LastUpdateTime']
        dic['RecruitPostName']=value['RecruitPostName']
        dic['Responsibility']=value['Responsibility']
        # print(dic)
        PostId=value['PostId']
        # print([PostId])
        detail_url=f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp={timestamp}&postId={PostId}&language=zh-cn'
        # print(detail_url)
        res=requests.get(url=detail_url,headers=headers)
        result2=res.text
        # print(result2)
        text=json.loads(result2)
        # print(text)
        dic['Requirement']=text['Data']['Requirement']
        # print(dic['Requirement'])
        lst.append(dic)
        print('采集已完成')
df=pd.DataFrame(lst)
df.to_excel('./腾讯.xlsx',index=False)

2.制作词云图

1-做分词处理

res_list=[]
def chinese_jieba(field):
    for i in data[field]:
        res_list.append(str(i))
    st=''.join(res_list)
    wordlist_jieba=jieba.lcut(st)
    txt_jieba=" ".join(wordlist_jieba)
    return txt_jieba
# chinese_jieba('Responsibility')

2-停用词清洗

def stopwords_read():
    stopwords_=['负责','优化']
    # 自己建一个txt文件，专门放停用词
    with open(r'./stopword.txt',encoding='utf-8')as f:
        for line in f:
            if len(line)>0:
                stopwords_.append(line.strip())
    return stopwords_
# stopwords_read()

3-词云图实现

def wordcloud_generate():
    stopwords_=stopwords_read()
    txt=chinese_jieba('Responsibility')
    background_image=np.array(Image.open(r'./腾讯词云图.jpg'))
    colormaps = colors.ListedColormap(['blue', 'green', 'yellow', 'red', 'pink'])
    wordcloud=WordCloud(
        font_path='simsun.ttc',
        max_words = 100,
        max_font_size=400,
        stopwords=stopwords_,
        mask=background_image,
        colormap=colormaps,
        contour_color='steelblue',
        contour_width=2,
        collocations=False
    ).generate(txt)
    image=wordcloud.to_image()
    filename='pic.jpg'
    wordcloud.to_file(filename)
    image.show()
if __name__=="__main__":
    wordcloud_generate()