python数据分析——招聘数据获取与分析

前言

小明最近准备找一个Python相关的工作,但是作为刚毕业的大学生,对于行情并不怎么了解。于是他想要通过获取腾讯招聘中的招聘信息,来了解Python相关工作需要掌握哪些技能。

思路分析

1.数据爬取
2.处理数据
3.存储数据
    1-用pandas存起来,然后再存入csv
    2-用openpyxl存入csv
4.制作词云图
    1-获取要处理的列Responsibility数据,整理成一个大字符串
    2-做分词处理
    3-停用词清洗
    4-词云图实现

1.数据爬取

        0-用到的第三方库

import time
import requests
import openpyxl
import random
import json
import numpy as np
import pandas as pd
from pyecharts.charts import *
from pyecharts.globals import CurrentConfig,NotebookType
CurrentConfig.NOTEBOOK_TYPE =NotebookType.JUPYTER_LAB
# 下载 pip install wordcloud
from pyecharts.charts import *
from wordcloud import WordCloud
from PIL import Image

        1-openpyxl

# 使用pandas
lst=[]
timestamp=int(time.time()*1000)
keyword=input('请输入关键字:')
pageIndex=int(input('请输入页数:'))
# 记住不要超出页数
for page in range(1,pageIndex+1):
    url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp={timestamp}&countryId=&cityId=3&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={keyword}&pageIndex={page}&pageSize=10&language=zh-cn&area=cn'
    
    headers={
            #头部
            }
    res=requests.get(url,headers=headers)
    data=res.json()
    # print(data)
    print(f'这是第{page}页')
    for i in data['Data']['Posts']:
        dic={}
        dic['LastUpdateTime']=i['LastUpdateTime']
        dic['RecruitPostName']=i['RecruitPostName']
        dic['Responsibility']=i['Responsibility']
        PostId=i['PostId']
        # print(dic)
        details_url=f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp={timestamp}&postId={PostId}&language=zh-cn'
        # print(details_url)
        res1=requests.get(details_url,headers=headers)
        # print(res1)
        data1=res1.json()
        # print(data1)
        dic['Requirement']=data1['Data']['Requirement']
        lst.append(dic)
df=pd.DataFrame(lst)
df.to_excel('./腾讯招聘.xlsx',index=False)

        2-pandas

lst=[]
timestamp=int(time.time()*1000)
# keyword=input('关键字:')
keyword=input('请输入关键字:')
pageIndex=int(input('请输入页数:'))
# pageIndex=int(input('页码:'))
for page in range(1,pageIndex+1):
    url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp={timestamp}&countryId=&cityId=3&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={keyword}&pageIndex={page}&pageSize=10&language=zh-cn&area=cn'

    # print(f'这是{page}页的数据{url}')
    response=requests.get(url=url,headers=headers)
    result=response.text
    
    # 休眠5秒
    # time.sleep(random.randint(1,5))
    content_dict=json.loads(result)
    # print(content_dict)
    post_list=content_dict['Data']['Posts']
    # print(post_list)
    for value in post_list:
        dic={}
        dic['LastUpdateTime']=value['LastUpdateTime']
        dic['RecruitPostName']=value['RecruitPostName']
        dic['Responsibility']=value['Responsibility']
        # print(dic)
        PostId=value['PostId']
        # print([PostId])
        detail_url=f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp={timestamp}&postId={PostId}&language=zh-cn'
        # print(detail_url)
        res=requests.get(url=detail_url,headers=headers)
        result2=res.text
        # print(result2)
        text=json.loads(result2)
        # print(text)
        dic['Requirement']=text['Data']['Requirement']
        # print(dic['Requirement'])
        lst.append(dic)
        print('采集已完成')
df=pd.DataFrame(lst)
df.to_excel('./腾讯.xlsx',index=False)

2.制作词云图

        1-做分词处理

res_list=[]
def chinese_jieba(field):
    for i in data[field]:
        res_list.append(str(i))
    st=''.join(res_list)
    wordlist_jieba=jieba.lcut(st)
    txt_jieba=" ".join(wordlist_jieba)
    return txt_jieba
# chinese_jieba('Responsibility')

        2-停用词清洗

def stopwords_read():
    stopwords_=['负责','优化']
    # 自己建一个txt文件,专门放停用词
    with open(r'./stopword.txt',encoding='utf-8')as f:
        for line in f:
            if len(line)>0:
                stopwords_.append(line.strip())
    return stopwords_
# stopwords_read()

        3-词云图实现

def wordcloud_generate():
    stopwords_=stopwords_read()
    txt=chinese_jieba('Responsibility')
    background_image=np.array(Image.open(r'./腾讯词云图.jpg'))
    colormaps = colors.ListedColormap(['blue', 'green', 'yellow', 'red', 'pink'])
    wordcloud=WordCloud(
        font_path='simsun.ttc',
        max_words = 100,
        max_font_size=400,
        stopwords=stopwords_,
        mask=background_image,
        colormap=colormaps,
        contour_color='steelblue',
        contour_width=2,
        collocations=False
    ).generate(txt)
    image=wordcloud.to_image()
    filename='pic.jpg'
    wordcloud.to_file(filename)
    image.show()
if __name__=="__main__":
    wordcloud_generate()
    

3.最终效果

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值