前言
小明最近准备找一个Python相关的工作,但是作为刚毕业的大学生,对于行情并不怎么了解。于是他想要通过获取腾讯招聘中的招聘信息,来了解Python相关工作需要掌握哪些技能。
思路分析
1.数据爬取
2.处理数据
3.存储数据
1-用pandas存起来,然后再存入csv
2-用openpyxl存入csv
4.制作词云图
1-获取要处理的列Responsibility数据,整理成一个大字符串
2-做分词处理
3-停用词清洗
4-词云图实现
1.数据爬取
0-用到的第三方库
import time
import requests
import openpyxl
import random
import json
import numpy as np
import pandas as pd
from pyecharts.charts import *
from pyecharts.globals import CurrentConfig,NotebookType
CurrentConfig.NOTEBOOK_TYPE =NotebookType.JUPYTER_LAB
# 下载 pip install wordcloud
from pyecharts.charts import *
from wordcloud import WordCloud
from PIL import Image
1-openpyxl
# 使用pandas
lst=[]
timestamp=int(time.time()*1000)
keyword=input('请输入关键字:')
pageIndex=int(input('请输入页数:'))
# 记住不要超出页数
for page in range(1,pageIndex+1):
url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp={timestamp}&countryId=&cityId=3&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={keyword}&pageIndex={page}&pageSize=10&language=zh-cn&area=cn'
headers={
#头部
}
res=requests.get(url,headers=headers)
data=res.json()
# print(data)
print(f'这是第{page}页')
for i in data['Data']['Posts']:
dic={}
dic['LastUpdateTime']=i['LastUpdateTime']
dic['RecruitPostName']=i['RecruitPostName']
dic['Responsibility']=i['Responsibility']
PostId=i['PostId']
# print(dic)
details_url=f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp={timestamp}&postId={PostId}&language=zh-cn'
# print(details_url)
res1=requests.get(details_url,headers=headers)
# print(res1)
data1=res1.json()
# print(data1)
dic['Requirement']=data1['Data']['Requirement']
lst.append(dic)
df=pd.DataFrame(lst)
df.to_excel('./腾讯招聘.xlsx',index=False)
2-pandas
lst=[]
timestamp=int(time.time()*1000)
# keyword=input('关键字:')
keyword=input('请输入关键字:')
pageIndex=int(input('请输入页数:'))
# pageIndex=int(input('页码:'))
for page in range(1,pageIndex+1):
url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp={timestamp}&countryId=&cityId=3&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={keyword}&pageIndex={page}&pageSize=10&language=zh-cn&area=cn'
# print(f'这是{page}页的数据{url}')
response=requests.get(url=url,headers=headers)
result=response.text
# 休眠5秒
# time.sleep(random.randint(1,5))
content_dict=json.loads(result)
# print(content_dict)
post_list=content_dict['Data']['Posts']
# print(post_list)
for value in post_list:
dic={}
dic['LastUpdateTime']=value['LastUpdateTime']
dic['RecruitPostName']=value['RecruitPostName']
dic['Responsibility']=value['Responsibility']
# print(dic)
PostId=value['PostId']
# print([PostId])
detail_url=f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp={timestamp}&postId={PostId}&language=zh-cn'
# print(detail_url)
res=requests.get(url=detail_url,headers=headers)
result2=res.text
# print(result2)
text=json.loads(result2)
# print(text)
dic['Requirement']=text['Data']['Requirement']
# print(dic['Requirement'])
lst.append(dic)
print('采集已完成')
df=pd.DataFrame(lst)
df.to_excel('./腾讯.xlsx',index=False)
2.制作词云图
1-做分词处理
res_list=[]
def chinese_jieba(field):
for i in data[field]:
res_list.append(str(i))
st=''.join(res_list)
wordlist_jieba=jieba.lcut(st)
txt_jieba=" ".join(wordlist_jieba)
return txt_jieba
# chinese_jieba('Responsibility')
2-停用词清洗
def stopwords_read():
stopwords_=['负责','优化']
# 自己建一个txt文件,专门放停用词
with open(r'./stopword.txt',encoding='utf-8')as f:
for line in f:
if len(line)>0:
stopwords_.append(line.strip())
return stopwords_
# stopwords_read()
3-词云图实现
def wordcloud_generate():
stopwords_=stopwords_read()
txt=chinese_jieba('Responsibility')
background_image=np.array(Image.open(r'./腾讯词云图.jpg'))
colormaps = colors.ListedColormap(['blue', 'green', 'yellow', 'red', 'pink'])
wordcloud=WordCloud(
font_path='simsun.ttc',
max_words = 100,
max_font_size=400,
stopwords=stopwords_,
mask=background_image,
colormap=colormaps,
contour_color='steelblue',
contour_width=2,
collocations=False
).generate(txt)
image=wordcloud.to_image()
filename='pic.jpg'
wordcloud.to_file(filename)
image.show()
if __name__=="__main__":
wordcloud_generate()