一、爬取代码
参考:前两个是数据化参考,最后一个是爬取拉勾网参考
https://blog.csdn.net/m0_48405781/article/details/108848131?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522162443965116780264054191%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fblog.%2522%257D&request_id=162443965116780264054191&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2blogfirst_rank_v2~rank_v29-1-108848131.nonecase&utm_term=%E7%88%AC%E5%8F%96%E6%8B%89%E9%92%A9%E7%BD%91%E6%95%B0%E6%8D%AE%E5%8F%AF%E8%A7%86%E5%8C%96&spm=1018.2226.3001.4450
https://blog.csdn.net/weixin_43862077/article/details/88931970?ops_request_misc=&request_id=&biz_id=102&utm_term=%E6%8B%89%E9%92%A9%E7%BD%91%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&utm_medium=distribute.pc_search_result.none-task-blog-2blogsobaiduweb~default-4-.nonecase&spm=1018.2226.3001.4450
https://blog.csdn.net/zhufureb/article/details/114081414
前景提示:爬取代码,如果不成功,登录之后再获取cookie值,记得睡一会,要不然封的太快,如果出现content为空,证明已经被封了,刷新页面弄一下,再回来爬
#导入需要的库
import requests #请求网页
import time
headers={
'Cookie':'user_trace_token=20210622202246-bb027267-e0f5-4ab4-8efe-f5aa05ba6edc; _ga=GA1.2.860520427.1624364567; LGUID=20210622202247-941ced7c-279c-4b7c-a0f9-51788c919fd3; JSESSIONID=ABAAABAABAGABFA1468D7F9218CB69AC4DEAEDB48FE83E9; WEBTJ-ID=20210622%E4%B8%8B%E5%8D%888:22:51202251-17a33ac59852e8-02a52c9593e426-c3f3568-2073600-17a33ac5986c95; RECOMMEND_TIP=true; privacyPolicyPopup=false; sensorsdata2015session=%7B%7D; _gid=GA1.2.1892442560.1624364571; index_location_city=%E5%85%A8%E5%9B%BD; __lg_stoken__=b9e58adddfa700f03fb274a50f48d6b1b400aaa2f9e4678e607a797cb80a0a48fe5eb0a4004fb2fb03cdef96d4733a07b5edbbb5b490a980f77213cf48c84f7ae119859469df; X_MIDDLE_TOKEN=5a81c7e7f82898948524890d2cf52d08; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1624364567,1624364571,1624372055,1624413700; LGSID=20210623100138-b6c62594-7f82-40ce-95f3-66a5ddb3519c; gate_login_token=05fca47d28b5c23f1b74c492ba94639423e23dbd482d802e311a41d97a9dd500; LG_HAS_LOGIN=1; _putrc=F0657479B99537F7123F89F2B170EADC; login=true; unick=%E7%94%A8%E6%88%B72087; hasDeliver=0; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; __SAFETY_CLOSE_TIME__22001503=1; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=29a0ccc39cfc071988751442611eae66e98378310a; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2222001503%22%2C%22first_id%22%3A%2217a33ac5a80662-0723a057691a06-c3f3568-2073600-17a33ac5a81cf5%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_utm_source%22%3A%22baidujava%22%2C%22%24latest_utm_medium%22%3A%22sspc%22%2C%22%24latest_utm_term%22%3A%22YHZX_java22993%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2289.0.4389.114%22%2C%22lagou_company_id%22%3A%22%22%7D%2C%22%24device_id%22%3A%2217a33ac5a80662-0723a057691a06-c3f3568-2073600-17a33ac5a81cf5%22%7D; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1624415789; LGRID=20210623103628-ef59a978-6c56-4ee0-8dcc-9acfd2043d42; SEARCH_ID=865cc25ade2145a9b79f133516959ec4',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_python%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88?labelWords=&fromSearch=true&suginput=',
'Origin':'https://www.lagou.com',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':'None',
'X-Requested-With':'XMLHttpRequest',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
} #模拟头部信息
j = 1
for x in range(1,31): #爬取30页
data={
'first':'true',
'pn':x,
'kd':'python开发工程师'
}
result=requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false',headers=headers,data=data) #请求网页
json_result=result.json()
page_position=json_result['content']['positionResult']['result']
time.sleep(3) #睡3秒钟)
for i in page_position:
city = i['city']
companyFullName = i['companyFullName']
companySize = i['companySize']
education = i['education']
positionName = i['positionName']
salary = i['salary']
workYear = i['workYear']
companyLabelList = i['companyLabelList']
if len(companyLabelList) > 0:
companyLabelList = ''.join(companyLabelList)
else:
companyLabelList = ''
'''
companyLabelstr=companyLabelList+companyLabelstr
print(workYear,companyLabelList)
print(companyLabelstr)
'''
with open('python开发工程师2.csv', 'a+', encoding='utf-8')as f:
f.write(
f'{city},{companyFullName},{companySize},{education},{positionName},{salary},{workYear},{companyLabelList}\n')
print(f'第{j}条数据成功')
j += 1
二、可视化部分
首先,先给csv文件弄一下列名
import pandas as pd
# names,按照顺序给每一个列,一个列名
df=pd.read_csv(r'python开发工程师3.csv',encoding='utf-8',names=['city','companyFullName','companySize','education','positionName','salary','workYear','companyLabelList'])
data=pd.read_csv('python.csv',usecols=[7])
data.to_csv('python开发工程师3.csv', index = False)
(一)、饼图
然后是代码
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from wordcloud import WordCloud
#字体,不能缺,少了,显示字体的地方会变成框框
matplotlib.rcParams['font.family']='SimHei'
df=pd.read_csv(r'python开发工程师3.csv',encoding='utf-8')
city = df['city'].value_counts()
label = city.keys()
city_list = []
count = 0
n = 1
distance = []
for i in city:
city_list.append(i)
print('列表长度', len(city_list))
count += 1
if count > 5:
n += 0.1
distance.append(n)
else:
distance.append(0)
plt.pie(city_list, labels=label, labeldistance=1.2, autopct='%2.1f%%', pctdistance=0.6, shadow=True, explode=distance)
plt.axis('equal') # 使饼图为正圆形
plt.legend(loc='upper left', bbox_to_anchor=(-0.1, 1))
plt.savefig('python地理位置分布图.jpg')
plt.show()
(二)、词云
import jieba
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
matplotlib.rcParams['font.family']='SimHei'
data=pd.read_csv('python开发工程师3.csv')
text = ''
print(data['companyLabelList'])
# //词云分析
for i in data['companyLabelList']:
if i=='':
continue
else:
text+=str(i)
result=jieba.cut(text)
# print("切分结果: "+",".join(result))
cloud_text=",".join(result)
# print(cloud_text)
from wordcloud import WordCloud
# 注意注意,字体,记得字体,可能会报错,记得用自己电脑上的字体,有的可能不可以用就换个能用的
wc = WordCloud(
font_path='./data/msyhbd.ttc', #使用字体
)
wc.generate(cloud_text)
wc.to_file("pic.png")
(三)、柱形图
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
df=pd.read_csv(r'draft.csv',encoding='utf-8')
# 柱形图
count_by_city = df['education'].value_counts() #对城市对应的education数量进行统计
X = count_by_city.index #获取对应的education
Y = list(count_by_city) #获取对应城市的education
plt.rcParams['font.sans-serif'] = ['SimHei'] #设置可现实中文
for x,y in zip(X,Y):
plt.text(x, y+0.05, '%d' % y, ha='center', va= 'bottom') #绘制条形图上的数字
plt.bar(X,Y) #绘制图形
plt.savefig('教育成都.jpg') #存储图片
plt.show() #显示图片
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
from pylab import mpl
# 使用matplotlib能够显示中文
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
# 读取数据
df=pd.read_csv(r'python开发工程师3.csv',encoding='utf-8')
pattern = '\d+'
avg_work_year = []
# 注意注意!!,爬取到的薪资是个区间,所以需要转成数,薪资取最低值加上区间值得25%,比较贴近现实
df['salary12'] = df['salary'].str.findall(pattern)
avg_salary = []
for k in df['salary12']:
int_list = [int(n) for n in k]
avg_wage = int_list[0]+(int_list[1]-int_list[0])/4
avg_salary.append(avg_wage)
matplotlib.rcParams['font.family']='SimHei'
plt.rcParams['axes.labelsize']=16
plt.rcParams['xtick.labelsize']=14
plt.rcParams['ytick.labelsize']=14
plt.rcParams['legend.fontsize']=12
plt.rcParams['figure.figsize']=[15,9]
plt.hist(avg_salary,bins=8,facecolor='#ff6700',edgecolor='blue')
plt.xlabel('薪资(单位/千元)')
plt.ylabel('频数/频率')
plt.title('python薪资直方图')
plt.savefig('python薪资分布.jpg')
plt.show()
ps:忙于写作业,不能完善,还参考了好几篇博客,如有遗忘,请私聊我