爬取一下lagou的职位数据,并用饼图展示出来
整体的思路如下:
1.爬取拉勾网求职信息
(1)requests 请求,获取单页面
(2)分析页面加载,找到数据
(3)添加headers 信息,模仿浏览器请求
(4)解析页面,实现翻页爬取
(5)爬取数据存入json文件
2.数据分析与可视化
(1)分析数据
(2)清晰数据,matplotlib.pyplot绘制饼图
爬取拉勾网求职信息
(1)requests 请求,获取单页面
# 1.获取拉钩网url
req_url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
# 2.发送post请求
req_result = requests.post(req_url)
# 3.打印请求结果
print(req_result.text)
由上面的流程,打印输出结果如下:
{"status":false,"msg":"您操作太频繁,请稍后再访问","clientIp":"223.75.51.17","state":2408}
原因是我们直接post访问url,服务器会把我们误认为‘机器人’,这也是一种反爬,解决方法是加一个请求头即可完全模拟浏览器请求
2)分析页面加载
1.请求分析
在拉钩网首页,按F12进入开发者模式,然后在查询框中输入python,点击搜索
(3)添加headers 信息,模仿浏览器请求
(4)解析页面,实现翻页爬取
import requests
#特别注意:Cookie一定要带上
headers = {
'Host': 'www.lagou.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_python/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput=',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': None,
'X-Requested-With': 'XMLHttpRequest',
'Cookie':'xxxxxxx'
}
positions = []
for x in range(1, 16):
data = {
'first': 'true',
'pn': x,
'kd': 'python'
}
res = requests.post(
"https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false",
headers=headers, data=data)
(5)爬取数据存入json文件
这里可以直接存入cvs文件中
import time
import json
json_result = res.json()
print(json_result)
page_positions = json_result['content']['positionResult']['result']
positions.extend(page_positions)
for position in positions:
# 打印测试
print("-" * 40)
print(position)
# 转化为Json字符串
line = json.dumps(positions, ensure_ascii=False)
# 保存
with open('lagou.json', 'wb+') as fp:
fp.write(line.encode('utf-8'))
time.sleep(3)
简单展示一下爬取到的数据:
数据分析与可视化
(1)分析数据,将json数据根据需要转换为csv
import csv
import json
def load_json(jsonpath):
lines = [] # 第一步:定义一个列表, 打开文件
with open(jsonpath,'r', encoding='utf8') as f:
for row in f.readlines(): # 第二步:读取文件内容
if row.strip().startswith("//"): # 第三步:对每一行进行过滤
continue
lines.append(row) # 第四步:将过滤后的行添加到列表中.
f.close()
return json.loads("\n".join(lines))
def trans(jsonpath,csvpath):
# json_file = codecs.open(jsonpath, 'r', encoding='utf8')
json_file = JsonToCvs.load_json(jsonpath)
csv_file = open(csvpath,'w',newline='')
f_csv = csv.writer(csv_file)
flag = True
for k , v in enumerate(json_file)
if flag:
# 获取属性列表
f_keys = list(v.keys()
f_csv.writerow(f_keys) # 将属性列表写入csv中
flag = False
else:
f_values = list(v.values())
# 读取json数据的每一行,将values数据一次一行的写入csv中
f_csv.writerow(f_values)
csv_file.close()
(2)清晰数据,matplotlib.pyplot绘制"职位学历"要求饼图
import pandas as pd
import matplotlib.pyplot as plt
# 读取文件内容jobs.csv
csvpath = "E:/workspace/TestPython/test_data/jobs.csv"
df = pd.read_csv(r'E:\workspace\TestPython\test_data\jobs.csv',encoding='utf-8')
print(df)
#删除以下列['companyShortName', 'companyLogo', 'financeStage', 'companyLabelList', 'thirdType', 'skillLables', 'industryLables', 'createTime', 'formatCreateTime', ,'district', 'salaryMonth', 'jobNature', 'imState', 'lastLogin', 'publisherId', 'approve', 'subwayline', 'stationname', 'linestaion', 'latitude', 'longitude', 'distance', 'hitags', 'resumeProcessRate', 'resumeProcessDay', 'score', 'newScore', 'matchScore', 'matchScoreExplain', 'query', 'explain', 'isSchoolJob', 'adWord', 'plus', 'pcShow', 'appShow', 'deliver', 'gradeDescription', 'promotionScoreExplain', 'isHotHire', 'count', 'aggregatePositionIds', 'promotionType', 'is51Job', 'famousCompany', 'detailRecall','hunterJob']
#df.drop(axis = 1,inplace=True) //按列删除 axis = 0按行删除,inplace=True不保留副本
df.drop(['companyShortName','companyLogo','financeStage','companyLabelList','thirdType','skillLables','industryLables','createTime','formatCreateTime','district','salaryMonth','jobNature','imState','lastLogin','publisherId','approve','subwayline','stationname','linestaion', 'latitude','longitude','distance','hitags','resumeProcessRate','resumeProcessDay','score', 'newScore', 'matchScore', 'matchScoreExplain','query','explain','isSchoolJob','adWord','plus','pcShow','appShow','deliver','gradeDescription','promotionScoreExplain','isHotHire','count','aggregatePositionIds','promotionType','is51Job','famousCompany','detailRecall','hunterJob'],axis = 1,inplace=True)
print(df)
#查看是否有重复的值,positionId重复认定为重复
#查看数据的行数和列数
df.shape
#positionId有多少个不重复的值
print(df['positionId'].nunique())
#drop_duplicates不带参数按照默认方式判重,所有列都重复才认为是重复,keep参数默认保留第一条
df.drop_duplicates(subset=['positionId'],inplace=True)
#统计学历,分组不做排序
edu = df.groupby(['education'],sort=False)['positionId'].count()
print(edu)
#设置可以显示中文
plt.rcParams['font.family']=['sans-serif']
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
#画饼图
size = edu.values
#print(size)
lable = df.education.unique()
#print(lable)
plt.pie(size,labels=lable,autopct='%1.1f%%')
plt.axis('equal') #该行代码使饼图长宽相等
plt.title("数据分析")
plt.show()