python爬取分页数据并画图
爬取贵州农经网并分析画图
案例详情
爬取鲜鸡蛋在最近一年的销售价格、销售地点等信息,并存放于excel表格中,然后根据表格将数据按照不同的标准分组,并把所需要的数据取出进行画图,从而更加直观的了解。
解题思路
1、因为数据较多,所以采用分页爬取,这里采用的是使用循环来得到每一页的url,然后根据url获取每一页的数据信息。
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
}
list1 = []
list2 = []
list3 = []
list4 = []
for i in range(20):
pageNum = i + 1
url = r"https://www.gznw.com/eportal/ui?moduleId=ab59857100d84dcca372ff4473198d88&struts.portlet.mode=view&struts.portlet.action=/portlet/priceFront!queryFrontList.action&pageSize=20&" + "pageNum=%d" % pageNum
url = url + "&recruitType=1&productName=%E9%B2%9C%E9%B8%A1%E8%9B%8B&areaCode=22572&startTime=20220416&endTime=20230416"
response = requests.get(url=url, headers=headers)
response.encoding = "utf-8"
response = json.loads(response.text)
for row in response['rows']:
list1.append(row['v0027'])
list2.append(row['v0005'])
list3.append(row['v0031'])
list4.append(row['v0004'])
# 将list转换为Series类型
ser1 = pandas.Series(list1)
ser2 = pandas.Series(list2)
ser3 = pandas.Series(list3)
ser4 = pandas.Series(list4)
# 将Series转换为DataFrame类型
dat = pandas.DataFrame({'object':ser1,'price':ser2,'place':ser3,'date':ser4})
# 保存为excel文件,需要安装openpyxl
dat.to_excel('new.xlsx')
3.这里爬取到数据的日期很长,我们只需要一部分 所以我们只取日期的年月日
df=pd.read_excel("new.xlsx")
# print(df)
# 取日期的年月日
df['date'] = df['date'].str[0:10]
# print(df['date'])
4.分析数据并画图,在这里我们要根据市场也就是地点来将数据分组(groupby)然后画图的横坐标为日期,纵坐标为价格
# 按照地点分组 横坐标为日期 纵坐标为价格
plt.figure(figsize=(4,5),dpi=80,facecolor="green")
group1 = df.groupby(['place'])
imgRow = len(group1)
imgIndex = 1
for name,value in group1:
print(name)
print(value['date'],value['price'])
ax = plt.subplot(imgRow, 2, imgIndex,label=name)
ax.plot(value['date'],value['price'])
imgIndex = imgIndex+1
print(imgIndex)
plt.show()
完整代码
import pandas as pd
from pylab import *
import json
import pandas
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
}
list1 = []
list2 = []
list3 = []
list4 = []
for i in range(20):
pageNum = i + 1
url = r"https://www.gznw.com/eportal/ui?moduleId=ab59857100d84dcca372ff4473198d88&struts.portlet.mode=view&struts.portlet.action=/portlet/priceFront!queryFrontList.action&pageSize=20&" + "pageNum=%d" % pageNum
url = url + "&recruitType=1&productName=%E9%B2%9C%E9%B8%A1%E8%9B%8B&areaCode=22572&startTime=20220416&endTime=20230416"
response = requests.get(url=url, headers=headers)
response.encoding = "utf-8"
response = json.loads(response.text)
for row in response['rows']:
list1.append(row['v0027'])
list2.append(row['v0005'])
list3.append(row['v0031'])
list4.append(row['v0004'])
# 将list转换为Series类型
ser1 = pandas.Series(list1)
ser2 = pandas.Series(list2)
ser3 = pandas.Series(list3)
ser4 = pandas.Series(list4)
# 将Series转换为DataFrame类型
dat = pandas.DataFrame({'object':ser1,'price':ser2,'place':ser3,'date':ser4})
# 保存为excel文件,需要安装openpyxl
dat.to_excel('new.xlsx')
#
# # 读取日期
# data = pandas.read_excel("new.xlsx")
#
# # 仅保留年月日
# data['date'] = data.date.str.slice(0, 11)
# print(data.values[:,4])
df=pd.read_excel("new(1).xlsx")
# print(df)
# 取日期的年月日
df['date'] = df['date'].str[0:10]
# print(df['date'])
# 按照地点分组 横坐标为日期 纵坐标为价格
plt.figure(figsize=(4,5),dpi=80,facecolor="green")
group1 = df.groupby(['place'])
imgRow = len(group1)
imgIndex = 1
for name,value in group1:
print(name)
print(value['date'],value['price'])
ax = plt.subplot(imgRow, 2, imgIndex,label=name)
ax.plot(value['date'],value['price'])
imgIndex = imgIndex+1
print(imgIndex)
plt.show()