导入包
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
导入数据各个海滨城市数据
In [2]:
milano1 = pd.read_csv('milano_250715.csv')
milano2 = pd.read_csv('milano_150715.csv')
milano3 = pd.read_csv('milano_270615.csv')
#数据导入成功之后,需要合并
In [3]:
display(milano1.head(),milano2.head(),milano3.head())
#合并之前,看看数据的列是否完全一样
| Unnamed: 0 | temp | humidity | pressure | description | dt | wind_speed | wind_deg | city | day | dist |
0 | 0 | 29.50 | 48 | 1011 | few clouds | 1437730849 | 3.6 | 90 | Milano | 2015-07-24 11:40:49 | 250 |
1 | 1 | 30.81 | 45 | 1011 | few clouds | 1437734492 | 2.6 | 70 | Milano | 2015-07-24 12:41:32 | 250 |
2 | 2 | 31.91 | 45 | 1010 | proximity shower rain | 1437738045 | 3.1 | 80 | Milano | 2015-07-24 13:40:45 | 250 |
3 | 3 | 32.72 | 40 | 1009 | proximity shower rain | 1437741578 | 3.6 | 130 | Milano | 2015-07-24 14:39:38 | 250 |
4 | 4 | 33.44 | 38 | 1009 | Sky is Clear | 1437745188 | 3.6 | 130 | Milano | 2015-07-24 15:39:48 | 250 |
| Unnamed: 0 | temp | humidity | pressure | description | dt | wind_speed | wind_deg | city | day | dist |
0 | 0 | 28.57 | 54 | 1016 | Sky is Clear | 1436863175 | 2.1 | 100 | Milano | 2015-07-14 10:39:35 | 250 |
1 | 1 | 29.74 | 48 | 1016 | Sky is Clear | 1436866758 | 2.6 | 0 | Milano | 2015-07-14 11:39:18 | 250 |
2 | 2 | 31.12 | 48 | 1016 | Sky is Clear | 1436870509 | 2.6 | 140 | Milano | 2015-07-14 12:41:49 | 250 |
3 | 3 | 32.16 | 45 | 1015 | Sky is Clear | 1436874098 | 2.1 | 0 | Milano | 2015-07-14 13:41:38 | 250 |
4 | 4 | 33.59 | 43 | 1015 | Sky is Clear | 1436877644 | 3.1 | 80 | Milano | 2015-07-14 14:40:44 | 250 |
| Unnamed: 0 | temp | humidity | pressure | description | dt | wind_speed | wind_deg | city | day | dist |
0 | 0 | 24.69 | 60 | 1017 | Sky is Clear | 1435390925 | 2.6 | 140 | Milano | 2015-06-27 09:42:05 | 250 |
1 | 1 | 25.34 | 57 | 1017 | Sky is Clear | 1435394243 | 2.1 | 160 | Milano | 2015-06-27 10:37:23 | 250 |
2 | 2 | 27.70 | 51 | 1017 | Sky is Clear | 1435399015 | 1.5 | 210 | Milano | 2015-06-27 11:56:55 | 250 |
3 | 3 | 28.36 | 42 | 1017 | Sky is Clear | 1435402416 | 2.1 | 220 | Milano | 2015-06-27 12:53:36 | 250 |
4 | 4 | 29.45 | 42 | 1016 | few clouds | 1435406054 | 2.6 | 210 | Milano | 2015-06-27 13:54:14 | 250 |
In [4]:
milano = pd.concat([milano1,milano2,milano3],ignore_index=True)
#ignore_index=True,重新排列
In [5]:
#查看有多少数据
display(milano.shape,milano.tail())
(66, 11)
| Unnamed: 0 | temp | humidity | pressure | description | dt | wind_speed | wind_deg | city | day | dist |
61 | 13 | 20.27 | 68 | 1017 | Sky is Clear | 1435453226 | 2.6 | 90 | Milano | 2015-06-28 03:00:26 | 250 |
62 | 14 | 19.62 | 72 | 1017 | Sky is Clear | 1435456482 | 2.6 | 50 | Milano | 2015-06-28 03:54:42 | 250 |
63 | 15 | 18.68 | 72 | 1017 | Sky is Clear | 1435460036 | 2.6 | 40 | Milano | 2015-06-28 04:53:56 | 250 |
64 | 16 | 18.28 | 72 | 1017 | Sky is Clear | 1435463874 | 2.6 | 40 | Milano | 2015-06-28 05:57:54 | 250 |
65 | 17 | 18.86 | 77 | 1017 | Sky is Clear | 1435467177 | 0.5 | 0 | Milano | 2015-06-28 06:52:57 | 250 |
In [6]:
asti1 = pd.read_csv('asti_250715.csv')
asti2 = pd.read_csv('asti_150715.csv')
asti3 = pd.read_csv('asti_270615.csv')
asti = pd.concat([asti1,asti2,asti3],ignore_index=True)
bologna1 = pd.read_csv('bologna_250715.csv')
bologna2 = pd.read_csv('bologna_150715.csv')
bologna3 = pd.read_csv('bologna_270615.csv')
bologna = pd.concat([bologna1,bologna2,bologna3],ignore_index=True)
cesena1 = pd.read_csv('cesena_250715.csv')
cesena2 = pd.read_csv('cesena_150715.csv')
cesena3 = pd.read_csv('cesena_270615.csv')
cesena = pd.concat([cesena1,cesena2,cesena3],ignore_index=True)
faenza1 = pd.read_csv('faenza_250715.csv')
faenza2 = pd.read_csv('faenza_150715.csv')
faenza3 = pd.read_csv('faenza_270615.csv')
faenza = pd.concat([faenza1,faenza2,faenza3],ignore_index=True)
ferrara1 = pd.read_csv('ferrara_250715.csv')
ferrara2 = pd.read_csv('ferrara_150715.csv')
ferrara3 = pd.read_csv('ferrara_270615.csv')
ferrara = pd.concat([ferrara1,ferrara2,ferrara3],ignore_index=True)
mantova1 = pd.read_csv('mantova_250715.csv')
mantova2 = pd.read_csv('mantova_150715.csv')
mantova3 = pd.read_csv('mantova_270615.csv')
mantova = pd.concat([mantova1,mantova2,mantova3],ignore_index=True)
piacenza1 = pd.read_csv('piacenza_250715.csv')
piacenza2 = pd.read_csv('piacenza_150715.csv')
piacenza3 = pd.read_csv('piacenza_270615.csv')
piacenza = pd.concat([piacenza1,piacenza2,piacenza3],ignore_index=True)
ravenna1 = pd.read_csv('ravenna_250715.csv')
ravenna2 = pd.read_csv('ravenna_150715.csv')
ravenna3 = pd.read_csv('ravenna_270615.csv')
ravenna = pd.concat([ravenna1,ravenna2,ravenna3],ignore_index=True)
torino1 = pd.read_csv('torino_250715.csv')
torino2 = pd.read_csv('torino_150715.csv')
torino3 = pd.read_csv('torino_270615.csv')
torino = pd.concat([torino1,torino2,torino3],ignore_index=True)
查看行数
In [7]:
print('asti',asti.shape)
print('bologna',bologna.shape)
print('cesena',cesena.shape)
print('faenza',faenza.shape)
print('ferrara',ferrara.shape)
print('mantova',mantova.shape)
print('milano',milano.shape)
print('piacenza',piacenza.shape)
print('ravenna',ravenna.shape)
print('torino',torino.shape)
asti (68, 11)
bologna (68, 11)
cesena (68, 11)
faenza (67, 11)
ferrara (68, 11)
mantova (68, 11)
milano (66, 11)
piacenza (68, 11)
ravenna (66, 11)
torino (68, 11)
去除没用的列
In [8]:
milano.columns
Out[8]:
Index(['Unnamed: 0', 'temp', 'humidity', 'pressure', 'description', 'dt',
'wind_speed', 'wind_deg', 'city', 'day', 'dist'],
dtype='object')
In [9]:
#如何批量删除“没用的列”呢?
#先把所有的城市组合起来
cities = [asti,bologna,cesena,faenza,ferrara,mantova,milano,piacenza,ravenna,torino]
#然后通过for循环,删除,注意‘Unnamed: 0’
for c in cities:
c.drop('Unnamed: 0',axis = 1,inplace = True)
In [10]:
#查看删除结果
milano
# temp 该城市的 温度
# humidity 湿度
# pressure 压强
# description 描述
# dt 时间
# wind_speed 风速
# wind_deg 风向
# city 城市名称
# day 采集日期
# dist 距离 (海里)
Out[10]:
| temp | humidity | pressure | description | dt | wind_speed | wind_deg | city | day | dist |
0 | 29.50 | 48 | 1011 | few clouds | 1437730849 | 3.6 | 90 | Milano | 2015-07-24 11:40:49 | 250 |
1 | 30.81 | 45 | 1011 | few clouds | 1437734492 | 2.6 | 70 | Milano | 2015-07-24 12:41:32 | 250 |
2 | 31.91 | 45 | 1010 | proximity shower rain | 1437738045 | 3.1 | 80 | Milano | 2015-07-24 13:40:45 | 250 |
3 | 32.72 | 40 | 1009 | proximity shower rain | 1437741578 | 3.6 | 130 | Milano | 2015-07-24 14:39:38 | 250 |
4 | 33.44 | 38 | 1009 | Sky is Clear | 1437745188 | 3.6 | 130 | Milano | 2015-07-24 15:39:48 | 250 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
61 | 20.27 | 68 | 1017 | Sky is Clear | 1435453226 | 2.6 | 90 | Milano | 2015-06-28 03:00:26 | 250 |
62 | 19.62 | 72 | 1017 | Sky is Clear | 1435456482 | 2.6 | 50 | Milano | 2015-06-28 03:54:42 | 250 |
63 | 18.68 | 72 | 1017 | Sky is Clear | 1435460036 | 2.6 | 40 | Milano | 2015-06-28 04:53:56 | 250 |
64 | 18.28 | 72 | 1017 | Sky is Clear | 1435463874 | 2.6 | 40 | Milano | 2015-06-28 05:57:54 | 250 |
65 | 18.86 | 77 | 1017 | Sky is Clear | 1435467177 | 0.5 | 0 | Milano | 2015-06-28 06:52:57 | 250 |
66 rows × 10 columns
各城市与海洋距离,最高温度,最低温度,最高湿度,最低湿度
In [11]:
#定义,各城市与海洋距离,最高温度,最低温度,最高湿度,最低湿度
dists = []
temp_max = []
temp_min = []
hum_max = []
hum_min = []
In [12]:
#通过for循环获需要的数据
for city in cities:
dists.append(city['dist'][0])
temp_max.append(city['temp'].max())
temp_min.append(city['temp'].min())
hum_max.append(city['humidity'].max())
hum_min.append(city['humidity'].min())
In [13]:
#随意看看一个数据
temp_max
Out[13]:
[34.31,
33.85000000000002,
32.81,
32.74000000000001,
33.43000000000001,
34.18000000000001,
34.81,
33.920000000000016,
32.79000000000002,
34.69]
In [14]:
display(len(temp_max),len(cities))
10
10
显示最高温度与离海远近的关系
In [15]:
#x轴-离海远近 dists,y轴-最高温度 temp_max
plt.plot(dists,temp_max)
Out[15]:
[<matplotlib.lines.Line2D at 0x7fa40a9e76d8>]
In [16]:
#使用点来显示,用到scatter
plt.scatter(dists,temp_max)
Out[16]:
<matplotlib.collections.PathCollection at 0x7fa4028c3ac8>
观察发现,离海近的可以形成一条直线,离海远的也能形成一条直线。
首先使用numpy:把列表转换为numpy数组,用于后续计算。
分别以100公里和50公里为分界点,划分为离海近和离海远的两组数据
In [17]:
#100公里之外的数据
#判断之前,需要将(dists,temp_max)列表型的数据,转换为numpy数组
x = np.array(dists)
y = np.array(temp_max)
#x1表示距离小于100公里的海滨城市
x1 = x[x<100]
#y1表示距离小于100公里城市的温度
y1 = y[x<100]
print('距离小于100公里,对应的温度')
print(x1,y1)
#x2表示距离大于50公里的海滨城市
x2 = x[x>50]
#y1表示距离大于50公里城市的温度
y2 = y[x>50]
print('距离大于50公里,对应的温度')
print(x2,y2)
距离小于100公里,对应的温度
[71 14 37 47 8] [33.85 32.81 32.74 33.43 32.79]
距离大于50公里,对应的温度
[315 71 121 250 200 357] [34.31 33.85 34.18 34.81 33.92 34.69]
In [18]:
#将数据转换为二维
x1 = x1.reshape(5,1)
y1 = y1.reshape(5,1)
x2 = x2.reshape(6,1)
y2 = y2.reshape(6,1)
display(x1,y1,x2,y2)
array([[71],
[14],
[37],
[47],
[ 8]])
array([[33.85],
[32.81],
[32.74],
[33.43],
[32.79]])
array([[315],
[ 71],
[121],
[250],
[200],
[357]])
array([[34.31],
[33.85],
[34.18],
[34.81],
[33.92],
[34.69]])
使用支持向量机计算回归参数
In [19]:
#机器学习的模型
from sklearn.svm import SVR
In [20]:
#创建算法
svr1 = SVR(kernel='linear')
svr2 = SVR(kernel='linear')
In [21]:
#将数据交给算法fit == feed
svr1.fit(x1,y1)
svr2.fit(x2,y2)
/home/ccoy/.local/lib/python3.7/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
/home/ccoy/.local/lib/python3.7/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
Out[21]:
SVR(kernel='linear')
In [22]:
#预测数据
x_test1 = np.linspace(0,100,20).reshape(20,1)
x_test2 = np.linspace(50,350,20).reshape(20,1)
display(x_test1,x_test2)
array([[ 0. ],
[ 5.26315789],
[ 10.52631579],
[ 15.78947368],
[ 21.05263158],
[ 26.31578947],
[ 31.57894737],
[ 36.84210526],
[ 42.10526316],
[ 47.36842105],
[ 52.63157895],
[ 57.89473684],
[ 63.15789474],
[ 68.42105263],
[ 73.68421053],
[ 78.94736842],
[ 84.21052632],
[ 89.47368421],
[ 94.73684211],
[100. ]])
array([[ 50. ],
[ 65.78947368],
[ 81.57894737],
[ 97.36842105],
[113.15789474],
[128.94736842],
[144.73684211],
[160.52631579],
[176.31578947],
[192.10526316],
[207.89473684],
[223.68421053],
[239.47368421],
[255.26315789],
[271.05263158],
[286.84210526],
[302.63157895],
[318.42105263],
[334.21052632],
[350. ]])
In [23]:
y1_ = svr1.predict(x_test1)
y2_ = svr2.predict(x_test2)
display(y1_,y2_)
array([32.55539683, 32.64395155, 32.73250627, 32.82106099, 32.90961571,
32.99817043, 33.08672515, 33.17527987, 33.26383459, 33.35238931,
33.44094403, 33.52949875, 33.61805347, 33.70660819, 33.79516291,
33.88371763, 33.97227235, 34.06082707, 34.14938179, 34.23793651])
array([33.90300699, 33.93834008, 33.97367317, 34.00900626, 34.04433934,
34.07967243, 34.11500552, 34.15033861, 34.1856717 , 34.22100478,
34.25633787, 34.29167096, 34.32700405, 34.36233714, 34.39767022,
34.43300331, 34.4683364 , 34.50366949, 34.53900258, 34.57433566])
In [24]:
#真实数据,点
plt.scatter(dists,temp_max)
#机器学习模拟,使用线
plt.plot(x_test1,y1_,'r')
plt.plot(x_test2,y2_,'b')
Out[24]:
[<matplotlib.lines.Line2D at 0x7fa3eb8e2160>]
查看最低温度与海洋距离的关系
In [25]:
plt.scatter(dists,temp_min)
Out[25]:
<matplotlib.collections.PathCollection at 0x7fa3eb844390>
最低湿度与海洋距离的关系
In [26]:
plt.scatter(dists,hum_min)
Out[26]:
<matplotlib.collections.PathCollection at 0x7fa3eb880f60>
最高湿度与海洋距离的关系
In [27]:
plt.scatter(dists,hum_max)
Out[27]:
<matplotlib.collections.PathCollection at 0x7fa3eb783978>
平均湿度与海洋距离的关系
In [28]:
temp_mean = []
for city in cities:
temp_mean.append(city['temp'].mean())
temp_mean
Out[28]:
[26.216176470588252,
27.242352941176495,
26.82029411764708,
27.038805970149276,
27.390735294117665,
27.643676470588254,
26.705303030303053,
27.018529411764725,
26.948636363636385,
26.50764705882355]
In [29]:
plt.scatter(dists,temp_mean)
Out[29]:
<matplotlib.collections.PathCollection at 0x7fa3eb6f2ba8>
思考:模仿最高温度,得到平均湿度与海洋距离的回归曲线
风向与风速的关系
In [30]:
#milano城市,'ro',r代表红色,o代表圆点
plt.plot(milano['wind_deg'],milano['wind_speed'],'ro')
Out[30]:
[<matplotlib.lines.Line2D at 0x7fa3eb66d080>]
在子图中,同时比较风向与湿度和风力的关系
In [31]:
axes1 = plt.subplot(121)
axes1.scatter(milano['wind_deg'],milano['humidity'])
axes2 = plt.subplot(122)
axes2.scatter(milano['wind_deg'],milano['wind_speed'])
Out[31]:
<matplotlib.collections.PathCollection at 0x7fa3eb6036d8>
可以看到散点图显示效果不好
由于风向是360度,我们可以考虑使用玫瑰图(极坐标条形图)
首先自定义一个画图函数
In [32]:
def show_rose(values,title):
#玫瑰花瓣的个数8,(角度)45度
n = 8
angle = np.arange(0,2*np.pi,2*np.pi/n)
#绘制的数据values
radius = np.array(values)
#axis:轴(x,y轴)
#axes:整个画面
plt.axes([0,0,2,2],polar = True)
colors = np.random.random(size = 24).reshape(8,3)
plt.bar(angle,radius,color = colors)
plt.title(title,loc = 'left')
用numpy创建一个直方图,将360度划分为8个面元,将数据分类到这8个面元中
In [33]:
#milano城市
degree = milano['wind_deg']
d,b = np.histogram(degree,8,[0,360])
In [34]:
display(d,b)
array([21, 9, 9, 6, 14, 3, 2, 2])
array([ 0., 45., 90., 135., 180., 225., 270., 315., 360.])
In [35]:
show_rose(d,'milano')
计算米兰各个方向上的风速
In [36]:
print(milano[milano['wind_deg']<45]['wind_speed'].mean())
print(milano[(milano['wind_deg']>44)&(milano['wind_deg']<90)]['wind_speed'].mean())
print(milano[(milano['wind_deg']>89)&(milano['wind_deg']<135)]['wind_speed'].mean())
print(milano[(milano['wind_deg']>134)&(milano['wind_deg']<180)]['wind_speed'].mean())
print(milano[(milano['wind_deg']>179)&(milano['wind_deg']<225)]['wind_speed'].mean())
print(milano[(milano['wind_deg']>224)&(milano['wind_deg']<270)]['wind_speed'].mean())
print(milano[(milano['wind_deg']>269)&(milano['wind_deg']<315)]['wind_speed'].mean())
print(milano[milano['wind_deg']>314]['wind_speed'].mean())
1.8142857142857143
2.2222222222222223
2.855555555555556
2.583333333333333
2.3285714285714287
2.266666666666667
2.05
2.1
将各个方向的风速保存在列表中
In [37]:
degs = np.arange(45,361,45)
tmp = []
for deg in degs:
tmp.append(milano[(milano['wind_deg']>(deg-46))&(milano['wind_deg']<deg)]['wind_speed'].mean())
speeds = np.array(tmp)
print('各个方向的风速:',speeds)
各个方向的风速: [1.81428571 2.22222222 2.85555556 2.58333333 2.32857143 2.26666667
2.05 2.1 ]
画出各个方向的风速
In [38]:
show_rose(speeds,'milano')
将上面步骤写成函数
In [83]:
#定义某个城市,各个方向的风速函数,RoseWind_Speed
def RoseWind_Speed(city):
degs = np.arange(45,361,45)
tmp = []
for deg in degs:
tmp.append(city[(city['wind_deg']>(deg-46))&(city['wind_deg']<deg)]['wind_speed'].mean())
return np.array(tmp)
#定义定义一个画图函数,showRoseWind_Speed
def showRoseWind_Speed(speeds,city_name):
N = 8
theta = np.arange(0.,2 * np.pi, 2 * np.pi / N)
radii = np.array(speeds)
plt.axes([0,0,2,2], polar=True)
colors = np.random.random(size = 24).reshape(8,3)
bars = plt.bar(theta, radii, width=(2*np.pi/N), bottom=0.0, color=colors)
plt.title(city_name,x=0.2, fontsize=20)
In [74]:
RoseWind_Speed(ravenna)
#ravenna返回值中含有空值nan,调用showRoseWind_Speed函数会报错
Out[74]:
array([3.2 , 4.0925 , 2.62583333, 1.52888889, 1.78909091,
2.71142857, nan, 2.11 ])
In [75]:
a = RoseWind_Speed(ravenna)
a.shape
Out[75]:
(8,)
In [84]:
showRoseWind_Speed(RoseWind_Speed(milano),'Milano')
In [85]:
showRoseWind_Speed(RoseWind_Speed(bologna),'Bologna')
仅供参考学习,严禁转载!