import pandas as pd
import numpy as np
train=pd.read_csv('train.csv')
train.head()
| Date | stations | observation | 0 | 1 | 2 | 3 | 4 | 5 | 6 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
---|
0 | 2014/1/1 | station | AMB_TEMP | 14 | 14 | 14 | 13 | 12 | 12 | 12 | ... | 22 | 22 | 21 | 19 | 17 | 16 | 15 | 15 | 15 | 15 |
---|
1 | 2014/1/1 | station | CH4 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | ... | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 |
---|
2 | 2014/1/1 | station | CO | 0.51 | 0.41 | 0.39 | 0.37 | 0.35 | 0.3 | 0.37 | ... | 0.37 | 0.37 | 0.47 | 0.69 | 0.56 | 0.45 | 0.38 | 0.35 | 0.36 | 0.32 |
---|
3 | 2014/1/1 | station | NMHC | 0.2 | 0.15 | 0.13 | 0.12 | 0.11 | 0.06 | 0.1 | ... | 0.1 | 0.13 | 0.14 | 0.23 | 0.18 | 0.12 | 0.1 | 0.09 | 0.1 | 0.08 |
---|
4 | 2014/1/1 | station | NO | 0.9 | 0.6 | 0.5 | 1.7 | 1.8 | 1.5 | 1.9 | ... | 2.5 | 2.2 | 2.5 | 2.3 | 2.1 | 1.9 | 1.5 | 1.6 | 1.8 | 1.5 |
---|
5 rows × 27 columns
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4320 entries, 0 to 4319
Data columns (total 27 columns):
Date 4320 non-null object
stations 4320 non-null object
observation 4320 non-null object
0 4320 non-null object
1 4320 non-null object
2 4320 non-null object
3 4320 non-null object
4 4320 non-null object
5 4320 non-null object
6 4320 non-null object
7 4320 non-null object
8 4320 non-null object
9 4320 non-null object
10 4320 non-null object
11 4320 non-null object
12 4320 non-null object
13 4320 non-null object
14 4320 non-null object
15 4320 non-null object
16 4320 non-null object
17 4320 non-null object
18 4320 non-null object
19 4320 non-null object
20 4320 non-null object
21 4320 non-null object
22 4320 non-null object
23 4320 non-null object
dtypes: object(27)
memory usage: 911.3+ KB
train.observation.unique()
array(['AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx', 'O3', 'PM10',
'PM2.5', 'RAINFALL', 'RH', 'SO2', 'THC', 'WD_HR', 'WIND_DIREC',
'WIND_SPEED', 'WS_HR'], dtype=object)
train_PM=train[train.observation=='PM2.5']
train_PM.head()
| Date | stations | observation | 0 | 1 | 2 | 3 | 4 | 5 | 6 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
---|
9 | 2014/1/1 | station | PM2.5 | 26 | 39 | 36 | 35 | 31 | 28 | 25 | ... | 36 | 45 | 42 | 49 | 45 | 44 | 41 | 30 | 24 | 13 |
---|
27 | 2014/1/2 | station | PM2.5 | 21 | 23 | 30 | 30 | 22 | 18 | 13 | ... | 53 | 43 | 43 | 45 | 46 | 32 | 16 | 19 | 22 | 26 |
---|
45 | 2014/1/3 | station | PM2.5 | 19 | 25 | 27 | 20 | 16 | 14 | 15 | ... | 32 | 36 | 34 | 45 | 40 | 41 | 23 | 29 | 23 | 37 |
---|
63 | 2014/1/4 | station | PM2.5 | 27 | 27 | 14 | 20 | 22 | 24 | 26 | ... | 62 | 55 | 56 | 67 | 78 | 83 | 90 | 75 | 85 | 82 |
---|
81 | 2014/1/5 | station | PM2.5 | 80 | 80 | 76 | 81 | 75 | 66 | 70 | ... | 64 | 73 | 57 | 57 | 53 | 70 | 70 | 60 | 68 | 66 |
---|
5 rows × 27 columns
PM_data=train_PM.iloc[:,3:]
PM_data.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
---|
9 | 26 | 39 | 36 | 35 | 31 | 28 | 25 | 20 | 19 | 30 | ... | 36 | 45 | 42 | 49 | 45 | 44 | 41 | 30 | 24 | 13 |
---|
27 | 21 | 23 | 30 | 30 | 22 | 18 | 13 | 13 | 11 | 22 | ... | 53 | 43 | 43 | 45 | 46 | 32 | 16 | 19 | 22 | 26 |
---|
45 | 19 | 25 | 27 | 20 | 16 | 14 | 15 | 8 | 4 | 9 | ... | 32 | 36 | 34 | 45 | 40 | 41 | 23 | 29 | 23 | 37 |
---|
63 | 27 | 27 | 14 | 20 | 22 | 24 | 26 | 33 | 48 | 50 | ... | 62 | 55 | 56 | 67 | 78 | 83 | 90 | 75 | 85 | 82 |
---|
81 | 80 | 80 | 76 | 81 | 75 | 66 | 70 | 65 | 66 | 57 | ... | 64 | 73 | 57 | 57 | 53 | 70 | 70 | 60 | 68 | 66 |
---|
5 rows × 24 columns
PM_data=PM_data.apply(lambda x : x.astype('float'))
PM_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 240 entries, 9 to 4311
Data columns (total 24 columns):
0 240 non-null float64
1 240 non-null float64
2 240 non-null float64
3 240 non-null float64
4 240 non-null float64
5 240 non-null float64
6 240 non-null float64
7 240 non-null float64
8 240 non-null float64
9 240 non-null float64
10 240 non-null float64
11 240 non-null float64
12 240 non-null float64
13 240 non-null float64
14 240 non-null float64
15 240 non-null float64
16 240 non-null float64
17 240 non-null float64
18 240 non-null float64
19 240 non-null float64
20 240 non-null float64
21 240 non-null float64
22 240 non-null float64
23 240 non-null float64
dtypes: float64(24)
memory usage: 46.9 KB
train_x=[]
train_y=[]
for i in range(15):
x=PM_data.iloc[:,i:i+9]
x.columns=np.arange(9)
y=PM_data.iloc[:,i+9]
y.columns=np.arange(1)
train_x.append(x)
train_y.append(y)
train_x=pd.concat(train_x,axis=0)
train_y=pd.concat(train_y,axis=0)
train_x.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3600 entries, 9 to 4311
Data columns (total 9 columns):
0 3600 non-null float64
1 3600 non-null float64
2 3600 non-null float64
3 3600 non-null float64
4 3600 non-null float64
5 3600 non-null float64
6 3600 non-null float64
7 3600 non-null float64
8 3600 non-null float64
dtypes: float64(9)
memory usage: 281.2 KB
train_x.shape
(3600, 9)
train_y.shape
(3600,)
train_x=np.array(train_x)
train_y=np.array(train_y)
train_x=np.concatenate((np.ones((train_x.shape[0],1)),train_x),axis=1)
train_x.shape
(3600, 10)
w=np.zeros(train_x.shape[1])
lr=10
Iteration=10000
sum_gra=np.zeros(train_x.shape[1])
for i in range(Iteration):
y_new=np.dot(train_x,w)
loss=y_new-train_y
gra=np.dot(train_x.transpose(),loss)
sum_gra+=gra**2
ada=np.sqrt(sum_gra)
w=w-lr*gra/ada
w
array([ 2.15246702, 0.00728964, -0.04603067, 0.19941492, -0.20757788,
-0.04384333, 0.46235285, -0.54329351, 0.01552538, 1.07716609])
df_test=pd.read_csv('test(1).csv')
df_test.shape
(4319, 11)
test_PM=df_test[df_test.AMB_TEMP=='PM2.5']
test_PM.head()
| id_0 | AMB_TEMP | 15 | 14 | 14.1 | 13 | 13.1 | 13.2 | 13.3 | 13.4 | 12 |
---|
8 | id_0 | PM2.5 | 27 | 13 | 24 | 29 | 41 | 30 | 29 | 27 | 28 |
---|
26 | id_1 | PM2.5 | 46 | 47 | 57 | 78 | 84 | 76 | 59 | 61 | 61 |
---|
44 | id_2 | PM2.5 | 10 | 10 | 25 | 34 | 40 | 39 | 36 | 25 | 22 |
---|
62 | id_3 | PM2.5 | 71 | 58 | 51 | 41 | 41 | 46 | 43 | 34 | 29 |
---|
80 | id_4 | PM2.5 | 13 | 23 | 18 | 10 | 5 | 5 | 13 | 9 | 12 |
---|
test_x=np.array(test_PM.iloc[:,2:],float)
test_x.shape
(240, 9)
test_x=np.concatenate([np.ones((test_x.shape[0],1)),test_x],axis=1)
test_x.shape
(240, 10)
y_pre=np.dot(test_x,w)
y_submit=pd.read_csv('sampleSubmission.csv')
y_submit.head()
| id | value |
---|
0 | id_0 | 27.414421 |
---|
1 | id_1 | 61.555764 |
---|
2 | id_2 | 20.498032 |
---|
3 | id_3 | 29.534434 |
---|
4 | id_4 | 10.797670 |
---|
y_submit.value=y_pre
y_submit.to_csv('sampleSubmission.csv',index=False)
y_real=pd.read_csv('answer.csv')
y_real.head()
| id | value |
---|
0 | id_0 | 33 |
---|
1 | id_1 | 60 |
---|
2 | id_2 | 16 |
---|
3 | id_3 | 33 |
---|
4 | id_4 | 5 |
---|
MAE
error_mae=abs(y_real.value-y_pre).sum()/y_real.shape[0]
error_mae
4.97442948413227
MSE
error_mse=sum((y_real.value-y_pre)**2)/y_real.shape[0]
error_mse
44.38203674794888
R
2
R^2
R2
def r2_score(y_ture,y_predict):
mse=sum((y_ture-y_predict)**2)
variance=sum((y_ture-np.mean(y_predict))**2)
return 1-mse/variance
error_r2=r2_score(y_real.value,y_pre)
error_r2
0.9094491525564087
使用sklearn
from sklearn import linear_model
reg=linear_model.LinearRegression()
reg.fit(train_x,train_y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
reg.coef_
array([ 0. , 0.00729085, -0.04603342, 0.1994169 , -0.20757678,
-0.0438468 , 0.46235542, -0.54329308, 0.01552321, 1.07716736])
y_prediction=reg.predict(test_x)
from sklearn.metrics import r2_score
r2_score(y_real.value,y_prediction)
0.90944810736946002
r2_score(y_real.value,y_pre)
0.90944807944600869