手写梯度下降

import pandas as pd
import numpy as np
train=pd.read_csv('train.csv')
train.head()

Datestationsobservation0123456...14151617181920212223
02014/1/1stationAMB_TEMP14141413121212...22222119171615151515
12014/1/1stationCH41.81.81.81.81.81.81.8...1.81.81.81.81.81.81.81.81.81.8
22014/1/1stationCO0.510.410.390.370.350.30.37...0.370.370.470.690.560.450.380.350.360.32
32014/1/1stationNMHC0.20.150.130.120.110.060.1...0.10.130.140.230.180.120.10.090.10.08
42014/1/1stationNO0.90.60.51.71.81.51.9...2.52.22.52.32.11.91.51.61.81.5

5 rows × 27 columns

train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4320 entries, 0 to 4319
Data columns (total 27 columns):
Date           4320 non-null object
stations       4320 non-null object
observation    4320 non-null object
0              4320 non-null object
1              4320 non-null object
2              4320 non-null object
3              4320 non-null object
4              4320 non-null object
5              4320 non-null object
6              4320 non-null object
7              4320 non-null object
8              4320 non-null object
9              4320 non-null object
10             4320 non-null object
11             4320 non-null object
12             4320 non-null object
13             4320 non-null object
14             4320 non-null object
15             4320 non-null object
16             4320 non-null object
17             4320 non-null object
18             4320 non-null object
19             4320 non-null object
20             4320 non-null object
21             4320 non-null object
22             4320 non-null object
23             4320 non-null object
dtypes: object(27)
memory usage: 911.3+ KB
train.observation.unique()
array(['AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx', 'O3', 'PM10',
       'PM2.5', 'RAINFALL', 'RH', 'SO2', 'THC', 'WD_HR', 'WIND_DIREC',
       'WIND_SPEED', 'WS_HR'], dtype=object)
train_PM=train[train.observation=='PM2.5']
train_PM.head()
Datestationsobservation0123456...14151617181920212223
92014/1/1stationPM2.526393635312825...36454249454441302413
272014/1/2stationPM2.521233030221813...53434345463216192226
452014/1/3stationPM2.519252720161415...32363445404123292337
632014/1/4stationPM2.527271420222426...62555667788390758582
812014/1/5stationPM2.580807681756670...64735757537070606866

5 rows × 27 columns

PM_data=train_PM.iloc[:,3:]
PM_data.head()
0123456789...14151617181920212223
926393635312825201930...36454249454441302413
2721233030221813131122...53434345463216192226
4519252720161415849...32363445404123292337
6327271420222426334850...62555667788390758582
8180807681756670656657...64735757537070606866

5 rows × 24 columns

PM_data=PM_data.apply(lambda x : x.astype('float'))
PM_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 240 entries, 9 to 4311
Data columns (total 24 columns):
0     240 non-null float64
1     240 non-null float64
2     240 non-null float64
3     240 non-null float64
4     240 non-null float64
5     240 non-null float64
6     240 non-null float64
7     240 non-null float64
8     240 non-null float64
9     240 non-null float64
10    240 non-null float64
11    240 non-null float64
12    240 non-null float64
13    240 non-null float64
14    240 non-null float64
15    240 non-null float64
16    240 non-null float64
17    240 non-null float64
18    240 non-null float64
19    240 non-null float64
20    240 non-null float64
21    240 non-null float64
22    240 non-null float64
23    240 non-null float64
dtypes: float64(24)
memory usage: 46.9 KB
train_x=[]
train_y=[]
for i in range(15):
    x=PM_data.iloc[:,i:i+9]
    x.columns=np.arange(9)
    y=PM_data.iloc[:,i+9]
    y.columns=np.arange(1)
    train_x.append(x)
    train_y.append(y)
train_x=pd.concat(train_x,axis=0)
train_y=pd.concat(train_y,axis=0)
train_x.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3600 entries, 9 to 4311
Data columns (total 9 columns):
0    3600 non-null float64
1    3600 non-null float64
2    3600 non-null float64
3    3600 non-null float64
4    3600 non-null float64
5    3600 non-null float64
6    3600 non-null float64
7    3600 non-null float64
8    3600 non-null float64
dtypes: float64(9)
memory usage: 281.2 KB
train_x.shape
(3600, 9)
train_y.shape
(3600,)
train_x=np.array(train_x)
train_y=np.array(train_y)
train_x=np.concatenate((np.ones((train_x.shape[0],1)),train_x),axis=1)
train_x.shape
(3600, 10)
w=np.zeros(train_x.shape[1])
lr=10
Iteration=10000
# sum_gra=np.zeros(train_x.shape[0])
sum_gra=np.zeros(train_x.shape[1])
for i in range(Iteration):
    y_new=np.dot(train_x,w)
    loss=y_new-train_y
    ##X^t * loss 为矩阵形式的梯度
    gra=np.dot(train_x.transpose(),loss)
    ###保存之前的梯度的平方
    sum_gra+=gra**2
    ada=np.sqrt(sum_gra)
    w=w-lr*gra/ada
w
array([ 2.15246702,  0.00728964, -0.04603067,  0.19941492, -0.20757788,
       -0.04384333,  0.46235285, -0.54329351,  0.01552538,  1.07716609])
df_test=pd.read_csv('test(1).csv')
df_test.shape
(4319, 11)
test_PM=df_test[df_test.AMB_TEMP=='PM2.5']
test_PM.head()
id_0AMB_TEMP151414.11313.113.213.313.412
8id_0PM2.5271324294130292728
26id_1PM2.5464757788476596161
44id_2PM2.5101025344039362522
62id_3PM2.5715851414146433429
80id_4PM2.5132318105513912
test_x=np.array(test_PM.iloc[:,2:],float)
 
test_x.shape
(240, 9)
test_x=np.concatenate([np.ones((test_x.shape[0],1)),test_x],axis=1)
test_x.shape
(240, 10)
y_pre=np.dot(test_x,w)
y_submit=pd.read_csv('sampleSubmission.csv')
y_submit.head()
idvalue
0id_027.414421
1id_161.555764
2id_220.498032
3id_329.534434
4id_410.797670
y_submit.value=y_pre
y_submit.to_csv('sampleSubmission.csv',index=False)
y_real=pd.read_csv('answer.csv')
y_real.head()
idvalue
0id_033
1id_160
2id_216
3id_333
4id_45

MAE

error_mae=abs(y_real.value-y_pre).sum()/y_real.shape[0]
error_mae
4.97442948413227

MSE

error_mse=sum((y_real.value-y_pre)**2)/y_real.shape[0]
error_mse
44.38203674794888

R 2 R^2 R2

def r2_score(y_ture,y_predict):
    mse=sum((y_ture-y_predict)**2)
    variance=sum((y_ture-np.mean(y_predict))**2)
    return 1-mse/variance
error_r2=r2_score(y_real.value,y_pre)
error_r2
0.9094491525564087

使用sklearn

from sklearn import linear_model
reg=linear_model.LinearRegression()
reg.fit(train_x,train_y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
reg.coef_
array([ 0.        ,  0.00729085, -0.04603342,  0.1994169 , -0.20757678,
       -0.0438468 ,  0.46235542, -0.54329308,  0.01552321,  1.07716736])
y_prediction=reg.predict(test_x)
from sklearn.metrics import r2_score
r2_score(y_real.value,y_prediction)
0.90944810736946002
r2_score(y_real.value,y_pre)
0.90944807944600869
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
梯度下降法用于求解二分类逻辑回归模型的最优解。通过梯度下降法,我们可以逐步迭代地调整模型的参数,以使损失函数达到最小值。梯度下降法的基本思路是计算损失函数关于参数的梯度,并按照负梯度方向更新参数。这样反复迭代,直到达到收敛条件或迭代次数达到设定值。 具体实现时,首先需要定义损失函数,通常使用对数损失函数来表示二分类逻辑回归模型的损失。然后,计算损失函数对每个参数的偏导数,即梯度。接着,按照负梯度方向更新参数,通过不断迭代来逐渐降低损失函数的值。 代码实现时,可以使用西瓜数据集作为训练数据,通过手写梯度下降法来求解二分类逻辑回归模型的参数。此外,还可以使用sklearn库中的函数来求解二分类逻辑回归模型的最优解。 总之,梯度下降法是一种常用的求解二分类逻辑回归模型的方法,通过迭代调整模型参数,使损失函数达到最小值,从而得到最优解。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *2* *3* [机器学习之逻辑回归以及梯度下降法求解](https://blog.csdn.net/qq_52785473/article/details/126953977)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 100%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值