机器学习之交叉验证

import numpy as np
import pandas as pd

admission = pd.read_csv("D:\\test\machineLearning\\admissions.csv")
print admission.head(2)                      
admission["actul_label"] = admission["admit"]
admission = admission.drop("admit",axis=1)

shuffled_index=np.random.permutation(admission.index)
shuffled_admission =admission.loc[shuffled_index]
admission = shuffled_admission.reset_index()
#切成5块,fold是额外加上的属性
admission.ix[0:128,"fold"]=1
admission.ix[129:257,"fold"]=2
admission.ix[258:386,"fold"]=3
admission.ix[387:514,"fold"]=4
admission.ix[515:644,"fold"]=5
#将列的数据设置成整形
admission["fold"]=admission["fold"].astype('int')
print (admission.head(2))
print (admission.tail(2))                      
   admit       gpa         gre
0      0  3.177277  594.102992
1      0  3.412655  631.528607
   index       gpa         gre  actul_label  fold
0    285  3.110348  689.990853            0     1
1    188  3.229678  563.682408            0     1
     index       gpa         gre  actul_label  fold
642    531  3.367606  658.277726            1     5
643    439  3.371170  643.340952            1     5
from sklearn.linear_model import LogisticRegression
model1=LogisticRegression()
train_interation = admission[admission["fold"] != 1]
test_interation = admission[admission["fold"] == 1] 
#训练
model1.fit(train_interation[["gpa"]],train_interation["actul_label"])
#预测
labels =model1.predict(test_interation[["gpa"]]) 
test_interation["predict_label"]=labels
#取出准确值
match = test_interation["predict_label"]==test_interation["actul_label"]
correct_predict=test_interation[match]

accuracy=len(correct_predict)/float(len(test_interation))
print accuracy
0.666666666667


C:\Users\qiujiahao\Anaconda2\lib\site-packages\ipykernel\__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
#r如果只取第一块,数据可能不准,因此,每一块都要测试,称为交叉验证
fold_id=[1,2,3,4,5]
def get_accuracies():
    fold_accuracies=[]
    for i in range(5):
        train_interation = admission[admission["fold"] != fold_id[i]]
        test_interation = admission[admission["fold"] == fold_id[i]] 
        #训练
        model1.fit(train_interation[["gpa"]],train_interation["actul_label"])
        #预测
        labels =model1.predict(test_interation[["gpa"]]) 
        test_interation["predict_label"]=labels
        #取出准确值
        match = test_interation["predict_label"]==test_interation["actul_label"]
        correct_predict=test_interation[match] 
        accuracy=len(correct_predict)/float(len(test_interation))
        fold_accuracies.append(accuracy)
    return fold_accuracies
fold_accuracies = get_accuracies()
print fold_accuracies
print np.mean(fold_accuracies)
C:\Users\qiujiahao\Anaconda2\lib\site-packages\ipykernel\__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[0.6666666666666666, 0.6589147286821705, 0.6124031007751938, 0.65625, 0.6356589147286822]
0.645978682171
#以上是自己实现,以下是调用库函数来实现
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

admission = pd.read_csv("D:\\test\machineLearning\\admissions.csv")                     
admission["actul_label"] = admission["admit"]
admission = admission.drop("admit",axis=1)
#交叉验证,第一参数是样本数量,第二个参数,你想分成几份,第三个参数,是否要重新洗牌,第四个参数,随机种子
kf=KFold(len(admission),5,shuffle=True,random_state=8)
#逻辑回归的模型
lr = LogisticRegression()
#进行交叉验证,scoring指定什么值他就会返回什么值,此处指定的是精度
accuracy = cross_val_score(lr,admission[["gpa"]],admission["actul_label"],scoring="accuracy",cv=kf)
average_accuracy=sum(accuracy)/len(accuracy)
print average_accuracy
print accuracy
0.644391957364
[ 0.6124031   0.65891473  0.64341085  0.6744186   0.6328125 ]
#返回的是roc的有效面积
roc_auc = cross_val_score(lr,admission[["gpa"]],admission["actul_label"],scoring="roc_auc",cv=kf)
average_accuracy=sum(accuracy)/len(accuracy)
print average_accuracy
print roc_auc
0.644391957364
[ 0.70790123  0.69550265  0.65987934  0.73363017  0.57864583]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值