《 大数据竞赛平台——Kaggle 入门篇》的补充

《 大数据竞赛平台——Kaggle 入门篇》的补充

博客原文链接:
[ http://blog.csdn.net/u012162613/article/details/41929171 ]


knn_benchmark.csv下载

github的连接
https://github.com/clytwynec/digit_recognition/blob/master/data/knn_benchmark.csv

将得到result.csv转换为提交文件格式:

Python代码

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 21 21:35:48 2017

@author: xyz
"""
from numpy import *
import csv

def toInt(array):
    array=mat(array)
    m,n=shape(array)
    newArray=zeros((m,n))
    for i in xrange(m):
        for j in xrange(n):
                newArray[i,j]=int(float(label[i,j]))#string类型要先转换为float再转换为int
    return newArray

def loadPredictResult():
    l=[]
    with open('sklearn_knn_Result.csv') as file:
         lines=csv.reader(file)
         for line in lines:
             l.append(line)#28001*2
    #l.remove(l[0])
    label=array(l)
    return toInt(label)  #  label 28000*1

def saveResult(result,csvName):
    with open(csvName,'wb') as myFile:    
        myWriter=csv.writer(myFile)
        myWriter.writerow(['ImageId','Label'])
        indexid=1
        for i in result:
            #tmp=[]
            #tmp.append(i)
            myWriter.writerow([indexid,int(i[0])])#每个i是一个array
            indexid=indexid+1

resultlabel=loadPredictResult()
saveResult(resultlabel,'processed-result.csv')

利用Kears构建ANN模型:

参考博客《【Python与机器学习】:利用Keras进行多类分类》

[ http://www.cnblogs.com/arkenstone/p/5943489.html?utm_source=itdadao&utm_medium=referral ]

输入特征为784,利用one-hot encoder对0-9这10个分类标签编码,综上,输入神经元个数为784,输出神经元个数为10,隐藏节点数为输入节点数和输出节点数乘积的平方根,
hidnodes=784×10=88

神经网络结构如图:
这里写图片描述

Python代码

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 16 21:59:00 2014

@author: wepon

@blog:http://blog.csdn.net/u012162613
"""

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
#from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

# load train dataset X_train:data Y:label
dataframe = pd.read_csv("train.csv")
dataset = dataframe.values
#X = dataset[:, 1:].astype(float)
X_train= dataset[:, 1:].astype(int)
Y = dataset[:, 0].astype(int)

# load test dataset X_test:data
testdataframe = pd.read_csv("test.csv")
testdataset = testdataframe.values
#X = dataset[:, 1:].astype(float)
X_test= testdataset.astype(int)

# load validation dataset
validdataframe = pd.read_csv("knn_benchmark.csv")
validdataset = validdataframe.values
#X = dataset[:, 1:].astype(float)

# encode class values as integers ???label encoders are 10 calatorgry?????????????
encoder = LabelEncoder()
def labelencode(Y):
    encoded_Y = encoder.fit_transform(Y)
# convert integers to dummy variables (one hot encoding)
    dummy_y = np_utils.to_categorical(encoded_Y)
    return dummy_y

# define model structure 隐藏节点数为输入节点数和输出节点数乘积的开平方
def baseline_model():
    model = Sequential()
    model.add(Dense(output_dim=88, input_dim=784, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(output_dim=10, input_dim=88, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=40, batch_size=256)
#指定每个batch的大小batch_size,指定训练轮数nb_epoch
# splitting data into training set and test set. If random_state is set to an integer, the split datasets are fixed.
#X_train, X_test, Y_train, Y_test = train_test_split(X, dummy_y, test_size=0.3, random_state=0)
Y_train=labelencode(Y)
estimator.fit(X_train, Y_train)

# make predictions
pred = estimator.predict(X_test)

# inverse numeric variables to initial categorical labels
init_lables = encoder.inverse_transform(pred)
#print init_lables[0]
init_lables=pd.DataFrame({'label':init_lables})#array convert to DataFrame
validdataframe['prelabel']=init_lables['label']
validdataframe['accuracy']=validdataframe['prelabel']==validdataframe['Label']
totalcount=validdataframe['accuracy'].count()
print 'ANN accuarcy:',validdataframe['accuracy'].sum()/1.0/totalcount
#write predict results to submision csv
validdataframe.to_csv('submision.csv',columns=['ImageId','Label'],index=False)        

[在Kaggle上的排名]
在Kaggle上的排名

比较不同模型的分类效果,看看CNN的效果如何:

如何用卷积神经网络CNN识别手写数字集?

http://blog.csdn.net/zdy0_2004/article/details/51945932

先给出CNN网络结构模型

这里写图片描述

代码
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 16 21:59:00 2014

@author: wepon

@blog:http://blog.csdn.net/u012162613
"""

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout,Convolution2D,Reshape,AveragePooling2D, Flatten
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import SGD
from keras.utils import np_utils
#from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

# load train dataset X_train:data Y:label
#dataframe = pd.read_csv("train.csv", header=None)
dataframe = pd.read_csv("train.csv")
dataset = dataframe.values
#X = dataset[:, 1:].astype(float)
X_train= dataset[:, 1:].astype(int)
Y = dataset[:, 0].astype(int)

# load test dataset X_test:data
#testdataframe = pd.read_csv("test.csv", header=None)
testdataframe = pd.read_csv("test.csv")
testdataset = testdataframe.values
#X = dataset[:, 1:].astype(float)
X_test= testdataset.astype(int)

# load validation dataset
validdataframe = pd.read_csv("knn_benchmark.csv")
validdataset = validdataframe.values
#X = dataset[:, 1:].astype(float)

# encode class values as integers
encoder = LabelEncoder()
def labelencode(Y):
    encoded_Y = encoder.fit_transform(Y)
# convert integers to dummy variables (one hot encoding)
    dummy_y = np_utils.to_categorical(encoded_Y)
    return dummy_y

Y_train=labelencode(Y)
# define model structure 784-88-10
def baseline_model():
    model = Sequential()
    model.add(Dense(output_dim=85, input_dim=784, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(output_dim=10, input_dim=85, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
# define cnn model
def cnn_model():
    model = Sequential()
    #Reshape层用来将输入shape转换为特定的shape
    model.add(Reshape(target_shape=(1,28,28),input_shape=(784,)))
    model.add(Convolution2D(nb_filter=32,nb_row=3,nb_col=3,dim_ordering='th',border_mode='same',bias=False,init='uniform'))    
    #model.add(Convolution2D(32, 3, 3,border_mode='same',input_shape=(1, 96, 96)))  
    model.add(AveragePooling2D(pool_size=(2,2),dim_ordering='th'))
    model.add(Convolution2D(nb_filter=64,nb_row=3,nb_col=3,dim_ordering='th',border_mode='same',bias=False,init='uniform'))    
    model.add(AveragePooling2D(pool_size=(2,2),dim_ordering='th'))
    # the model so far outputs 3D feature maps (height, width, features)
    model.add(Flatten()) # this converts our 3D feature maps to 1D feature vectors
    #Dense就是常用的全连接层
    model.add(Dense(output_dim=1000,activation='sigmoid'))
    model.add(Dense(output_dim=1000,activation='sigmoid'))
    model.add(Dense(output_dim=10,activation='linear'))
    sgd = SGD(lr='0.01',decay=1e-6,momentum=0.9, nesterov=True)
    model.compile(loss='mean_squared_error', optimizer=sgd,metrics=['accuracy'])
    return model
model_name='cnn'
estimator = KerasClassifier(build_fn=cnn_model, nb_epoch=40, batch_size=256)
# splitting data into training set and test set. If random_state is set to an integer, the split datasets are fixed.
#X_train, X_test, Y_train, Y_test = train_test_split(X, dummy_y, test_size=0.3, random_state=0)

estimator.fit(X_train, Y_train)

# make predictions
pred = estimator.predict(X_test)

# inverse numeric variables to initial categorical labels
init_lables = encoder.inverse_transform(pred)
#print init_lables[0]
#init_lables=pd.DataFrame({'label':init_lables[1:]})
init_lables=pd.DataFrame({'label':init_lables})
validdataframe['prelabel']=init_lables['label']
validdataframe['accuracy']=validdataframe['prelabel']==validdataframe['Label']
totalcount=validdataframe['accuracy'].count()
print '%s accuarcy:'%model_name,validdataframe['accuracy'].sum()/1.0/totalcount
#write to submision csv
validdataframe.to_csv('submision_%s.csv'%model_name,columns=['ImageId','Label'],index=False)
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值