关闭

Kaggle的Digits Recognizer题目实现

标签: 机器学习Pythonsklearn
379人阅读 评论(0) 收藏 举报
分类:

        机器学习看了有一阵子了,一些常用的算法已经有些了解。应该拿个项目/比赛练习一下,看看机器学习到底是如何应用的。Kaggle是个非常不错的机器学习和数据挖掘的比赛网站,网站提供数据,可以拿来练习算法。下面是101里面的第一道题,Digit Recognizer。用Python实现的。应用了KNN,SVM和RF算法。今天先贴上代码,过几天把相应的算法原理也写上,方便深入理解。这段代码是参照别的大神写的代码,进行了一定的修改和优化。数据源可以从如下网址获取:https://www.kaggle.com/c/digit-recognizer/data

import numpy as np 
import operator
import csv
import scipy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier


def read_data(file, header = True, test = False, rows = 0):
	csv_reader = csv.reader(open(file, 'r'), delimiter = ',')
	data = []
	labels = []
	index = 0
	for line in csv_reader:
		index = index + 1
		if rows > 0 & index > rows:
			break
		if header & index == 1:
			continue
		if not test:
			labels.append(int(line[0]))
			line = line[1:]
		data.append(np.array(np.int64(line)))
	return data, labels


def predictKNN(train, labels, test):
	print 'KNN starts...'
	KNNobj = KNeighborsClassifier()
	KNNobj.fit(train, labels)
	predict = KNNobj.predict(test)
	pre_pro = KNNobj.predict_proba(test)
	max_pre_pro = pre_pro.max(axis = 1)
	print 'KNN ends...'
	return predict, max_pre_pro


def predicSVC(train, labels, test):
	print 'SVC starts...'
	SVCobj = SVC(probability=True)
	SVCobj.fit(train, labels)
	predict = SVCobj.predict(test)
	pre_pro = SVCobj.predict_proba(test)
	max_pre_pro = pre_pro.max(axis = 1)
	print 'SVC ends...'
	return predict, max_pre_pro


def predicRF(train, labels, test, label):
	print 'RF starts...'
	RFobj = RandomForestClassifier(n_estimators=200, n_jobs=2)
	RFobj.fit(train, labels)
	predict = RFobj.predict(test)
	pre_pro = RFobj.predict_proba(test)
	max_pre_pro = pre_pro.max(axis = 1)
	print 'RF ends...'
	return predict, max_pre_pro

class PredicScore:
	def __init__(self, predict, score):
		self.predict = predict
		self.score = score
	predict = -1
	score = 0

if __name__ = '__main__':
	print 'test begins...'
	train, labels = read_data('train.csv', rows = 100)
	test, label = read_data('test.csv', test = True, rows = 100)
	predict_RF, max_pre_pro_RF = predicRF(train, labels, test, label)
	predict_KNN, max_pre_pro_KNN = predicKNN(train, labels, test)
	predict_SVC, max_pre_pro_SVC = predicSVC(train, labels, test)
	index = 0
	result = []
	for eachscore in max_pre_pro_RF:
		KNNclassobj = PredicScore(predict_KNN[index], max_pre_pro_KNN[index])
		SVCclassobj = PredicScore(predict_SVC[index], max_pre_pro_SVC[index])
		RFclassobj = PredicScore(predict_RF[index], max_pre_pro_RF[index])

		scoreArray = []
		scoreArray.append(KNNclassobj)
		scoreArray.append(SVCclassobj)
		scoreArray.append(RFclassobj)

		max_score_obj = max(scoreArray, key=operator.attrgetter('score'))
		result.append(max_score_obj.predict)
		index = index + 1

	np.savetxt('submission.csv', result, fmt = '%i', delimiter = ',')
	print 'done'





0
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:12360次
    • 积分:206
    • 等级:
    • 排名:千里之外
    • 原创:8篇
    • 转载:1篇
    • 译文:0篇
    • 评论:0条
    文章分类
    文章存档