使用scikit-learn处理分类的基础流程

#coding=utf-8
'''
处理流程:
1.加载数据,有些算法支持增量训练,有些不支持,对于支持增量训练的算法,一般都会有一个partial_fit方法
2.数据预处理(对空值进行填补等)
3.数据是否需要压缩高维稀疏矩阵
4.shuffle数据和切出训练集和测试集
5.数据是否需要归一化或者标准化
6.是否需要PCA降低维度
7.训练模型
8.根据正确率,混淆矩阵,进行模型选择
9.保存模型,以便下次可以将模型直接使用,有些算法不能生成模型,也保存不了
'''
from sklearn import datasets
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from sklearn import metrics
from sklearn.decomposition import PCA
# 1.加载数据
ALLData1 = np.loadtxt('D:\\xxx\\xxx.txt',delimiter=',')
data = ALLData1[:,:-1]
print data.shape
target = ALLData1[:,-1]
# print data
# print target

# 2.是否需要数据预处理

# 3.数据是否需要压缩高维稀疏矩阵,有很多种高维稀疏矩阵压缩算法
# from scipy.sparse import coo_matrix
# X = np.array([[ 3, 100,0,0,0,0,0,0,0],[4,50000,0,0,0,0,0,0,0]])
# print coo_matrix(X)

# 4.shuffle数据和切出训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.4, random_state=0)
# 直接按样本比例取训练集
# train_num = data.shape[0]*0.6
# X_train = data[0:train_num]
# print X_train.shape
# y_train = target[0:train_num]
# X_test = data[train_num:]
# y_test = target[train_num:]
# 从文件中另外读取测试集
# X_train = ALLData1[:,:-1]
# # print data.shape
# y_train = ALLData1[:,-1]
# ALLData2 = np.loadtxt('D:\\machinetest\\37resulttest.txt',delimiter=',',dtype=np.string_)
# X_test = ALLData2[:,:-1]
# print X_test.shape
# y_test = ALLData2[:,-1]


# 5.数据是否需要归一化或标准化
# from sklearn import preprocessing
# scaler = preprocessing.StandardScaler().fit(X_train)
# pred_X_train = scaler.transform(X_train)
# pred_X_test = scaler.transform(X_test)


# 6.尝试使用PCA降低维度
# pca=PCA(n_components=0.98)
# pca=PCA(n_components='mle')
# pca.fit(X_train)
# 各个特征占所有特征的方差百分比,分值越高说明保留的信息就越多,越重要
# print pca.explained_variance_ratio_
# 该特征的方差,方差越大,说明主成分越重要
# print pca.explained_variance_
# pcaX_train=pca.transform(X_train)
# pcaX_test=pca.transform(X_test)

# 7.训练模型
clf = SVC()
# clf.fit(X_train, y_train)  

# 8.使用SVM各种核函数查看结果
# 线性核函数(Linear Kernel)
print "Linear"
clf.set_params(kernel='linear').fit(X_train, y_train)  
print clf.score(X_train,y_train)
predicted = clf.predict(X_test)
# clf.set_params(kernel='linear').fit(pcaX_train, y_train)  
# print clf.score(pcaX_train,y_train)
# predicted = clf.predict(pcaX_test)
# 输出正确率
print np.mean(predicted == y_test)
# 输出混淆矩阵
print metrics.confusion_matrix(y_test, predicted)

# 高斯核函数(Gaussian Kernel),也称为径向基核函数
print "Gaussian"
clf.set_params(kernel='rbf').fit(X_train, y_train)  
print clf.score(X_train,y_train)
predicted = clf.predict(X_test)
# clf.set_params(kernel='rbf').fit(pcaX_train, y_train)  
# print clf.score(pcaX_train,y_train)
# predicted = clf.predict(pcaX_test)
# 输出正确率
print np.mean(predicted == y_test)
# 输出混淆矩阵
print metrics.confusion_matrix(y_test, predicted)

# 多项式核函数(Polynomial Kernel)
print "Polynomial"
clf.set_params(kernel='poly').fit(X_train, y_train)  
print clf.score(X_train,y_train)
predicted = clf.predict(X_test)
# clf.set_params(kernel='poly').fit(pcaX_train, y_train)  
# print clf.score(pcaX_train,y_train)
# predicted = clf.predict(pcaX_test)
# 输出正确率
print np.mean(predicted == y_test)
# 输出混淆矩阵
print metrics.confusion_matrix(y_test, predicted)

# Sigmoid核函数(Sigmoid Kernel)
print "Sigmoid"
clf.set_params(kernel='sigmoid').fit(X_train, y_train)  
print clf.score(X_train,y_train)
predicted = clf.predict(X_test)
# clf.set_params(kernel='sigmoid').fit(pcaX_train, y_train)  
# print clf.score(pcaX_train,y_train)
# predicted = clf.predict(pcaX_test)
# 输出正确率
print np.mean(predicted == y_test)
# 输出混淆矩阵
print metrics.confusion_matrix(y_test, predicted)

# # 9.保存模型
# output = open('D:\\xxx\\data.pkl', 'wb')
# s = pickle.dump(clf, output)
# output.close()
# 调用模型
# input = open('D:\\xxx\\data.pkl', 'rb')
# clf2 = pickle.load(input)
# input.close()
# clf2.predict(X[0:1])
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值