读取数据
import pandas
import numpy
# Read the data
data = pandas.read_csv('data.csv')
# Split the data into X and y
X = numpy.array(data[['x1', 'x2']])
y = numpy.array(data['y'])
1. 使用不同的分类器
# import statements for the classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
# Logistic Regression Classifier(逻辑回归)
classifier = LogisticRegression()
classifier.fit(X,y)
# Decision Tree Classifier(决策树)
classifier = DecisionTreeClassifier()
classifier.fit(X,y)
# Support Vector Machine Classifier(SVM)
classifier = SVC()
classifier.fit(X,y)
2. 分离训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)#测试集占10%
3. 模型评估相关函数
准确率
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true, y_pred)
绝对值误差
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
classifier = LinearRegression()
classifier.fit(X, y)
pred = classifier.predict(X)
error = mean_absolute_error(y, pred)
均方误差
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
classifier = LinearRegression()
classifier.fit(X, y)
pred = classifier.predict(X)
error = mean_squared_error(y, pred)
R2 Score
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
classifier = LinearRegression()
classifier.fit(X, y)
pred = classifier.predict(X)
score = r2_score(y, pred)
4. K折交叉验证
由于将数据集分为训练集和测试集,测试集的数据不会参入训练,会造成数据浪费,k折交叉验证就是为了充分利用数据。
K折交叉验证方法将数据集分成k份,其中k-1份用做训练集,剩下的1份作为验证集,以这种方式执行k轮,得到k个模型评测结果.对k次的评估结果取平均,作为该模型的整体性能.(k一般取值为5或者10)。
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True)
x = [1, 2, 3, 4, 5, 6, 7, 8]
y = [0, 0, 1, 0, 1, 0, 0, 1]
for train_indices, test_indices in kf.split(x, y):
print(train_indices, test_indices)
[1 3 4 5 6 7] [0 2]
[0 1 2 5 6 7] [3 4]
[0 1 2 3 4 7] [5 6]
[0 2 3 4 5 6] [1 7]
注:kf.split()得到的是数据的序号。
5. 网络搜索
网络搜索即在那些用不同的超参数训练的模型中找出在验证集表现最好的模型
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
#选择模型,这里采用SVC(Support Vector Classification就是支持向量机用于分类)
model = SVC()
#选择超参数;键是参数的名称,值是每个参数可能值的列表
parameters = {'kernel':['poly', 'rbf'], 'C':[0.1, 1, 10]}
#创建一个评分机制
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
scorer = make_scorer(f1_score)
#创建交叉验证集
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True)
#使用参数和评分机制创建一个 GridSearch 对象
grid_obj = GridSearchCV(model, parameters, scoring=scorer, cv=KFold)
grid_fit = grid_obj.fit(X, y)
#获得最佳模型
best_model = grid_fit.best_estimator_