Python 数据分析第八期–scikit-learn
1. 机器学习
2. scikit - learn
scikit-learn是面向 python 的免费机器学习库,包括分类,回归,聚类算法,以及降维、模型筛选、预处理等算法。
安装
pip install scikit-learn
conda install scikit-learn
机器学习数据集分为训练集,验证集,测试集
2.1 加载示例数据集
from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()
2.2 在训练集上训练模型
# 手动划分训练集、测试集
n_test = 100 # 测试样本个数
train_X = digits.data[:-n_test, :]
train_y = digits.target[:-n_test]
test_X = digits.data[-n_test:, :]
y_true = digits.target[-n_test:]
# 选择SVM模型
from sklearn import svm
svm_model = svm.SVC(gamma=0.001, C=100.)
# svm_model = svm.SVC(gamma=100., C=1.)
# 训练模型
svm_model.fit(train_X, train_y)
SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
# 选择LR模型
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
# 训练模型
lr_model.fit(train_X, train_y)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)
2.3 在测试集上测试模型
y_pred_svm = svm_model.predict(test_X)
y_pred_lr = lr_model.predict(test_X)
# 查看结果
from sklearn.metrics import accuracy_score
#print '预测标签:', y_pred
#print '真实标签:', y_true
print('SVM结果:', accuracy_score(y_true, y_pred_svm))
print('LR结果:', accuracy_score(y_true, y_pred_lr))