Python - Scikit-Learn 的机器学习

最新推荐文章于 2024-03-20 18:00:00 发布

Renaissance5K

最新推荐文章于 2024-03-20 18:00:00 发布

阅读量578

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/wukai0909/article/details/67639441

版权

python 专栏收录该内容

41 篇文章 8 订阅

订阅专栏

本文参考http://python.jobbole.com/81721/，列出了运行结果

逻辑回归

大多数情况下被用来解决分类问题（二元分类），但多类的分类（所谓的一对多方法）也适用。这个算法的优点是对于每一个输出的对象都有一个对应类别的概率。

# -*- coding: utf-8 -*- 
import numpy as np
import urllib

from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]
# print X,y

# 规格化数据
normalized_X = preprocessing.normalize(X)
print normalized_X
# 标准化数据
standardized_X = preprocessing.scale(X)
print '___________________________'
print standardized_X

model = LogisticRegression()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

输出：

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
precision recall f1-score support

0.0 0.79 0.89 0.84 500
1.0 0.74 0.55 0.63 268

avg / total 0.77 0.77 0.77 768

[[447 53]
[120 148]]

朴素贝叶斯

它也是最有名的机器学习的算法之一，它的主要任务是恢复训练样本的数据分布密度。这个方法通常在多类的分类问题上表现的很好。

# -*- coding: utf-8 -*- 
import numpy as np
import urllib

from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]
# print X,y

# normalize the data attributes
normalized_X = preprocessing.normalize(X)
#print normalized_X
# standardize the data attributes
standardized_X = preprocessing.scale(X)
print '___________________________'
#print standardized_X

model = GaussianNB()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

输出：

GaussianNB(priors=None)
precision recall f1-score support

0.0 0.80 0.86 0.83 500
1.0 0.69 0.60 0.64 268

avg / total 0.76 0.77 0.76 768

[[429 71]
[108 160]]

k-最近邻

kNN（k-最近邻）方法通常用于一个更复杂分类算法的一部分。例如，我们可以用它的估计值做为一个对象的特征。有时候，一个简单的kNN算法在良好选择的特征上会有很出色的表现。当参数（主要是metrics）被设置得当，这个算法在回归问题中通常表现出最好的质量。

# -*- coding: utf-8 -*- 
import numpy as np
import urllib

from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]
# print X,y

# normalize the data attributes
normalized_X = preprocessing.normalize(X)
#print normalized_X
# standardize the data attributes
standardized_X = preprocessing.scale(X)
print '___________________________'
#print standardized_X

# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

输出：

___________________________
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform')
precision recall f1-score support

0.0 0.82 0.90 0.86 500
1.0 0.77 0.63 0.69 268

avg / total 0.80 0.80 0.80 768

[[448 52]
[ 98 170]]

决策树

分类和回归树（CART）经常被用于这么一类问题，在这类问题中对象有可分类的特征且被用于回归和分类问题。决策树很适用于多类分类。

# -*- coding: utf-8 -*- 
import numpy as np
import urllib

from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]
# print X,y

# normalize the data attributes
normalized_X = preprocessing.normalize(X)
#print normalized_X
# standardize the data attributes
standardized_X = preprocessing.scale(X)
print '___________________________'
#print standardized_X

# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

输出：

___________________________
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
precision recall f1-score support

0.0 1.00 1.00 1.00 500
1.0 1.00 1.00 1.00 268

avg / total 1.00 1.00 1.00 768

[[500 0]
[ 0 268]]

支持向量机

SVM（支持向量机）是最流行的机器学习算法之一，它主要用于分类问题。同样也用于逻辑回归，SVM在一对多方法的帮助下可以实现多类分类。

# -*- coding: utf-8 -*- 
import numpy as np
import urllib

from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]
# print X,y

# normalize the data attributes
normalized_X = preprocessing.normalize(X)
#print normalized_X
# standardize the data attributes
standardized_X = preprocessing.scale(X)
print '___________________________'
#print standardized_X

# fit a SVM model to the data
model = SVC()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

输出：
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
precision recall f1-score support

0.0 1.00 1.00 1.00 500
1.0 1.00 1.00 1.00 268

avg / total 1.00 1.00 1.00 768

[[500 0]
[ 0 268]]

除了分类和回归问题，Scikit-Learn还有海量的更复杂的算法，包括了聚类，以及建立混合算法的实现技术，如Bagging和Boosting。

如何优化算法的参数

在编写高效的算法的过程中最难的步骤之一就是正确参数的选择。一般来说如果有经验的话会容易些，但无论如何，我们都得寻找。幸运的是Scikit-Learn提供了很多函数来帮助解决这个问题。

作为一个例子，我们来看一下规则化参数的选择，在其中不少数值被相继搜索了

# -*- coding: utf-8 -*- 
import numpy as np
import urllib

from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV

# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]
# print X,y

# normalize the data attributes
normalized_X = preprocessing.normalize(X)
#print normalized_X
# standardize the data attributes
standardized_X = preprocessing.scale(X)
print '___________________________'
#print standardized_X

# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

输出：___________________________
GridSearchCV(cv=None, error_score='raise',
estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='auto', tol=0.001),
fit_params={}, iid=True, n_jobs=1,
param_grid={'alpha': array([ 1.00000e+00, 1.00000e-01, 1.00000e-02, 1.00000e-03,
1.00000e-04, 0.00000e+00])},
pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
0.282118955686
1.0

有时候随机地从既定的范围内选取一个参数更为高效，估计在这个参数下算法的质量，然后选出最好的。

# -*- coding: utf-8 -*- 
import numpy as np
import urllib

from sklearn import preprocessing
from sklearn import metrics
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.grid_search import RandomizedSearchCV

# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]
# print X,y

# normalize the data attributes
normalized_X = preprocessing.normalize(X)
#print normalized_X
# standardize the data attributes
standardized_X = preprocessing.scale(X)
print '___________________________'
#print standardized_X

# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X, y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

输出：___________________________
RandomizedSearchCV(cv=None, error_score='raise',
estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='auto', tol=0.001),
fit_params={}, iid=True, n_iter=100, n_jobs=1,
param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0E9EE3F0>},
pre_dispatch='2*n_jobs', random_state=None, refit=True,
scoring=None, verbose=0)
0.282118471664
0.982079186879

至此我们已经看了整个使用Scikit-Learn库的过程，除了将结果再输出到一个文件中。

Renaissance5K

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python - Scikit-Learn 的机器学习

本文参考http://python.jobbole.com/81721/，列出了运行结果逻辑回归大多数情况下被用来解决分类问题（二元分类），但多类的分类（所谓的一对多方法）也适用。这个算法的优点是对于每一个输出的对象都有一个对应类别的概率。# -*- coding: utf-8 -*- import numpy as npimport urllibfrom skle
复制链接

扫一扫

专栏目录