# Python中常用包——sklearn主要模块和基本使用方法

scikit-learn的实现使用了NumPy中的arrays，所以，我们要使用NumPy来载入csv文件。

1 import numpy as np
2 import urllib
3 # url with dataset
4 url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
6 raw_data = urllib.urlopen(url)
7 # load the CSV file as a numpy matrix
9 # separate the data from the target attributes
10 X = dataset[:,0:7]
11 y = dataset[:,8]

（2）X = dataset[:, 0:7]的意思是：把dataset中的所有行，所有0-7列的数据都保存在X中；

1 from sklearn import preprocessing
2 #scale the data attributes
3 scaled_X = preprocessing.scale(X)
4
5 # normalize the data attributes
6 normalized_X = preprocessing.normalize(X)
7
8 # standardize the data attributes
9 standardized_X = preprocessing.scale(X)

1 from sklearn import metrics
2 from sklearn.ensemble import ExtraTreesClassifier
3 model = ExtraTreesClassifier()
4 model.fit(X, y)
5 # display the relative importance of each attribute
6 print(model.feature_importances_)


[ 0.13784722  0.15383598  0.25451389  0.17476852  0.02847222  0.12314815  0.12741402]

scikit-learn实现了机器学习的大部分基础算法，让我们快速了解一下。

 1 from sklearn import metrics
2 from sklearn.linear_model import LogisticRegression
3 model = LogisticRegression()
4 model.fit(X, y)
5 print('MODEL')
6 print(model)
7 # make predictions
8 expected = y
9 predicted = model.predict(X)
10 # summarize the fit of the model
11 print('RESULT')
12 print(metrics.classification_report(expected, predicted))
13 print('CONFUSION MATRIX')
14 print(metrics.confusion_matrix(expected, predicted))

1 MODEL
2 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
3           intercept_scaling=1, max_iter=100, multi_class='ovr',
4           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
5           verbose=0)
6 RESULT
7              precision    recall  f1-score   support
8
9         0.0       1.00      1.00      1.00         4
10         1.0       1.00      1.00      1.00         6
11
12 avg / total       1.00      1.00      1.00        10
13
14 CONFUSION MATRIX
15 [[4 0]
16  [0 6]]

 1 from sklearn import metrics
2 from sklearn.naive_bayes import GaussianNB
3 model = GaussianNB()
4 model.fit(X, y)
5 print('MODEL')
6 print(model)
7 # make predictions
8 expected = y
9 predicted = model.predict(X)
10 # summarize the fit of the model
11 print('RESULT')
12 print(metrics.classification_report(expected, predicted))
13 print('CONFUSION MATRIX')
14 print(metrics.confusion_matrix(expected, predicted))

MODEL
GaussianNB()
RESULT
precision    recall  f1-score   support

0.0       0.80      1.00      0.89         4
1.0       1.00      0.83      0.91         6

avg / total       0.92      0.90      0.90        10

CONFUSION MATRIX
[[4 0]
[1 5]]

k近邻官方文档

k近邻算法常常被用作是分类算法一部分，比如可以用它来评估特征，在特征选择上我们可以用到它。

 1 from sklearn import metrics
2 from sklearn.neighbors import KNeighborsClassifier
3 # fit a k-nearest neighbor model to the data
4 model = KNeighborsClassifier()
5 model.fit(X, y)
6 print(model)
7 # make predictions
8 expected = y
9 predicted = model.predict(X)
10 # summarize the fit of the model
11 print(metrics.classification_report(expected, predicted))
12 print(metrics.confusion_matrix(expected, predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_neighbors=5, p=2, weights='uniform')
precision    recall  f1-score   support

0.0       0.75      0.75      0.75         4
1.0       0.83      0.83      0.83         6

avg / total       0.80      0.80      0.80        10

[[3 1]
[1 5]]

1 from sklearn import metrics
2 from sklearn.tree import DecisionTreeClassifier
3 # fit a CART model to the data
4 model = DecisionTreeClassifier()
5 model.fit(X, y)
6 print(model)
7 # make predictions
8 expected = y
9 predicted = model.predict(X)
10 # summarize the fit of the model
11 print(metrics.classification_report(expected, predicted))
12 print(metrics.confusion_matrix(expected, predicted))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
random_state=None, splitter='best')
precision    recall  f1-score   support

0.0       1.00      1.00      1.00         4
1.0       1.00      1.00      1.00         6

avg / total       1.00      1.00      1.00        10

[[4 0]
[0 6]]

SVM是非常流行的机器学习算法，主要用于分类问题，如同逻辑回归问题，它可以使用一对多的方法进行多类别的分类。

1 from sklearn import metrics
2 from sklearn.svm import SVC
3 # fit a SVM model to the data
4 model = SVC()
5 model.fit(X, y)
6 print(model)
7 # make predictions
8 expected = y
9 predicted = model.predict(X)
10 # summarize the fit of the model
11 print(metrics.classification_report(expected, predicted))
12 print(metrics.confusion_matrix(expected, predicted))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
precision    recall  f1-score   support

0.0       1.00      1.00      1.00         4
1.0       1.00      1.00      1.00         6

avg / total       1.00      1.00      1.00        10

[[4 0]
[0 6]]

GridSearchCV官方文档1（模块使用） 官方文档2（原理详解）

1 import numpy as np
2 from sklearn.linear_model import Ridge
3 from sklearn.grid_search import GridSearchCV
4 # prepare a range of alpha values to test
5 alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
6 # create and fit a ridge regression model, testing each alpha
7 model = Ridge()
8 grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
9 grid.fit(X, y)
10 print(grid)
11 # summarize the results of the grid search
12 print(grid.best_score_)
13 print(grid.best_estimator_.alpha)

GridSearchCV(cv=None, error_score='raise',
estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, solver='auto', tol=0.001),
fit_params={}, iid=True, loss_func=None, n_jobs=1,
param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
1.00000e-04,   0.00000e+00])},
pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
verbose=0)
-5.59572064238
0.0

RandomizedSearchCV官方文档（模块使用）官方文档2 （原理详解）

 1 import numpy as np
2 from scipy.stats import uniform as sp_rand
3 from sklearn.linear_model import Ridge
4 from sklearn.grid_search import RandomizedSearchCV
5 # prepare a uniform distribution to sample for the alpha parameter
6 param_grid = {'alpha': sp_rand()}
7 # create and fit a ridge regression model, testing random alpha values
8 model = Ridge()
9 rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
10 rsearch.fit(X, y)
11 print(rsearch)
12 # summarize the results of the random parameter search
13 print(rsearch.best_score_)
14 print(rsearch.best_estimator_.alpha)

• 广告
• 抄袭
• 版权
• 政治
• 色情
• 无意义
• 其他

120