构建分类器 基于NaïveBayse高斯模型的机器学习分类器 #导入数据 import sklearn from sklearn.datasets import load_breast_cancer data = load_breast_cancer() target_names = data['target_names'] target = data['target'] feature_names = data['feature_names'] data = data['data'] 字典键列表 分类标签名称(target_names) 实际标签(targets) 属性/功能名称(feature_names) 属性/功能(data) #组织数据 from sklearn.model_selection import train_test_split train, test, train_labels, test_labels = train_test_split(data, target, test_size=0.4, random_state=42) #建模 from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() model = gnb.fit(train, train_labels)#拟合 #预测 prebs = gnb.predict(test) #评估 from sklearn.metrics import accuracy_score accuracy_score(prebs, test_labels) 支持向量机 import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, svm #数据导入 iris = datasets.load_iris() X = iris.data[:, :2] Y = iris.target #数据处理 x_min, x_max = X[:, 0].min()-1, X[:, 0].max()+1 y_min, y_max = X[:, 1].min()-1, X[:, 1].max()+1 h = x_max/x_min/100 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) X_plot = np.c_[xx.ravel(), yy.ravel()] # c_:横向拼接;ravel:flatten #模型训练、预测 svc_classifier = svm.SVC(kernel='linear', C=1.0, decision_function_shape='ovr').fit(X, Y) Z = svc_classifier.predict(X_plot) Z = Z.reshape(xx.shape) #绘图 plt.contourf(xx, yy, Z, cmap=plt.cm.tab10, alpha=0.3) plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Set1) plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.xlim(xx.min(), xx.max()) plt.title('SVC with linear kernel') 逻辑回归 import numpy as np import matplotlib.pyplot as plt from sklearn import linear_model #创建数据 X = np.array([[2, 4.8], [2.9, 4.7], [2.5, 5], [3.2, 5.5], [6, 5], [7.6, 4], [3.2, 0.9], [2.9, 1.9],[2.4, 3.5], [0.5, 3.4], [1, 4], [0.9, 5.9]]) Y = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) #数据处理 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min()-1, X[:, 1].max()+1 step = 0.02 xx, yy = np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step)) #模型训练、预测 classifier = linear_model.LogisticRegression(solver='liblinear', C=75).fit(X, Y) Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) #绘图 plt.pcolormesh(xx, yy, Z, cmap=plt.cm.gray) plt.scatter(X[:, 0], X[:, 1], c=Y, s=75, edgecolors='black', linewidths=1, cmap=plt.cm.Paired) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks((np.arange(int(X[:, 0].min() - 1), int(X[:, 0].max() + 1), 1.0))) plt.yticks((np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0))) plt.title('LogisticRegression') 决策树分类器 import pydotplus from sklearn import tree from sklearn.model_selection import train_test_split import collections #构建数据 X = [[165,19],[175,32],[136,35],[174,65],[141,28],[176,15],[131,32], [166,6],[128,32],[179,10],[136,34],[186,2],[126,25],[176,28],[112,38], [169,9],[171,36],[116,25],[196,25]] Y = ['Man','Woman','Woman','Man','Woman','Man','Woman','Man','Woman', 'Man','Woman','Man','Woman','Woman','Woman','Man','Woman','Woman','Man'] data_feature_names = ['height','length of hair'] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.40, random_state=5) clf = tree.DecisionTreeClassifier().fit(X,Y) prediction = clf.predict([[133,37]]) print(prediction) #绘图 dot_data = tree.export_graphviz(clf,feature_names = data_feature_names,out_file = None,filled = True,rounded = True) graph = pydotplus.graph_from_dot_data(dot_data) colors = ('orange', 'yellow') edges = collections.defaultdict(list) for edge in graph.get_edge_list(): edges[edge.get_source()].append(int(edge.get_destination())) for edge in edges: edges[edge].sort() for i in range(2): dest = graph.get_node(str(edges[edge][i]))[0] dest.set_fillcolor(colors[i]) 随机森林分类器 from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.datasets import load_breast_cancer import matplotlib.pyplot as plt import numpy as np #数组导入、处理 cancer = load_breast_cancer() X_train, X_test, y_train,y_test = train_test_split(cancer.data, cancer.target, random_state = 0) forest = RandomForestClassifier(n_estimators = 50, random_state = 0) forest.fit(X_train,y_train) #模型评估 print('Accuracy on the training subset:(:.3f)',format(forest.score(X_train,y_train))) print('Accuracy on the training subset:(:.3f)',format(forest.score(X_test,y_test))) #绘图 n_features = cancer.data.shape[1] plt.barh(range(n_features),forest.feature_importances_, align='center') plt.yticks(np.arange(n_features),cancer.feature_names) plt.xlabel('Feature Importance') plt.ylabel('Feature') plt.show()