笔者是一个痴迷于挖掘数据中的价值的学习人,希望在平日的工作学习中,挖掘数据的价值,找寻数据的秘密,笔者认为,数据的价值不仅仅只体现在企业中,个人也可以体会到数据的魅力,用技术力量探索行为密码,让大数据助跑每一个人,欢迎直筒们关注我的公众号,大家一起讨论数据中的那些有趣的事情。
我的公众号为:livandata
本文主要是对sklearn的一些常用方法做一些简单的介绍,这个包中的内容主要包括一些机器学习的算法,需要结合机器学习的原理进行理解。
sklearn是一些封装较高的算法集:
分类、回归、无监督、决策树、数据降维、数据预处理等,包括常见的一些机器学习的方法。
#!/usr/bin/env python
# _*_ UTF-8 _*_
import sklearn
from sklearn.neighbors.classification import KNeighborsClassifier
'''1、强大的附带数据集:'''
# 鸢尾花数据集:
# from sklearn.datasets import load_iris
# loaded_data = load_iris()
# data_x = loaded_data.data
# data_y = loaded_data.target
# print(data_x)
# 波士顿房价:
# from sklearn import datasets
# loaded_data = datasets.load_boston()
# data_x = loaded_data.data
# data_y = loaded_data.target
# print(data_x)
'''2、常用算法'''
# knn算法:
# from sklearn.model_selection import train_test_split
# from sklearn import datasets
# from sklearn.neighbors import KNeighborsClassifier
# iris = datasets.load_iris()
# iris_X = iris.data
# iris_Y = iris.target
# X_train, X_test, Y_train, Y_test = train_test_split(iris_X, iris_Y, test_size=0.3)
# knn = KNeighborsClassifier()
# knn.fit(X_train, Y_train)
# print(knn.predict(X_test))
# print(Y_test)
# Y = knn.predict(X_test)
# 线性回归:
# import matplotlib.pyplot as plt
# from sklearn import datasets
# from sklearn.linear_model import LinearRegression
# loaded_data = datasets.load_boston()
# data_x = loaded_data.data
# data_y = loaded_data.target
# model = LinearRegression()
# model.fit(data_x, data_y)
# print(data_x)
# 4行所有列:
# print(data_x[:4, :])
# print(model.predict(data_x[:4, :]))
# 对于一维数据获取4之前的数据:
# print(data_y[:4])
# 正规化,标准化过程:
# from sklearn import preprocessing
# import numpy as np
# a = np.array([[1,2,3],[2,4,6],[3,6,9]])
# print(preprocessing.scale(a))
# 是一个数组,不是一维向量,可以将其转化成一维向量,然后再进行矩阵运算:
# a = np.random.randn(5)
# print(type(a))
# from sklearn import preprocessing
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.datasets.samples_generator import make_classification
# from sklearn.svm import SVC
# import matplotlib.pyplot as plt
# x, y = make_classification(n_samples=300,
# n_features=2,
# n_redundant=0,
# n_informative=2,
# random_state=22,
# n_clusters_per_class=1,
# scale=100)
# plt.plot.scatter(x[:,0], x[:, 1], c=y)
# plt.show()
# 随机森林:
# from sklearn.ensemble import RandomForestClassifier
# X = [[0,0],[1,1]]
# Y = [0,1]
# clf = RandomForestClassifier(n_estimators=10)
# clf = clf.fit(X, Y)
# print(clf)
# ------
# from sklearn.model_selection import cross_val_score
# from sklearn.datasets import make_blobs
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.tree import DecisionTreeClassifier
# X, y = make_blobs(n_samples=10000, n_features=10, centers=100)
# clf = DecisionTreeClassifier(max_depth=None, min_impurity_split=2)
# scores = cross_val_score(clf, X, y)
# print(scores.mean())