一,自定义数据样本
from sklearn import datasets
import matplotlib.pyplot as plt
# create customized data,noise 表示的是离散的程度
X,y=datasets.make_regression(n_samples=100,n_features=1,n_targets=1,noise=10)
plt.scatter(X,y)
plt.show()
二,常用实验数据汇总
from sklearn.datasets import load_iris
from sklearn.datasets import load_boston
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_digits
from sklearn.datasets import load_linnerud
from sklearn.datasets import load_wine
from sklearn.datasets import load_breast_canner
iris=load_iris()
iris_X=iris.data
iris_Y=iris.target
shape=iris.data.shape
三,模型的常用参数
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn import datasets
iris=load_iris()
iris_X=iris.data
iris_Y=iris.target
X_train,X_test,Y_train,Y_test=train_test_split(iris_X,iris_Y,test_size=0.3)
lr=LinearRegression()
lr.fit(X_train,Y_train)
print lr.coef_ # [-0.1025279 -0.10673591 0.18254043 0.69219621]
print lr.intercept_ # 0.4061787783812755
print lr.get_params() # {'copy_X': True, 'normalize': False, 'n_jobs': None, 'fit_intercept': True}
print lr.score(iris_X,iris_Y)# 0.9293519985342178
四,标准化数据
归一化(Normalization):把数变为(0,1)之间的小数,主要是为了数据处理方便提出来的,把数据映射到0~1范围之内处理,更加便捷快速,应该归到数字信号处理范畴之内.一般方法是最小-最大规范的方法: (x-min(x))/(max(x)-min(x)),
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
# Normalization
X,y=make_classification(n_samples=300,n_features=2,n_redundant=0,n_informative=2,random_state=22,n_clusters_per_class=1,scale=100)
preprocessing.scale(X) # its 0.44 if commented,else its 0.51
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.3)
clf=SVC()
clf.fit(X_train,Y_train)
print clf.score(X_test,Y_test)