In [1]:
from numpy.random import randn
import numpy as np
from scipy.stats.stats import pearsonr
In [2]:
x = randn(8)
y = randn(8)
pearsonr(x,y)[0]
Out[2]:
In [3]:
data1 = [0.8, 2.2, 2.8, 3.5, 4.8]
x = np.array(data1)
y = np.arange(5)
pearsonr(x,y)[0]
Out[3]:
In [4]:
x = np.arange(15)
y = np.arange(15)
pearsonr(x,y)[0]
Out[4]:
In [5]:
data1 = [0.8, 2.2, 2.8, 3.5, 4.8]
x = np.array(data1)
y = -np.arange(5)
pearsonr(x,y)[0]
Out[5]:
In [6]:
x = np.arange(15)
y = -np.arange(15)
pearsonr(x,y)[0]
Out[6]:
In [4]:
## 导入工具包
import pandas
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
iris =pandas.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None)
iris.columns=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
le = LabelEncoder()
le.fit(iris['Species'])
y = le.transform(iris['Species']) # 对花的类别进行编号处理
lm = linear_model.LogisticRegression() # 选择逻辑回归
features = ['PetalLengthCm','PetalWidthCm','SepalLengthCm','SepalWidthCm'] # 特征变量
selected_features = []
rest_features = features[:]
best_acc = 0
while len(rest_features)>0:
temp_best_i = ''
temp_best_acc = 0
for feature_i in rest_features:
temp_features = selected_features + [feature_i,]
X = iris[temp_features] # 每次选择一个特征进行训练
scores = cross_val_score(lm,X,y,cv=5 , scoring='accuracy')
acc = np.mean(scores)
if acc > temp_best_acc:
temp_best_acc = acc
temp_best_i = feature_i
print("select",temp_best_i,"acc:",temp_best_acc) # 打印选择的特征和评分
if temp_best_acc > best_acc: # 判断是否大于最好的评分
best_acc = temp_best_acc
selected_features += [temp_best_i,]
rest_features.remove(temp_best_i)
else:
break
print("best feature set: ",selected_features,"acc: ",best_acc)
In [50]:
# 通过卡方检验选定数据特征
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# 导入数据
iris =pandas.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None)
iris.columns=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
# 将数据分为输入数据和输出结果
arrary = iris.values
# print(arrary)
X =arrary[:,0:4]
le = LabelEncoder()
le.fit(iris['Species'])
Y = le.transform(iris['Species']) # 对花的类别进行编号处理
# print Y
# 特征选定
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
print(features)
In [51]:
# 通过递归消除来选定特征
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# 导入数据
iris =pandas.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None)
iris.columns=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
# 将数据分为输入数据和输出结果
arrary = iris.values
# print(arrary)
X =arrary[:,0:4]
# print X
le = LabelEncoder()
le.fit(iris['Species'])
Y = le.transform(iris['Species']) # 对花的类别进行编号处理
# 特征选定
model = LogisticRegression()
rfe = RFE(model, 2)
fit = rfe.fit(X, Y)
print("特征个数:")
print(fit.n_features_)
print("被选定的特征:")
print(fit.support_)
print("特征排名:")
print(fit.ranking_)
In [7]:
# 通过主要成分分析选定数据特征
from pandas import read_csv
from sklearn.decomposition import PCA
# 导入数据
iris =pandas.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None)
iris.columns=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
# 将数据分为输入数据和输出结果
arrary = iris.values
# print(arrary)
X =arrary[:,0:4]
# print X
le = LabelEncoder()
le.fit(iris['Species'])
Y = le.transform(iris['Species']) # 对花的类别进行编号处理
# 特征选定
pca = PCA(n_components=2)
fit = pca.fit(X)
print("解释方差:%s" % fit.explained_variance_ratio_)
print(fit.components_)
In [47]:
# 通过决策树计算特征的重要性
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
# 导入数据
iris =pandas.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None)
iris.columns=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
# 将数据分为输入数据和输出结果
arrary = iris.values
X =np.array(arrary[:,0:4])
le = LabelEncoder()
le.fit(iris['Species'])
Y = np.array(le.transform(iris['Species'])) # 对花的类别进行编号处理
# 特征选定
model = ExtraTreesClassifier()
fit = model.fit(X, Y)
print(fit.feature_importances_)
In [39]:
import pandas
import numpy as np
from sklearn import ensemble
from sklearn.preprocessing import LabelEncoder
iris =pandas.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None)
iris.columns=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
le = LabelEncoder()
le.fit(iris['Species'])
rf = ensemble.RandomForestClassifier()
features = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
y = np.array(le.transform(iris['Species']))
X = np.array(iris[features])
#Gini importance
rf.fit(X,y)
print(rf.feature_importances_)
In [38]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=10,test_size=0.1)
scores = np.zeros((10,4))
count = 0
for train_idx, test_idx in rs.split(X):
X_train , X_test = X[train_idx] , X[test_idx]
y_train , y_test = y[train_idx] , y[test_idx]
r = rf.fit(X_train,y_train)
acc = accuracy_score(y_test,rf.predict(X_test))
for i in range(len(features)):
X_t = X_test.copy()
np.random.shuffle(X_t[:, i])
shuff_acc = accuracy_score(y_test,rf.predict(X_t))
scores[count,i] = ((acc-shuff_acc)/acc)
count += 1
print(np.mean(scores,axis=0))
In [ ]:
https://coding.net/u/RuoYun/p/Python-of-machine-learning/git/tree/master