5.1、线性回归
5.2、逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict
# 自定义权重项
penalty = {
0: 5,
1: 1
}
lr = LogisticRegression(class_weight=penalty)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)
5.3、决策树
# 建模分析(决策树)
# 分类
from sklearn.tree import DecisionTreeClassifier
treeclf=DecisionTreeClassifier()
treeclf.fit(X_train,y_train)
# print(treeclf.predict(X_test))
# print(treeclf.predict(X_test)-y_test)
# 决策树做回归
from sklearn.tree import DecisionTreeRegressor
treereg=DecisionTreeRegressor()
treereg.fit(X_train,y_train)
# 查看模型在测试集的评分
treeclf.score(X_test,y_test)
5.4、随机森林
# 分类
from sklearn.ensemble import RandomForestClassifier
# 参数:
# 10棵树:n_estimators=10,
# 权重参数:class_weight="balanced",
# random_state=1
rfclf=RandomForestClassifier()
rfclf.fit(X_train,y_train)
# print(rfclf.predict(X_test))
# print(rfclf.predict(X_test)-y_test)
# 回归
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor()
rfr.fit(X_train,y_train)
5.5、支持向量机
from sklearn import svm
# 核函数kernel='linear'
svmclf=svm.SVC(kernel='linear')
svmclf.fit(X_train,y_train)
# print(svmclf.predict(X_test))
# print(svmclf.predict(X_test)-y_test)
svmclf.score(X_test,y_test)
5.6、聚类(K-means)
from sklearn.cluster import KMeans
#n_clusters=3指定三个中心
clt=KMeans(n_clusters=3)
clt.fit(X)
# 打印结果
print(clt.predict(X))
print(clt.labels_)
# 打印中心点
print(clt.cluster_centers_)
5.7、降维(PCA)
from sklearn.decomposition import PCA
pca=PCA(n_components=6)
pca.fit(X_train)
# 主成分
pca.explained_variance_
# 对降维后进行回归分析
X_train_pca,X_test_pca=pca.transform(X_train),pca.transform(X_test)
from sklearn.linear_model import ElasticNetCV
netreg=ElasticNetCV()
netreg.fit(X_train_pca,y_train)
netreg.predict(X_test_pca)
netreg.score(X_test_pca,y_test)
5.8、管道训练
# 将Boston房价问题的数据预处理,数据降维和回归分析过程构建成管道
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import Pipeline
boston=datasets.load_boston()
X_train,X_test,y_train,y_test=train_test_split(boston.data,boston.target,test_size=1/3,random_state=0)
# 构建管道,使用make_pipeline函数可以便捷的构建管道
pipe=Pipeline([('scaler',preprocessing.MinMaxScaler()),('pca',PCA()),('net',ElasticNetCV())])
# 可以在管道定义时设置参数,也可以统一设置,注意参数的名字的调整方式
pipe.set_params(scaler__feature_range=(0,1),pca__n_components=6)
# 用数据喂养管道
pipe.fit(X_train,y_train)
# 预测
print(pipe.predict(X_test))
print(pipe.score(X_test,y_test))
使用特征联合FeatureUnion
Pipeline是estimater的串联,而FeatureUnion则是estimater的并联。但是FeatureUnion并联的只能transformer转换器。
FeatureUnion合并了多个转换器对象形成一个新的转换器,该转换器合并了他们的输出。输出的样本向量被横向连接成更长的向量。
可以结合FeatureUnion和Pipeline来创造出更加复杂的模型。
from sklearn import datasets
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
boston=datasets.load_boston()
united=FeatureUnion([('linear_pca',PCA(n_components=3)),
('kernel_pca',KernelPCA(n_components=5))])
united.fit_transform(boston.data).shape
小案例
from sklearn.cross_validation import KFold
# 定义一个函数
def run_cv(X,y,clf_class,**kwargs):
# Construct a kfolds object
kf = KFold(len(y),n_folds=5,shuffle=True)
y_pred = y.copy()
# 模型训练
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
# 模型的参数
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
# 转成概率值,同样是流失用户,有些用户流失的概率会更大,需要优先处理
def run_prob_cv(X, y, clf_class, **kwargs):
kf = KFold(len(y), n_folds=5, shuffle=True)
y_prob = np.zeros((len(y),2))
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
# 转成概率值:predict_proba
y_prob[test_index] = clf.predict_proba(X_test)
return y_prob
# 调用上面函数,进行训练
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
# 评分函数
def accuracy(y_true,y_pred):
# NumPy interprets True and False as 1. and 0.
return np.mean(y_true == y_pred)
print ("Support vector machines:")
print ("%.3f" % accuracy(y, run_cv(X,y,SVC)))
print ("Random forest:")
print ("%.3f" % accuracy(y, run_cv(X,y,RF)))
print ("K-nearest-neighbors:")
print ("%.3f" % accuracy(y, run_cv(X,y,KNN)))
最终结果预测
import warnings
warnings.filterwarnings('ignore')
# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
#print pred_prob[0]
pred_churn = pred_prob[:,1]
is_churn = y == 1
# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
#print counts
# calculate true probabilities
true_prob = {}
for prob in counts.index:
true_prob[prob] = np.mean(is_churn[pred_churn == prob])
true_prob = pd.Series(true_prob)
# pandas-fu
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts
# pred_prob:流失的概率;count:共有多少人;true_prob:预测准确率
查看模型的参数
参数
clf.get_params()