python机器学习-建模(四)

在这里插入图片描述
在这里插入图片描述

5.1、线性回归

5.2、逻辑回归

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict
# 自定义权重项
penalty = {
    0: 5,
    1: 1
}

lr = LogisticRegression(class_weight=penalty)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

5.3、决策树

# 建模分析(决策树)
# 分类
from sklearn.tree import DecisionTreeClassifier
treeclf=DecisionTreeClassifier()
treeclf.fit(X_train,y_train)
# print(treeclf.predict(X_test))
# print(treeclf.predict(X_test)-y_test)

# 决策树做回归
from sklearn.tree import DecisionTreeRegressor
treereg=DecisionTreeRegressor()
treereg.fit(X_train,y_train)

# 查看模型在测试集的评分
treeclf.score(X_test,y_test)

5.4、随机森林

# 分类
from sklearn.ensemble import RandomForestClassifier

# 参数:
# 10棵树:n_estimators=10,
# 权重参数:class_weight="balanced", 
# random_state=1

rfclf=RandomForestClassifier()
rfclf.fit(X_train,y_train)
# print(rfclf.predict(X_test))
# print(rfclf.predict(X_test)-y_test)

# 回归
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor()
rfr.fit(X_train,y_train)

5.5、支持向量机

from sklearn import svm

# 核函数kernel='linear'
svmclf=svm.SVC(kernel='linear')

svmclf.fit(X_train,y_train)
# print(svmclf.predict(X_test))
# print(svmclf.predict(X_test)-y_test)
svmclf.score(X_test,y_test)

5.6、聚类(K-means)

 from sklearn.cluster import KMeans

#n_clusters=3指定三个中心
clt=KMeans(n_clusters=3)

clt.fit(X)
# 打印结果
print(clt.predict(X))
print(clt.labels_)
# 打印中心点
print(clt.cluster_centers_)

5.7、降维(PCA)

from sklearn.decomposition import PCA
pca=PCA(n_components=6)
pca.fit(X_train)

# 主成分
pca.explained_variance_

# 对降维后进行回归分析
X_train_pca,X_test_pca=pca.transform(X_train),pca.transform(X_test)
from sklearn.linear_model import ElasticNetCV

netreg=ElasticNetCV()
netreg.fit(X_train_pca,y_train)
netreg.predict(X_test_pca)
netreg.score(X_test_pca,y_test)

5.8、管道训练

# 将Boston房价问题的数据预处理,数据降维和回归分析过程构建成管道

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import Pipeline

boston=datasets.load_boston()
X_train,X_test,y_train,y_test=train_test_split(boston.data,boston.target,test_size=1/3,random_state=0)

# 构建管道,使用make_pipeline函数可以便捷的构建管道
pipe=Pipeline([('scaler',preprocessing.MinMaxScaler()),('pca',PCA()),('net',ElasticNetCV())])

# 可以在管道定义时设置参数,也可以统一设置,注意参数的名字的调整方式
pipe.set_params(scaler__feature_range=(0,1),pca__n_components=6)

# 用数据喂养管道
pipe.fit(X_train,y_train)

# 预测
print(pipe.predict(X_test))
print(pipe.score(X_test,y_test))

使用特征联合FeatureUnion
Pipeline是estimater的串联,而FeatureUnion则是estimater的并联。但是FeatureUnion并联的只能transformer转换器。
FeatureUnion合并了多个转换器对象形成一个新的转换器,该转换器合并了他们的输出。输出的样本向量被横向连接成更长的向量。
可以结合FeatureUnion和Pipeline来创造出更加复杂的模型。

from sklearn import datasets
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
boston=datasets.load_boston()

united=FeatureUnion([('linear_pca',PCA(n_components=3)),
                    ('kernel_pca',KernelPCA(n_components=5))])

united.fit_transform(boston.data).shape

小案例

from sklearn.cross_validation import KFold

# 定义一个函数
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    # 模型训练
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # 模型的参数
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred
 
    
# 转成概率值,同样是流失用户,有些用户流失的概率会更大,需要优先处理
def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        # 转成概率值:predict_proba
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob          
    
# 调用上面函数,进行训练
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

# 评分函数
def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

print ("Support vector machines:")
print ("%.3f" % accuracy(y, run_cv(X,y,SVC)))
print ("Random forest:")
print ("%.3f" % accuracy(y, run_cv(X,y,RF)))
print ("K-nearest-neighbors:")
print ("%.3f" % accuracy(y, run_cv(X,y,KNN)))

最终结果预测

import warnings
warnings.filterwarnings('ignore')

# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
#print pred_prob[0]
pred_churn = pred_prob[:,1]
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
#print counts

# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

# pandas-fu
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts

# pred_prob:流失的概率;count:共有多少人;true_prob:预测准确率


在这里插入图片描述
查看模型的参数

参数

clf.get_params()

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值