python机器学习-建模（四）

最新推荐文章于 2023-11-09 22:45:40 发布

微毂

最新推荐文章于 2023-11-09 22:45:40 发布

阅读量330

点赞数 1

分类专栏：数据预处理机器学习 Python

本文链接：https://blog.csdn.net/weixin_42749734/article/details/100112921

版权

Python 同时被 3 个专栏收录

20 篇文章 0 订阅

订阅专栏

数据预处理

15 篇文章 0 订阅

订阅专栏

机器学习

10 篇文章 1 订阅

订阅专栏

在这里插入图片描述

5.1、线性回归

5.2、逻辑回归

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict
# 自定义权重项
penalty = {
    0: 5,
    1: 1
}

lr = LogisticRegression(class_weight=penalty)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

5.3、决策树

# 建模分析（决策树）
# 分类
from sklearn.tree import DecisionTreeClassifier
treeclf=DecisionTreeClassifier()
treeclf.fit(X_train,y_train)
# print(treeclf.predict(X_test))
# print(treeclf.predict(X_test)-y_test)

# 决策树做回归
from sklearn.tree import DecisionTreeRegressor
treereg=DecisionTreeRegressor()
treereg.fit(X_train,y_train)

# 查看模型在测试集的评分
treeclf.score(X_test,y_test)

5.4、随机森林

# 分类
from sklearn.ensemble import RandomForestClassifier

# 参数：
# 10棵树：n_estimators=10,
# 权重参数：class_weight="balanced", 
# random_state=1

rfclf=RandomForestClassifier()
rfclf.fit(X_train,y_train)
# print(rfclf.predict(X_test))
# print(rfclf.predict(X_test)-y_test)

# 回归
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor()
rfr.fit(X_train,y_train)

5.5、支持向量机

from sklearn import svm

# 核函数kernel='linear'
svmclf=svm.SVC(kernel='linear')

svmclf.fit(X_train,y_train)
# print(svmclf.predict(X_test))
# print(svmclf.predict(X_test)-y_test)
svmclf.score(X_test,y_test)

5.6、聚类（K-means）

 from sklearn.cluster import KMeans

#n_clusters=3指定三个中心
clt=KMeans(n_clusters=3)

clt.fit(X)
# 打印结果
print(clt.predict(X))
print(clt.labels_)
# 打印中心点
print(clt.cluster_centers_)

5.7、降维（PCA）

from sklearn.decomposition import PCA
pca=PCA(n_components=6)
pca.fit(X_train)

# 主成分
pca.explained_variance_

# 对降维后进行回归分析
X_train_pca,X_test_pca=pca.transform(X_train),pca.transform(X_test)
from sklearn.linear_model import ElasticNetCV

netreg=ElasticNetCV()
netreg.fit(X_train_pca,y_train)
netreg.predict(X_test_pca)
netreg.score(X_test_pca,y_test)

5.8、管道训练

# 将Boston房价问题的数据预处理，数据降维和回归分析过程构建成管道

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import Pipeline

boston=datasets.load_boston()
X_train,X_test,y_train,y_test=train_test_split(boston.data,boston.target,test_size=1/3,random_state=0)

# 构建管道，使用make_pipeline函数可以便捷的构建管道
pipe=Pipeline([('scaler',preprocessing.MinMaxScaler()),('pca',PCA()),('net',ElasticNetCV())])

# 可以在管道定义时设置参数，也可以统一设置，注意参数的名字的调整方式
pipe.set_params(scaler__feature_range=(0,1),pca__n_components=6)

# 用数据喂养管道
pipe.fit(X_train,y_train)

# 预测
print(pipe.predict(X_test))
print(pipe.score(X_test,y_test))

使用特征联合FeatureUnion
Pipeline是estimater的串联，而FeatureUnion则是estimater的并联。但是FeatureUnion并联的只能transformer转换器。
FeatureUnion合并了多个转换器对象形成一个新的转换器，该转换器合并了他们的输出。输出的样本向量被横向连接成更长的向量。
可以结合FeatureUnion和Pipeline来创造出更加复杂的模型。

from sklearn import datasets
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
boston=datasets.load_boston()

united=FeatureUnion([('linear_pca',PCA(n_components=3)),
                    ('kernel_pca',KernelPCA(n_components=5))])

united.fit_transform(boston.data).shape

小案例

from sklearn.cross_validation import KFold

# 定义一个函数
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    # 模型训练
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # 模型的参数
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred
 
    
# 转成概率值，同样是流失用户，有些用户流失的概率会更大，需要优先处理
def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        # 转成概率值：predict_proba
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob          
    
# 调用上面函数，进行训练
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

# 评分函数
def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

print ("Support vector machines:")
print ("%.3f" % accuracy(y, run_cv(X,y,SVC)))
print ("Random forest:")
print ("%.3f" % accuracy(y, run_cv(X,y,RF)))
print ("K-nearest-neighbors:")
print ("%.3f" % accuracy(y, run_cv(X,y,KNN)))

最终结果预测

import warnings
warnings.filterwarnings('ignore')

# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
#print pred_prob[0]
pred_churn = pred_prob[:,1]
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
#print counts

# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

# pandas-fu
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts

# pred_prob：流失的概率；count:共有多少人；true_prob：预测准确率

在这里插入图片描述
查看模型的参数

参数

clf.get_params()

微毂

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
python机器学习-建模（四）

5.1、线性回归5.2、逻辑回归from sklearn.linear_model import LogisticRegressionfrom sklearn.cross_validation import cross_val_predict# 自定义权重项penalty = { 0: 5, 1: 1}lr = LogisticRegression(class_...
复制链接

扫一扫