python中基本分类算法

最新推荐文章于 2024-07-28 15:46:11 发布

三生彼岸

最新推荐文章于 2024-07-28 15:46:11 发布

阅读量826

点赞数

文章标签： python 算法 sklearn

本文链接：https://blog.csdn.net/chenlei4366/article/details/102897944

版权

# -*- coding: utf-8 -*-
"""
Created on Sun Oct  6 12:46:25 2019

@author: Administrator
"""

import os
import numpy as np  
import pandas as pd
from matplotlib import pyplot as plt

#mse
def mse_error(y,y_predict):
    error = (sum(list(map(lambda a:a*a,y-y_predict))))/len(y_predict)
    return error
# 计算准确率
def accuracy_cal(y,y_predict):
    list_acc = list(map(lambda a,b:1 if a==b else 0,y,y_predict))
    return sum(list_acc) / len(list_acc)

#%%准备数据集
from sklearn.datasets import load_iris  
#iris的４个属性是：萼片宽度,萼片长度,花瓣宽度,花瓣长度;标签是花的种类：setosa,versicolour,virginica
iris=load_iris()  
x=iris.data
y=iris.target
#拆分数据集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

#%%朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()   # 使用默认配置初始化朴素贝叶斯
gnb.fit(x_train,y_train)    # 利用训练数据对模型参数进行估计
print(u"MNB模型的预测正确率为： %.2f%%" % (100 * accuracy_cal(y_test,gnb.predict(x_test))))


#%%逻辑回归
from sklearn.linear_model import LogisticRegression as LR
lr = LR(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
        intercept_scaling=1, class_weight=None, random_state=None,
        solver='liblinear', max_iter=100, multi_class='ovr',
        verbose=0, warm_start=False, n_jobs=1) #建立逻辑回归模型
lr.fit(x_train, y_train) 
print(u"LR模型的预测正确率为： %.2f%%" % (100 * accuracy_cal(y_test,lr.predict(x_test))))

#%%决策树
from sklearn.tree import DecisionTreeClassifier as DTC
dtc = DTC(criterion="gini",splitter="best",max_depth=None,min_samples_split=2,
                 min_samples_leaf=1,min_weight_fraction_leaf=0.,max_features=None,
                 random_state=None,max_leaf_nodes=None,class_weight=None,
                 presort=False) #建立决策树模型
dtc.fit(x_train, y_train) 
print(u"DTC模型的预测正确率为： %.2f%%" % (100 * accuracy_cal(y_test,dtc.predict(x_test))))

#%%随机森林
from sklearn.ensemble import RandomForestRegressor   
rf=RandomForestRegressor(n_estimators=10,criterion="mse",max_depth=None,
                 min_samples_split=2,min_samples_leaf=1,min_weight_fraction_leaf=0.,
                 max_features="auto",max_leaf_nodes=None,bootstrap=True,oob_score=False,
                 n_jobs=1,random_state=None,verbose=0,warm_start=False)#这里使用了默认的参数设置  
rf.fit(x_train,y_train)#进行模型的训练
print(u"rf模型的预测正确率为： %.2f%%" % (100 * accuracy_cal(y_test,rf.predict(x_test))))

#%%SVM
from sklearn.svm import SVC # 导入SVC类 SVC(Support vector classifier)
svc = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto',
                 coef0=0.0, shrinking=True, probability=False,
                 tol=1e-3, cache_size=200, class_weight=None,
                 verbose=False, max_iter=-1, decision_function_shape=None,
                 random_state=None) # 引入线性核函数
svc.fit(x_train,y_train)
print(u"SVM模型的预测正确率为： %.2f%%" % (100 * accuracy_cal(y_test,svc.predict(x_test))))

#%%xgboost
import xgboost as xgb
params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',
#    objective [缺省值=reg:linear] 
#    “reg:linear” – 线性回归 
#    “reg:logistic” – 逻辑回归 
#    “binary:logistic” – 二分类逻辑回归，输出为概率 
#    “binary:logitraw” – 二分类逻辑回归，输出的结果为wTx 
#    “count:poisson” – 计数问题的poisson回归，输出结果为poisson分布。在poisson回归中，max_delta_step的缺省值为0.7 (used to safeguard optimization) 
#    “multi:softmax” – 设置 XGBoost 使用softmax目标函数做多分类，需要设置参数num_class（类别个数） 
#    “multi:softprob” – 如同softmax，但是输出结果为ndata*nclass的向量，其中的值是每个数据分为每个类的概率。
    'eval_metric':'rmse',
#    可供选择的如下所示： 
#    “rmse”: 均方根误差 
#    “mae”: 平均绝对值误差 
#    “logloss”: negative log-likelihood 
#    “error”: 二分类错误率。其值通过错误分类数目与全部分类数目比值得到。对于预测，预测值大于0.5被认为是正嘞，其它归为负类。 
#    “error@t”: 不同的划分阈值可以通过 ‘t’进行设置 
#    “merror”: 多分类错误率，计算公式为(wrong cases)/(all cases) 
#    “mlogloss”: 多分类log损失 
#    “auc”: 曲线下的面积 
#    “ndcg”:Normalized Discounted Cumulative Gain 
#    “map”: 平均正确率
    'num_class': 3,
    'gamma': 0.1, 
#    缺省值=0，别名: min_split_loss（分裂最小loss）,在节点分裂时，
#    只有分裂后损失函数的值下降了，才会分裂这个节点。Gamma指定了节点分裂所需的最小损失函数下降值。
#    这个参数的值越大，算法越保守。这个参数的值和损失函数息息相关，所以是需要调整的。
    'max_depth': 6,#这个值为树的最大深度。max_depth越大，模型会学到更具体更局部的样本。设置为0代表没有限制 。
    'lambda': 2,#L2正则化惩罚系数
    'subsample': 0.7,#这个参数控制对于每棵树，随机采样的比例。 减小这个参数的值，算法会更加保守，避免过拟合。
    #但是，如果这个值设置得过小，它可能会导致欠拟合。 典型值：0.5-1，0.5代表平均采样，防止过拟合.
    'colsample_bytree': 0.7,#用来控制每棵随机采样的列数的占比(每一列是一个特征)。 典型值：0.5-1 
    'min_child_weight': 3,
    #决定最小叶子节点样本权重和。XGBoost的这个参数是最小样本权重的和，而GBM参数是最小样本总数。 
    #这个参数用于避免过拟合。当它的值较大时，可以避免模型学习到局部的特殊样本。 但是如果这个值过高，会导致欠拟合。
    'silent': 1,#设置为0打印运行信息；设置为1静默模式，不打印
    'eta': 0.1, #缺省值=0.3，别名：learning_rate
    'seed': 1000,
    'nthread': 4, 
    'scale_pos_weight':1, #[缺省值=1] ,在样本不平衡时使用，可以使算法更快收敛。通常设置为负样本的数目与正样本数目的比值
}

plst = params.items()
dtrain = xgb.DMatrix(x_train, y_train)
num_rounds = 500
xgboost = xgb.train(plst, dtrain, num_rounds)

print(u"xgb模型的预测正确率为： %.2f%%" % (100 * accuracy_cal(y_test,xgboost.predict(xgb.DMatrix(x_test)))))

三生彼岸

关注

0
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
python中基本分类算法

# -*- coding: utf-8 -*-"""Created on Sun Oct 6 12:46:25 2019@author: Administrator"""import osimport numpy as np import pandas as pdfrom matplotlib import pyplot as plt#msedef mse_erro...
复制链接

扫一扫