# sklearn的模型训练与预测

sklearn是强大的python机器学习工具，支持丰富的机器学习算法数据预处理，在学术界和企业中应用广泛，下面是sklearn的代码编写流程和各种算法使用示例（以分类为例）。

1. 创建模型对象
2. 训练
3. 预测与性能评价

## xgboost算法分类

'''
* xgboost分类
'''

from classifier import LogRegClassifier
import numpy as np
import json
import math
import time
import os
import random
from sklearn.model_selection import train_test_split
from sklearn import metrics

def main():
time_begin = time.time()
# 原始数据（省略）
data = d.data
labels = d.labels
# 数据标准化
from sklearn.preprocessing import StandardScaler
data = StandardScaler().fit_transform(data)
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)

# 1.创建模型对象
import sklearn
from xgboost import XGBClassifier
clf = XGBClassifier(learning_rate=0.1,
n_estimators=1000,  # 树的个数--1000棵树建立xgboost
max_depth=6,  # 树的深度
min_child_weight=1,  # 叶子节点最小权重
gamma=0.,  # 惩罚项中叶子结点个数前的参数
subsample=0.8,  # 随机选择80%样本建立决策树
colsample_btree=0.8,  # 随机选择80%特征建立决策树
objective='multi:softmax',  # 指定损失函数
scale_pos_weight=1,  # 解决样本个数不平衡的问题
random_state=27  # 随机数
)

# 2.训练
clf = clf.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric="mlogloss", early_stopping_rounds=10,
verbose=True)

# 3.预测与性能评价
np.set_printoptions(threshold=np.inf)
predicted = clf.predict(x_test)
predicted = np.array(predicted)
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))
time_end = time.time()
print("total time is ", time_end-time_begin)

# 程序入口
if __name__ == "__main__":
main()



## 随机森林算法分类

n_estimators是随机森林的一个重要调优参数，表示树的个数。

'''
* 随机森林分类
'''

from classifier import LogRegClassifier
import numpy as np
import json
import math
import time
import os
import random
from sklearn.model_selection import train_test_split
from sklearn import metrics

def main():
time_begin = time.time()
# 原始数据（省略）
data = d.data
labels = d.labels
# 数据标准化
from sklearn.preprocessing import StandardScaler
data = StandardScaler().fit_transform(data)
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)

# 1.创建模型对象
import sklearn
from xgboost import XGBClassifier
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)

# 2.训练
clf = clf.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric="mlogloss", early_stopping_rounds=10,
verbose=True)

# 3.预测与性能评价
np.set_printoptions(threshold=np.inf)
predicted = clf.predict(x_test)
predicted = np.array(predicted)
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))
time_end = time.time()
print("total time is ", time_end-time_begin)

# 程序入口
if __name__ == "__main__":
main()



©️2019 CSDN 皮肤主题: 精致技术 设计师: CSDN官方博客