机器学习数据分类

接了一个垃圾的单子,做一个机器学习的分类,分别用SVM,决策树、随机森林,再做一个超参数调优,

import pandas as pd

data = pd.read_csv('./Sup2.csv')
#data

data = data.T
# print(data.info)
data = data.reset_index()
data = data.rename(columns={'index':'target'})

# print(data.info)

# print(data.head())
data['target'] = data['target'].apply(lambda x: float(x.split('-')[0]))
data['target'] = data['target'].apply(lambda x: 1 if x==0.5 else 3 if x>=3 else x)
data['target'].value_counts()

from sklearn import svm
from sklearn.model_selection import train_test_split as sp

columns = list(data.columns)
X_columns = columns[1:]

X = data[X_columns]
y = data['target']
print("sssss",X)
print("eer", y)
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99)
X = pca.fit_transform(X)


train_data, test_data, train_label, test_label = sp(X, y, random_state=1, train_size=0.85)
#3.训练svm分类器
#kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}
classifier=svm.SVC(kernel='rbf')# ovr:一对多策略C=2,kernel='rbf',gamma=10,decision_function_shape='ovr'
classifier.fit(train_data,train_label)
# #4.计算svc分类器的准确率
print("SVM训练集得分:", classifier.score(train_data, train_label))
print("SVM测试集得分:", classifier.score(test_data, test_label))

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

tree_D = DecisionTreeClassifier(random_state=5)
tree_D.fit(train_data, train_label)
tree_D_testScore = tree_D.score(test_data, test_label)
tree_D_trainScore = tree_D.score(train_data, train_label)
print("决策树优化前训练得分:", tree_D_trainScore)
print("决策树优化前测试得分:", tree_D_testScore)
param_grid = {'criterion':['entropy', 'gini'], 'max_depth':range(2, 14), 'min_samples_leaf':range(1, 10),
              'min_samples_split':range(2, 20)}
from sklearn.model_selection import GridSearchCV
GR = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
GR.fit(test_data, test_label)
print("决策树网格搜索算法优化后测试结果:", GR.best_score_)

param_test1 = {'max_features': range(3, 100, 5)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=40, min_samples_split=20, min_samples_leaf=10, max_depth=5,
                                                         random_state=10), param_grid=param_test1, cv=5)

gsearch1.fit(train_data, train_label)

means = gsearch1.cv_results_['mean_test_score']
print(means)
std = gsearch1.cv_results_['std_test_score']
params = gsearch1.cv_results_['params']
for mean, std, param in zip(means, std, params):
    print("mean:%f std:%f %r" %(mean, std, param))
print("best_params:", gsearch1.best_params_)
# rfc_testScore = gsearch1.score(test_data, test_label)
# print("随机森林优化前训练得分:", rfc_trainScore)
# print("随机森林优化前测试得分:", rfc_testScore)

# 交叉验证
from sklearn.model_selection import cross_val_score
score = []
for n in range(1, 100):
    rfc = RandomForestClassifier(n_estimators=n, n_jobs=-1)
    rfc_score = cross_val_score(rfc, test_data, test_label, cv=5).mean()
    score.append(rfc_score)
print(max(score), score.index(max(score)))




# import phate
# tree_data, tree_clusters = phate.tree.gen_dla()
# phate_operator = phate.PHATE(k=5, knn=5,decay=10,t=500)
# tree_phate = phate_operator.fit_transform(data)
# lab = [0]*35 + [1]*67 + [2]*46 + [3]*113
# phate.plot.scatter2d(tree_phate, c=lab)

 

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

AI炮灰

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值