接了一个垃圾的单子,做一个机器学习的分类,分别用SVM,决策树、随机森林,再做一个超参数调优,
import pandas as pd
data = pd.read_csv('./Sup2.csv')
#data
data = data.T
# print(data.info)
data = data.reset_index()
data = data.rename(columns={'index':'target'})
# print(data.info)
# print(data.head())
data['target'] = data['target'].apply(lambda x: float(x.split('-')[0]))
data['target'] = data['target'].apply(lambda x: 1 if x==0.5 else 3 if x>=3 else x)
data['target'].value_counts()
from sklearn import svm
from sklearn.model_selection import train_test_split as sp
columns = list(data.columns)
X_columns = columns[1:]
X = data[X_columns]
y = data['target']
print("sssss",X)
print("eer", y)
from sklearn.decomposition import PCA
pca = PCA(n_components=0.99)
X = pca.fit_transform(X)
train_data, test_data, train_label, test_label = sp(X, y, random_state=1, train_size=0.85)
#3.训练svm分类器
#kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}
classifier=svm.SVC(kernel='rbf')# ovr:一对多策略C=2,kernel='rbf',gamma=10,decision_function_shape='ovr'
classifier.fit(train_data,train_label)
# #4.计算svc分类器的准确率
print("SVM训练集得分:", classifier.score(train_data, train_label))
print("SVM测试集得分:", classifier.score(test_data, test_label))
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
tree_D = DecisionTreeClassifier(random_state=5)
tree_D.fit(train_data, train_label)
tree_D_testScore = tree_D.score(test_data, test_label)
tree_D_trainScore = tree_D.score(train_data, train_label)
print("决策树优化前训练得分:", tree_D_trainScore)
print("决策树优化前测试得分:", tree_D_testScore)
param_grid = {'criterion':['entropy', 'gini'], 'max_depth':range(2, 14), 'min_samples_leaf':range(1, 10),
'min_samples_split':range(2, 20)}
from sklearn.model_selection import GridSearchCV
GR = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
GR.fit(test_data, test_label)
print("决策树网格搜索算法优化后测试结果:", GR.best_score_)
param_test1 = {'max_features': range(3, 100, 5)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=40, min_samples_split=20, min_samples_leaf=10, max_depth=5,
random_state=10), param_grid=param_test1, cv=5)
gsearch1.fit(train_data, train_label)
means = gsearch1.cv_results_['mean_test_score']
print(means)
std = gsearch1.cv_results_['std_test_score']
params = gsearch1.cv_results_['params']
for mean, std, param in zip(means, std, params):
print("mean:%f std:%f %r" %(mean, std, param))
print("best_params:", gsearch1.best_params_)
# rfc_testScore = gsearch1.score(test_data, test_label)
# print("随机森林优化前训练得分:", rfc_trainScore)
# print("随机森林优化前测试得分:", rfc_testScore)
# 交叉验证
from sklearn.model_selection import cross_val_score
score = []
for n in range(1, 100):
rfc = RandomForestClassifier(n_estimators=n, n_jobs=-1)
rfc_score = cross_val_score(rfc, test_data, test_label, cv=5).mean()
score.append(rfc_score)
print(max(score), score.index(max(score)))
# import phate
# tree_data, tree_clusters = phate.tree.gen_dla()
# phate_operator = phate.PHATE(k=5, knn=5,decay=10,t=500)
# tree_phate = phate_operator.fit_transform(data)
# lab = [0]*35 + [1]*67 + [2]*46 + [3]*113
# phate.plot.scatter2d(tree_phate, c=lab)