1.数据方面:主要包含pandas操作
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
sns.set(font="simhei") #让heatmap显示中文
plt.rcParams['font.sans-serif'] = ['SimHei'] #设置在matplotlab上的中文字体
plt.rcParams['axes.unicode_minus'] = False #在matplotlib绘图正常显示符号
pd.set_option("display.max_rows", None)
data = pd.read_csv('train.csv')
#info、describe、head、value_counts等
print(data.shape)
print(data['label'].unique())
#######删除某列、删空、改名##########
x = data.drop('label', axis=1)
data = data.dropna() #,axis=1就是删列
data.rename(columns={‘old_name’: ‘new_ name’})
#######求和#############
data['col3']=data[['col1','col2']].sum(axis=1)
#data=data.reset_index(drop=True) # 更新
######处理标签#############
classes = data.loc[:, 'label'] #取出所有标签
df.label= df.label.astype(str).map({'False.':0, 'True.':1}) #改变标签tf为01
#########数据相关方面可以皮尔逊相关系数、热力图展示
2.归一化、独热编码:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
########归一化#########
std = StandardScaler() #或改为MinMaxScaler
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
########文字转数值##### 最近使用有点小问题
from sklearn.preprocessing import LabelEncoder
le = preprocessing.LabelEncoder()
train_x = data.apply(le.fit_transform)
#######独热编码######## 很多种方法
##方法一##
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories='auto').fit(data_ca)
result = enc.transform(data_ca)
##方法二##
data_dummies = pd.get_dummies(data[['col','col2']])
##方法三##
y_train = np_utils.to_categorical(y_train, num_classes=10) #针对标签;分十类
#eg.[1 0 0]
# --------------
# [[0. 1.]
# [1. 0.]
# [1. 0.]]
3.模型:只针对分类问题
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
# define scoring method
scoring = 'accuracy'
# Define models to train
names = ["Nearest Neighbors"
# , "Gaussian Process"
,"Decision Tree"
, "Random Forest"
, "Neural Net"
# , "AdaBoost"
,"Naive Bayes"
, "SVM Linear"
, "SVM RBF"
, "SVM Sigmoid"]
classifiers = [
KNeighborsClassifier(n_neighbors = 3)
# ,GaussianProcessClassifier(1.0 * RBF(1.0))
,DecisionTreeClassifier(max_depth=5)
,RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
,MLPClassifier(alpha=1)
# ,AdaBoostClassifier()
,GaussianNB()
,SVC(kernel = 'linear')
,SVC(kernel = 'rbf')
,SVC(kernel = 'sigmoid')
]
models = zip(names, classifiers)
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state = seed, shuffle=True)
cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print('Test-- ',name,': ',accuracy_score(y_test, predictions))
print()
print(classification_report(y_test, predictions))
4.评估:
from sklearn import metrics
###########自带的
print(clf.score(X_test,y_test))
###########交叉验证
scores = cross_val_score(clf, iris.data, iris.target, cv=5) #cross_val_scorel来完成交叉验证
print('scores:',scores)
print("Accuracy: {:.4f} (+/- {:.4})".format(scores.mean(), scores.std() * 2))
################F1
score=metrics.f1_score(y_true=y_true,y_pred=preds,average="macro")
################混淆矩阵及可视化
from sklearn.metrics import confusion_matrix, accuracy_score
conf = confusion_matrix(test_y, preds) #混淆模型, 预测值与真实值比较
label = ["0","1"] # 这里是二分类
sns.heatmap(conf, annot = True, xticklabels=label, yticklabels=label)
plt.show()
################其他
print('准确率:', metrics.accuracy_score(y_true, y_pred))
print('类别精度:', metrics.precision_score(y_true, y_pred, average=None)) # 不求平均
print('宏平均精度:', metrics.precision_score(y_true, y_pred, average='macro'))
print('微平均召回率:', metrics.recall_score(y_true, y_pred, average='micro'))
参考链接: