传统方法中,这次主要考虑OOP,但是可能代码写的有些鬼畜,这些还是需要慢慢体会,如何通过OOP更好提高可维护性等问题。
- 突然发现我这样写并不好,不方便用pipeline把特征工程和模型的参数一起搜参。。。
- 所以第一次写还是可以基于过程来写,然后改成OOP,感觉自己上来设计OO还是hold不住
10.19目前分了4部分
- PreProcess 预处理
- MyModel 特征工程+ML模型
- Evaluate 进行模型评估
- main
四部分分别如下:
from MyModel import Model
from time import time
import numpy as np
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
class Evaluator(Model):
def __init__(self, X, y):
super(Evaluator, self).__init__(X, y) # super会查找所有的超类,以及超类的超类
# def __getitem__(self, item):
# pass
def cross_val(self, clf):
Kfold = KFold(n_splits=5, shuffle=True, random_state=1) # shuffle作用:使每一折各类别的比例与总数据集相同
scores = cross_val_score(clf, self.X, self.y, cv=Kfold, scoring='f1_macro', n_jobs=-1)
return scores.mean()
def para_search_report(self, results, n_top=3): # 忘写self会“object is not subscriptable”的bug,就还要写getitem构造函数
for i in range(1, n_top+1):
candidates = np.flatnonzero(results['rank_test_score'] == i) # 保留正的?
# candidates = np.array(results['rank_test_score'] == i)
for candidate in candidates:
print(i)
print("Model with rank:{0}".format(i))
print("Mean validation score:%.3f(std:%.3f)" %
(results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters:{0}".format(results['params'][candidate]))
# print(i)
def param_search(self, clf, param, rand_search=False):
a = time()
Kfold = KFold(n_splits=5, shuffle=True, random_state=1)
if rand_search == True:
search = RandomizedSearchCV(clf, param_distributions=param, cv=Kfold,
n_iter=3, scoring='f1_macro', n_jobs=-1)
else:
search = GridSearchCV(clf, param_grid=param, cv=Kfold, scoring='f1_macro', n_jobs=-1)
search.fit(self.X, self.y)
print('搜索参数耗时:%.1fs' % (time() - a))
return search
def clf_report(self, clf):
pred = clf.predict(self.X_train)
rep_tra = metrics.classification_report(self.y_train, pred)
pred = clf.predict(self.X_val)
rep_val = metrics.classification_report(self.y_val, pred)
return (rep_tra, rep_val)
def lgb_clf_report(self, clf):
pred_prob = clf.predict(self.X_train)
pred = np.argmax(pred_prob, axis=1) # 获最大概率对应的label
rep_tra = metrics.classification_report(self.y_train, pred)
pred_prob = clf.predict(self.X_val)
pred = np.argmax(pred_prob, axis=1) # 获最大概率对应的label
rep_val = metrics.classification_report(self.y_val, pred)
return (rep_tra, rep_val)