自己实现的LR、SVM、XGBOOST算法,记录一下。
# -*- coding: utf-8 -*-
"""
project: Tag Embedding
author:
date:
"""
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import svm
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, roc_auc_score, auc, plot_roc_curve
from sklearn.model_selection import KFold
import xgboost as xgb
main_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))))
res_path = main_path + "/tag_similarity"
tag_path = res_path + "/tag_expand_embedding.xls"
tag_candidate = res_path + "/tag_sim_pred_candidate.xls"
def plot_res(x, y, color=None, linewidth=None):
if linewidth is None:
plt.scatter(x, y, color=color)
else:
plt.plot(x, y, color=color, linewidth=linewidth)
plt.xticks(np.arange(0, 0.5, 0.1))
plt.yticks(np.arange(0, 1, 0.1))
class Normalization(object):
def __init__(self, df):
self.df = df.replace('\\N', 0).replace('-', '-')
def min_max_norm(self):
norm_df = self.df.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
return norm_df
def mean_norm(self):
norm_df = self.df.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
return norm_df
def z_score_norm(self):
norm_df = self.df.apply(lambda x: (x - np.mean(x)) / np.std(x))
return norm_df
class ReadData(object):
def __init__(self, read_path):
self.read_path = read_path
def read_origin_data(self):
origin_data = pd.read_excel(self.read_path)
sample = origin_data[(origin_data.coa >= 0.07) | (origin_data.cos_sim >= 0.65)].sample(frac=1).reset_index(
drop=True)
# sample['cos_sim'] = sample['cos_sim'].apply(lambda x: 1-x)
return sample
def read_train_test_data(self):
sample = self.read_origin_data()
train_sample = sample[:(len(sample) // 3) * 2]
test_sample = sample[(len(sample) // 3) * 2:]
return train_sample, test_sample
class SimpleClassification(object):
def __init__(self, train_sample, test_sample):
self.train_sample = train_sample
self.test_sample = test_sample
def sample_extract(self):
pos_train = self.train_sample.query("old_flag != 0 & old_flag != 1 & new_flag != 0 & new_flag != 1")
neu_train = self.train_sample.query("(old_flag == 1 & new_flag != 0) | (old_flag != 0 & new_flag == 1)")
neg_train = self.train_sample.query("old_flag == 0 | new_flag == 0")
pos_test = self.test_sample.query("old_flag != 0 & old_flag != 1 & new_flag != 0 & new_flag != 1")
neu_test = self.test_sample.query("(old_flag == 1 & new_flag != 0) | (old_flag != 0 & new_flag == 1)")
neg_test = self.test_sample.query("old_flag == 0 | new_flag == 0")
return pos_train, neu_train, neg_train, pos_test, neu_test, neg_test
def sample_distribution(self):
pos_data = pd.concat([self.sample_extract()[0], self.sample_extract()[3]])
pos_sample = pos_data[['coa', 'cos_sim']]
pos_sample_array = pos_sample.values
plot_res(pos_sample_array[:, 0], pos_sample_array[:, 1], color='orange')
neu_data = pd.concat([self.sample_extract()[1], self.sample_extract()[4]])
neu_sample = neu_data[['coa', 'cos_sim']]
neu_sample_array = neu_sample.values
plot_res(neu_sample_array[:, 0], neu_sample_array[:, 1], color='blue')
neg_data = pd.concat([self.sample_extract()[2], self.sample_extract()[5]])
neg_sample = neg_data[['coa', 'cos_sim']]
neg_sample_array = neg_sample.values
plot_res(neg_sample_array[:, 0], neg_sample_array[:, 1], color='green')
plt.show()
def linear_regression(self, test_number):
pos_data = pd.concat([self.sample_extract()[0], self.sample_extract()[3]])
pos_sample = pos_data[['coa', 'cos_sim']]
pos_sample_array = pos_sample.values
regr = linear_model.LinearRegression()
train_end = test_number * (-1)
regr.fit(pos_sample_array[:train_end, 0].reshape(-1, 1), pos_sample_array[:train_end, 1].reshape(-1, 1))
regr_pred = regr.predict(pos_sample_array[train_end:, 0].reshape(-1, 1))
plot_res(pos_sample_array[train_end:, 0].reshape(-1, 1), pos_sample_array[train_end:, 1].reshape(-1, 1),
color='black')
plot_res(pos_sample_array[train_end:, 0].reshape(-1, 1), regr_pred, color='blue', linewidth=3)
plt.show()
def svm_classification(self):
train_data = pd.concat([self.sample_extract()[0], self.sample_extract()[1], self.sample_extract()[2]])
train_sample = train_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div']]
train_sample_norm = Normalization(train_sample).z_score_norm()
train_sample_array = train_sample_norm.values
train_index = [1] * len(self.sample_extract()[0]) + [0] * len(self.sample_extract()[1]) + [-1] * len(
self.sample_extract()[2])
train_sample_weight = train_data['clk_conf'].values.tolist()
test_data = pd.concat([self.sample_extract()[3], self.sample_extract()[4], self.sample_extract()[5]])
test_sample = test_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div']]
test_sample_norm = Normalization(test_sample).z_score_norm()
test_sample_array = test_sample_norm.values
test_index = [1] * len(self.sample_extract()[3]) + [0] * len(self.sample_extract()[4]) + [-1] * len(
self.sample_extract()[5])
# clf = svm.SVC(gamma='scale', decision_function_shape='ovo',
# class_weight={
# -1: len(train_sample) / (len(set(train_index)) * len(self.sample_extract()[2])) * 1.5,
# 0: len(train_sample) / (len(set(train_index)) * len(self.sample_extract()[1])) * 0.9,
# 1: len(train_sample) / (len(set(train_index)) * len(self.sample_extract()[0]))
# }
# )
clf = svm.SVC(gamma='scale', decision_function_shape='ovo', class_weight='balanced')
clf.fit(X=train_sample_array, y=train_index, sample_weight=train_sample_weight)
clf_pred = clf.predict(test_sample_array)
clf_dec = clf.decision_function(test_sample_array)
# 绘制每个类别的ROC曲线
fig, axes = plt.subplots(2, 2, figsize=(8, 8))
colors = ["r", "g", "b", "k"]
markers = ["o", "^", "v", "+"]
y_test = label_binarize(test_index, classes=clf.classes_)
for i in range(len(clf.classes_)):
# 计算每个类别的FPR, TPR
fpr, tpr, thr = roc_curve(y_test[:, i], clf_dec[:, i])
# print("classes_{}, fpr: {}, tpr: {}, threshold: {}".format(i, fpr, tpr, thr))
# 绘制ROC曲线,并计算AUC值
axes[int(i / 2), i % 2].plot(fpr, tpr, color=colors[i], marker=markers[i],
label="AUC: {:.2f}".format(auc(fpr, tpr)))
axes[int(i / 2), i % 2].set_xlabel("FPR")
axes[int(i / 2), i % 2].set_ylabel("TPR")
axes[int(i / 2), i % 2].set_title("Class_{}".format(clf.classes_[i]))
axes[int(i / 2), i % 2].legend(loc="lower right")
print("AUC:", roc_auc_score(y_test, clf_dec, multi_class="ovo", average=None))
# 输出预测结果
test_res_df = test_data
test_res_df['test_index'] = test_index
test_res_df['clf_pred'] = clf_pred
test_res_df.to_excel('test_res.xlsx')
return test_res_df
def xgb_classification_3(self):
train_data = pd.concat([self.sample_extract()[0], self.sample_extract()[1], self.sample_extract()[2]])
train_sample = train_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
train_sample_norm = Normalization(train_sample).min_max_norm()
train_sample_array = train_sample_norm.values
train_index = [2] * len(self.sample_extract()[0]) + [1] * len(self.sample_extract()[1]) + [0] * len(
self.sample_extract()[2])
# train_sample_weight = train_data['clk_conf_1'].replace('\\N', 0).values.tolist()
# xgb_train = xgb.DMatrix(train_sample_array, label=np.array(train_index), weight=np.array(train_sample_weight), missing=-1.0)
xgb_train = xgb.DMatrix(train_sample_array, label=np.array(train_index), missing=-999.0)
test_data = pd.concat([self.sample_extract()[3], self.sample_extract()[4], self.sample_extract()[5]])
test_sample = test_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
test_sample_norm = Normalization(test_sample).min_max_norm()
test_sample_array = test_sample_norm.values
test_index = [2] * len(self.sample_extract()[3]) + [1] * len(self.sample_extract()[4]) + [0] * len(
self.sample_extract()[5])
xgb_test = xgb.DMatrix(test_sample_array)
params = {
'booster': 'gbtree',
'objective': 'multi:softprob',
'num_class': 3,
'gamma': 0.1,
'max_depth': 3,
'lambda': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 2,
'max_delta_step': 2,
'alpha': 0.1,
'eta': 0.1,
'seed': 1000,
'nthread': 2,
}
# cv = KFold(n_splits=5, shuffle=True, random_state=100)
# r = xgb.cv(params=params, dtrain=xgb_train, num_boost_round=300, folds=cv, metrics='mlogloss')
# print(r)
evallist = [(xgb_train, 'train')]
bst_model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=200, evals=evallist)
xgb_pred_prob = bst_model.predict(xgb_test)
xgb_pred = []
for prob in xgb_pred_prob:
xgb_pred.append(np.argmax(prob))
bst_model.save_model('tag_sim_bst_model')
bst_model.dump_model('dump.raw.txt')
# 计算准确率
cnt1 = 0
cnt2 = 0
for i in range(len(test_index)):
if xgb_pred[i] == test_index[i]:
cnt1 += 1
else:
cnt2 += 1
print("Accuracy: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2)))
# 显示重要特征
# xgb.plot_importance(bst_model)
# plt.show()
fpr, tpr, thresholds = roc_curve(y_true=np.array(test_index), y_score=xgb_pred, pos_label=2)
roc_auc = auc(fpr, tpr)
print("AUC: %.2f" % roc_auc)
# plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.6f)' % roc_auc)
# 输出预测结果
test_res_df = test_data
test_res_df['test_index'] = test_index
test_res_df['xgb_pred_prob'] = list(xgb_pred_prob)
test_res_df['xgb_pred'] = xgb_pred
test_res_df.to_excel('xgb_test_res.xlsx')
return test_res_df
def xgb_classification_2(self):
train_data = pd.concat(
[self.sample_extract()[0], self.sample_extract()[1], self.sample_extract()[1], self.sample_extract()[2]])
train_sample = train_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
train_sample_norm = Normalization(train_sample).min_max_norm()
train_sample_array = train_sample_norm.values
train_index = [1] * len(self.sample_extract()[0]) + [1] * len(self.sample_extract()[1]) + [0] * len(self.sample_extract()[1]) + [0] * len(self.sample_extract()[2])
train_sample_weight = train_data['clk_conf_1'].values.tolist()
# xgb_train = xgb.DMatrix(train_sample_array, label=np.array(train_index), weight=np.array(train_sample_weight))
xgb_train = xgb.DMatrix(train_sample_array, label=np.array(train_index), missing=-999.0)
test_data = pd.concat(
[self.sample_extract()[3], self.sample_extract()[4], self.sample_extract()[4], self.sample_extract()[5]])
test_sample = test_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
test_sample_norm = Normalization(test_sample).min_max_norm()
test_sample_array = test_sample_norm.values
test_index = [1] * len(self.sample_extract()[3]) + [1] * len(self.sample_extract()[4]) + [0] * len(self.sample_extract()[4]) + [0] * len(self.sample_extract()[5])
xgb_test = xgb.DMatrix(test_sample_array)
params = {
'booster': 'gbtree',
'objective': 'binary:logistic',
# 'num_class': 2,
'gamma': 0.1,
'max_depth': 4,
'lambda': 2,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 3,
'max_delta_step': 2,
'alpha': 0.005,
'eta': 0.1,
'seed': 1000,
'nthread': 2,
# 'scale_pos_weight': (len(self.sample_extract()[2])) / (len(self.sample_extract()[0]) + len(self.sample_extract()[1])),
}
# cv = KFold(n_splits=5, shuffle=True, random_state=100)
# r = xgb.cv(params=params, dtrain=xgb_train, num_boost_round=200, folds=cv, metrics='mlogloss')
# print(r)
evallist = [(xgb_train, 'train')]
bst_model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=200, evals=evallist)
xgb_pred_prob = bst_model.predict(xgb_test)
xgb_pred = np.where(xgb_pred_prob >= 0.5, 1, 0)
bst_model.save_model('tag_sim_bst_model')
bst_model.dump_model('dump.raw.txt')
# 计算准确率
cnt1 = 0
cnt2 = 0
for i in range(len(test_index)):
if xgb_pred[i] == test_index[i]:
cnt1 += 1
else:
cnt2 += 1
print("Accuracy: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2)))
# 显示重要特征
# xgb.plot_importance(bst_model)
# plt.show()
fpr, tpr, thresholds = roc_curve(y_true=np.array(test_index), y_score=xgb_pred, pos_label=1)
roc_auc = auc(fpr, tpr)
print("AUC: %.2f" % roc_auc)
# plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.6f)' % roc_auc)
# 输出预测结果
test_res_df = test_data
test_res_df['test_index'] = test_index
test_res_df['xgb_pred'] = xgb_pred
test_res_df.to_excel('xgb_test_res.xlsx')
return test_res_df
class PredictTagSimilarity(object):
def __init__(self, model, pred_candidate):
self.model = model
self.pred_cand = pred_candidate
def xgb_pred_3(self):
pred_data = self.pred_cand[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
pred_data_norm = Normalization(pred_data).min_max_norm()
pred_data_norm_array = pred_data_norm.values
pred_data_xgb = xgb.DMatrix(pred_data_norm_array, missing=-999.0)
bst = xgb.Booster({'nthread': 2}) # init model
bst.load_model(self.model) # load data
bst_pred_prob = bst.predict(pred_data_xgb)
bst_pred_pro0, bst_pred_pro1, bst_pred_pro2, bst_pred = [], [], [], []
for prob in bst_pred_prob:
bst_pred_pro0.append(prob[0])
bst_pred_pro1.append(prob[1])
bst_pred_pro2.append(prob[2])
bst_pred.append(np.argmax(prob))
# 输出预测结果
res_df = self.pred_cand
res_df['bst_pred_prob'] = list(bst_pred_prob)
res_df['bst_pred_pro0'] = bst_pred_pro0
res_df['bst_pred_pro1'] = bst_pred_pro1
res_df['bst_pred_pro2'] = bst_pred_pro2
res_df['bst_pred'] = bst_pred
res_df.to_excel('xgb_pred_res.xlsx')
return res_df
def xgb_pred_2(self):
pred_data = self.pred_cand[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
pred_data_norm = Normalization(pred_data).min_max_norm()
pred_data_norm_array = pred_data_norm.values
pred_data_xgb = xgb.DMatrix(pred_data_norm_array, missing=-999.0)
bst = xgb.Booster({'nthread': 2}) # init model
bst.load_model(self.model) # load data
bst_pred_prob = bst.predict(pred_data_xgb)
bst_pred = np.where(bst_pred_prob >= 0.5, 1, 0)
# 输出预测结果
res_df = self.pred_cand
res_df['bst_pred_prob'] = list(bst_pred_prob)
res_df['bst_pred'] = bst_pred
res_df.to_excel('xgb_pred_res.xlsx')
return res_df
if __name__ == "__main__":
# print(ReadData(tag_path).read_train_test_data()[1])
train_datas = ReadData(tag_path).read_train_test_data()[0]
test_datas = ReadData(tag_path).read_train_test_data()[1]
# SimpleClassification(train_datas, test_datas).sample_distribution()
# SimpleClassification(train_datas, test_datas).linear_regression(test_number=40)
# SimpleClassification(train_datas, test_datas).svm_classification()
# plt.show()
# SimpleClassification(train_datas, test_datas).xgb_classification_3()
pred_datas = pd.read_excel(tag_candidate)
PredictTagSimilarity(pred_candidate=pred_datas, model="tag_sim_bst_model").xgb_pred_3()