python实现几种常见机器学习算法

自己实现的LR、SVM、XGBOOST算法,记录一下。

# -*- coding: utf-8 -*-

"""
project: Tag Embedding
author:
date:
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn import svm
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, roc_auc_score, auc, plot_roc_curve
from sklearn.model_selection import KFold
import xgboost as xgb

main_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))))
res_path = main_path + "/tag_similarity"
tag_path = res_path + "/tag_expand_embedding.xls"
tag_candidate = res_path + "/tag_sim_pred_candidate.xls"


def plot_res(x, y, color=None, linewidth=None):
    if linewidth is None:
        plt.scatter(x, y, color=color)
    else:
        plt.plot(x, y, color=color, linewidth=linewidth)

    plt.xticks(np.arange(0, 0.5, 0.1))
    plt.yticks(np.arange(0, 1, 0.1))


class Normalization(object):

    def __init__(self, df):
        self.df = df.replace('\\N', 0).replace('-', '-')

    def min_max_norm(self):
        norm_df = self.df.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
        return norm_df

    def mean_norm(self):
        norm_df = self.df.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
        return norm_df

    def z_score_norm(self):
        norm_df = self.df.apply(lambda x: (x - np.mean(x)) / np.std(x))
        return norm_df


class ReadData(object):

    def __init__(self, read_path):
        self.read_path = read_path

    def read_origin_data(self):
        origin_data = pd.read_excel(self.read_path)
        sample = origin_data[(origin_data.coa >= 0.07) | (origin_data.cos_sim >= 0.65)].sample(frac=1).reset_index(
            drop=True)
        # sample['cos_sim'] = sample['cos_sim'].apply(lambda x: 1-x)
        return sample

    def read_train_test_data(self):
        sample = self.read_origin_data()
        train_sample = sample[:(len(sample) // 3) * 2]
        test_sample = sample[(len(sample) // 3) * 2:]
        return train_sample, test_sample


class SimpleClassification(object):

    def __init__(self, train_sample, test_sample):
        self.train_sample = train_sample
        self.test_sample = test_sample

    def sample_extract(self):
        pos_train = self.train_sample.query("old_flag != 0 & old_flag != 1 & new_flag != 0 & new_flag != 1")
        neu_train = self.train_sample.query("(old_flag == 1 & new_flag != 0) | (old_flag != 0 & new_flag == 1)")
        neg_train = self.train_sample.query("old_flag == 0 | new_flag == 0")
        pos_test = self.test_sample.query("old_flag != 0 & old_flag != 1 & new_flag != 0 & new_flag != 1")
        neu_test = self.test_sample.query("(old_flag == 1 & new_flag != 0) | (old_flag != 0 & new_flag == 1)")
        neg_test = self.test_sample.query("old_flag == 0 | new_flag == 0")
        return pos_train, neu_train, neg_train, pos_test, neu_test, neg_test

    def sample_distribution(self):
        pos_data = pd.concat([self.sample_extract()[0], self.sample_extract()[3]])
        pos_sample = pos_data[['coa', 'cos_sim']]
        pos_sample_array = pos_sample.values
        plot_res(pos_sample_array[:, 0], pos_sample_array[:, 1], color='orange')

        neu_data = pd.concat([self.sample_extract()[1], self.sample_extract()[4]])
        neu_sample = neu_data[['coa', 'cos_sim']]
        neu_sample_array = neu_sample.values
        plot_res(neu_sample_array[:, 0], neu_sample_array[:, 1], color='blue')

        neg_data = pd.concat([self.sample_extract()[2], self.sample_extract()[5]])
        neg_sample = neg_data[['coa', 'cos_sim']]
        neg_sample_array = neg_sample.values
        plot_res(neg_sample_array[:, 0], neg_sample_array[:, 1], color='green')
        plt.show()

    def linear_regression(self, test_number):
        pos_data = pd.concat([self.sample_extract()[0], self.sample_extract()[3]])
        pos_sample = pos_data[['coa', 'cos_sim']]
        pos_sample_array = pos_sample.values
        regr = linear_model.LinearRegression()
        train_end = test_number * (-1)
        regr.fit(pos_sample_array[:train_end, 0].reshape(-1, 1), pos_sample_array[:train_end, 1].reshape(-1, 1))
        regr_pred = regr.predict(pos_sample_array[train_end:, 0].reshape(-1, 1))
        plot_res(pos_sample_array[train_end:, 0].reshape(-1, 1), pos_sample_array[train_end:, 1].reshape(-1, 1),
                 color='black')
        plot_res(pos_sample_array[train_end:, 0].reshape(-1, 1), regr_pred, color='blue', linewidth=3)
        plt.show()

    def svm_classification(self):
        train_data = pd.concat([self.sample_extract()[0], self.sample_extract()[1], self.sample_extract()[2]])
        train_sample = train_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div']]
        train_sample_norm = Normalization(train_sample).z_score_norm()
        train_sample_array = train_sample_norm.values
        train_index = [1] * len(self.sample_extract()[0]) + [0] * len(self.sample_extract()[1]) + [-1] * len(
            self.sample_extract()[2])
        train_sample_weight = train_data['clk_conf'].values.tolist()

        test_data = pd.concat([self.sample_extract()[3], self.sample_extract()[4], self.sample_extract()[5]])
        test_sample = test_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div']]
        test_sample_norm = Normalization(test_sample).z_score_norm()
        test_sample_array = test_sample_norm.values
        test_index = [1] * len(self.sample_extract()[3]) + [0] * len(self.sample_extract()[4]) + [-1] * len(
            self.sample_extract()[5])

        # clf = svm.SVC(gamma='scale', decision_function_shape='ovo',
        #               class_weight={
        #                   -1: len(train_sample) / (len(set(train_index)) * len(self.sample_extract()[2])) * 1.5,
        #                   0: len(train_sample) / (len(set(train_index)) * len(self.sample_extract()[1])) * 0.9,
        #                   1: len(train_sample) / (len(set(train_index)) * len(self.sample_extract()[0]))
        #               }
        # )
        clf = svm.SVC(gamma='scale', decision_function_shape='ovo', class_weight='balanced')
        clf.fit(X=train_sample_array, y=train_index, sample_weight=train_sample_weight)
        clf_pred = clf.predict(test_sample_array)
        clf_dec = clf.decision_function(test_sample_array)

        # 绘制每个类别的ROC曲线
        fig, axes = plt.subplots(2, 2, figsize=(8, 8))
        colors = ["r", "g", "b", "k"]
        markers = ["o", "^", "v", "+"]

        y_test = label_binarize(test_index, classes=clf.classes_)
        for i in range(len(clf.classes_)):
            # 计算每个类别的FPR, TPR
            fpr, tpr, thr = roc_curve(y_test[:, i], clf_dec[:, i])
            #     print("classes_{}, fpr: {}, tpr: {}, threshold: {}".format(i, fpr, tpr, thr))
            # 绘制ROC曲线,并计算AUC值
            axes[int(i / 2), i % 2].plot(fpr, tpr, color=colors[i], marker=markers[i],
                                         label="AUC: {:.2f}".format(auc(fpr, tpr)))
            axes[int(i / 2), i % 2].set_xlabel("FPR")
            axes[int(i / 2), i % 2].set_ylabel("TPR")
            axes[int(i / 2), i % 2].set_title("Class_{}".format(clf.classes_[i]))
            axes[int(i / 2), i % 2].legend(loc="lower right")

        print("AUC:", roc_auc_score(y_test, clf_dec, multi_class="ovo", average=None))

        # 输出预测结果
        test_res_df = test_data
        test_res_df['test_index'] = test_index
        test_res_df['clf_pred'] = clf_pred
        test_res_df.to_excel('test_res.xlsx')

        return test_res_df

    def xgb_classification_3(self):
        train_data = pd.concat([self.sample_extract()[0], self.sample_extract()[1], self.sample_extract()[2]])
        train_sample = train_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
        train_sample_norm = Normalization(train_sample).min_max_norm()
        train_sample_array = train_sample_norm.values
        train_index = [2] * len(self.sample_extract()[0]) + [1] * len(self.sample_extract()[1]) + [0] * len(
            self.sample_extract()[2])
        # train_sample_weight = train_data['clk_conf_1'].replace('\\N', 0).values.tolist()
        # xgb_train = xgb.DMatrix(train_sample_array, label=np.array(train_index), weight=np.array(train_sample_weight), missing=-1.0)
        xgb_train = xgb.DMatrix(train_sample_array, label=np.array(train_index), missing=-999.0)

        test_data = pd.concat([self.sample_extract()[3], self.sample_extract()[4], self.sample_extract()[5]])
        test_sample = test_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
        test_sample_norm = Normalization(test_sample).min_max_norm()
        test_sample_array = test_sample_norm.values
        test_index = [2] * len(self.sample_extract()[3]) + [1] * len(self.sample_extract()[4]) + [0] * len(
            self.sample_extract()[5])
        xgb_test = xgb.DMatrix(test_sample_array)

        params = {
            'booster': 'gbtree',
            'objective': 'multi:softprob',
            'num_class': 3,
            'gamma': 0.1,
            'max_depth': 3,
            'lambda': 0.1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'min_child_weight': 2,
            'max_delta_step': 2,
            'alpha': 0.1,
            'eta': 0.1,
            'seed': 1000,
            'nthread': 2,
        }

        # cv = KFold(n_splits=5, shuffle=True, random_state=100)
        # r = xgb.cv(params=params, dtrain=xgb_train, num_boost_round=300, folds=cv, metrics='mlogloss')
        # print(r)
        evallist = [(xgb_train, 'train')]
        bst_model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=200, evals=evallist)

        xgb_pred_prob = bst_model.predict(xgb_test)
        xgb_pred = []
        for prob in xgb_pred_prob:
            xgb_pred.append(np.argmax(prob))
        bst_model.save_model('tag_sim_bst_model')
        bst_model.dump_model('dump.raw.txt')

        # 计算准确率
        cnt1 = 0
        cnt2 = 0
        for i in range(len(test_index)):
            if xgb_pred[i] == test_index[i]:
                cnt1 += 1
            else:
                cnt2 += 1
        print("Accuracy: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2)))

        # 显示重要特征
        # xgb.plot_importance(bst_model)
        # plt.show()

        fpr, tpr, thresholds = roc_curve(y_true=np.array(test_index), y_score=xgb_pred, pos_label=2)
        roc_auc = auc(fpr, tpr)
        print("AUC: %.2f" % roc_auc)
        # plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.6f)' % roc_auc)

        # 输出预测结果
        test_res_df = test_data
        test_res_df['test_index'] = test_index
        test_res_df['xgb_pred_prob'] = list(xgb_pred_prob)
        test_res_df['xgb_pred'] = xgb_pred
        test_res_df.to_excel('xgb_test_res.xlsx')
        return test_res_df

    def xgb_classification_2(self):
        train_data = pd.concat(
            [self.sample_extract()[0], self.sample_extract()[1], self.sample_extract()[1], self.sample_extract()[2]])
        train_sample = train_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
        train_sample_norm = Normalization(train_sample).min_max_norm()
        train_sample_array = train_sample_norm.values
        train_index = [1] * len(self.sample_extract()[0]) + [1] * len(self.sample_extract()[1]) + [0] * len(self.sample_extract()[1]) + [0] * len(self.sample_extract()[2])
        train_sample_weight = train_data['clk_conf_1'].values.tolist()
        # xgb_train = xgb.DMatrix(train_sample_array, label=np.array(train_index), weight=np.array(train_sample_weight))
        xgb_train = xgb.DMatrix(train_sample_array, label=np.array(train_index), missing=-999.0)

        test_data = pd.concat(
            [self.sample_extract()[3], self.sample_extract()[4],  self.sample_extract()[4], self.sample_extract()[5]])
        test_sample = test_data[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
        test_sample_norm = Normalization(test_sample).min_max_norm()
        test_sample_array = test_sample_norm.values
        test_index = [1] * len(self.sample_extract()[3]) + [1] * len(self.sample_extract()[4]) + [0] * len(self.sample_extract()[4]) + [0] * len(self.sample_extract()[5])
        xgb_test = xgb.DMatrix(test_sample_array)

        params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            # 'num_class': 2,
            'gamma': 0.1,
            'max_depth': 4,
            'lambda': 2,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'min_child_weight': 3,
            'max_delta_step': 2,
            'alpha': 0.005,
            'eta': 0.1,
            'seed': 1000,
            'nthread': 2,
            # 'scale_pos_weight': (len(self.sample_extract()[2])) / (len(self.sample_extract()[0]) + len(self.sample_extract()[1])),
        }

        # cv = KFold(n_splits=5, shuffle=True, random_state=100)
        # r = xgb.cv(params=params, dtrain=xgb_train, num_boost_round=200, folds=cv, metrics='mlogloss')
        # print(r)
        evallist = [(xgb_train, 'train')]
        bst_model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=200, evals=evallist)

        xgb_pred_prob = bst_model.predict(xgb_test)
        xgb_pred = np.where(xgb_pred_prob >= 0.5, 1, 0)
        bst_model.save_model('tag_sim_bst_model')
        bst_model.dump_model('dump.raw.txt')

        # 计算准确率
        cnt1 = 0
        cnt2 = 0
        for i in range(len(test_index)):
            if xgb_pred[i] == test_index[i]:
                cnt1 += 1
            else:
                cnt2 += 1
        print("Accuracy: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2)))

        # 显示重要特征
        # xgb.plot_importance(bst_model)
        # plt.show()

        fpr, tpr, thresholds = roc_curve(y_true=np.array(test_index), y_score=xgb_pred, pos_label=1)
        roc_auc = auc(fpr, tpr)
        print("AUC: %.2f" % roc_auc)
        # plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.6f)' % roc_auc)

        # 输出预测结果
        test_res_df = test_data
        test_res_df['test_index'] = test_index
        test_res_df['xgb_pred'] = xgb_pred
        test_res_df.to_excel('xgb_test_res.xlsx')
        return test_res_df


class PredictTagSimilarity(object):

    def __init__(self, model, pred_candidate):
        self.model = model
        self.pred_cand = pred_candidate

    def xgb_pred_3(self):
        pred_data = self.pred_cand[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
        pred_data_norm = Normalization(pred_data).min_max_norm()
        pred_data_norm_array = pred_data_norm.values
        pred_data_xgb = xgb.DMatrix(pred_data_norm_array, missing=-999.0)

        bst = xgb.Booster({'nthread': 2})  # init model
        bst.load_model(self.model)  # load data
        bst_pred_prob = bst.predict(pred_data_xgb)
        bst_pred_pro0, bst_pred_pro1, bst_pred_pro2, bst_pred = [], [], [], []
        for prob in bst_pred_prob:
            bst_pred_pro0.append(prob[0])
            bst_pred_pro1.append(prob[1])
            bst_pred_pro2.append(prob[2])
            bst_pred.append(np.argmax(prob))

        # 输出预测结果
        res_df = self.pred_cand
        res_df['bst_pred_prob'] = list(bst_pred_prob)
        res_df['bst_pred_pro0'] = bst_pred_pro0
        res_df['bst_pred_pro1'] = bst_pred_pro1
        res_df['bst_pred_pro2'] = bst_pred_pro2
        res_df['bst_pred'] = bst_pred
        res_df.to_excel('xgb_pred_res.xlsx')
        return res_df

    def xgb_pred_2(self):
        pred_data = self.pred_cand[['coa', 'euclid_dist', 'cos_sim', 'kl_div', 'clk_coa', 'clk_conf_1', 'clk_conf_2']]
        pred_data_norm = Normalization(pred_data).min_max_norm()
        pred_data_norm_array = pred_data_norm.values
        pred_data_xgb = xgb.DMatrix(pred_data_norm_array, missing=-999.0)

        bst = xgb.Booster({'nthread': 2})  # init model
        bst.load_model(self.model)  # load data
        bst_pred_prob = bst.predict(pred_data_xgb)
        bst_pred = np.where(bst_pred_prob >= 0.5, 1, 0)

        # 输出预测结果
        res_df = self.pred_cand
        res_df['bst_pred_prob'] = list(bst_pred_prob)
        res_df['bst_pred'] = bst_pred
        res_df.to_excel('xgb_pred_res.xlsx')
        return res_df


if __name__ == "__main__":
    # print(ReadData(tag_path).read_train_test_data()[1])
    train_datas = ReadData(tag_path).read_train_test_data()[0]
    test_datas = ReadData(tag_path).read_train_test_data()[1]

    # SimpleClassification(train_datas, test_datas).sample_distribution()

    # SimpleClassification(train_datas, test_datas).linear_regression(test_number=40)

    # SimpleClassification(train_datas, test_datas).svm_classification()
    # plt.show()

    # SimpleClassification(train_datas, test_datas).xgb_classification_3()

    pred_datas = pd.read_excel(tag_candidate)
    PredictTagSimilarity(pred_candidate=pred_datas, model="tag_sim_bst_model").xgb_pred_3()

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值