字节跳动 安全ai挑战赛初赛参赛总结

初赛赛题
https://security.bytedance.com/fe/ai-challenge#/project?id=1&active=1
字段:
在这里插入图片描述
我主要提取的特征就是(userid,product_id)(userid,product_1st_category)这些元组出现的次数,加上产品id总共出现的次数和userid总共出现的次数,但是对于地址这块我没想到好的特征提取方法,然后模型用的是lgboost,已经加了class weight,最终正确率是80%,按照官方积分的算法尝试的最高得分是0.94
在这里插入图片描述

from lightgbm import LGBMRegressor
import random
import tensorflow as tf
import codecs
from tensorflow.keras import models, layers
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate, \
    StratifiedKFold
from tpot import TPOTClassifier
from xgboost import XGBClassifier

from catboost import CatBoostClassifier

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, log_loss
import pickle
import joblib
import os
import time
from sklearn.decomposition import KernelPCA, LatentDirichletAllocation
from sklearn.utils import compute_sample_weight, compute_class_weight
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import OneHotEncoder
import csv
from tqdm import tqdm

from sklearn.ensemble import IsolationForest


# -*-coding:utf-8 -*-
def write_to_pickle(dictdata, filename):
    # 把dict_data以pickle的形式保存
    pick_file = open(filename + '.pkl', 'wb')
    pickle.dump(dictdata, pick_file)
    pick_file.close()


def random_predict(prob_num):
    # 输入一个0-1的概率,有这个概率返回True
    random_result = random.random()
    if random_result <= prob_num:
        return True
    else:
        return False


def load_picklefile(filename):
    # 读取pickle文件
    pickfile = open(filename, 'rb')
    listfile = pickle.load(pickfile)
    return listfile


def read_csv(filename="A_test_data.csv"):
    # 读取csv文件
    full_data = []
    with open(filename, 'r', encoding='gbk') as f:
        reader = csv.reader(f)
        for row in reader:
            full_data.append(row)
    return full_data[1:]  # 删去抬头


def pad_numlist(numlist=[1, 2], goal_length=30):
    # 将数值数据打成指定的长度
    current_length = len(numlist)
    if current_length >= goal_length:
        return numlist[:goal_length]
    else:
        return numlist + [0] * (goal_length - current_length)


def strlist2numlist(strlist):
    # 将形如['10','23']的字符串列表变为[10,23]的数字列表
    numlist = []
    for each in strlist:
        numlist.append(int(each))
    return numlist


def update_count_dict(dictcollect={}, dictname='name', new_data=('1', '2')):
    # 往dict_data里更新数据
    dict2update = dictcollect.get(dictname)
    if new_data in dict2update.keys():
        count = dict2update.get(new_data)
        dict2update.update({new_data: count + 1})
        return count + 1
    else:
        dict2update.update({new_data: 1})
        return 1


def init_dict_collect():
    # 初始化一个统计字典,放在dict_collect里
    count_dict_name = ['product_id', 'product_1st_category', 'product_2nd_category', 'product_3rd_category',
                       'user_appear_time', 'product_appear_time', 'word_gram', 'user_product_kind', 'word_with_label0',
                       'word_with_label1']
    dict_collect = {}
    for name in count_dict_name:
        dict_collect.update({name: {}})
    return dict_collect


def save_dict_collect(dictcollect={}, type='train'):
    # 保存dict_collect里的信息
    for each_key in dictcollect.keys():
        each_dict = dictcollect.get(each_key)
        write_to_pickle(each_dict, each_key + "_" + type)


def turn_num2word_gram(numlist, dict_collect):
    # 根据字段生成n-gram统计
    for count in range(0, len(numlist) - 1):
        update_count_dict(dict_collect, 'word_gram', ((numlist[count], numlist[count + 1])))


def turn_list2dict(listdata):
    # 把排序后的列表重新化成字典
    # 输入形式是[["url", 2], ["sfstat", 3], ["jobimg", 5]]这样
    result_dict = {}
    for each_data in listdata:
        result_dict.update({each_data[0]: each_data[1]})
    return result_dict


def sort_wordgram(dict_collect):
    # 对词组进行整理,删去出现次数极少的词组
    word_dict = dict_collect.get('word_gram')
    word_dict_sorted = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
    final_word_dict = {}
    for each_word in word_dict_sorted:
        count = each_word[1]
        if count > 2:
            final_word_dict.update({each_word[0]: len(final_word_dict) + 1})
    dict_collect.update({'word_gram': final_word_dict})


def update_kind_dict(dict_collect, user_id, kind_id):
    # 统计每位用户总共买的产品种类
    user_product_kind_dict = dict_collect.get('user_product_kind')
    user_kind_set = user_product_kind_dict.get(user_id)
    if user_kind_set is None:
        user_product_kind_dict.update({user_id: set(kind_id)})
    else:
        user_kind_set.add(kind_id)
        user_product_kind_dict.update({user_id: user_kind_set})


def update_word_with_label_dict(dict_collect, dictname, word_list):
    # 统计不同地点单词在收获细节出现的比例
    word_with_label_dict = dict_collect.get(dictname)
    for each_word in word_list:
        if each_word in word_with_label_dict.keys():
            count = word_with_label_dict.get(each_word)
            word_with_label_dict.update({each_word: count + 1})
        else:
            word_with_label_dict.update({each_word: 1})


def handle_csv_data(csv_data=[], dict_collect=init_dict_collect(), type='train+test'):
    # 处理读取csv之后得到的数据
    count = 0
    request_label_dict={}
    for each_row in csv_data:
        request_id = np.array(each_row[0])
        product_id = each_row[1]
        update_count_dict(dict_collect, 'product_appear_time', product_id)
        product_1st_category = each_row[2]
        product_2nd_category = each_row[3]
        product_3rd_category = each_row[4]
        user_id = each_row[5]
        label = str(each_row[10])
        update_count_dict(dict_collect, 'user_appear_time', user_id)
        update_count_dict(dict_collect, 'product_id', (user_id, product_id))
        update_count_dict(dict_collect, 'product_1st_category', (user_id, product_1st_category))
        update_count_dict(dict_collect, 'product_2nd_category', (user_id, product_2nd_category))
        update_count_dict(dict_collect, 'product_3rd_category', (user_id, product_3rd_category))
        update_kind_dict(dict_collect, user_id, product_id)
        word_with_label_dict_name = 'word_with_label' + label
        post_province = each_row[6].split(' ')
        turn_num2word_gram(post_province, dict_collect)

        post_city = each_row[7].split(' ')
        turn_num2word_gram(post_city, dict_collect)

        post_town = each_row[8].split(' ')
        turn_num2word_gram(post_town, dict_collect)

        post_detail = each_row[9].split(' ')
        update_word_with_label_dict(dict_collect, word_with_label_dict_name, post_province+post_city+post_town+post_detail)
        turn_num2word_gram(post_detail, dict_collect)
        count = count + 1
        if count % 1000 == 0:
            print("count:", count)
        request_label_dict.update({int(request_id):int(label)})
    sort_wordgram(dict_collect)
    save_dict_collect(dict_collect, type)
    write_to_pickle(request_label_dict,'request_label_dict')


def turn_wordgram2feature(numlist, word_gram_dict):
    # 把数字列表转化为2gram特征
    word_feature = []
    dict_length = len(word_gram_dict)
    for count in range(0, len(numlist) - 1):
        if (numlist[count], numlist[count + 1]) in word_gram_dict.keys():
            word_feature.append(word_gram_dict.get((numlist[count], numlist[count + 1])))
        else:
            word_feature.append(dict_length)
    return word_feature


def generate_word_with_laebl_feature(post_detail, word_with_laebl0_dict, word_with_laebl1_dict):
    feature_list = []
    for each_word in post_detail:
        if each_word in word_with_laebl0_dict:
            count0 = word_with_laebl0_dict.get(each_word)
        else:
            count0 = 0.1
        if each_word in word_with_laebl1_dict:
            count1 = word_with_laebl1_dict.get(each_word)
        else:
            count1 = 0.1
        feature_list.append(round(count1 / count0, 2))
    return feature_list


def generate_feature(csvdata=[], type='train'):
    # 从csv文件中生成特征:
    dict_collect = init_dict_collect()
    label_collect = []
    feature_collect = []
    request_id_collect = []
    user_appear_onetime_count = 0
    # 记录只出现一次的用户
    one_and_huangniu_count = 0
    # 记录只出现一次且用户是黄牛

    file_count = 0
    user_id_feature_dict={}
    #这个字典里存放的是user_id对应的特征,确保相同user_id的特征不会被分开
    product_id_dict = load_picklefile('product_id_train+test' + '.pkl')
    product_1st_category_dict = load_picklefile('product_1st_category_train+test' + '.pkl')
    product_2nd_category_dict = load_picklefile('product_2nd_category_train+test' + '.pkl')
    product_3rd_category_dict = load_picklefile('product_3rd_category_train+test' + '.pkl')
    word_gram_dict = load_picklefile('word_gram_train+test.pkl')
    product_appear_dict = load_picklefile('product_appear_time_train+test.pkl')
    user_appear_dict = load_picklefile('user_appear_time_train+test.pkl')
    user_kind_dict = load_picklefile("user_product_kind_train+test.pkl")
    word_with_laebl0_dict = load_picklefile("word_with_label0_train.pkl")
    word_with_laebl1_dict = load_picklefile("word_with_label1_train.pkl")
    for each_row in csvdata:
        file_count = file_count + 1
        request_id = np.array(each_row[0])
        request_id_collect.append(request_id)
        product_id = each_row[1]

        product_appear_time = product_appear_dict.get(product_id)

        product_1st_category = each_row[2]
        product_2nd_category = each_row[3]
        product_3rd_category = each_row[4]

        user_id = each_row[5]
        user_kind_num = len(user_kind_dict.get(user_id))
        user_appear_time = user_appear_dict.get(user_id)

        feature_row = [request_id, product_appear_time, user_appear_time, user_kind_num]

        feature_row = feature_row + [int(product_id), int(product_1st_category), int(product_2nd_category),
                                     int(product_3rd_category)]
        product_id_count = product_id_dict.get((user_id, product_id))
        product_1st_category_count = product_1st_category_dict.get((user_id, product_1st_category))
        product_2nd_category_count = product_2nd_category_dict.get((user_id, product_2nd_category))
        product_3rd_category_count = product_3rd_category_dict.get((user_id, product_3rd_category))
        feature_row = feature_row + [product_id_count, product_1st_category_count, product_2nd_category_count,
                                     product_3rd_category_count]
        sequence_feature = [
            update_count_dict(dict_collect, 'product_appear_time', product_id),
            update_count_dict(dict_collect, 'user_appear_time', user_id),
            update_count_dict(dict_collect, 'product_id', (user_id, product_id)),
            update_count_dict(dict_collect, 'product_1st_category', (user_id, product_1st_category)),
            update_count_dict(dict_collect, 'product_2nd_category', (user_id, product_2nd_category)),
            update_count_dict(dict_collect, 'product_3rd_category', (user_id, product_3rd_category)), ]

        post_province = each_row[6].split(' ')
        post_province_feature = turn_wordgram2feature(post_province, word_gram_dict)

        post_city = each_row[7].split(' ')
        post_city_feature = turn_wordgram2feature(post_city, word_gram_dict)

        post_town = each_row[8].split(' ')
        post_town_feature = turn_wordgram2feature(post_town, word_gram_dict)

        post_detail = each_row[9].split(' ')
        word_with_laebl_feature = generate_word_with_laebl_feature(post_province+post_city+post_town+post_detail, word_with_laebl0_dict,
                                                                   word_with_laebl1_dict)
        word_with_laebl_feature = pad_numlist(word_with_laebl_feature, 55)
        # post_detail_feature = turn_wordgram2feature(post_detail, word_gram_dict)
        post_detail_length = len(post_detail)
        post_detail = strlist2numlist(post_detail)
        post_detail_sum = int(np.sum(strlist2numlist(post_detail)))
        post_detail_ave = post_detail_sum // post_detail_length
        feature_row = feature_row + [post_detail_length, post_detail_sum, post_detail_ave]
        feature_row = feature_row + word_with_laebl_feature
        feature_row = feature_row + pad_numlist(post_province_feature, 6) + pad_numlist(post_city_feature,
                                                                                        6) + pad_numlist(
            post_town_feature, 10) + pad_numlist(post_detail, 40) + sequence_feature
        feature_row = np.array(feature_row,dtype='int32')
        label = [int(each_row[10])]
        label_collect = label_collect + label
        former_feature_list=user_id_feature_dict.get(user_id)
        if former_feature_list is None:
            user_id_feature_dict.update({user_id:[feature_row]})
        else:
            former_feature_list.append(feature_row)
            user_id_feature_dict.update({user_id:former_feature_list})
        if file_count == 1:
            feature_collect = feature_row
        else:
            feature_collect = np.vstack((feature_collect, feature_row))
        if file_count % 100 == 0:
            print(file_count)
    np.save('csvfeature_' + type, feature_collect)
    np.save('labelcollect_' + type, label_collect)
    np.save('request_id_' + type, request_id_collect)
    write_to_pickle(user_id_feature_dict,type+'user_id_feature_dict')
    print("onetime_huangniu", one_and_huangniu_count, user_appear_onetime_count)
def turn_prob_to_label(prob_data, input_data):
    # 将检测概率转化成一维数字标签
    error_model = pickle.load(open('lgberror.dat', "rb"))
    prob_data = list(prob_data)
    input_data = list(input_data)

    result = []
    for pointer in range(0, len(prob_data)):
        prob_data_row = list(prob_data[pointer])
        input_data_row = list(input_data[pointer])
        user_appear_time = int(input_data_row[2])
        porduct_id_count = int(input_data_row[8])
        if (porduct_id_count == 1 or user_appear_time == 1) and prob_data_row[1] < 0.8:
            result.append(0)
            continue
        if prob_data_row[0] >= 0.34:
            result.append(0)
            continue
        else:
            result.append(1)
            continue

        max_loc = list(prob_data_row).index(max(list(prob_data_row)))
        result.append(max_loc)
    return result




def find_fault(prob_data, input_data, true_label):
    prob_data = list(prob_data)
    input_data = list(input_data)
    true_label = list(true_label)
    result = {}
    for pointer in range(0, len(prob_data)):
        prob_data_row = list(prob_data[pointer])
        input_data_row = list(input_data[pointer])
        request_id = input_data_row[0]
        max_loc = list(prob_data_row).index(max(list(prob_data_row)))
        if int(max_loc) != int(true_label[pointer]):
            result.update({request_id: prob_data_row})
    print(result)
    write_to_pickle(result, "error_record")


def find_single_appear():
    # 筛选出在表里只出现一次的用户
    csvdata = read_csv('train_ori.csv')
    user_appear_dict = load_picklefile('user_appear_time_train+test.pkl')
    result = []
    for each_row in csvdata:
        user_id = each_row[5]
        appear_time = user_appear_dict.get(user_id)
        if appear_time > 1:
            result.append(each_row)
    write_single(result)


def caculate_score(label, predict):
    # 计算pre和recall
    predict = list(predict)
    label = list(label)
    pre_count = 0
    label_count = 0.0
    correct_predict_count = 0.0
    error_list = []
    for count in range(0, len(label)):
        predict_num = int(predict[count])
        label_num = int(label[count])
        if predict_num == 1:
            pre_count = pre_count + 1
        if label_num == 1:
            label_count = label_count + 1
        if label_num == 1 and predict_num == 1:
            correct_predict_count = correct_predict_count + 1
        if label_num != predict_num:
            error_list.append(count)
    Precision = correct_predict_count / pre_count
    Recall = correct_predict_count / label_count
    beta = 0.3
    score = (1 + beta ** 2) * Precision * Recall / (beta ** 2 * Precision + Recall)
    print(Precision, Recall)
    print("score", score)
    return score


def write_csv(answer_data=[(0, 0)]):
    # 生成答案的csv文件
    data_head = [
        ("request_id", "product_id", "product_1st_category", "product_2nd_category", "product_3rd_category", 'user_id',
         "post_province", "post_city", "post_town", "post_detail", "label")
    ]
    data = data_head + answer_data
    f = codecs.open('submit0.34_0.8.csv', 'w', 'gbk')
    writer = csv.writer(f)
    for i in data:
        writer.writerow(i)
    f.close()


def write_single(answer_data=[(0, 0)]):
    # 生成只出现少次的csv文件
    data_head = [
        ("request_id", "product_id", "product_1st_category", "product_2nd_category", "product_3rd_category", 'user_id',
         "post_province", "post_city", "post_town", "post_detail", "label")
    ]
    data = data_head + answer_data
    f = codecs.open('single_csv.csv', 'w', 'gbk')
    writer = csv.writer(f)
    for i in data:
        writer.writerow(i)
    f.close()


def write_error(answer_data=[(0, 0)]):
    # 生成错误列表的csv文件
    data_head = [
        ("request_id", "product_id", "product_1st_category", "product_2nd_category", "product_3rd_category", 'user_id',
         "post_province", "post_city", "post_town", "post_detail", "label", "prob_0", "prob_1")
    ]
    data = data_head + answer_data
    f = codecs.open('error_csv.csv', 'w', 'gbk')
    writer = csv.writer(f)
    for i in data:
        writer.writerow(i)
    f.close()


def do_mymlp(train_x, train_y):
    # mlp
    tf.keras.backend.clear_session()
    print("do mymlp")
    # inputs = layers.Input(shape=max_features)
    inputs = layers.Input(shape=67)
    x = layers.Dense(60, activation='sigmoid')(inputs)
    x = layers.Dense(50, activation='sigmoid')(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(30, activation='sigmoid')(x)
    x = layers.Dense(10, activation='sigmoid')(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=inputs, outputs=outputs)
    model.summary()

    import datetime
    import os

    stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    # 在 Python3 下建议使用 pathlib 修正各操作系统的路径
    from pathlib import Path
    stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    logdir = str(Path('./data/autograph/' + stamp))

    tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
        loss=tf.keras.losses.binary_crossentropy,
        metrics=["accuracy", "Recall", "Precision", "AUC"]

    )

    history = model.fit(train_x, train_y, epochs=30, validation_split=0.1,
                        callbacks=[tensorboard_callback], workers=4, shuffle=True)
    preds = model.predict(train_x)
    test_result = np.array(turn_prob_to_label(preds, train_x))
    print(test_result[0:10])
    print(train_y[0:10])
    test_accuracy = accuracy_score(train_y, test_result)
    print(test_accuracy)
    caculate_pre_recall(test_result, train_y)

    x_train, x_test_valid, y_train, y_test_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=1)
    preds = model.predict(x_test_valid)
    test_result = np.array(turn_prob_to_label(preds, x_test_valid))
    print(test_result[0:10])
    print(y_test_valid[0:10])
    test_accuracy = accuracy_score(y_test_valid, test_result)
    f_score = caculate_pre_recall(test_result, y_test_valid)

    return model


def generate_answer_data(predict_result):
    # 把id和预测label组合
    anser_list = []
    test_csv = read_csv('test.csv')
    id_list = list(np.load('request_id_test.npy'))
    for pointer in range(0, len(id_list)):
        temp_list = test_csv[pointer][:-1] + [predict_result[pointer]]
        anser_list.append(temp_list)
        print(temp_list)
    return anser_list


def xgboost(x, y):
    # 'multi:softprob'
    model = XGBClassifier(max_depth=25, objective='binary:logistic', scale_pos_weight=0.15)
    x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.4, random_state=1)
    model.fit(x_train, y_train, early_stopping_rounds=30, eval_set=[(x_train, y_train), (x_test_valid, y_test_valid)],
              eval_metric="logloss", verbose=True)

    preds = model.predict_proba(x)
    test_result = np.array(turn_prob_to_label(preds, x))
    print(test_result[0:10])
    print(y[0:10])
    test_accuracy = accuracy_score(y, test_result)
    print(test_accuracy)
    caculate_pre_recall(test_result, y)

    preds = model.predict_proba(x_test_valid)
    test_result = np.array(turn_prob_to_label(preds, x_test_valid))
    print(test_result[0:10])
    print(y_test_valid[0:10])
    test_accuracy = accuracy_score(y_test_valid, test_result)
    f_score = caculate_pre_recall(test_result, y_test_valid)
    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    pickle.dump(model, open(str(f_score)[:4] + "lgboostmodel2.pickle.dat", "wb"))


def generate_error_data(logname="error_record"):
    error_dict = load_picklefile('error_record.pkl')
    train_csv = read_csv('train.csv')
    infor_collect = []
    for each_row in train_csv:
        request_id = each_row[0]
        if request_id in error_dict.keys():
            error_proba = error_dict.get(request_id)
            print(error_proba)
            print(each_row)
            infor_collect.append(each_row + error_proba)
    write_error(infor_collect)


def shuffle_data(data_x, data_y):
    # 对输入的numpy数组进行手动打乱
    np.random.seed(207)
    np.random.shuffle(data_x)
    np.random.seed(207)
    np.random.shuffle(data_y)
    return data_x, data_y


def show_dangerous_word():
    # 显示容易是黄牛的词
    word_with_laebl0_dict = load_picklefile("word_with_label0_train.pkl")
    word_with_laebl1_dict = load_picklefile("word_with_label1_train.pkl")
    final_dict = {}
    for each in word_with_laebl1_dict.keys():
        count1 = word_with_laebl1_dict.get(each)
        count0 = word_with_laebl0_dict.get(each)
        if count0 is None:
            count0 = 1
        final_dict.update({int(each): count1 / count0})
    final_dict = sorted(final_dict.items(), key=lambda x: x[1], reverse=True)
    return final_dict


def show_good_word():
    # 显示是好人的单词
    word_with_laebl0_dict = load_picklefile("word_with_label0_train.pkl")
    word_with_laebl1_dict = load_picklefile("word_with_label1_train.pkl")
    final_dict = {}
    for each in word_with_laebl0_dict.keys():
        count1 = word_with_laebl1_dict.get(each)
        count0 = word_with_laebl0_dict.get(each)
        if count1 is None:
            count1 = 1
        final_dict.update({int(each): count0 / count1})
    final_dict = sorted(final_dict.items(), key=lambda x: x[1], reverse=True)
    return final_dict


def reverse_label(intnum):
    # 返回翻转的标签
    if int(intnum) == 1:
        return 0
    else:
        return 1


def turn_label2one_hot(label_y):
    # 把向量转化为onehot
    result = []
    for each in label_y:
        if int(each) == 0:
            result.append([1, 0])
        else:
            result.append([0, 1])
    return np.array(result)
def generate_train_feature_from_index(index_list):
    #通过输入的index来生成训练数据集
    label_dict=load_picklefile('request_label_dict.pkl')
    user_id_feature_dict=load_picklefile('trainuser_id_feature_dict.pkl')
    x=[]
    y=[]
    key_list=list(user_id_feature_dict.keys())
    for each_num in index_list:
        pointer=key_list[each_num]
        user_id_features=user_id_feature_dict.get(pointer)
        for each_user_id_feature in user_id_features:
            x.append(each_user_id_feature)
            request_id=int(each_user_id_feature[0])
            y.append(int(label_dict.get(request_id)))
    return np.array(x),np.array(y)

def genetrate_num_list(start=0,end=101887):
    num_list=[]
    for count in range(start,end):
        num_list.append(count)
    return num_list
def lgb_with_kfold(train_X,train_Y):
    test_X = np.load('csvfeature_test.npy')
    skf = StratifiedKFold(n_splits=5)
    class_weight = {
        0:4 ,
        1:0.5 }
    model = LGBMRegressor(num_leaves=50, max_depth=30, objective='multiclass', num_class=2, class_weight=class_weight,
                          n_estimators=10000, learning_rate=0.05)
    total_res = []
    for train_index, test_index in skf.split(train_X, train_Y):
        X_train, X_test = train_X[train_index], train_X[test_index]
        y_train, y_test = train_Y[train_index], train_Y[test_index]
        model.fit(X_train, y_train, early_stopping_rounds=300,
                  eval_set=[(X_train, y_train), (X_test, y_test)],
                  eval_metric='multi_error', verbose=True)
        res = model.predict(test_X)
        total_res.append(res)

    avg_proba = []
    for line in range(total_res[0].shape[0]):
        temp_proba = np.zeros((5, 2))
        for i in range(len(total_res)):
            temp = total_res[i][line]
            temp_proba[i] = temp
        x = np.mean(temp_proba, axis=0)
        avg_proba.append(x)
    test_result = np.array(turn_prob_to_label(avg_proba, test_X))
    answer_data = generate_answer_data(test_result)
    write_csv(answer_data)
    # test_accuracy = accuracy_score(test_Y, test_result)
    # f_score = caculate_pre_recall(test_Y, test_result)
    # print(test_accuracy,f_score)
def get_sample_weight(y):
    balanced_sample=[]
    class_weight = {
        0: 5,
        1: 1, }
    for each in y:
        balanced_sample.append(class_weight.get(int(each)))
    return np.array(balanced_sample)
def cat_with_kfold(train_X, train_Y):
    test_X = np.load('csvfeature_test.npy')
    skf = StratifiedKFold(n_splits=5)

    model = CatBoostClassifier(objective='MultiClass', loss_function='multi_error', learning_rate=0.5,max_depth=8)
    total_res = []
    for train_index, test_index in skf.split(train_X, train_Y):
        X_train, X_test = train_X[train_index], train_X[test_index]
        y_train, y_test = train_Y[train_index], train_Y[test_index]
        balanced_weight=get_sample_weight(y_train)
        model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_test, y_test)],
                  verbose=True,sample_weight=balanced_weight)
        res = model.predict_proba(test_X)
        total_res.append(res)

    avg_proba = []
    for line in range(total_res[0].shape[0]):
        temp_proba = np.zeros((5, 2))
        for i in range(len(total_res)):
            temp = total_res[i][line]
            temp_proba[i] = temp
        x = np.mean(temp_proba, axis=0)
        avg_proba.append(x)
    test_result = np.array(turn_prob_to_label(avg_proba, test_X))
    answer_data = generate_answer_data(test_result)
    write_csv(answer_data)
def cat(x,y):
    #使用cat进行分类
    model = CatBoostClassifier(objective='MultiClass',loss_function='multi_error', learning_rate=0.5,max_depth=8)
    x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.2, random_state=10)
    sample_weight=get_sample_weight(y_train)
    x_test_valid, y_test_valid = shuffle_data(x_test_valid, y_test_valid)
    model.fit(x_train, y_train,verbose=True,eval_set=[(x_test_valid,y_test_valid)],sample_weight=sample_weight)

    # make prediction
    preds = model.predict_proba(x_test_valid)
    test_result = np.array(turn_prob_to_label(preds, x_test_valid))
    print(test_result[0:10])
    print(y_test_valid[0:10])
    test_accuracy = accuracy_score(y_test_valid, test_result)
    f_score = caculate_pre_recall(y_test_valid, test_result)
    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    if type == 'error':
        pickle.dump(model, open('lgberror.dat', "wb"))
    else:
        pickle.dump(model, open(str(f_score)[:4] + "lgboostmodel2.pickle.dat", "wb"))
    return model
def lightgbm(x, y, type=''):
    class_weight = {
        0: 5,
        1: 1, }
    model = LGBMRegressor(num_leaves=50, max_depth=30, objective='multiclass', num_class=2, class_weight=class_weight,
                          n_estimators=10000, learning_rate=0.25)
    x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.2, random_state=10)
    x_test_valid, y_test_valid = shuffle_data(x_test_valid, y_test_valid)
    model.fit(x_train, y_train, early_stopping_rounds=500, eval_set=[(x_train, y_train), (x_test_valid, y_test_valid)],
              eval_metric='multi_error', verbose=True)

    # make prediction
    preds = model.predict(x_test_valid)
    test_result = np.array(turn_prob_to_label(preds, x_test_valid))
    print(test_result[0:10])
    print(y_test_valid[0:10])
    test_accuracy = accuracy_score(y_test_valid, test_result)
    f_score = caculate_pre_recall(y_test_valid, test_result)
    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    if type == 'error':
        pickle.dump(model, open('lgberror.dat', "wb"))
    else:
        pickle.dump(model, open(str(f_score)[:4] + "lgboostmodel2.pickle.dat", "wb"))
    return model


def write_alert_csv(csv_data):
    # 填写修改后的csv
    data_head = [
        ("request_id", "product_id", "product_1st_category", "product_2nd_category", "product_3rd_category", 'user_id',
         "post_province", "post_city", "post_town", "post_detail", "label", "danger_score")
    ]
    data = data_head + csv_data
    f = codecs.open('anser_alerted.csv', 'w', 'gbk')
    writer = csv.writer(f)
    for i in data:
        writer.writerow(i)
    f.close()


def gererate_tfidf_featuer(filename='train_ori.csv'):
    # 生成tf-idf向量
    csv_data = read_csv(filename)
    type = filename.replace('.csv', '')
    x = []
    for each_row in csv_data:
        post_detail = strlist2numlist(each_row[9].split(' '))
        x.append(pad_numlist(post_detail, 40))
    transformer = TfidfTransformer(smooth_idf=False)
    x_tfidf = transformer.fit_transform(x)
    x = x_tfidf.toarray()
    np.save('tfidf_feature_' + type, x)


def alert_answer_data(badword_dict=show_dangerous_word(), goodword_dict=show_good_word()):
    csv_data = read_csv('submit2.csv')
    final_csv_row = []
    for each_row in csv_data:
        post_detail = each_row[9].split(' ')
        row_danger_score = 0
        for each_word in post_detail:
            each_word = int(each_word)
            if each_word in badword_dict.keys():
                print("hi")
                dangerous_count = badword_dict.get(each_word)
                if dangerous_count >= 10:
                    row_danger_score = row_danger_score + dangerous_count / 10
            if each_word in goodword_dict.keys():
                good_count = goodword_dict.get(each_word)
                if good_count >= 10:
                    row_danger_score = row_danger_score - dangerous_count / 10
        each_row = each_row + [row_danger_score]
        final_csv_row.append(each_row)
    write_alert_csv(final_csv_row)

def tpot(x, y, type=''):
    class_weight = {
        0: 5,
        1: 1, }
    model = TPOTClassifier(generations=10, population_size=20, verbosity=2,
                      config_dict='TPOT MDR')
    x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.2, random_state=10)
    x_test_valid, y_test_valid = shuffle_data(x_test_valid, y_test_valid)
    model.fit(x_train, y_train,sample_weight=get_sample_weight(y_train))
    model.score(x_test_valid,y_test_valid)
    model.export('tport_code.py')
    # make prediction
    preds = model.predict_proba(x_test_valid)
    test_result = np.array(turn_prob_to_label(preds, x_test_valid))
    print(test_result[0:10])
    print(y_test_valid[0:10])
    test_accuracy = accuracy_score(y_test_valid, test_result)
    f_score = caculate_pre_recall(y_test_valid, test_result)
    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    if type == 'error':
        pickle.dump(model, open('lgberror.dat', "wb"))
    else:
        pickle.dump(model, open(str(f_score)[:4] + "lgboostmodel2.pickle.dat", "wb"))
    return model

#x,y=generate_train_feature_from_index([1])
#print(x.shape)
#print(y.shape)
# generate_error_data()
# print(load_picklefile('product_id.pkl'))
# train_file_name = "simple_sample.csv"
# gererate_tfidf_featuer()
# alert_answer_data()
# generate_feature()
# show_dangerous_word()
#csv_data1 =read_csv('train_ori.csv')
# csv_data2=read_csv('test.csv')
#handle_csv_data(csv_data1,type='train')
# np.sum(a)))
# generate_feature(csv_data1, "train")
#csv_data1 =read_csv('train_ori.csv')
#csv_data2=read_csv('test.csv')
#handle_csv_data(csv_data1,type='train')
# np.sum(a)))
#generate_feature(csv_data1, "train")
#generate_feature(csv_data2, "test")
# find_single_appear()
# generate_error_data()
# find_single_appear()
#lightgbm()
#csv_data2 = read_csv('train_ori.csv')
#generate_feature(csv_data2, "train")
#csv_data3 = read_csv('test.csv')
#generate_feature(csv_data2, "test")
x = np.load('csvfeature_train.npy')
# print(x.shape)
# x=np.load('csvfeature_train.npy')
# x2=np.load('tfidf_feature_train_ori.npy')
# x=np.hstack((x,x2))
# print(x.shape)
# print(x.shape)
# a=set([4,5])
# a.add(6)
# print(a)
y = np.load('labelcollect_train.npy')
lgb_with_kfold(x,y)
#tpot(x,y)
# xgboost(x, y)
#model=lightgbm(x,y)
#lgb_with_kfold()
#cat_with_kfold(x,y)
#skf = StratifiedKFold(n_splits=5)
#for x,y in skf.split([1,2,3,4]*10,[1,2,3,4]*10):
#    print(x)

# do_mymlp(x,y)
# model = pickle.load(open("0.95lgboostmodel2.pickle.dat", "rb"))
# loaded_model = pickle.load(open("pima.pickle.dat", "rb"))
# preds = model.predict(x)
# test_result = np.array(turn_prob_to_label(preds, x))
# print(test_result[0:10])
# print(y[0:10])
# test_accuracy = accuracy_score(y, test_result)
# print(test_accuracy)
# caculate_pre_recall(y, test_result)
# result_data=model.predict(x)
# result_label=turn_prob_to_label(result_data,x)
# answer_data=generate_answer_data(result_label)
# write_csv(answer_data)
# print(y)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值