初赛赛题
https://security.bytedance.com/fe/ai-challenge#/project?id=1&active=1
字段:
我主要提取的特征就是(userid,product_id)(userid,product_1st_category)这些元组出现的次数,加上产品id总共出现的次数和userid总共出现的次数,但是对于地址这块我没想到好的特征提取方法,然后模型用的是lgboost,已经加了class weight,最终正确率是80%,按照官方积分的算法尝试的最高得分是0.94
from lightgbm import LGBMRegressor
import random
import tensorflow as tf
import codecs
from tensorflow.keras import models, layers
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate, \
StratifiedKFold
from tpot import TPOTClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, log_loss
import pickle
import joblib
import os
import time
from sklearn.decomposition import KernelPCA, LatentDirichletAllocation
from sklearn.utils import compute_sample_weight, compute_class_weight
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import OneHotEncoder
import csv
from tqdm import tqdm
from sklearn.ensemble import IsolationForest
# -*-coding:utf-8 -*-
def write_to_pickle(dictdata, filename):
# 把dict_data以pickle的形式保存
pick_file = open(filename + '.pkl', 'wb')
pickle.dump(dictdata, pick_file)
pick_file.close()
def random_predict(prob_num):
# 输入一个0-1的概率,有这个概率返回True
random_result = random.random()
if random_result <= prob_num:
return True
else:
return False
def load_picklefile(filename):
# 读取pickle文件
pickfile = open(filename, 'rb')
listfile = pickle.load(pickfile)
return listfile
def read_csv(filename="A_test_data.csv"):
# 读取csv文件
full_data = []
with open(filename, 'r', encoding='gbk') as f:
reader = csv.reader(f)
for row in reader:
full_data.append(row)
return full_data[1:] # 删去抬头
def pad_numlist(numlist=[1, 2], goal_length=30):
# 将数值数据打成指定的长度
current_length = len(numlist)
if current_length >= goal_length:
return numlist[:goal_length]
else:
return numlist + [0] * (goal_length - current_length)
def strlist2numlist(strlist):
# 将形如['10','23']的字符串列表变为[10,23]的数字列表
numlist = []
for each in strlist:
numlist.append(int(each))
return numlist
def update_count_dict(dictcollect={}, dictname='name', new_data=('1', '2')):
# 往dict_data里更新数据
dict2update = dictcollect.get(dictname)
if new_data in dict2update.keys():
count = dict2update.get(new_data)
dict2update.update({new_data: count + 1})
return count + 1
else:
dict2update.update({new_data: 1})
return 1
def init_dict_collect():
# 初始化一个统计字典,放在dict_collect里
count_dict_name = ['product_id', 'product_1st_category', 'product_2nd_category', 'product_3rd_category',
'user_appear_time', 'product_appear_time', 'word_gram', 'user_product_kind', 'word_with_label0',
'word_with_label1']
dict_collect = {}
for name in count_dict_name:
dict_collect.update({name: {}})
return dict_collect
def save_dict_collect(dictcollect={}, type='train'):
# 保存dict_collect里的信息
for each_key in dictcollect.keys():
each_dict = dictcollect.get(each_key)
write_to_pickle(each_dict, each_key + "_" + type)
def turn_num2word_gram(numlist, dict_collect):
# 根据字段生成n-gram统计
for count in range(0, len(numlist) - 1):
update_count_dict(dict_collect, 'word_gram', ((numlist[count], numlist[count + 1])))
def turn_list2dict(listdata):
# 把排序后的列表重新化成字典
# 输入形式是[["url", 2], ["sfstat", 3], ["jobimg", 5]]这样
result_dict = {}
for each_data in listdata:
result_dict.update({each_data[0]: each_data[1]})
return result_dict
def sort_wordgram(dict_collect):
# 对词组进行整理,删去出现次数极少的词组
word_dict = dict_collect.get('word_gram')
word_dict_sorted = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
final_word_dict = {}
for each_word in word_dict_sorted:
count = each_word[1]
if count > 2:
final_word_dict.update({each_word[0]: len(final_word_dict) + 1})
dict_collect.update({'word_gram': final_word_dict})
def update_kind_dict(dict_collect, user_id, kind_id):
# 统计每位用户总共买的产品种类
user_product_kind_dict = dict_collect.get('user_product_kind')
user_kind_set = user_product_kind_dict.get(user_id)
if user_kind_set is None:
user_product_kind_dict.update({user_id: set(kind_id)})
else:
user_kind_set.add(kind_id)
user_product_kind_dict.update({user_id: user_kind_set})
def update_word_with_label_dict(dict_collect, dictname, word_list):
# 统计不同地点单词在收获细节出现的比例
word_with_label_dict = dict_collect.get(dictname)
for each_word in word_list:
if each_word in word_with_label_dict.keys():
count = word_with_label_dict.get(each_word)
word_with_label_dict.update({each_word: count + 1})
else:
word_with_label_dict.update({each_word: 1})
def handle_csv_data(csv_data=[], dict_collect=init_dict_collect(), type='train+test'):
# 处理读取csv之后得到的数据
count = 0
request_label_dict={}
for each_row in csv_data:
request_id = np.array(each_row[0])
product_id = each_row[1]
update_count_dict(dict_collect, 'product_appear_time', product_id)
product_1st_category = each_row[2]
product_2nd_category = each_row[3]
product_3rd_category = each_row[4]
user_id = each_row[5]
label = str(each_row[10])
update_count_dict(dict_collect, 'user_appear_time', user_id)
update_count_dict(dict_collect, 'product_id', (user_id, product_id))
update_count_dict(dict_collect, 'product_1st_category', (user_id, product_1st_category))
update_count_dict(dict_collect, 'product_2nd_category', (user_id, product_2nd_category))
update_count_dict(dict_collect, 'product_3rd_category', (user_id, product_3rd_category))
update_kind_dict(dict_collect, user_id, product_id)
word_with_label_dict_name = 'word_with_label' + label
post_province = each_row[6].split(' ')
turn_num2word_gram(post_province, dict_collect)
post_city = each_row[7].split(' ')
turn_num2word_gram(post_city, dict_collect)
post_town = each_row[8].split(' ')
turn_num2word_gram(post_town, dict_collect)
post_detail = each_row[9].split(' ')
update_word_with_label_dict(dict_collect, word_with_label_dict_name, post_province+post_city+post_town+post_detail)
turn_num2word_gram(post_detail, dict_collect)
count = count + 1
if count % 1000 == 0:
print("count:", count)
request_label_dict.update({int(request_id):int(label)})
sort_wordgram(dict_collect)
save_dict_collect(dict_collect, type)
write_to_pickle(request_label_dict,'request_label_dict')
def turn_wordgram2feature(numlist, word_gram_dict):
# 把数字列表转化为2gram特征
word_feature = []
dict_length = len(word_gram_dict)
for count in range(0, len(numlist) - 1):
if (numlist[count], numlist[count + 1]) in word_gram_dict.keys():
word_feature.append(word_gram_dict.get((numlist[count], numlist[count + 1])))
else:
word_feature.append(dict_length)
return word_feature
def generate_word_with_laebl_feature(post_detail, word_with_laebl0_dict, word_with_laebl1_dict):
feature_list = []
for each_word in post_detail:
if each_word in word_with_laebl0_dict:
count0 = word_with_laebl0_dict.get(each_word)
else:
count0 = 0.1
if each_word in word_with_laebl1_dict:
count1 = word_with_laebl1_dict.get(each_word)
else:
count1 = 0.1
feature_list.append(round(count1 / count0, 2))
return feature_list
def generate_feature(csvdata=[], type='train'):
# 从csv文件中生成特征:
dict_collect = init_dict_collect()
label_collect = []
feature_collect = []
request_id_collect = []
user_appear_onetime_count = 0
# 记录只出现一次的用户
one_and_huangniu_count = 0
# 记录只出现一次且用户是黄牛
file_count = 0
user_id_feature_dict={}
#这个字典里存放的是user_id对应的特征,确保相同user_id的特征不会被分开
product_id_dict = load_picklefile('product_id_train+test' + '.pkl')
product_1st_category_dict = load_picklefile('product_1st_category_train+test' + '.pkl')
product_2nd_category_dict = load_picklefile('product_2nd_category_train+test' + '.pkl')
product_3rd_category_dict = load_picklefile('product_3rd_category_train+test' + '.pkl')
word_gram_dict = load_picklefile('word_gram_train+test.pkl')
product_appear_dict = load_picklefile('product_appear_time_train+test.pkl')
user_appear_dict = load_picklefile('user_appear_time_train+test.pkl')
user_kind_dict = load_picklefile("user_product_kind_train+test.pkl")
word_with_laebl0_dict = load_picklefile("word_with_label0_train.pkl")
word_with_laebl1_dict = load_picklefile("word_with_label1_train.pkl")
for each_row in csvdata:
file_count = file_count + 1
request_id = np.array(each_row[0])
request_id_collect.append(request_id)
product_id = each_row[1]
product_appear_time = product_appear_dict.get(product_id)
product_1st_category = each_row[2]
product_2nd_category = each_row[3]
product_3rd_category = each_row[4]
user_id = each_row[5]
user_kind_num = len(user_kind_dict.get(user_id))
user_appear_time = user_appear_dict.get(user_id)
feature_row = [request_id, product_appear_time, user_appear_time, user_kind_num]
feature_row = feature_row + [int(product_id), int(product_1st_category), int(product_2nd_category),
int(product_3rd_category)]
product_id_count = product_id_dict.get((user_id, product_id))
product_1st_category_count = product_1st_category_dict.get((user_id, product_1st_category))
product_2nd_category_count = product_2nd_category_dict.get((user_id, product_2nd_category))
product_3rd_category_count = product_3rd_category_dict.get((user_id, product_3rd_category))
feature_row = feature_row + [product_id_count, product_1st_category_count, product_2nd_category_count,
product_3rd_category_count]
sequence_feature = [
update_count_dict(dict_collect, 'product_appear_time', product_id),
update_count_dict(dict_collect, 'user_appear_time', user_id),
update_count_dict(dict_collect, 'product_id', (user_id, product_id)),
update_count_dict(dict_collect, 'product_1st_category', (user_id, product_1st_category)),
update_count_dict(dict_collect, 'product_2nd_category', (user_id, product_2nd_category)),
update_count_dict(dict_collect, 'product_3rd_category', (user_id, product_3rd_category)), ]
post_province = each_row[6].split(' ')
post_province_feature = turn_wordgram2feature(post_province, word_gram_dict)
post_city = each_row[7].split(' ')
post_city_feature = turn_wordgram2feature(post_city, word_gram_dict)
post_town = each_row[8].split(' ')
post_town_feature = turn_wordgram2feature(post_town, word_gram_dict)
post_detail = each_row[9].split(' ')
word_with_laebl_feature = generate_word_with_laebl_feature(post_province+post_city+post_town+post_detail, word_with_laebl0_dict,
word_with_laebl1_dict)
word_with_laebl_feature = pad_numlist(word_with_laebl_feature, 55)
# post_detail_feature = turn_wordgram2feature(post_detail, word_gram_dict)
post_detail_length = len(post_detail)
post_detail = strlist2numlist(post_detail)
post_detail_sum = int(np.sum(strlist2numlist(post_detail)))
post_detail_ave = post_detail_sum // post_detail_length
feature_row = feature_row + [post_detail_length, post_detail_sum, post_detail_ave]
feature_row = feature_row + word_with_laebl_feature
feature_row = feature_row + pad_numlist(post_province_feature, 6) + pad_numlist(post_city_feature,
6) + pad_numlist(
post_town_feature, 10) + pad_numlist(post_detail, 40) + sequence_feature
feature_row = np.array(feature_row,dtype='int32')
label = [int(each_row[10])]
label_collect = label_collect + label
former_feature_list=user_id_feature_dict.get(user_id)
if former_feature_list is None:
user_id_feature_dict.update({user_id:[feature_row]})
else:
former_feature_list.append(feature_row)
user_id_feature_dict.update({user_id:former_feature_list})
if file_count == 1:
feature_collect = feature_row
else:
feature_collect = np.vstack((feature_collect, feature_row))
if file_count % 100 == 0:
print(file_count)
np.save('csvfeature_' + type, feature_collect)
np.save('labelcollect_' + type, label_collect)
np.save('request_id_' + type, request_id_collect)
write_to_pickle(user_id_feature_dict,type+'user_id_feature_dict')
print("onetime_huangniu", one_and_huangniu_count, user_appear_onetime_count)
def turn_prob_to_label(prob_data, input_data):
# 将检测概率转化成一维数字标签
error_model = pickle.load(open('lgberror.dat', "rb"))
prob_data = list(prob_data)
input_data = list(input_data)
result = []
for pointer in range(0, len(prob_data)):
prob_data_row = list(prob_data[pointer])
input_data_row = list(input_data[pointer])
user_appear_time = int(input_data_row[2])
porduct_id_count = int(input_data_row[8])
if (porduct_id_count == 1 or user_appear_time == 1) and prob_data_row[1] < 0.8:
result.append(0)
continue
if prob_data_row[0] >= 0.34:
result.append(0)
continue
else:
result.append(1)
continue
max_loc = list(prob_data_row).index(max(list(prob_data_row)))
result.append(max_loc)
return result
def find_fault(prob_data, input_data, true_label):
prob_data = list(prob_data)
input_data = list(input_data)
true_label = list(true_label)
result = {}
for pointer in range(0, len(prob_data)):
prob_data_row = list(prob_data[pointer])
input_data_row = list(input_data[pointer])
request_id = input_data_row[0]
max_loc = list(prob_data_row).index(max(list(prob_data_row)))
if int(max_loc) != int(true_label[pointer]):
result.update({request_id: prob_data_row})
print(result)
write_to_pickle(result, "error_record")
def find_single_appear():
# 筛选出在表里只出现一次的用户
csvdata = read_csv('train_ori.csv')
user_appear_dict = load_picklefile('user_appear_time_train+test.pkl')
result = []
for each_row in csvdata:
user_id = each_row[5]
appear_time = user_appear_dict.get(user_id)
if appear_time > 1:
result.append(each_row)
write_single(result)
def caculate_score(label, predict):
# 计算pre和recall
predict = list(predict)
label = list(label)
pre_count = 0
label_count = 0.0
correct_predict_count = 0.0
error_list = []
for count in range(0, len(label)):
predict_num = int(predict[count])
label_num = int(label[count])
if predict_num == 1:
pre_count = pre_count + 1
if label_num == 1:
label_count = label_count + 1
if label_num == 1 and predict_num == 1:
correct_predict_count = correct_predict_count + 1
if label_num != predict_num:
error_list.append(count)
Precision = correct_predict_count / pre_count
Recall = correct_predict_count / label_count
beta = 0.3
score = (1 + beta ** 2) * Precision * Recall / (beta ** 2 * Precision + Recall)
print(Precision, Recall)
print("score", score)
return score
def write_csv(answer_data=[(0, 0)]):
# 生成答案的csv文件
data_head = [
("request_id", "product_id", "product_1st_category", "product_2nd_category", "product_3rd_category", 'user_id',
"post_province", "post_city", "post_town", "post_detail", "label")
]
data = data_head + answer_data
f = codecs.open('submit0.34_0.8.csv', 'w', 'gbk')
writer = csv.writer(f)
for i in data:
writer.writerow(i)
f.close()
def write_single(answer_data=[(0, 0)]):
# 生成只出现少次的csv文件
data_head = [
("request_id", "product_id", "product_1st_category", "product_2nd_category", "product_3rd_category", 'user_id',
"post_province", "post_city", "post_town", "post_detail", "label")
]
data = data_head + answer_data
f = codecs.open('single_csv.csv', 'w', 'gbk')
writer = csv.writer(f)
for i in data:
writer.writerow(i)
f.close()
def write_error(answer_data=[(0, 0)]):
# 生成错误列表的csv文件
data_head = [
("request_id", "product_id", "product_1st_category", "product_2nd_category", "product_3rd_category", 'user_id',
"post_province", "post_city", "post_town", "post_detail", "label", "prob_0", "prob_1")
]
data = data_head + answer_data
f = codecs.open('error_csv.csv', 'w', 'gbk')
writer = csv.writer(f)
for i in data:
writer.writerow(i)
f.close()
def do_mymlp(train_x, train_y):
# mlp
tf.keras.backend.clear_session()
print("do mymlp")
# inputs = layers.Input(shape=max_features)
inputs = layers.Input(shape=67)
x = layers.Dense(60, activation='sigmoid')(inputs)
x = layers.Dense(50, activation='sigmoid')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(30, activation='sigmoid')(x)
x = layers.Dense(10, activation='sigmoid')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = models.Model(inputs=inputs, outputs=outputs)
model.summary()
import datetime
import os
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# 在 Python3 下建议使用 pathlib 修正各操作系统的路径
from pathlib import Path
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = str(Path('./data/autograph/' + stamp))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
loss=tf.keras.losses.binary_crossentropy,
metrics=["accuracy", "Recall", "Precision", "AUC"]
)
history = model.fit(train_x, train_y, epochs=30, validation_split=0.1,
callbacks=[tensorboard_callback], workers=4, shuffle=True)
preds = model.predict(train_x)
test_result = np.array(turn_prob_to_label(preds, train_x))
print(test_result[0:10])
print(train_y[0:10])
test_accuracy = accuracy_score(train_y, test_result)
print(test_accuracy)
caculate_pre_recall(test_result, train_y)
x_train, x_test_valid, y_train, y_test_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=1)
preds = model.predict(x_test_valid)
test_result = np.array(turn_prob_to_label(preds, x_test_valid))
print(test_result[0:10])
print(y_test_valid[0:10])
test_accuracy = accuracy_score(y_test_valid, test_result)
f_score = caculate_pre_recall(test_result, y_test_valid)
return model
def generate_answer_data(predict_result):
# 把id和预测label组合
anser_list = []
test_csv = read_csv('test.csv')
id_list = list(np.load('request_id_test.npy'))
for pointer in range(0, len(id_list)):
temp_list = test_csv[pointer][:-1] + [predict_result[pointer]]
anser_list.append(temp_list)
print(temp_list)
return anser_list
def xgboost(x, y):
# 'multi:softprob'
model = XGBClassifier(max_depth=25, objective='binary:logistic', scale_pos_weight=0.15)
x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.4, random_state=1)
model.fit(x_train, y_train, early_stopping_rounds=30, eval_set=[(x_train, y_train), (x_test_valid, y_test_valid)],
eval_metric="logloss", verbose=True)
preds = model.predict_proba(x)
test_result = np.array(turn_prob_to_label(preds, x))
print(test_result[0:10])
print(y[0:10])
test_accuracy = accuracy_score(y, test_result)
print(test_accuracy)
caculate_pre_recall(test_result, y)
preds = model.predict_proba(x_test_valid)
test_result = np.array(turn_prob_to_label(preds, x_test_valid))
print(test_result[0:10])
print(y_test_valid[0:10])
test_accuracy = accuracy_score(y_test_valid, test_result)
f_score = caculate_pre_recall(test_result, y_test_valid)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
pickle.dump(model, open(str(f_score)[:4] + "lgboostmodel2.pickle.dat", "wb"))
def generate_error_data(logname="error_record"):
error_dict = load_picklefile('error_record.pkl')
train_csv = read_csv('train.csv')
infor_collect = []
for each_row in train_csv:
request_id = each_row[0]
if request_id in error_dict.keys():
error_proba = error_dict.get(request_id)
print(error_proba)
print(each_row)
infor_collect.append(each_row + error_proba)
write_error(infor_collect)
def shuffle_data(data_x, data_y):
# 对输入的numpy数组进行手动打乱
np.random.seed(207)
np.random.shuffle(data_x)
np.random.seed(207)
np.random.shuffle(data_y)
return data_x, data_y
def show_dangerous_word():
# 显示容易是黄牛的词
word_with_laebl0_dict = load_picklefile("word_with_label0_train.pkl")
word_with_laebl1_dict = load_picklefile("word_with_label1_train.pkl")
final_dict = {}
for each in word_with_laebl1_dict.keys():
count1 = word_with_laebl1_dict.get(each)
count0 = word_with_laebl0_dict.get(each)
if count0 is None:
count0 = 1
final_dict.update({int(each): count1 / count0})
final_dict = sorted(final_dict.items(), key=lambda x: x[1], reverse=True)
return final_dict
def show_good_word():
# 显示是好人的单词
word_with_laebl0_dict = load_picklefile("word_with_label0_train.pkl")
word_with_laebl1_dict = load_picklefile("word_with_label1_train.pkl")
final_dict = {}
for each in word_with_laebl0_dict.keys():
count1 = word_with_laebl1_dict.get(each)
count0 = word_with_laebl0_dict.get(each)
if count1 is None:
count1 = 1
final_dict.update({int(each): count0 / count1})
final_dict = sorted(final_dict.items(), key=lambda x: x[1], reverse=True)
return final_dict
def reverse_label(intnum):
# 返回翻转的标签
if int(intnum) == 1:
return 0
else:
return 1
def turn_label2one_hot(label_y):
# 把向量转化为onehot
result = []
for each in label_y:
if int(each) == 0:
result.append([1, 0])
else:
result.append([0, 1])
return np.array(result)
def generate_train_feature_from_index(index_list):
#通过输入的index来生成训练数据集
label_dict=load_picklefile('request_label_dict.pkl')
user_id_feature_dict=load_picklefile('trainuser_id_feature_dict.pkl')
x=[]
y=[]
key_list=list(user_id_feature_dict.keys())
for each_num in index_list:
pointer=key_list[each_num]
user_id_features=user_id_feature_dict.get(pointer)
for each_user_id_feature in user_id_features:
x.append(each_user_id_feature)
request_id=int(each_user_id_feature[0])
y.append(int(label_dict.get(request_id)))
return np.array(x),np.array(y)
def genetrate_num_list(start=0,end=101887):
num_list=[]
for count in range(start,end):
num_list.append(count)
return num_list
def lgb_with_kfold(train_X,train_Y):
test_X = np.load('csvfeature_test.npy')
skf = StratifiedKFold(n_splits=5)
class_weight = {
0:4 ,
1:0.5 }
model = LGBMRegressor(num_leaves=50, max_depth=30, objective='multiclass', num_class=2, class_weight=class_weight,
n_estimators=10000, learning_rate=0.05)
total_res = []
for train_index, test_index in skf.split(train_X, train_Y):
X_train, X_test = train_X[train_index], train_X[test_index]
y_train, y_test = train_Y[train_index], train_Y[test_index]
model.fit(X_train, y_train, early_stopping_rounds=300,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='multi_error', verbose=True)
res = model.predict(test_X)
total_res.append(res)
avg_proba = []
for line in range(total_res[0].shape[0]):
temp_proba = np.zeros((5, 2))
for i in range(len(total_res)):
temp = total_res[i][line]
temp_proba[i] = temp
x = np.mean(temp_proba, axis=0)
avg_proba.append(x)
test_result = np.array(turn_prob_to_label(avg_proba, test_X))
answer_data = generate_answer_data(test_result)
write_csv(answer_data)
# test_accuracy = accuracy_score(test_Y, test_result)
# f_score = caculate_pre_recall(test_Y, test_result)
# print(test_accuracy,f_score)
def get_sample_weight(y):
balanced_sample=[]
class_weight = {
0: 5,
1: 1, }
for each in y:
balanced_sample.append(class_weight.get(int(each)))
return np.array(balanced_sample)
def cat_with_kfold(train_X, train_Y):
test_X = np.load('csvfeature_test.npy')
skf = StratifiedKFold(n_splits=5)
model = CatBoostClassifier(objective='MultiClass', loss_function='multi_error', learning_rate=0.5,max_depth=8)
total_res = []
for train_index, test_index in skf.split(train_X, train_Y):
X_train, X_test = train_X[train_index], train_X[test_index]
y_train, y_test = train_Y[train_index], train_Y[test_index]
balanced_weight=get_sample_weight(y_train)
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=True,sample_weight=balanced_weight)
res = model.predict_proba(test_X)
total_res.append(res)
avg_proba = []
for line in range(total_res[0].shape[0]):
temp_proba = np.zeros((5, 2))
for i in range(len(total_res)):
temp = total_res[i][line]
temp_proba[i] = temp
x = np.mean(temp_proba, axis=0)
avg_proba.append(x)
test_result = np.array(turn_prob_to_label(avg_proba, test_X))
answer_data = generate_answer_data(test_result)
write_csv(answer_data)
def cat(x,y):
#使用cat进行分类
model = CatBoostClassifier(objective='MultiClass',loss_function='multi_error', learning_rate=0.5,max_depth=8)
x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.2, random_state=10)
sample_weight=get_sample_weight(y_train)
x_test_valid, y_test_valid = shuffle_data(x_test_valid, y_test_valid)
model.fit(x_train, y_train,verbose=True,eval_set=[(x_test_valid,y_test_valid)],sample_weight=sample_weight)
# make prediction
preds = model.predict_proba(x_test_valid)
test_result = np.array(turn_prob_to_label(preds, x_test_valid))
print(test_result[0:10])
print(y_test_valid[0:10])
test_accuracy = accuracy_score(y_test_valid, test_result)
f_score = caculate_pre_recall(y_test_valid, test_result)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
if type == 'error':
pickle.dump(model, open('lgberror.dat', "wb"))
else:
pickle.dump(model, open(str(f_score)[:4] + "lgboostmodel2.pickle.dat", "wb"))
return model
def lightgbm(x, y, type=''):
class_weight = {
0: 5,
1: 1, }
model = LGBMRegressor(num_leaves=50, max_depth=30, objective='multiclass', num_class=2, class_weight=class_weight,
n_estimators=10000, learning_rate=0.25)
x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.2, random_state=10)
x_test_valid, y_test_valid = shuffle_data(x_test_valid, y_test_valid)
model.fit(x_train, y_train, early_stopping_rounds=500, eval_set=[(x_train, y_train), (x_test_valid, y_test_valid)],
eval_metric='multi_error', verbose=True)
# make prediction
preds = model.predict(x_test_valid)
test_result = np.array(turn_prob_to_label(preds, x_test_valid))
print(test_result[0:10])
print(y_test_valid[0:10])
test_accuracy = accuracy_score(y_test_valid, test_result)
f_score = caculate_pre_recall(y_test_valid, test_result)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
if type == 'error':
pickle.dump(model, open('lgberror.dat', "wb"))
else:
pickle.dump(model, open(str(f_score)[:4] + "lgboostmodel2.pickle.dat", "wb"))
return model
def write_alert_csv(csv_data):
# 填写修改后的csv
data_head = [
("request_id", "product_id", "product_1st_category", "product_2nd_category", "product_3rd_category", 'user_id',
"post_province", "post_city", "post_town", "post_detail", "label", "danger_score")
]
data = data_head + csv_data
f = codecs.open('anser_alerted.csv', 'w', 'gbk')
writer = csv.writer(f)
for i in data:
writer.writerow(i)
f.close()
def gererate_tfidf_featuer(filename='train_ori.csv'):
# 生成tf-idf向量
csv_data = read_csv(filename)
type = filename.replace('.csv', '')
x = []
for each_row in csv_data:
post_detail = strlist2numlist(each_row[9].split(' '))
x.append(pad_numlist(post_detail, 40))
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x)
x = x_tfidf.toarray()
np.save('tfidf_feature_' + type, x)
def alert_answer_data(badword_dict=show_dangerous_word(), goodword_dict=show_good_word()):
csv_data = read_csv('submit2.csv')
final_csv_row = []
for each_row in csv_data:
post_detail = each_row[9].split(' ')
row_danger_score = 0
for each_word in post_detail:
each_word = int(each_word)
if each_word in badword_dict.keys():
print("hi")
dangerous_count = badword_dict.get(each_word)
if dangerous_count >= 10:
row_danger_score = row_danger_score + dangerous_count / 10
if each_word in goodword_dict.keys():
good_count = goodword_dict.get(each_word)
if good_count >= 10:
row_danger_score = row_danger_score - dangerous_count / 10
each_row = each_row + [row_danger_score]
final_csv_row.append(each_row)
write_alert_csv(final_csv_row)
def tpot(x, y, type=''):
class_weight = {
0: 5,
1: 1, }
model = TPOTClassifier(generations=10, population_size=20, verbosity=2,
config_dict='TPOT MDR')
x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.2, random_state=10)
x_test_valid, y_test_valid = shuffle_data(x_test_valid, y_test_valid)
model.fit(x_train, y_train,sample_weight=get_sample_weight(y_train))
model.score(x_test_valid,y_test_valid)
model.export('tport_code.py')
# make prediction
preds = model.predict_proba(x_test_valid)
test_result = np.array(turn_prob_to_label(preds, x_test_valid))
print(test_result[0:10])
print(y_test_valid[0:10])
test_accuracy = accuracy_score(y_test_valid, test_result)
f_score = caculate_pre_recall(y_test_valid, test_result)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
if type == 'error':
pickle.dump(model, open('lgberror.dat', "wb"))
else:
pickle.dump(model, open(str(f_score)[:4] + "lgboostmodel2.pickle.dat", "wb"))
return model
#x,y=generate_train_feature_from_index([1])
#print(x.shape)
#print(y.shape)
# generate_error_data()
# print(load_picklefile('product_id.pkl'))
# train_file_name = "simple_sample.csv"
# gererate_tfidf_featuer()
# alert_answer_data()
# generate_feature()
# show_dangerous_word()
#csv_data1 =read_csv('train_ori.csv')
# csv_data2=read_csv('test.csv')
#handle_csv_data(csv_data1,type='train')
# np.sum(a)))
# generate_feature(csv_data1, "train")
#csv_data1 =read_csv('train_ori.csv')
#csv_data2=read_csv('test.csv')
#handle_csv_data(csv_data1,type='train')
# np.sum(a)))
#generate_feature(csv_data1, "train")
#generate_feature(csv_data2, "test")
# find_single_appear()
# generate_error_data()
# find_single_appear()
#lightgbm()
#csv_data2 = read_csv('train_ori.csv')
#generate_feature(csv_data2, "train")
#csv_data3 = read_csv('test.csv')
#generate_feature(csv_data2, "test")
x = np.load('csvfeature_train.npy')
# print(x.shape)
# x=np.load('csvfeature_train.npy')
# x2=np.load('tfidf_feature_train_ori.npy')
# x=np.hstack((x,x2))
# print(x.shape)
# print(x.shape)
# a=set([4,5])
# a.add(6)
# print(a)
y = np.load('labelcollect_train.npy')
lgb_with_kfold(x,y)
#tpot(x,y)
# xgboost(x, y)
#model=lightgbm(x,y)
#lgb_with_kfold()
#cat_with_kfold(x,y)
#skf = StratifiedKFold(n_splits=5)
#for x,y in skf.split([1,2,3,4]*10,[1,2,3,4]*10):
# print(x)
# do_mymlp(x,y)
# model = pickle.load(open("0.95lgboostmodel2.pickle.dat", "rb"))
# loaded_model = pickle.load(open("pima.pickle.dat", "rb"))
# preds = model.predict(x)
# test_result = np.array(turn_prob_to_label(preds, x))
# print(test_result[0:10])
# print(y[0:10])
# test_accuracy = accuracy_score(y, test_result)
# print(test_accuracy)
# caculate_pre_recall(y, test_result)
# result_data=model.predict(x)
# result_label=turn_prob_to_label(result_data,x)
# answer_data=generate_answer_data(result_label)
# write_csv(answer_data)
# print(y)