CIS实验室内部的竞赛,在这里做一个记录保存下来
Good man
good man就是一个二分类问题,在数据中找到好人和坏人,好人标记为1,坏人标记为0。这道题的难点在于正样本和负样本严重的不均衡,训练集中 label 1 与 label 0 样本数比例为:184 : 61051(330 倍)。每个样本 202个特征数据,其中有部分数据缺失, ln_165 列的数据包含“-”字符, ln_166 列是字母与数字组合。
一、数据处理
1、 时间列处理
XXXX-XX-XX 变为 XXXXXXXX
for i in range(len(data['time'])):
new = str(data["time"][i][0:4]+data["time"][i][5:7]+data["time"][i][8:10]+data["time"][i][11:13]+data["time"][i][14:16]+data["time"][i][17:19])
del data["time"][i]
data["time"][i] = new
2、 信号列处理(字母变成 ASCII 对应的数字)
AQ2060E1 变为 65812060691
for i in range(len(data['a'])):
new = str(ord(data['a'][i][0]))+str(ord(data['a'][i][1]))+data['a'][i][2:6]+str(ord(data['a'][i][6]))+data['a'][i][7]
del data['a'][i]
data['a'][i] = new
3、缺失值填补:(由于原始数据中无-1,填补-1 不会原样本造成干扰)
#缺失值填补
data = data.fillna(-1)#填补指定数字
二、样本不均衡的处理
这里用了很笨的方法,将 label 为 1 的样例抽取出来复制 330 倍拼接到原数
据中。
1、Label 为 1 的样本提取:
#正样本提取
x = []
for i in range(len(data)):
if data['label'][i] == 1:
x.append(data.loc[i])
data = pd.DataFrame(x)#将样本保存到data里
2、样本扩充:
#人工扩充正样本
new = x
for j in range(330):
new = np.concatenate((new,x),axis=0)
dataframe = pd.DataFrame(new)
#均匀样本
df = shuffle(data)
三、分类器选择
这里我用了 XGBoost 分类器,最开始尝试了 LR、KNN、GBDT、RF 分类器,由于当时没调参数,用默认参数跑的结果准确率都很低。最后使用 XGBoost准确率有提升。
数据读取:
#读入数据
train = pd.read_csv("/Users/Cheney/Downloads/train.csv")
test = pd.read_csv("/Users/Cheney/Downloads/test.csv")
模型参数:
(二分类问题,所以 num_class=2,因为样本不均衡,参数调整的重点在:scale_pos_weight、learning_rate、min_child_weight)
xgb_model = xgb.XGBClassifier()
params = dict(booster='gbtree',
objective='multi:softmax',
num_class=2,
gamma=0,
reg_lambda = 0,
alpha = 0,
max_depth=7,
silent=0,
seed=1000,
scale_pos_weight = 1,
learning_rate=0.03,
min_child_weight=3000,
subsample=0.7
)
交叉验证训练样本:
用部分训练集的样本对训练的模型进行验证
#用sklearn.cross_validation进行训练数据集划分,这里训练集和交叉验证集比例为7:3,可以自己根据需要设置
train_xy, val = train_test_split(train, test_size = 0.3,random_state=1)
y = train_xy.label
X = train_xy.drop(['label'],axis=1)
val_y = val.label
val_X = val.drop(['label'],axis=1)
XGB 矩阵赋值:
#xgb矩阵赋值
xgb_val = xgb.DMatrix(val_X,label=val_y)
xgb_train = xgb.DMatrix(X, label=y)
xgb_test = xgb.DMatrix(test)
保存训练模型:(验证集上准确率 100 次迭代没发生变化就停止训练)
# early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=100)
这道题的评分标准是用AUC,计算 AUC 得分:
print ('AUC: %.4f' % metrics.roc_auc_score(val_y, preds))
导出预测结果:
preds = model.predict(xgb_test, ntree_limit=model.best_ntree_limit)
np.savetxt('/Users/Cheney/Downloads/res.csv',np.c_[range(1,len(test)+1),preds],delimiter=',',header='Label',comments='',fmt='%d')
完成代码:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn import metrics
#读入数据
train = pd.read_csv("/Users/Cheney/Downloads/train.csv")
test = pd.read_csv("/Users/Cheney/Downloads/test.csv")
xgb_model = xgb.XGBClassifier()
params = dict(booster='gbtree',
objective='multi:softmax',
num_class=2,
gamma=0,
reg_lambda = 0,
alpha = 0,
max_depth=7,
silent=0,
seed=1000,
scale_pos_weight = 1,
learning_rate=0.03,
min_child_weight=3000,
subsample=0.7
)
plst = list(params.items())
num_rounds = 8000 # 迭代次数
#用sklearn.cross_validation进行训练数据集划分,这里训练集和交叉验证集比例为7:3,可以自己根据需要设置
train_xy, val = train_test_split(train, test_size = 0.3,random_state=1)
y = train_xy.label
X = train_xy.drop(['label'],axis=1)
val_y = val.label
val_X = val.drop(['label'],axis=1)
#xgb矩阵赋值
xgb_val = xgb.DMatrix(val_X,label=val_y)
xgb_train = xgb.DMatrix(X, label=y)
xgb_test = xgb.DMatrix(test)
watchlist = [(xgb_train, 'train'),(xgb_val, 'val')]
# early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=100)
preds = model.predict(xgb_test, ntree_limit=model.best_ntree_limit)
np.savetxt('/Users/Cheney/Downloads/res.csv',np.c_[range(1,len(test)+1),preds],delimiter=',',header='Label',comments='',fmt='%d')
print ('AUC: %.4f' % metrics.roc_auc_score(val_y, preds))
Better man
题目分析:
better man 与 good man 都是不均衡样本的分类问题。 better man 要分三类, 样本悬殊更大,难度增加。训练集中 label 2、label 1 与 label 0 样本数比例为:88:94 : 61051(620 倍) 相较于 good man,我对之前的方法有所改进,采用 smote 函数对少的样本进行重采样。
一、数据处理
1、smote 函数
#smote 函数实现
class Smote:
def __init__(self,samples,N=10,k=5):
self.n_samples,self.n_attrs=samples.shape
self.N=N
self.k=k
self.samples=samples
self.newindex=0
# self.synthetic=np.zeros((self.n_samples*N,self.n_attrs))
def over_sampling(self):
N=int(self.N/100)
self.synthetic = np.zeros((self.n_samples * N, self.n_attrs))
neighbors=NearestNeighbors(n_neighbors=self.k).fit(self.samples)
print('neighbors',neighbors)
for i in range(len(self.samples)):
nnarray=neighbors.kneighbors(self.samples[i].reshape(1,-1),return_distance=False)[0]
#print nnarray
self._populate(N,i,nnarray)
return self.synthetic
# for each minority class samples,choose N of the k nearest neighbors and generate N synthetic samples.
def _populate(self,N,i,nnarray):
for j in range(N):
nn=random.randint(0,self.k-1)
dif=self.samples[nnarray[nn]]-self.samples[i]
gap=random.random()
self.synthetic[self.newindex]=self.samples[i]+gap*dif
self.newindex+=1
2、数据读取
a = pd.read_csv("/Users/Cheney/Downloads/HW2/p1.csv").fillna(-1)
b = pd.read_csv("/Users/Cheney/Downloads/HW2/p2.csv").fillna(-1)
c = pd.read_csv("/Users/Cheney/Downloads/HW2/train2.csv").fillna(-1)
d = pd.read_csv("/Users/Cheney/Downloads/HW2/test2.csv").fillna(-1)
3、转换成向量
d_data = d.values[:,2:]
a_data = a.values[:,3:]
b_data = b.values[:,3:]
c_data = c.values[:,3:]
train_label = train_label.values.reshape([-1,1]).astype(int)
4、smote 函数重采样
s1= Smote(a_data ,N=40000).over_sampling()
s2= Smote(b_data, N=45000).over_sampling()
train_label1_append = np.ones((len(s1),1),np.int)
train_label2_append = np.ones((len(s2),1),np.int)*2
5、数据拼接
c_data = np.vstack((c_data,s1))
c_data = np.vstack((c_data,s2))
train_label = np.vstack((train_label,train_label1_append))
train_label = np.vstack((train_label,train_label2_append))
二、调参数
(主要是 learning_rate、 min_child_weight、 subsample 三个参数)
xgb_model = xgb.XGBClassifier()
params = dict(booster='gbtree',
objective='multi:softmax',
num_class=3,
gamma=1,
reg_lambda=1,
alpha=1,
max_depth=5,
silent=0,
seed=1000,
scale_pos_weight=1,
learning_rate=0.03,
min_child_weight=4500,
subsample=0.7
)
完整代码:
from sklearn import metrics
import pickle as pickle
from sklearn.neighbors import NearestNeighbors
import numpy as np
import random
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn import metrics
#smote 函数实现==========================================================================================================
class Smote:
def __init__(self,samples,N=10,k=5):
self.n_samples,self.n_attrs=samples.shape
self.N=N
self.k=k
self.samples=samples
self.newindex=0
# self.synthetic=np.zeros((self.n_samples*N,self.n_attrs))
def over_sampling(self):
N=int(self.N/100)
self.synthetic = np.zeros((self.n_samples * N, self.n_attrs))
neighbors=NearestNeighbors(n_neighbors=self.k).fit(self.samples)
print('neighbors',neighbors)
for i in range(len(self.samples)):
nnarray=neighbors.kneighbors(self.samples[i].reshape(1,-1),return_distance=False)[0]
#print nnarray
self._populate(N,i,nnarray)
return self.synthetic
# for each minority class samples,choose N of the k nearest neighbors and generate N synthetic samples.
def _populate(self,N,i,nnarray):
for j in range(N):
nn=random.randint(0,self.k-1)
dif=self.samples[nnarray[nn]]-self.samples[i]
gap=random.random()
self.synthetic[self.newindex]=self.samples[i]+gap*dif
self.newindex+=1
#数据读取================================================================================================================
a = pd.read_csv("/Users/Cheney/Downloads/HW2/p1.csv").fillna(-1)
b = pd.read_csv("/Users/Cheney/Downloads/HW2/p2.csv").fillna(-1)
c = pd.read_csv("/Users/Cheney/Downloads/HW2/train2.csv").fillna(-1)
d = pd.read_csv("/Users/Cheney/Downloads/HW2/test2.csv").fillna(-1)
train_label = c['label']
#转换成向量===============================================================================================================
d_data = d.values[:,2:]
a_data = a.values[:,3:]
b_data = b.values[:,3:]
c_data = c.values[:,3:]
train_label = train_label.values.reshape([-1,1]).astype(int)
#smote重采样=============================================================================================================
s1= Smote(a_data ,N=40000).over_sampling()
s2= Smote(b_data, N=45000).over_sampling()
train_label1_append = np.ones((len(s1),1),np.int)
train_label2_append = np.ones((len(s2),1),np.int)*2
#数据拼接================================================================================================================
c_data = np.vstack((c_data,s1))
c_data = np.vstack((c_data,s2))
train_label = np.vstack((train_label,train_label1_append))
train_label = np.vstack((train_label,train_label2_append))
#c_data = shuffle(c_data)
#train_label = shuffle(train_label)
#XGBOOST模型=============================================================================================================
xgb_model = xgb.XGBClassifier()
params = dict(booster='gbtree',
objective='multi:softmax',
num_class=3,
gamma=1,
reg_lambda=1,
alpha=1,
max_depth=5,
silent=0,
seed=1000,
scale_pos_weight=1,
learning_rate=0.03,
min_child_weight=4500,
subsample=0.7
)
plst = list(params.items())
num_rounds = 8000 # 迭代次数
# sklearn.cross_validation进行训练数据集划分,训练集和交叉验证集比例,
train_x,val_X,train_y, val_y = train_test_split(c_data,train_label, test_size=0.2, random_state=1)
# xgb矩阵赋值
xgb_val = xgb.DMatrix(val_X, label=val_y)
xgb_train = xgb.DMatrix(train_x, label=train_y)
xgb_test = xgb.DMatrix(d_data)
watchlist = [(xgb_train, 'train'), (xgb_val, 'val')]
# training model
# early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=200)
print("best best_ntree_limit", model.best_ntree_limit)
print("跑到这里了model.predict")
preds = model.predict(xgb_test, ntree_limit=model.best_ntree_limit)
#结果输出
np.savetxt('/Users/Cheney/Downloads/res.csv', np.c_[range(1, len(d_data) + 1), preds], delimiter=',', header='Label',
comments='', fmt='%d')