1、排序
train_new.sort_values(by='imeimd5')
train_new.sort_values(by='imeimd5')['imeimd5'].max()
train_ime = train_new['imeimd5'].unique()
2、迭代器进度条:tqdm
tqdm
cnt = 0
for i in tqdm.tqdm_notebook(test_new['imeimd5'].values):
if i in train_ime:
cnt += 1
3、numpy保存和读取
import numpy as np
# a=np.load('ip_dict.npy',allow_pickle=True)
# data=a.item()
temp = train[['ip','label']].groupby('ip')['label'].agg({'mean_label':'mean','count_label':'count','sum_label':'sum'}).reset_index()
ip_dict={}
for k,v in tqdm.tqdm_notebook(temp[['ip','mean_label']].values):
if v==1 or v==0:
if k in ip_dict:
continue
else:
ip_dict[k]=v
np.save('ip_dict.npy', ip_dict)
4、统计+unique+ration
!pip install tqdm
import tqdm
def count_ratio_fea_1(data):
for i in tqdm.tqdm_notebook(arr1):
data[i+'_count'] = data[i].map(data[i].value_counts())
data.loc[data[i+'_count']<5,i] = None
return data
def count_ratio_fea_2(data):
for i in arr1:
for j in tqdm.tqdm_notebook(arr2):
data[i+'_'+j+'_count'] = (data[i].astype(str)+'_'+data[j].astype(str)).map(
(data[i].astype(str)+'_'+data[j].astype(str)).value_counts())
data[i+'_'+j+'_ratio'] = data[i+'_'+j+'_count']/data[i+'_count']
data[j+'_'+i+'_ratio'] = data[i+'_'+j+'_count']/data[j+'_count']
return data
def unique_fea(data):
for i in arr1:
for j in tqdm.tqdm_notebook(arr2):
data = data.merge(data.groupby(i)[j].nunique().reset_index().rename(columns={j:i+'_'+j+'_nunqiue'}),on=i,how='left')
data = data.merge(data.groupby(j)[i].nunique().reset_index().rename(columns={i:j+'_'+i+'_nunqiue'}),on=j,how='left')
return data
5、历史点击率
(1)前1天
label_feature=['age','city','gender','netType']
cnt = 1
for feat_sum in tqdm.tqdm_notebook(label_feature):
data[feat_sum + '_value_counts'] = data[feat_sum].map(data[feat_sum].value_counts())
data.loc[data[feat_sum + '_value_counts'] <= 3,feat_sum] = None
del data[feat_sum + '_value_counts']
print('feature %d'%cnt + ':%s'% feat_sum)
cnt += 1
gc.collect()
res=pd.DataFrame()
temp=data[[feat_sum,'day','label']]
ctr_flag = 0
for day in range(26,33):#仅第一天点击率为None
print('%s ctr: '%feat_sum + 'the day is:', day)
if day == 26:
ctr_flag = 1
count =temp.groupby([feat_sum]).apply(lambda x: x['label'][( (x['day']<day) & (x['day'] < day - 1) ).values].count()).reset_index(name=feat_sum+'count_day_%d'%day)
count1=temp.groupby([feat_sum]).apply(lambda x: x['label'][( (x['day']<day) & (x['day'] > day - 1) ).values].sum()).reset_index(name=feat_sum+'_sum')
count[feat_sum+'_sum']=count1[feat_sum+'_sum']
count['day']=day
count.fillna(value=0, inplace=True)
if ctr_flag:
count[feat_sum+'_rate_day'] = round(count[feat_sum+'_sum'] / count[feat_sum+'count_day_%d'%day] + 2, 5)
else:
count[feat_sum+'_rate_day'] = None
count.drop([feat_sum+'_sum'],axis=1,inplace=True)
count.fillna(value=0, inplace=True)
res=res.append(count,ignore_index=True)
print('day:%d done!'%day)
print(feat_sum+'_rate_day_total' + ' done! ' + feat_sum +' over!\n')
data = pd.merge(data,res, how='left', on=[feat_sum,'day'])
(2)前n天
#基于day的历史点击率 26-32
label_feature=['age','city','gender','netType']
cnt = 1
for feat_sum in tqdm.tqdm_notebook(label_feature):
data[feat_sum + '_value_counts'] = data[feat_sum].map(data[feat_sum].value_counts())
data.loc[data[feat_sum + '_value_counts'] <= 3,feat_sum] = None
del data[feat_sum + '_value_counts']
cnt += 1
gc.collect()
res=pd.DataFrame()
temp=data[[feat_sum,'day','label']]
ctr_flag = 0
for day in range(26,33):#仅第一天点击率为None
if day == 26:
ctr_flag = 1
count=temp.groupby([feat_sum]).apply(lambda x: x['label'][(x['day']<day).values].count()).reset_index(name=feat_sum+'_count_total_day_%d'%day)
count1=temp.groupby([feat_sum]).apply(lambda x: x['label'][(x['day']<day).values].sum()).reset_index(name=feat_sum+'_sum')
count[feat_sum+'_sum']=count1[feat_sum+'_sum']
count['day']=day
count.fillna(value=0, inplace=True)
if ctr_flag:
ctr_flag = 0
count[feat_sum+'_rate_day_total'] = None
else:
count[feat_sum+'_rate_day_total'] = round(count[feat_sum+'_sum'] / count[feat_sum+'_count_total_day_%d'%day] + 2, 5)
count.drop([feat_sum+'_sum'],axis=1,inplace=True)
count.fillna(value=0, inplace=True)
res=res.append(count,ignore_index=True)
print(feat_sum+'_rate_day_total' + ' done! ' + feat_sum +' over!\n')
data = pd.merge(data,res, how='left', on=[feat_sum,'day'])