前面的特征工程部分参考大神操作,此代码非比赛真是代码,可以在特征工程方面多下功夫,这次比赛经过模型融合后的最好成绩为96名(5000队)
#coding=utf-8
import pandas as pd
import numpy as np
import scipy as sp
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
import xgboost as xgb
from sklearn import svm
from sklearn.pipeline import Pipeline
import warnings
# 0.0873405
warnings.filterwarnings("ignore")
import time
def timestamp_datetime(value):
format = '%Y-%m-%d %H:%M:%S'
value = time.localtime(value)
dt = time.strftime(format, value)
return dt
#评估与计算logloss
def logloss(act, pred):
epsilon = 1e-15
pred = sp.maximum(epsilon, pred)
pred = sp.minimum(1-epsilon, pred)
ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
ll = ll * -1.0/len(act)
return ll
def base_process(data):
lbl = preprocessing.LabelEncoder()
print(
'--------------------------------------------------------------item--------------------------------------------------------------')
data['len_item_category'] = data['item_category_list'].map(lambda x: len(str(x).split(';')))
data['len_item_property'] = data['item_property_list'].map(lambda x: len(str(x).split(';')))
data['item_sales_level'] = data['item_sales_level'].apply(lambda x: 12 if x == -1 else x)
for i in range(1, 3):
data['item_category_list' + str(i)] = lbl.fit_transform(data['item_category_list'].map(
lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else '')) # item_category_list的第0列全部都一样
for i in range(10):
data['item_property_list' + str(i)] = lbl.fit_transform(data['item_property_list'].map(lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
for col in ['item_id', 'item_brand_id', 'item_city_id']:
data[col] = lbl.fit_transform(data[col])
del data['item_category_list']
del data['item_property_list']
print(
'--------------------------------------------------------------user--------------------------------------------------------------')
data['user_gender_id'] = data['user_gender_id'].apply(lambda x: 0 if x == -1 else x)
data['user_age_level'] = data['user_age_level'].apply(lambda x: 1003 if x == -1 else x)
data['user_occupation_id'] = data['user_occupation_id'].apply(lambda x: 2005 if x == -1 else x)
data['user_star_level'] = data['user_star_level'].apply(lambda x: 3006 if x == -1 else x)
for col in ['user_id','user_gender_id','user_age_level','user_occupation_id','user_star_level']:
data[col] = lbl.fit_transform(data[col])
print(
'--------------------------------------------------------------context--------------------------------------------------------------')
data['realtime'] = data['context_timestamp'].apply(timestamp_datetime)
data['realtime'] = pd.to_datetime(data['realtime'])
data['day'] = data['realtime'].dt.day
data['hour'] = data['realtime'].dt.hour
data['len_predict_category_property'] = data['predict_category_property'].map(lambda x: len(str(x).split(';')))
for i in range(5):
data['predict_category_property' + str(i)] = lbl.fit_transform(data['predict_category_property'].map(
lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
data['context_page_id'] = lbl.fit_transform(data['context_page_id'])
data['context_id'] = lbl.fit_transform(data['context_id'])
del data['predict_category_property']
del data['context_timestamp']
del data['realtime']
print(
'--------------------------------------------------------------shop--------------------------------------------------------------')
# data['shop_score_service'] = data['shop_score_service'].apply(lambda x: 1 if x == -1 else x)
# data['shop_score_description'] = data['shop_score_description'].apply(lambda x: 1 if x == -1 else x)
# data['shop_score_delivery'] = data['shop_score_delivery'].apply(lambda x: 1 if x == -1 else x)
# data['shop_review_positive_rate'] = data['shop_review_positive_rate'].apply(lambda x: 1 if x == -1 else x)
for col in ['shop_id','shop_star_level']:
data[col] = lbl.fit_transform(data[col])
return data
def map_hour(x):
if (x>=7)&(x<=12):
return 1
elif (x>=13)&(x<=20):
return 2
else:
return 3
def shijian(data):
data['hour_map'] = data['hour'].apply(map_hour)
return data
def deliver(x):
#x=round(x,6)
jiange=0.1
for i in range(1,20):
if (x>=4.1+jiange*(i-1))&(x<=4.1+jiange*i):
return i+1
if x==-5:
return 1
def deliver1(x):
if (x>=2)&(x<=5):
return 1
elif (x>=6)&(x<=7):
return 2
else:
return 3
def review(x):
# x=round(x,6)
jiange = 0.02
for i in range(1, 30):
if (x >= 0.714 + jiange * (i - 1)) & (x <= 0.714 + jiange * i):
return i + 1
if x == -1:
return 1
def review1(x):
# x=round(x,6)
if (x>=2)&(x<=12):
return 1
elif (x>=13)&(x<=15):
return 2
else:
return 3
def service(x):
#x=round(x,6)
jiange=0.1
for i in range(1,20):
if (x>=3.93+jiange*(i-1))&(x<=3.93+jiange*i):
return i+1
if x==-1:
return 1
def service1(x):
if (x>=2)&(x<=7):
return 1
elif (x>=8)&(x<=9):
return 2
else:
return 3
def describe(x):
#x=round(x,6)
jiange=0.1
for i in range(1,30):
if (x>=3.93+jiange*(i-1))&(x<=3.93+jiange*i):
return i+1
if x==-1:
return 1
def describe1(x):
if (x>=2)&(x<=8):
return 1
elif (x>=9)&(x<=10):
return 2
else:
return 3
def shop_fenduan(data):
data['shop_score_delivery'] = data['shop_score_delivery'] * 5
data = data[data['shop_score_delivery'] != -5 ]
data['deliver_map'] = data['shop_score_delivery'].apply(deliver)
data['deliver_map1'] = data['deliver_map'].apply(deliver1)
del data['shop_score_delivery']
# print(data.deliver_map.value_counts())
data['shop_score_service'] = data['shop_score_service'] * 5
data = data[data['shop_score_service'] != -5]
data['service_map'] = data['shop_score_service'].apply(service)
data['service_map1'] = data['service_map'].apply(service1)
del data['shop_score_service']
# print(data.service_map.value_counts()) # 视为好评,中评,差评
#
data['shop_score_description'] = data['shop_score_description'] * 5
data = data[data['shop_score_description'] != -5]
data['de_map'] = data['shop_score_description'].apply(describe)
data['de_map1'] = data['de_map'].apply(describe1)
del data['shop_score_description']
# print(data.de_map.value_counts())
data = data[data['shop_review_positive_rate'] != -1]
data['review_map'] = data['shop_review_positive_rate'].apply(review)
data['review_map1'] = data['review_map'].apply(review1)
del data['shop_review_positive_rate']
# print(data.review_map.value_counts())
data['normal_shop'] = data.apply(
lambda x: 1 if (x.deliver_map1 == 3) & (x.service_map1 == 3) & (x.de_map1 == 3) & (x.review_map1 == 3) else 0,
axis=1)
del data['de_map1']
del data['service_map1']
del data['deliver_map1']
del data['review_map1']
return data
def slide_cnt(data):
print('当前日期前一天的cnt')
for d in range(19, 26): # 18到24号
df1 = data[data['day'] == d - 1]
df2 = data[data['day'] == d] # 19到25号
user_cnt = df1.groupby(by='user_id').count()['context_id'].to_dict()
# print user_cnt
# print user_cnt.get(4250898847359861768,0),'.....'
item_cnt = df1.groupby(by='item_id').count()['instance_id'].to_dict()
shop_cnt = df1.groupby(by='shop_id').count()['instance_id'].to_dict()
df2['user_cnt1'] = df2['user_id'].apply(lambda x: user_cnt.get(x, 0))
df2['item_cnt1'] = df2['item_id'].apply(lambda x: item_cnt.get(x, 0))
df2['shop_cnt1'] = df2['shop_id'].apply(lambda x: shop_cnt.get(x, 0))
df2 = df2[['user_cnt1', 'item_cnt1', 'shop_cnt1', 'instance_id']]
if d == 19:
Df2 = df2
else:
Df2 = pd.concat([df2, Df2])
# print Df2.head(50)
# print df2['user_cnt1'].value_counts()
data = pd.merge(data, Df2, on=['instance_id'], how='left')
# print data.head(50)
print('当前日期之前的cnt')
for d in range(19, 26):
# 19到25,25是test
df1 = data[data['day'] < d]
df2 = data[data['day'] == d]
user_cnt = df1.groupby(by='user_id').count()['instance_id'].to_dict()
item_cnt = df1.groupby(by='item_id').count()['instance_id'].to_dict()
shop_cnt = df1.groupby(by='shop_id').count()['instance_id'].to_dict()
df2['user_cntx'] = df2['user_id'].apply(lambda x: user_cnt.get(x, 0))
df2['item_cntx'] = df2['item_id'].apply(lambda x: item_cnt.get(x, 0))
df2['shop_cntx'] = df2['shop_id'].apply(lambda x: shop_cnt.get(x, 0))
df2 = df2[['user_cntx', 'item_cntx', 'shop_cntx', 'instance_id']]
if d == 19:
Df2 = df2
else:
Df2 = pd.concat([df2, Df2])
data = pd.merge(data, Df2, on=['instance_id'], how='left')
for col in ['user_cnt1', 'item_cnt1', 'shop_cnt1','user_cntx', 'item_cntx', 'shop_cntx']:
data[col] = data[col].fillna(0)
return data
def zuhe(data):
# for col in ['user_gender_id','user_age_level','user_occupation_id','user_star_level']:
# data[col] = data[col].apply(lambda x: 0 if x == -1 else x)
for col in ['item_sales_level', 'item_price_level', 'item_collected_level',
'user_gender_id','user_age_level','user_occupation_id','user_star_level',
'shop_review_num_level', 'shop_star_level']:
data[col] = data[col].astype(str)
print('item两两组合')
data['sale_price'] = data['item_sales_level'] + data['item_price_level']
data['sale_collect'] = data['item_sales_level'] + data['item_collected_level']
data['price_collect'] = data['item_price_level'] + data['item_collected_level']
print('user两两组合')
data['gender_age'] = data['user_gender_id'] + data['user_age_level']
data['gender_occ'] = data['user_gender_id'] + data['user_occupation_id']
data['gender_star'] = data['user_gender_id'] + data['user_star_level']
print('shop两两组合')
data['review_star'] = data['shop_review_num_level'] + data['shop_star_level']
for col in ['item_sales_level', 'item_price_level', 'item_collected_level', 'sale_price','sale_collect', 'price_collect',
'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level','gender_age','gender_occ','gender_star',
'shop_review_num_level','shop_star_level','review_star']:
data[col] = data[col].astype(int)
# del data['review_star']
return data
def item(data):
print('一个item有多少brand,price salse collected level……')
itemcnt = data.groupby(['item_id'], as_index=False)['instance_id'].agg({'item_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['item_id'], how='left')
for col in ['item_brand_id','item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
itemcnt = data.groupby([col, 'item_id'], as_index=False)['instance_id'].agg({str(col) + '_item_cnt': 'count'})
data = pd.merge(data, itemcnt, on=[col, 'item_id'], how='left')
data[str(col) + '_item_prob']=data[str(col) + '_item_cnt']/data['item_cnt']
del data['item_cnt']
print('一个brand有多少price salse collected level……')
itemcnt = data.groupby(['item_brand_id'], as_index=False)['instance_id'].agg({'item_brand_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['item_brand_id'], how='left')
for col in ['item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
itemcnt = data.groupby([col, 'item_brand_id'], as_index=False)['instance_id'].agg({str(col) + '_brand_cnt': 'count'})
data = pd.merge(data, itemcnt, on=[col, 'item_brand_id'], how='left')
data[str(col) + '_brand_prob'] = data[str(col) + '_brand_cnt'] / data['item_brand_cnt']
del data['item_brand_cnt']
print('一个city有多少item_price_level,item_sales_level,item_collected_level,item_pv_level')
itemcnt = data.groupby(['item_city_id'], as_index=False)['instance_id'].agg({'item_city_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['item_city_id'], how='left')
for col in ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_city_cnt': 'count'})
data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left')
data[str(col) + '_city_prob'] = data[str(col) + '_city_cnt'] / data['item_city_cnt']
del data['item_city_cnt']
print('一个price有多少item_sales_level,item_collected_level,item_pv_level')
itemcnt = data.groupby(['item_price_level'], as_index=False)['instance_id'].agg({'item_price_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['item_price_level'], how='left')
for col in ['item_sales_level', 'item_collected_level', 'item_pv_level']:
itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_price_cnt': 'count'})
data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left')
data[str(col) + '_price_prob'] = data[str(col) + '_price_cnt'] / data['item_price_cnt']
del data['item_price_cnt']
print('一个item_sales_level有多少item_collected_level,item_pv_level')
itemcnt = data.groupby(['item_sales_level'], as_index=False)['instance_id'].agg({'item_salse_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['item_sales_level'], how='left')
for col in ['item_collected_level', 'item_pv_level']:
itemcnt = data.groupby([col, 'item_sales_level'], as_index=False)['instance_id'].agg({str(col) + '_salse_cnt': 'count'})
data = pd.merge(data, itemcnt, on=[col, 'item_sales_level'], how='left')
data[str(col) + '_salse_prob'] = data[str(col) + '_salse_cnt'] / data['item_salse_cnt']
del data['item_salse_cnt']
print('一个item_collected_level有多少item_pv_level')
itemcnt = data.groupby(['item_collected_level'], as_index=False)['instance_id'].agg({'item_coll_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['item_collected_level'], how='left')
for col in ['item_pv_level']:
itemcnt = data.groupby([col, 'item_collected_level'], as_index=False)['instance_id'].agg({str(col) + '_coll_cnt': 'count'})
data = pd.merge(data, itemcnt, on=[col, 'item_collected_level'], how='left')
data[str(col) + '_coll_prob'] = data[str(col) + '_coll_cnt'] / data['item_coll_cnt']
del data['item_coll_cnt']
return data
def user(data):
print('用户有多少性别')
itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['user_id'], how='left')
for col in ['user_gender_id','user_age_level', 'user_occupation_id', 'user_star_level']:
itemcnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg({str(col) + '_user_cnt': 'count'})
data = pd.merge(data, itemcnt, on=[col, 'user_id'], how='left')
data[str(col) + '_user_prob']=data[str(col) + '_user_cnt']/data['user_cnt']
del data['user_cnt']
# #
# print('性别的年龄段,职业有多少')
# itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left')
#
# for col in ['user_age_level', 'user_occupation_id', 'user_star_level']:
# itemcnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg({str(col) + '_user_gender_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=[col, 'user_gender_id'], how='left')
# data[str(col) + '_user_gender_prob']=data[str(col) + '_user_gender_cnt']/data['user_gender_cnt']
# del data['user_gender_cnt']
# print('user_age_level对应的user_occupation_id,user_star_level')
# itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=['user_age_level'], how='left')
#
# for col in ['user_occupation_id', 'user_star_level']:
# itemcnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg({str(col) + '_user_age_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=[col, 'user_age_level'], how='left')
# data[str(col) + '_user_age_prob']=data[str(col) + '_user_age_cnt']/data['user_age_cnt']
# del data['user_age_cnt']
#
# print('user_occupation_id对应的user_star_level')
# itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left')
# for col in ['user_star_level']:
# itemcnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg({str(col) + '_user_occ_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=[col, 'user_occupation_id'], how='left')
# data[str(col) + '_user_occ_prob']=data[str(col) + '_user_occ_cnt']/data['user_occ_cnt']
# del data['user_occ_cnt']
return data
def user_item(data):
itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['user_id'], how='left')
print('一个user有多少item_id,item_brand_id……')
for col in ['item_id',
'item_brand_id','item_city_id','item_price_level',
'item_sales_level','item_collected_level','item_pv_level']:
item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg({str(col)+'_user_cnt': 'count'})
data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left')
data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt']
# print('一个user_gender有多少item_id,item_brand_id……')
# itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left')
# for col in ['item_id',
# 'item_brand_id','item_city_id','item_price_level',
# 'item_sales_level','item_collected_level','item_pv_level']:
# item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg({str(col)+'_user_gender_cnt': 'count'})
# data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left')
# data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt']
# #
# print('一个user_age_level有多少item_id,item_brand_id……')
# itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=['user_age_level'], how='left')
# for col in ['item_id',
# 'item_brand_id','item_city_id','item_price_level',
# 'item_sales_level','item_collected_level','item_pv_level']:
# item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg({str(col)+'_user_age_cnt': 'count'})
# data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left')
# data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt']
print('一个user_occupation_id有多少item_id,item_brand_id…')
itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left')
for col in ['item_id',
'item_brand_id','item_city_id','item_price_level',
'item_sales_level','item_collected_level','item_pv_level']:
item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg({str(col)+'_user_occ_cnt': 'count'})
data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left')
data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt']
return data
def user_shop(data):
print('一个user有多少shop_id,shop_review_num_level……')
# itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=['user_id'], how='left')
for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg(
{str(col) + '_user_cnt': 'count'})
data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left')
data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt']
del data['user_cnt']
# print('一个user_gender有多少shop_id,shop_review_num_level……')
# itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left')
# for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
# item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg(
# {str(col) + '_user_gender_cnt': 'count'})
# data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left')
# data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt']
# del data['user_gender_cnt']
#
# print('一个user_age_level有多少shop_id,shop_review_num_level……')
# itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=['user_age_level'], how='left')
# for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
# item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg(
# {str(col) + '_user_age_cnt': 'count'})
# data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left')
# data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt']
# del data['user_age_cnt']
# #
print('一个user_occupation_id有多少shop_id,shop_review_num_level……')
# itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'})
# data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left')
for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg(
{str(col) + '_user_occ_cnt': 'count'})
data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left')
data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt']
del data['user_occ_cnt']
return data
def shop_item(data):
print('一个shop有多少item_id,item_brand_id,item_city_id,item_price_level……')
itemcnt = data.groupby(['shop_id'], as_index=False)['instance_id'].agg({'shop_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['shop_id'], how='left')
for col in ['item_id',
'item_brand_id','item_city_id','item_price_level',
'item_sales_level','item_collected_level','item_pv_level']:
item_shop_cnt = data.groupby([col, 'shop_id'], as_index=False)['instance_id'].agg({str(col)+'_shop_cnt': 'count'})
data = pd.merge(data, item_shop_cnt, on=[col, 'shop_id'], how='left')
data[str(col) + '_shop_prob'] = data[str(col) + '_shop_cnt'] / data['shop_cnt']
del data['shop_cnt']
print('一个shop_review_num_level有多少item_id,item_brand_id,item_city_id,item_price_level……')
itemcnt = data.groupby(['shop_review_num_level'], as_index=False)['instance_id'].agg({'shop_rev_cnt': 'count'})
data = pd.merge(data, itemcnt, on=['shop_review_num_level'], how='left')
for col in ['item_id',
'item_brand_id','item_city_id','item_price_level',
'item_sales_level','item_collected_level','item_pv_level']:
item_shop_cnt = data.groupby([col, 'shop_review_num_level'], as_index=False)['instance_id'].agg({str(col)+'_shop_rev_cnt': 'count'})
data = pd.merge(data, item_shop_cnt, on=[col, 'shop_review_num_level'], how='left')
data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']
del data['shop_rev_cnt']
return data
def stacking(train_x, train_y, test):
# 4个一级分类器
clfs = [
RandomForestClassifier(n_estimators=200,random_state=0,min_weight_fraction_leaf=0.1,max_depth=8,min_samples_split=5,
min_samples_leaf=2,max_features=None,n_jobs=-1),
xgb.XGBClassifier(n_estimators=200, objective="binary:logistic", gamma=1, max_depth=8, subsample=0.85,eval_metric='logloss',tree_method='hist',
colsample_bytree=0.9, nthread=-1, seed=1),
ExtraTreesClassifier(n_estimators=200, criterion="gini", max_features=0.9, max_depth=8,
min_samples_split=2, min_samples_leaf=1, bootstrap=True, n_jobs=-1, random_state=1),
lgb.LGBMClassifier(objective='binary',max_depth=8,learning_rate=0.02,colsample_bytree=0.8,subsample=0.9,n_estimators=1000)
]#subsample=0.8,colsample_bytree=0.8,max_bin=10,subsample_freq=10,min_child_samples=500,max_depth=9,random_state=666,boosting_type='gbdt',n_estimators=1200,learning_rate=0.01,verbose=1
# 二级分类器的train_x, test
dataset_blend_train = np.zeros((train_x.shape[0], len(clfs)))
dataset_blend_test = np.zeros((test.shape[0], len(clfs)))
# 4个分类器进行5_folds预测
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1)
for i, clf in enumerate(clfs):
print('training model:',i+1)
dataset_blend_test_j = np.zeros((test.shape[0], n_folds)) # 每个分类器的单次fold预测结果
logloss_mean=[]
for j, (train_index, test_index) in enumerate(skf.split(train_x, train_y)):
tr_x = train_x[train_index]
tr_y = train_y[train_index]
te_x = train_x[test_index]
te_y =train_y[test_index]
print(j+1,'fold training')
if i<3:
clf.fit(tr_x, tr_y)
else:
clf.fit(tr_x, tr_y,eval_set=[(te_x, te_y)], early_stopping_rounds=1000)
dataset_blend_train[test_index, i] = clf.predict_proba(train_x[test_index])[:,1]
dataset_blend_test_j[:, j] = clf.predict_proba(test)[:,1]
print(j+1,'fold logloss:',logloss(te_y,dataset_blend_train[test_index, i]))
logloss_mean.append(logloss(te_y,dataset_blend_train[test_index, i]))
# print dataset_blend_train
dataset_blend_test[:, i] = dataset_blend_test_j.mean(1)
print('model', i + 1, 'logloss:', np.array(logloss_mean).mean())
# 二级分类器进行预测
clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.9, max_depth=6, n_estimators=100)
# clf=svm.SVC(C=0.8, kernel='rbf', gamma=2, decision_function_shape='ovr',probability=True)
clf.fit(dataset_blend_train, train_y)#eval_set=[(dataset_blend_test, y)], early_stopping_rounds=1000
proba = clf.predict_proba(dataset_blend_test)[:, 1]
return proba
if __name__ == "__main__":
train = pd.read_csv("F:/data/alictr/round1_ijcai_18_train_20180301.txt", sep="\s+")
test = pd.read_csv("F:/data/alictr/round1_ijcai_18_test_a_20180301.txt", sep="\s+")
train = train.drop_duplicates(subset='instance_id') # 把instance id去重
data = pd.concat([train, test])
print data.info()
print('make feature')
# print data['item_sales_level'].value_counts()
data = base_process(data)
data=shijian(data)
print data['day'].value_counts()
data=shop_fenduan(data)
# print data.head(50)
# print data.info()
# print data['normal_shop'].value_counts()
data = slide_cnt(data)
data = zuhe(data)
# print data.head(50)
# print data.info()
# print('----------------------------全局统计特征---------------------------------------------------')
# data = item(data)
data = user(data)
data = user_item(data)
data = user_shop(data)
# data = shop_item(data)
# col_fu=['user_id']
# for col in col_fu:
# data.drop(col,1,inplace=True)
# train=data[data.is_trade.notnull()]
# train1=data[(data['is_trade'] == 0)]
# train2=data[(data['is_trade'] == 1)]
# train = pd.concat([train1,train2])
train = data[((data['day'] >= 18) & (data['day'] <= 24))]
# train = train.sample(n=200000, replace=False, random_state=123)
y_train=train['is_trade'].values
train.drop('is_trade',1,inplace=True)
train.drop('instance_id',1,inplace=True)
train.drop('day', 1, inplace=True)
# test = data[(data['day'] == 24)]#(data['day'] <= 24) & 0.081388880092578234
# y_test=test['is_trade'].values
# test.drop('is_trade',1,inplace=True)
# test.drop('instance_id',1,inplace=True)
# test.drop('day', 1, inplace=True)
# # x_train, x_test, y_train, y_test = train_test_split(train, y_train, test_size=.8, random_state=123)
# x_train=np.array(train)
# y_train=np.array(y_train)
# x_test=np.array(test)
# result_predict_prob = stacking(x_train,y_train,x_test)
# print result_predict_prob
# label=y_test
# f = logloss(label, result_predict_prob)
# print('logloss:',f)
test_sub = data[data.is_trade.isnull()]
test_sub = test_sub.drop_duplicates(subset='instance_id')
print('行数:',test_sub['instance_id'].count())
test1=pd.DataFrame()
test1['instance_id']=test_sub['instance_id']
test_sub.drop('is_trade',1,inplace=True)
test_sub.drop('instance_id',1,inplace=True)
test_sub.drop('day', 1, inplace=True)
# proba_test=model.predict_proba(test_sub)[:,1]
train=np.array(train)
y_train=np.array(y_train)
test_sub=np.array(test_sub)
proba_test=stacking(train,y_train,test_sub)
result_prob = np.array(proba_test).reshape([-1,1])
test1['predicted_score']=result_prob
test_data = test1.loc[:,['instance_id','predicted_score']]
sub=pd.read_csv("F:/data/alictr/round1_ijcai_18_test_a_20180301.txt", sep="\s+")
test_data1=pd.merge(sub,test_data,on=['instance_id'],how='left')
test_data1=test_data1.fillna(0)
test_data1[['instance_id', 'predicted_score']].to_csv('F:/data/alictr/round1_ijcai_18_result_demo_20180301.txt',index=False,sep=" ")
print test_data1[['instance_id', 'predicted_score']].head(5)