感谢大神!!!
#提取商品的特征
#对于数据集3
merchant3 = feature3[['merchant_id','coupon_id','distance','date_received','date']]
t = merchant3[['merchant_id']]
#删除重复行数据
t.drop_duplicates(inplace=True)
#显示卖出的商品
t1 = merchant3[merchant3.date!='null'][['merchant_id']]
t1['total_sales'] = 1
#显示每个商品的销售数量
t1 = t1.groupby('merchant_id').agg('sum').reset_index()
#显示使用了优惠券消费的商品,正样本
t2 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id']]
t2['sales_use_coupon'] = 1
t2 = t2.groupby('merchant_id').agg('sum').reset_index()
#显示了商品的优惠券的总数量
t3 = merchant3[merchant3.coupon_id != 'null'][['merchant_id']]
t3 ['total_coupon'] = 1
t3 = t3.groupby('merchant_id').agg('sum').reset_index()
#显示商品销量和距离的关系
t4 = merchant3[(merchant3.date != 'null')&(merchant3.coupon_id != 'null')][['merchant_id','distance']]
#把数据中的null值全部替换为-1
t4.replace('null',-1,inplace=True)
t4.distance = t4.distance.astype('int')
#再把数据中的-1全部替换为NaN
t4.replace(-1,np.nan,inplace=True)
#返回用户离商品的距离最小值
t5 = t4.groupby('merchant_id').agg('min').reset_index()
t5.rename(columns={'distance':'merchant_min_distance'},inplace = True)
#返回用户离商品的距离最大值
t6 = t4.groupby('merchant_id').agg('max').reset_index()
t6.rename(columns={'distance':'merchant_max_distance'},inplace = True)
#print(t6)
#返回距离的平均值
t7 = t4.groupby('merchant_id').agg('mean').reset_index()
t7.rename(columns = {'distance':'merchant_mean_distance'},inplace= True)
#返回距离的中位值
t8 = t4.groupby('merchant_id').agg('median').reset_index()
t8.rename(columns={'distance':'merchant_median_distance'},inplace = True)
merchant3_feature = pd.merge(t,t1,on='merchant_id',how='left')
#print(merchant3_feature)
merchant3_feature = pd.merge(merchant3_feature,t2,on='merchant_id',how='left')
#print(merchant3_feature)
merchant3_feature = pd.merge(merchant3_feature,t3,on='merchant_id',how='left')
#print(merchant3_feature)
merchant3_feature = pd.merge(merchant3_feature,t5,on='merchant_id',how='left')
#print(merchant3_feature)
merchant3_feature = pd.merge(merchant3_feature,t6,on='merchant_id',how='left')
#print(merchant3_feature)
merchant3_feature = pd.merge(merchant3_feature,t7,on='merchant_id',how='left')
#print(merchant3_feature)
merchant3_feature = pd.merge(merchant3_feature,t8,on='merchant_id',how='left')
#print(merchant3_feature)
#将数据中的NaN用0来替换
merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0)
#即优惠券的使用率
merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon.astype('float')/merchant3_feature.total_coupon
#即卖出商品中使用优惠券的占比
merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales
#将数据中的NaN用0来替换
merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0)
merchant3_feature.to_csv('data/merchant3_feature.csv',index=None)