【BI学习作业08-用户购买预测与基于流行度的推荐】


数据集说明

数据下载地址:https://aistudio.baidu.com/aistudio/competition/detail/51
目标:预测下个月用户是否购买

测试集test1.csv = submission.csv
(customer_id顺序一致,result为预测结果)

train.csv字段说明

字段说明
order_detail_id订单详情id
order_id订单id
order_detail_id订单详情id
order_total_num订单商品总购买数量
order_amount订单商品总金额
order_total_payment订单实付金额
order_total_discount订单优惠金额
order_pay_time付款时间
order_status订单状态: 1表示等待买家付款, 2表示卖家部分发货, 3表示卖家发货, 4表示等待买家确认收货, 5表示买家已签收, 6表示交易成功
order_count订单包含的子订单数量
is_customer_rate用户是否评价,0没有评价,1已经评价
order_detail_status订单详细状态
order_detail_good_num订单中的商品数量
order_detail_amount订单应付总金额
order_detail_payment订单实付金额
order_detail_discount订单优惠金额
member_id会员id
customer_gender用户id
customer_province性别:0未知,1男,2女
customer_city用户省份所在地
member_status会员状态:1正常,2冻结,3已删除
is_member_active用户城市所在地
goods_id用户城市所在地
goods_class_id商品分类id
goods_price商品原始价格
goods_status商品库存状态:1出售中,2库中
goods_has_discount是否支持会员折扣:0不支持,1支持
goods_list_time商品最新上架时间
goods_delist_time商品最新下架时间

submission.csv 字段说明

字段说明
customer_id用户id
result下个月是否会购买:0 不购买,1购买

模型构建

import datetime
import numpy as np
import pandas as pd

# 对Goods_id进行标签编码
raw['goods_id'] = pd.factorize(raw['goods_id'])[0]
raw['goods_id'].value_counts()

# 对数据进行预处理,各种特征提取
def preprocess(raw, train='train'):
    # 对性别为空的情况,进行0填充
    data = pd.DataFrame(raw.groupby('customer_id')['customer_gender'].last().fillna(0))
    # 添加新的列:商品相关(最后一次行为)
    data[['goods_id_last', 'goods_status_last', 'goods_price_last', 'goods_has_discount_last',           'goods_list_time_last', 'goods_delist_time_last']] = raw.groupby('customer_id')['goods_id',             'goods_status', 'goods_price', 'goods_has_discount', 'goods_list_time', 'goods_delist_time'].last()
    data[['order_total_num_last', 'order_amount_last', 'order_total_payment_last', 'order_total_discount_last', 'order_pay_time_last',           'order_status_last', 'order_count_last', 'is_customer_rate_last', 'order_detail_status_last', 'order_detail_goods_num_last',           'order_detail_amount_last', 'order_detail_payment_last', 'order_detail_discount']] = raw.groupby('customer_id')['order_total_num',         'order_amount', 'order_total_payment', 'order_total_discount', 'order_pay_time', 'order_status', 'order_count', 'is_customer_rate',         'order_detail_status', 'order_detail_goods_num', 'order_detail_amount', 'order_detail_payment', 'order_detail_discount'].last()
    # 添加商品原始价格(多种统计字段) goods_price: mean, min, max, std
    data[['goods_price_mean', 'goods_price_min', 'goods_price_max', 'goods_price_std']] =         raw.groupby('customer_id')['goods_price'].agg([
            ('goods_price_mean', 'mean'),
            ('goods_price_min', 'min'),
            ('goods_price_max', 'max'),
            ('goods_price_std', 'std')])
   # 订单实付金额(多种统计字段) order_detail_payment: mean, min, max, std
    data[['order_detail_payment_mean', 'order_detail_payment_min', 'order_detail_payment_max', 'order_detail_payment_std']] =         raw.groupby('customer_id')['order_detail_payment'].agg([
            ('order_detail_payment_mean', 'mean'),
            ('order_detail_payment_min', 'min'),
            ('order_detail_payment_max', 'max'),
            ('order_detail_payment_std', 'std')])
    # 用户购买的订单数量
    data['count'] = raw.groupby('customer_id')['order_id'].nunique()
    # 用户购买的商品数量
    #data['goods_count'] = raw.groupby('customer_id')['goods_id'].nunique()
    # order_total_num
    data['order_total_sum'] = raw.groupby('customer_id')['order_total_num'].sum()
    # 用户所在省份
    data['customer_province'] = raw.groupby('customer_id')['customer_province'].last()
    # 用户所在城市
    data['customer_city'] = raw.groupby('customer_id')['customer_city'].last()
    # 用户是否评价 统计结果(平均,总和)
    data[['is_customer_rate_ratio', 'is_customer_rate_sum']] = raw.groupby('customer_id')['is_customer_rate'].agg([
        ('is_customer_rate_ratio', np.mean),
        ('is_customer_rate_sum', np.sum)
    ])
    # 用户购买的goods数量
    data['order_detail_total_num'] = raw.groupby('customer_id')['order_detail_goods_num'].sum()
    # 商品折扣统计属性(sum, mean)
    data[['goods_has_discount_sum', 'goods_has_discount_mean']] = raw.groupby('customer_id')['goods_has_discount'].agg([
        ('goods_has_discount_sum', np.sum),
        ('goods_has_discount_mean', np.mean)
    ])
    # 订单实付金额 统计属性(sum, mean)
    data[['order_total_payment_sum', 'order_total_payment_mean']] = raw.groupby('customer_id')['order_total_payment'].agg([
        ('order_total_payment_sum', np.sum),
        ('order_total_payment_mean', np.mean)
    ])
    # 时间转换
    def time2multi(x):
        t = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
        # 2013-01-01 的day = 1, weekday = 星期二(2-1=1)
        return pd.Series([t.month, t.day, t.weekday(), t.hour])
    
    # 起始时间 2013-01-01
    t_str = '2013-01-01 00:00:00'
    t = datetime.datetime.strptime(t_str, '%Y-%m-%d %H:%M:%S')
    
    # 商品最新上架时间diff(假设起始时间为2013-01-01 00:00:00)
    data['goods_list_time_diff'] = data['goods_list_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
    # 商品最新下架时间diff(假设起始时间为2013-01-01 00:00:00)
    data['goods_delist_time_diff'] = data['goods_delist_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
    # 商品展示时间
    data['goods_diff'] = data['goods_delist_time_diff'] - data['goods_list_time_diff']
    
    # 付款时间的尺度
    data[['order_pay_time_last_m', 'order_pay_time_last_d', 'order_pay_time_last_week', 'order_pay_time_last_h']] =         data['order_pay_time_last'].apply(time2multi)
    data['order_pay_time_last_diff'] = data['order_pay_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
    return data



label_raw = set(raw[raw['order_pay_time'] > '2013-07-31 23:59:59']['customer_id'].dropna())
# 如果这个用户在8月份进行了购买 label=1,否则label=0
# 只有用户有order 就会有行为
train_raw['label'] = train_raw.index.map(lambda x:int(x in label_raw))

get_ipython().run_cell_magic('time', '', '# 用全量的用户特征进行预测,raw 即9月份之前的所有数据\ntest = preprocess(raw)')

get_ipython().run_cell_magic('time', '', "# 8月之前的做训练集,8月的数据做验证集\nraw['order_pay_time'].max()\ntrain_raw = raw[raw['order_pay_time'] <= '2013-07-31 23:59:59']\n# 提取训练集 各种特征\ntrain_raw = preprocess(train_raw)\ntrain_raw")

保存和加载模型

import pickle
with open('test.pkl', 'wb') as file:
    pickle.dump(test, file)
with open('train_raw.pkl', 'wb') as file:
    pickle.dump(train_raw, file)

特征处理

# 有些时间戳提取了时间尺度,需要去掉
train_data = train_raw.drop(['goods_list_time_last', 'goods_delist_time_last', 'order_pay_time_last'], axis=1)

train_data['customer_province'].fillna('0', inplace=True)
train_data['customer_city'].fillna('0', inplace=True)

利用sklearn数据归一化

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_data['customer_province'] = le.fit_transform(train_data['customer_province'])
train_data['customer_city'] = le.fit_transform(train_data['customer_city'])
train_data = train_data.drop(['customer_province', 'customer_city'], axis=1)

# 分类变量
cate1 = ['goods_id_last', 'goods_status_last', 'order_status_last',         'customer_gender', 'order_detail_status_last',         'order_pay_time_last_h', 'order_pay_time_last_week', 'order_pay_time_last_h']

查看train_data数据的情况

train_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1435404 entries, 1000000 to 2826574
Data columns (total 43 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   customer_gender              1435404 non-null  float64
 1   goods_id_last                1435404 non-null  int64  
 2   goods_status_last            1435404 non-null  float64
 3   goods_price_last             1435280 non-null  float64
 4   goods_has_discount_last      1435404 non-null  float64
 5   order_total_num_last         1435404 non-null  float64
 6   order_amount_last            1435404 non-null  float64
 7   order_total_payment_last     1435404 non-null  float64
 8   order_total_discount_last    1435404 non-null  float64
 9   order_status_last            1435404 non-null  int64  
 10  order_count_last             1435404 non-null  float64
 11  is_customer_rate_last        1435404 non-null  float64
 12  order_detail_status_last     1435404 non-null  float64
 13  order_detail_goods_num_last  1435404 non-null  float64
 14  order_detail_amount_last     1435404 non-null  float64
 15  order_detail_payment_last    1435404 non-null  float64
 16  order_detail_discount        1435404 non-null  float64
 17  goods_price_mean             1435280 non-null  float64
 18  goods_price_min              1435280 non-null  float64
 19  goods_price_max              1435280 non-null  float64
 20  goods_price_std              396081 non-null   float64
 21  order_detail_payment_mean    1435404 non-null  float64
 22  order_detail_payment_min     1435404 non-null  float64
 23  order_detail_payment_max     1435404 non-null  float64
 24  order_detail_payment_std     396177 non-null   float64
 25  count                        1435404 non-null  int64  
 26  order_total_sum              1435404 non-null  float64
 27  is_customer_rate_ratio       1435404 non-null  float64
 28  is_customer_rate_sum         1435404 non-null  float64
 29  order_detail_total_num       1435404 non-null  float64
 30  goods_has_discount_sum       1435404 non-null  float64
 31  goods_has_discount_mean      1435404 non-null  float64
 32  order_total_payment_sum      1435404 non-null  float64
 33  order_total_payment_mean     1435404 non-null  float64
 34  goods_list_time_diff         1435404 non-null  float64
 35  goods_delist_time_diff       1435404 non-null  float64
 36  goods_diff                   1435404 non-null  float64
 37  order_pay_time_last_m        1435404 non-null  int64  
 38  order_pay_time_last_d        1435404 non-null  int64  
 39  order_pay_time_last_week     1435404 non-null  int64  
 40  order_pay_time_last_h        1435404 non-null  int64  
 41  order_pay_time_last_diff     1435404 non-null  float64
 42  label                        1435404 non-null  int64  
dtypes: float64(35), int64(8)
memory usage: 481.9 MB

数据集切分

# 训练集切分
from sklearn.model_selection import KFold
import lightgbm as lgb

# 采用CV=5折交叉验证
kf = KFold(n_splits=5,shuffle=True)
y_pred = 0
for train_index , test_index in kf.split(train_data):
    # 设置每一折的 train和test
    X_train, X_valid, y_train, y_valid = train_data.drop('label', axis=1).iloc[train_index], train_data.drop('label', axis=1).iloc[test_index],         np.array(train_data['label'])[train_index], np.array(train_data['label'])[test_index]
    param = {
        'num_leaves': 121,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'learning_rate': 0.05,
        'metric': 'binary_logloss'
    }
    # 使用lgb进行训练
    trn_data = lgb.Dataset(X_train, y_train)
    val_data = lgb.Dataset(X_valid, y_valid)
    lgbm = lgb.train(param, trn_data, valid_sets=[trn_data, val_data], num_boost_round=10000, early_stopping_rounds=100, verbose_eval=50, categorical_feature=cate1)
    # 计算test的预测结果
    test = test[X_train.columns]
    y_pred = y_pred + lgbm.predict(test)*0.2
Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logloss: 0.0659339	valid_1's binary_logloss: 0.0697238
[100]	training's binary_logloss: 0.0629397	valid_1's binary_logloss: 0.0697213
[150]	training's binary_logloss: 0.0609746	valid_1's binary_logloss: 0.0698468
Early stopping, best iteration is:
[73]	training's binary_logloss: 0.0643987	valid_1's binary_logloss: 0.0696613
Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logloss: 0.0660194	valid_1's binary_logloss: 0.0693105
[100]	training's binary_logloss: 0.0630359	valid_1's binary_logloss: 0.069384
[150]	training's binary_logloss: 0.0611705	valid_1's binary_logloss: 0.0694661
Early stopping, best iteration is:
[62]	training's binary_logloss: 0.0652081	valid_1's binary_logloss: 0.0692976
Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logloss: 0.0658128	valid_1's binary_logloss: 0.0702261
[100]	training's binary_logloss: 0.06284	valid_1's binary_logloss: 0.0702096
[150]	training's binary_logloss: 0.0608663	valid_1's binary_logloss: 0.0703097
Early stopping, best iteration is:
[77]	training's binary_logloss: 0.0640384	valid_1's binary_logloss: 0.0701494
Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logloss: 0.0658765	valid_1's binary_logloss: 0.069946
[100]	training's binary_logloss: 0.0628522	valid_1's binary_logloss: 0.0699529
[150]	training's binary_logloss: 0.060943	valid_1's binary_logloss: 0.0700854
Early stopping, best iteration is:
[73]	training's binary_logloss: 0.0643301	valid_1's binary_logloss: 0.0699093
Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logloss: 0.0654505	valid_1's binary_logloss: 0.0715896
[100]	training's binary_logloss: 0.0625174	valid_1's binary_logloss: 0.0716002
[150]	training's binary_logloss: 0.0606312	valid_1's binary_logloss: 0.071723
Early stopping, best iteration is:
[67]	training's binary_logloss: 0.0642761	valid_1's binary_logloss: 0.071539

创建DataFrame

pd.DataFrame({
    'column': X_train.columns,
    'importance':lgbm.feature_importance()
}).sort_values(by='importance')

保存要提交的csv

def f(x):
    """
    if x<0.1:
        return 0.1
    if x>0.9:
        return 0.9
    """
    return round(x*1000)/1000

test['result'] = y_pred
test['result'] = test['result'].map(f)
result = pd.DataFrame(test['result'])
result.to_csv('submission1.csv')
result

在这里插入图片描述

完整代码

#!/usr/bin/env python
# coding: utf-8

import datetime
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def reduce_mem_usage(df, verbose=True):
  numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  start_mem = df.memory_usage().sum() / 1024**2    
  for col in df.columns:
    col_type = df[col].dtypes
    if col_type in numerics:
      c_min = df[col].min()
      c_max = df[col].max()
      if str(col_type)[:3] == 'int':
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
          df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
          df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
          df[col] = df[col].astype(np.int32)
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
          df[col] = df[col].astype(np.int64)  
      else:
        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
          df[col] = df[col].astype(np.float16)
        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
          df[col] = df[col].astype(np.float32)
        else:
          df[col] = df[col].astype(np.float64)    
  end_mem = df.memory_usage().sum() / 1024**2
  if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
  return df

df = reduce_mem_usage(pd.read_csv('train.csv'))

print(df.info())
print(df.shape) #(2306871, 29)


print(df['goods_id'].value_counts())


# 对Goods_id进行标签编码
df['goods_id'] = pd.factorize(df['goods_id'])[0]
df['goods_id'].value_counts()





# 对数据进行预处理,各种特征提取
def preprocess(raw, train='train'):
  # 对性别为空的情况,进行0填充
  data = pd.DataFrame(raw.groupby('customer_id')['customer_gender'].last().fillna(0))
  # 添加新的列:商品相关(最后一次行为)
  data[['goods_id_last', 'goods_status_last', 'goods_price_last', 'goods_has_discount_last',  'goods_list_time_last', 'goods_delist_time_last']] = raw.groupby('customer_id')['goods_id',             'goods_status', 'goods_price', 'goods_has_discount', 'goods_list_time', 'goods_delist_time'].last()
  data[['order_total_num_last', 'order_amount_last', 'order_total_payment_last', 'order_total_discount_last', 'order_pay_time_last',  'order_status_last', 'order_count_last', 'is_customer_rate_last', 'order_detail_status_last', 'order_detail_goods_num_last',           'order_detail_amount_last', 'order_detail_payment_last', 'order_detail_discount']] = raw.groupby('customer_id')['order_total_num',         'order_amount', 'order_total_payment', 'order_total_discount', 'order_pay_time', 'order_status', 'order_count', 'is_customer_rate',         'order_detail_status', 'order_detail_goods_num', 'order_detail_amount', 'order_detail_payment', 'order_detail_discount'].last()
  # 添加商品原始价格(多种统计字段) goods_price: mean, min, max, std
  data[['goods_price_mean', 'goods_price_min', 'goods_price_max', 'goods_price_std']] =         raw.groupby('customer_id')['goods_price'].agg([
          ('goods_price_mean', 'mean'),
          ('goods_price_min', 'min'),
          ('goods_price_max', 'max'),
          ('goods_price_std', 'std')])
  # 订单实付金额(多种统计字段) order_detail_payment: mean, min, max, std
  data[['order_detail_payment_mean', 'order_detail_payment_min', 'order_detail_payment_max', 'order_detail_payment_std']] =         raw.groupby('customer_id')['order_detail_payment'].agg([
          ('order_detail_payment_mean', 'mean'),
          ('order_detail_payment_min', 'min'),
          ('order_detail_payment_max', 'max'),
          ('order_detail_payment_std', 'std')])
  # 用户购买的订单数量
  data['count'] = raw.groupby('customer_id')['order_id'].nunique()
  # 用户购买的商品数量
  #data['goods_count'] = raw.groupby('customer_id')['goods_id'].nunique()
  # order_total_num
  data['order_total_sum'] = raw.groupby('customer_id')['order_total_num'].sum()
  # 用户所在省份
  data['customer_province'] = raw.groupby('customer_id')['customer_province'].last()
  # 用户所在城市
  data['customer_city'] = raw.groupby('customer_id')['customer_city'].last()
  # 用户是否评价 统计结果(平均,总和)
  data[['is_customer_rate_ratio', 'is_customer_rate_sum']] = raw.groupby('customer_id')['is_customer_rate'].agg([
      ('is_customer_rate_ratio', np.mean),
      ('is_customer_rate_sum', np.sum)
  ])
  # 用户购买的goods数量
  data['order_detail_total_num'] = raw.groupby('customer_id')['order_detail_goods_num'].sum()
  # 商品折扣统计属性(sum, mean)
  data[['goods_has_discount_sum', 'goods_has_discount_mean']] = raw.groupby('customer_id')['goods_has_discount'].agg([
      ('goods_has_discount_sum', np.sum),
      ('goods_has_discount_mean', np.mean)
  ])
  # 订单实付金额 统计属性(sum, mean)
  data[['order_total_payment_sum', 'order_total_payment_mean']] = raw.groupby('customer_id')['order_total_payment'].agg([
      ('order_total_payment_sum', np.sum),
      ('order_total_payment_mean', np.mean)
  ])
  # 时间转换
  def time2multi(x):
      t = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
      # 2013-01-01 的day = 1, weekday = 星期二(2-1=1)
      return pd.Series([t.month, t.day, t.weekday(), t.hour])

  # 起始时间 2013-01-01
  t_str = '2013-01-01 00:00:00'
  t = datetime.datetime.strptime(t_str, '%Y-%m-%d %H:%M:%S')

  # 商品最新上架时间diff(假设起始时间为2013-01-01 00:00:00)
  data['goods_list_time_diff'] = data['goods_list_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
  # 商品最新下架时间diff(假设起始时间为2013-01-01 00:00:00)
  data['goods_delist_time_diff'] = data['goods_delist_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
  # 商品展示时间
  data['goods_diff'] = data['goods_delist_time_diff'] - data['goods_list_time_diff']

  # 付款时间的尺度
  data[['order_pay_time_last_m', 'order_pay_time_last_d', 'order_pay_time_last_week', 'order_pay_time_last_h']] =         data['order_pay_time_last'].apply(time2multi)
  data['order_pay_time_last_diff'] = data['order_pay_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
  return data

train_raw = df[df['order_pay_time'] <= '2013-07-31 23:59:59']
train_raw = preprocess(train_raw)


label_raw = set(df[df['order_pay_time'] > '2013-07-31 23:59:59']['customer_id'].dropna())
# 如果这个用户在8月份进行了购买 label=1,否则label=0
# 只有用户有order 就会有行为
train_raw['label'] = train_raw.index.map(lambda x:int(x in label_raw))


test = preprocess(df)

import pickle
with open('test.pkl', 'wb') as file:
    pickle.dump(test, file)
with open('train_raw.pkl', 'wb') as file:
    pickle.dump(train_raw, file)



# 有些时间戳提取了时间尺度,需要去掉
train_data = train_raw.drop(['goods_list_time_last', 'goods_delist_time_last', 'order_pay_time_last'], axis=1)





train_data['customer_province'].fillna('0', inplace=True)
train_data['customer_city'].fillna('0', inplace=True)



from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_data['customer_province'] = le.fit_transform(train_data['customer_province'])
train_data['customer_city'] = le.fit_transform(train_data['customer_city'])
train_data = train_data.drop(['customer_province', 'customer_city'], axis=1)




# 分类变量
cate1 = ['goods_id_last', 'goods_status_last', 'order_status_last',         'customer_gender', 'order_detail_status_last',         'order_pay_time_last_h', 'order_pay_time_last_week', 'order_pay_time_last_h']




#train_data[['customer_province', 'customer_city']].isnull().sum()
train_data['customer_province'] = train_data['customer_province'].astype('int64')
train_data['customer_city'] = train_data['customer_city'].astype('int64')

# 训练集切分

# 采用CV=5折交叉验证
kf = KFold(n_splits=5,shuffle=True)
y_pred = 0
for train_index , test_index in kf.split(train_data):
    # 设置每一折的 train和test
    X_train, X_valid, y_train, y_valid = train_data.drop('label', axis=1).iloc[train_index], train_data.drop('label', axis=1).iloc[test_index], np.array(train_data['label'])[train_index], np.array(train_data['label'])[test_index]
    param = {
      'num_leaves': 121,
      'boosting_type': 'gbdt',
      'objective': 'binary',
      'learning_rate': 0.05,
      'metric': 'binary_logloss'
    }
    # 使用lgb进行训练
    trn_data = lgb.Dataset(X_train, y_train)
    val_data = lgb.Dataset(X_valid, y_valid)
    lgbm = lgb.train(param, trn_data, valid_sets=[trn_data, val_data], num_boost_round=10000, early_stopping_rounds=100, verbose_eval=50, categorical_feature=cate1)
    # 计算test的预测结果
    test = test[X_train.columns]
    y_pred = y_pred + lgbm.predict(test)*0.2


pd.DataFrame({
  'column': X_train.columns,
  'importance':lgbm.feature_importance()
}).sort_values(by='importance')





def f(x):
  """
  if x<0.1:
    return 0.1
  if x>0.9:
    return 0.9
  """
  return round(x*1000)/1000

test['result'] = y_pred
test['result'] = test['result'].map(f)
result = pd.DataFrame(test['result'])
result.to_csv('submission_1.csv')

提交成绩为

在这里插入图片描述

  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

水花

您的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值