from google.colab import drive
drive.mount('/content/drive')
导入相应的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta
import pandas_profiling as ppf
import datetime
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')%matplotlib inline
数据的读入
full = pd.read_csv('/content/drive/My Drive/Colab Notebooks/DC/round1_diac2019_train.csv')
full.head()
# 对字符类型的缺失值进行填充
cols =["customer_province","customer_city"]for col in cols:
full[col].fillna("None",inplace=True)# 对数值类型的缺失值进行填充
cols1=["is_customer_rate","customer_gender","is_member_actived"]for col in cols1:
full[col].fillna(0, inplace=True)# 对这些列用众数进行填充
cols2 =["customer_id","member_status"]for col in cols2:
full[col].fillna(full[col].mode()[0], inplace=True)
将字符类型转换为数值类型
# 针对customer_province处理
customer_province_cold =['香港特别行政区','台湾','柔佛']
customer_province_hot1 =['广东省','浙江省','江苏省','上海','北京','None']
customer_province_hot2 =['四川省','湖北省','山东省','河南省','福建省','安徽省','湖南省']
customer_province_hot3 =['辽宁省','江西省','重庆','河北省','陕西省','云南省']
customer_province_hot4 =['广西壮族自治区','天津','黑龙江省','贵州省','吉林省','山西省','内蒙古自治区','新疆维吾尔自治区','甘肃省','海南省','西藏自治区','宁夏回族自治区','青海省']for province_col_cold in customer_province_cold:
full.loc[full.customer_province == province_col_cold,'customer_province']=0for province_col_hot1 in customer_province_hot1:
full.loc[full.customer_province == province_col_hot1,'customer_province']=1for province_col_hot2 in customer_province_hot2:
full.loc[full.customer_province == province_col_hot2,'customer_province']=2for province_col_hot3 in customer_province_hot3:
full.loc[full.customer_province == province_col_hot3,'customer_province']=3for province_col_hot4 in customer_province_hot4:
full.loc[full.customer_province == province_col_hot4,'customer_province']=4
# 针对customer_city处理
full_customer_city = full['customer_city'].drop_duplicates().dropna().values.tolist()
customer_city_hot1 =['上海市','None','北京市','广州市','杭州市','成都市','武汉市','深圳市']
customer_city_hot2 =['南京市','重庆市','苏州市','天津市','长沙市','郑州市','西安市']
customer_city_hot3 =['合肥市','宁波市','福州市','昆明市','温州市','东莞市','青岛市','沈阳市','厦门市','佛山市','南昌市','无锡市','济南市','哈尔滨市']
customer_city_cold =list(set(full_customer_city)-set(customer_city_hot1)-set(customer_city_hot2)-set(customer_city_hot3))for city_col_hot1 in customer_city_hot1:
full.loc[full.customer_city == city_col_hot1,'customer_city']=1for city_col_hot2 in customer_city_hot2:
full.loc[full.customer_city == city_col_hot2,'customer_city']=2for city_col_hot3 in customer_city_hot3:
full.loc[full.customer_city == city_col_hot3,'customer_city']=3for city_col_cold in customer_city_cold:
full.loc[full.customer_city == city_col_cold,'customer_city']=0
defmake_label(labels,train3):
label = labels.groupby(['customer_id'],as_index=False)['label'].agg({'label':'max'})
data = pd.merge(train3,label,on=['customer_id'],how='left',copy=False)print(data.shape)return data
train = make_label(train_history,train3)
test = test3.copy()print(test.shape)
# 构建机器学习所需的label和data
train_copy = train.copy()
y = train_copy.pop('label')
feature =[x for x in train_copy.columns if x notin['customer_id']]
X = train_copy[feature]# 训练与测试数据标准化处理from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
X_scaled = ss_X.fit_transform(X)# 划分训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.5, random_state=33)