#MACHINE LEARNING tips
导入库
1)设置读取数据时的最大最小
pd.set_options(‘display.max_columns’,None)
pd.set_options(‘display.max_rows’,None)
1相关性分析
price_numeric = data_train[numeric_features]
correlation = price_numeric.corr()
print(correlation[‘price’].cort_values(ascending = False),’\n’)
2删除列
del price_numeric[‘price’]
3对每个数字特征分布可视化
f = pd.melt(data_train,value_vars = numerical_features)
g = sns.FaceGrid(f,col = ‘variable’,col_wrap = 2,sharex = False,sharey = False)
g = g.map(sns.displot,‘value’)
4 Onehotencoder
data = pd.get_dummies(data,columns = [‘model’,‘brand’,fueltype])
5 featuretools库的使用方法
import peaturetools as ft
7进行特征工程时提前合并数据集
df = data.append(test_data, sort=False)
#等到特征处理结束后在进行训练集与测试集的拆分
# 5折交叉
df_train = df[~df['y1_is_purchase'].isnull()]
#波浪线是啥
df_train = df_train.reset_index(drop=True)
df_test = df[df['y1_is_purchase'].isnull()]
方法2
train['train'] = 1
test['train'] = 0
data = pd.concat([train,test],ignore_index = True,sort = False)
8普通列特征创建
1
df[‘vlp_ratio’] = df[‘nprem_vlp’] / (df[‘si_vlp’]+1)
2平方项创建
df[‘bt_ratio2’]=df[‘bt_ratio’]**2
9时间格式处理
df[‘birth_month’] = df[‘birth_month’].apply(lambda x: int(x[:-1]) if type(x) != float else 0)
10特征具有连续的列相关性,进行循环啊
for i in np.arange(start=16,stop=20,step=1):
df[str(i)+str(i+1)+‘nprem_ratio’] = df[‘suiche_nonauto_nprem_’+str(i+1)] / (df[‘suiche_nonauto_nprem_’+str()]+1)
df[str(i)+str(i+1)+‘amount_ratio’] = df[‘suiche_nonauto_amount_’+str(i+1)] / (df[‘suiche_nonauto_amount_’+str(i)]+1)
11#看不懂??
for f in [[‘dpt’], [‘client_no’], [‘trademark_cn’], [‘brand_cn’], [‘make_cn’], [‘series’]]:
df_temp = df.groupby(f).size().reset_index()
df_temp.columns = f + [’{}count’.format(’’.join(f))]
df = df.merge(df_temp, how=‘left’)
12#看不懂?
def stat(df, df_merge, group_by, agg):
group = df.groupby(group_by).agg(agg)
columns = []
for on, methods in agg.items():
for method in methods:
columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
group.columns = columns
group.reset_index(inplace=True)
df_merge = df_merge.merge(group, on=group_by, how='left')
del (group)
gc.collect()
return df_merge
def statis_feat(df_know, df_unknow):
for f in tqdm([‘p1_census_register’, ‘dpt’]):
df_unknow = stat(df_know, df_unknow, [f], {
‘y1_is_purchase’: [‘mean’]})
return df_unknow
13缺失值
1)信息统计
data.isnull().sum()
2)可视化
missing = data_train.isnull().sum()/len(data_train)
missing = missing[missing>0]
missing = missing.sort_values()
missing.plot().bar()
3)某一列的缺失值如果存在特殊符号标识进行替换
data[‘date’].replace(’-’,np.nan,impalce = True)
14特征工程后直接删除某些列
del data[‘mmm’]
15特征处理时从邮编中提取城市信息
data[‘city’] = data[‘regioncode’].apply(lambda x: str(x)[:-3])