1. df_train_base.info()
2. df_train_base.duplicated().sum()
3. df_train_base.isnull().sum()
4.df_train_base[‘age’].fillna(df_train_base[‘age’].median(),inplace = True)
5. df_train_base.dropna(inplace= True)
6.df_train_base[‘name’].unique()
7.mapping + drop
df_train_base['gender'].map({
"Male":1,
"Female":0
})
df_train_base.drop('number',axis=1,inplace=True)
8.df_train_base.reset_index(drop=True)
9.类型转换xxx = xxx.astype(int)
10. 数据统计
df_tr = df_train.groupby(['id'])['click'].count()
df_tx = df_train_trx.groupby(['id'])['amt'].sum()
截取‘A00001’————————》 ‘00001’
df_train['id']= df_train['id'].replace('\D','', regex=True).astype(int)
时间探索
截取天数:2023-08-26 12:57:23
df_train['day']=[str(x)[10:13] for x in df_train['time']]
df_train['day']=df_train['day'].astype(int)
df_train.groupby(['id','day'])['day'].agg(['count'])
tim=df_train.groupby(['id','day'])['day'].agg(['count']).reset_index()
tim = tim.sort_values(['id','count'], ascending=False)
df_train_time=df_train_time.groupby('id').first().reset_index()
11. Merge
df_train = pd.merge(train_tmp1,df_tmp2,on=['id'])
12. AutoML --autogluon探索合适模型
from autogluon.tabular import TabularDataset, TabularPredictor
train_da = TabularDataset(df_train)
subsample_size = 50000
train_da = train_da.sample(n=subsample_size, random_state=40)
label = 'label'
save_path = 'agModels-predictClass'
predictor = TabularPredictor(label=label, path=save_path).fit(train_da)
13. 优化参数
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
scorel=[]
for i in range(0,200,10):
rfc = LGBMClassifier(n_estimators=i+1,
n_jobs=-1,
random_state=90)
score = cross_val_score(rfc,train_data,train_target,cv=10).mean()
scorel.append(score)
print(max(scorel),(scorel.index(max(scorel))*10)+1)
plt.figure(figsize=[20,5])
plt.plot(range(1,201,10),scorel)
=============================================================
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
param_grid = [
{'learning_rate': [0.01,0.03,0.05,0.02],
'n_estimators':[25,50,75,100]
},
]
lightgbm_reg = LGBMRegressor()
grid_search = GridSearchCV(lightgbm_reg, param_grid, cv=10,
scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
==============================================================
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid = [
{'max_features': [12,18,24,32],
'max_depth':[24,32,40],
'min_samples_leaf':[18,24,32],
'min_samples_split':[24,32,40]
},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=10,
scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=181, random_state=42)
model.fit(X_train, y_train)