一、预处理
1、数据概览
#数据概览
df.info()
df.describe()
df.head() #查看数据
df.dtypes() #查看数据类型
df.shape() #查看数据行列数
df['user_age_level'].hist() #查看数值分布
df.isnull().sum() #查看每一列的缺失值情况
df['n_null'] = df.isnull().sum(axis=1) #查看每一行的缺失值情况
df['user_age_level'].value_counts() #查看这一列的值统计
df['user_age_level']
df['user_age_level'].unique() #查看数据取值
2、缺失值填充
#将-1的值替换为nan
for feature in columns:
df.loc[df['feature']==-1,feature]=np.nan
#填充缺失值的几种办法
mode_df = df.fillna(df.mode().iloc[0],inplace = True)
median_df = df.fillna(df.median())
df['user_age_level'][df.age.isnull()] = 0
#使用模型预测出缺失值
3、连续特征规范化处理
#连续特征规范化处理
from sklearn.preprocessing import MinMaxScaler,Normalizer,StandardScaler
scaler = StandardScaler() #标准正太分布
scaler = MinMaxScaler(feature_range(0,1)) #变换到[0,1]区间(也可以是其他固定最大值最小值的区间)
scaler = Normalizer(norm='12') #
df['shop_review_positive_rate']=StandardScaler().fit_transform(df['shop_review_positive_rate'].shape(-1,1)
df['shop_review_positive_rate']=df['shop_review_positive_rate'].rank()
4、类别型特征处理
#获得哑变量
data = pd.get_dummies(df,columns=['user_gender_id'],dummy_na=True)
from sklearn.preprocessing import LabelEncodern,OneHotEncoder
from scipy import sparse
df['user_gender_id']=LabelEncoder().fit_transform(df['user_gender_id'])
data = sparse.hstack((df,OneHotcoder().fit_transform(df['user_gender_id'])))
5、文本向量化
from sklearn.feature_extraction.text import CountVectorizer
df['item_category_list'] = df['itemcategory_list'].apply(lambda x:''.join(x.split(';')))
item_category_list = CountVectorizer().fit_transform(df['item_category_list'])
df = sparse.hstack((item_category_list,df)) #hstack为横向拼接,wstack为纵向拼接
6、连续特征离散化
df['pv_bins'] = pd.cut(df['item_pv_level'],bins = [0,5,10,15,20]).head()
df['pv_bins'] = LabelEncoder().fit_transform(df['pv_bins'])
7、特征二值化
#小于threshold值的编码为0,大于threshold值的编码为1
from sklearn.preprocessing import Binarizer
df['item_pv_level'] = Binarizer(threshold=10).fit_transfrom(df['item_pv_level'].values.reshape(-1,1))
##reshape(-1,1)表示将数组转换成行数不知,列数为1的数组
二、模型评估与参数搜索
#模型评估与参数搜索
model = LogisticRegression()
param_grid = {
'max_iter':[20,50,100],
'C':[1e-3,1e-2,1e-1,1]
}
grid_search = GridSearchCV(model,param_grid,n_jobs=8,verbose=1,cv=5)
grid_search_fit(train_x,train_y)
best_parameters = grid_search.best_estimator_.get_params()
for para,val in list(best_parameters.items()):
print(para,val)
model = LogisticRegression(max_iter=best_parameters['max_iter'],C=best_parameters['C'])
model.fit(train_x,train_y)