Kaggle 学习笔记
标题机器学习数据处理
数值型缺失值
//1.删除缺失值所在的行
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)
//2.保留数值型数据
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])
//获取每列缺失值个数
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])
//去除含有缺失值的列
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis = 1)
//填充缺失值
from sklearn.impute import SimpleImputer
imput = SimpleImputer()
imputed_X_train = pd.DataFrame(imput.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imput.transform(X_valid))
类别型处理:
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
//选出训练集和验证集中类别种类相同,可进行编码的列名
good_label_cols = [col for col in object_cols if set(X_train[col]) == set(X_valid[col])]
bad_label_cols = list(set(object_cols)-set(good_label_cols))
//展示每列数据种类
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d= dict(zip(object_cols, object_nunique))
sorted(d.items(), key = lambda x:x[1])
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
Python Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[ ('num', numerical_transformer, numerical_cols),('cat', categorical_transformer, categorical_cols)])
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor',preprocessor),('model', model)])
# Preprocessing of training data, fit model
clf.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)
print('MAE:', mean_absolute_error(y_valid, preds))
//交叉验证
scores = -1 * cross_val_score(my_pipeline, X, y,cv=5,scoring='neg_mean_absolute_error')
特征工程
时间戳型数据处理
//时间戳数据应用时应按照时序选择数据训练
clicks = click_data.copy()
clicks['day'] = clicks['click_time'].dt.day.astype('uint8')
# Fill in the rest
clicks['hour'] = clicks['click_time'].dt.hour.astype('uint8')
clicks['minute'] = clicks['click_time'].dt.minute.astype('uint8')
clicks['second'] = clicks['click_time'].dt.second.astype('uint8')
标签型数据处理
from sklearn import preprocessing
cat_features = ['ip', 'app', 'device', 'os', 'channel']
label_encoder = preprocessing.LabelEncoder()
# Create new columns in clicks using preprocessing.LabelEncoder()
for feature in cat_features:
encoded= label_encoder.fit_transform(clicks[feature])
clicks[feature + "_labels"] = encoded
//类别处理API category_encoders
import category_encoders as ce
cat_features = ['category', 'currency', 'country']
# Create the encoder
count_enc = ce.CountEncoder()
# Transform the features, rename the columns with the _count suffix, and join to dataframe
count_encoded = count_enc.fit_transform(ks[cat_features])
data = data.join(count_encoded.add_suffix("_count"))
# Create the encoder
target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(train[cat_features], train['outcome'])
# Transform the features, rename the columns with _target suffix, and join to dataframe
train_TE = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid_TE = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))
CatBoostEncoding 性能优于LighGBM
# remove IP from the encoded features
cat_features = ['app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)
# Create the CatBoost encoder
cb_enc = ce.CatBoostEncoder(cols = cat_features)
# Learn encoding from the training set
cb_enc.fit(train[cat_features], train['is_attributed'])
# Apply encoding to the train and validation sets as new columns
# Make sure to add `_cb` as a suffix to the new columns
train_encoded = train.join(cb_enc.transform(train[cat_features]).add_suffix("_cb"))
valid_encoded = valid.join(cb_enc.transform(valid[cat_features]).add_suffix("_cb"))
数据扩增
//针对相关性较大的标签
import itertools
cat_features = ['ip', 'app', 'device', 'os', 'channel']
interactions = pd.DataFrame(index=clicks.index)
for col1, col2 in itertools.combinations(cat_features, 2):
col_new_name = '_'.join([col1, col2])
new_value = clicks[col1].map(str) + "_" + clicks[col2].map(str)
encoder = preprocessing.LabelEncoder()
interactions[col_new_name] = encoder.fit_transform(new_value)
def count_past_events(series, time_window='6H'):
series = pd.Series(series.index, index = series)
past_events = series.rolling(time_window).count()-1
return past_events
//存疑
timedeltas = clicks.groupby('ip')['click_time'].transform(time_diff)
def time_diff(series):
""" Returns a series with the time since the last timestamp in seconds """
return series.diff().dt.total_seconds()