EDA
数据总览
预测目标为
TotalTimeStopped_p20, TotalTimeStopped_p50, TotalTimeStopped_p80, DistanceToFirstStop_p20, DistanceToFirstStop_p50 and DistanceToFirstStop_p80
Feature Engineering
构造特征
# Road Encoding
road_encoding = {'Street': 0, 'St': 0, 'Avenue': 1, 'Ave': 1, 'Boulevard': 2, 'Road': 3,
'Drive': 4, 'Lane': 5, 'Tunnel': 6, 'Highway': 7, 'Way': 8, 'Parkway': 9,
'Parking': 10, 'Oval': 11, 'Square': 12, 'Place': 13, 'Bridge': 14}
def encode(x):
if pd.isna(x):
return 0
for road in road_encoding.keys():
if road in x:
return road_encoding[road]
return 0
for par in [train, test]:
par['EntryType'] = par['EntryStreetName'].apply(encode)
par['ExitType'] = par['ExitStreetName'].apply(encode)
par['EntryType_1'] = pd.Series(par['EntryStreetName'].str.split().str.get(0))
par['ExitType_1'] = pd.Series(par['ExitStreetName'].str.split().str.get(0))
par['EntryType_2'] = pd.Series(par['EntryStreetName'].str.split().str.get(1))
par['ExitType_2'] = pd.Series(par['ExitStreetName'].str.split().str.get(1))
par.loc[par['EntryType_1'].isin(par['EntryType_1'].value_counts()[par['EntryType_1'].value_counts()<=500].index), 'EntryType_1'] = 'Other'
par.loc[par['ExitType_1'].isin(par['ExitType_1'].value_counts()[par['ExitType_1'].value_counts()<=500].index), 'ExitType_1'] = 'Other'
par.loc[par['EntryType_2'].isin(par['EntryType_2'].value_counts()[par['EntryType_2'].value_counts()<=500].index), 'EntryType_2'] = 'Other'
par.loc[par['ExitType_2'].isin(par['ExitType_2'].value_counts()[par['ExitType_2'].value_counts()<=500].index), 'ExitType_2'] = 'Other'
# The cardinal directions can be expressed using the equation: θ/π
# Where θ is the angle between the direction we want to encode and the north compass direction, measured clockwise.
directions = {'N': 0, 'NE': 1/4, 'E': 1/2, 'SE': 3/4, 'S': 1, 'SW': 5/4, 'W': 3/2, 'NW': 7/4}
train['EntryHeading'] = train['EntryHeading'].map(directions)
train['ExitHeading'] = train['ExitHeading'].map(directions)
test['EntryHeading'] = test['EntryHeading'].map(directions)
test['ExitHeading'] = test['ExitHeading'].map(directions)
# EntryStreetName == ExitStreetName ?
# EntryHeading == ExitHeading ?
for par in [train, test]:
par["same_street_exact"] = (par["EntryStreetName"] == par["ExitStreetName"]).astype(int)
par["same_heading_exact"] = (par["EntryHeading"] == par["ExitHeading"]).astype(int)
# We have some intersection id that are in more than one city, it is a good idea to feature cross them
for par in [train, test]:
par['Intersection'] = par['IntersectionId'].astype(str) + '_' + par['City'].astype(str)
# Add temperature (°F) of each city by month
monthly_av = {'Atlanta1': 43, 'Atlanta5': 69, 'Atlanta6': 76, 'Atlanta7': 79, 'Atlanta8': 78, 'Atlanta9': 73,
'Atlanta10': 62, 'Atlanta11': 53, 'Atlanta12': 45, 'Boston1': 30, 'Boston5': 59, 'Boston6': 68,
'Boston7': 74, 'Boston8': 73, 'Boston9': 66, 'Boston10': 55,'Boston11': 45, 'Boston12': 35,
'Chicago1': 27, 'Chicago5': 60, 'Chicago6': 70, 'Chicago7': 76, 'Chicago8': 76, 'Chicago9': 68,
'Chicago10': 56, 'Chicago11': 45, 'Chicago12': 32, 'Philadelphia1': 35, 'Philadelphia5': 66,
'Philadelphia6': 76, 'Philadelphia7': 81, 'Philadelphia8': 79, 'Philadelphia9': 72, 'Philadelphia10': 60,
'Philadelphia11': 49, 'Philadelphia12': 40}
for par in [train, test]:
# Concatenating the city and month into one variable
par['city_month'] = par["City"].astype(str) + par["Month"].astype(str)
# Creating a new column by mapping the city_month variable to it's corresponding average monthly temperature
par["average_temp"] = par['city_month'].map(monthly_av)
# Add climate data
monthly_rainfall = {'Atlanta1': 5.02, 'Atlanta5': 3.95, 'Atlanta6': 3.63, 'Atlanta7': 5.12, 'Atlanta8': 3.67, 'Atlanta9': 4.09,
'Atlanta10': 3.11, 'Atlanta11': 4.10, 'Atlanta12': 3.82, 'Boston1': 3.92, 'Boston5': 3.24, 'Boston6': 3.22,
'Boston7': 3.06, 'Boston8': 3.37, 'Boston9': 3.47, 'Boston10': 3.79,'Boston11': 3.98, 'Boston12': 3.73,
'Chicago1': 1.75, 'Chicago5': 3.38, 'Chicago6': 3.63, 'Chicago7': 3.51, 'Chicago8': 4.62, 'Chicago9': 3.27,
'Chicago10': 2.71, 'Chicago11': 3.01, 'Chicago12': 2.43, 'Philadelphia1': 3.52, 'Philadelphia5': 3.88,
'Philadelphia6': 3.29, 'Philadelphia7': 4.39, 'Philadelphia8': 3.82, 'Philadelphia9':3.88 ,
'Philadelphia10': 2.75, 'Philadelphia11': 3.16, 'Philadelphia12': 3.31}
monthly_snowfall = {'Atlanta1': 0.6, 'Atlanta5': 0, 'Atlanta6': 0, 'Atlanta7': 0, 'Atlanta8': 0, 'Atlanta9': 0,
'Atlanta10': 0, 'Atlanta11': 0, 'Atlanta12': 0.2, 'Boston1': 12.9, 'Boston5': 0, 'Boston6': 0,
'Boston7': 0, 'Boston8': 0, 'Boston9': 0, 'Boston10': 0,'Boston11': 1.3, 'Boston12': 9.0,
'Chicago1': 11.5, 'Chicago5': 0, 'Chicago6': 0, 'Chicago7': 0, 'Chicago8': 0, 'Chicago9': 0,
'Chicago10': 0, 'Chicago11': 1.3, 'Chicago12': 8.7, 'Philadelphia1': 6.5, 'Philadelphia5': 0,
'Philadelphia6': 0, 'Philadelphia7': 0, 'Philadelphia8': 0, 'Philadelphia9':0 , 'Philadelphia10': 0,
'Philadelphia11': 0.3, 'Philadelphia12': 3.4}
monthly_daylight = {'Atlanta1': 10, 'Atlanta5': 14, 'Atlanta6': 14, 'Atlanta7': 14, 'Atlanta8': 13, 'Atlanta9': 12,
'Atlanta10': 11, 'Atlanta11': 10, 'Atlanta12': 10, 'Boston1': 9, 'Boston5': 15, 'Boston6': 15,
'Boston7': 15, 'Boston8': 14, 'Boston9': 12, 'Boston10': 11,'Boston11': 10, 'Boston12': 9,
'Chicago1': 10, 'Chicago5': 15, 'Chicago6': 15, 'Chicago7': 15, 'Chicago8': 14, 'Chicago9': 12,
'Chicago10': 11, 'Chicago11': 10, 'Chicago12': 9, 'Philadelphia1': 10, 'Philadelphia5': 14,
'Philadelphia6': 15, 'Philadelphia7': 15, 'Philadelphia8': 14, 'Philadelphia9':12 , 'Philadelphia10': 11,
'Philadelphia11': 10, 'Philadelphia12': 9}
monthly_sunshine = {'Atlanta1': 5.3, 'Atlanta5': 9.3, 'Atlanta6': 9.5, 'Atlanta7': 8.8, 'Atlanta8': 8.3, 'Atlanta9': 7.6,
'Atlanta10': 7.7, 'Atlanta11': 6.2, 'Atlanta12': 5.3, 'Boston1': 5.3, 'Boston5': 8.6, 'Boston6': 9.6,
'Boston7': 9.7, 'Boston8': 8.9, 'Boston9': 7.9, 'Boston10': 6.7,'Boston11': 4.8, 'Boston12': 4.6,
'Chicago1': 4.4, 'Chicago5': 9.1, 'Chicago6': 10.4, 'Chicago7': 10.3, 'Chicago8': 9.1, 'Chicago9': 7.6,
'Chicago10': 6.2, 'Chicago11': 3.6, 'Chicago12': 3.4, 'Philadelphia1': 5.0, 'Philadelphia5': 7.9,
'Philadelphia6': 9.0, 'Philadelphia7': 8.9, 'Philadelphia8': 8.4, 'Philadelphia9':7.9 ,
'Philadelphia10': 6.6, 'Philadelphia11': 5.2, 'Philadelphia12': 4.4}
for par in [train, test]:
# Creating a new column by mapping the city_month variable to it's corresponding average monthly rainfall
par["average_rainfall"] = par['city_month'].map(monthly_rainfall)
# Creating a new column by mapping the city_month variable to it's corresponding average monthly snowfall
par['average_snowfall'] = par['city_month'].map(monthly_snowfall)
# Creating a new column by mapping the city_month variable to it's corresponding average monthly daylight
par["average_daylight"] = par['city_month'].map(monthly_daylight)
# Creating a new column by mapping the city_month variable to it's corresponding average monthly sunsine
par["average_sunshine"] = par['city_month'].map(monthly_sunshine)
# drop city month
train.drop('city_month', axis=1, inplace=True)
test.drop('city_month', axis=1, inplace=True)
# Add feature is day
train['is_day'] = train['Hour'].apply(lambda x: 1 if 5 < x < 20 else 0)
test['is_day'] = test['Hour'].apply(lambda x: 1 if 5 < x < 20 else 0)
# fill NaN categories
train.fillna(-999, inplace = True)
test.fillna(-999, inplace = True)
# distance from the center of the city
def add_distance(df):
df_center = pd.DataFrame({"Atlanta":[33.753746, -84.386330],
"Boston":[42.361145, -71.057083],
"Chicago":[41.881832, -87.623177],
"Philadelphia":[39.952583, -75.165222]})
df["CenterDistance"] = df.apply(lambda row: math.sqrt((df_center[row.City][0] - row.Latitude) ** 2 +
(df_center[row.City][1] - row.Longitude) ** 2) , axis=1)
add_distance(train)
add_distance(test)
# frequency encode
def encode_FE(df1, df2, cols):
for col in cols:
df = pd.concat([df1[col],df2[col]])
vc = df.value_counts(dropna=True, normalize=True).to_dict()
nm = col+'_FE'
df1[nm] = df1[col].map(vc)
df1[nm] = df1[nm].astype('float32')
df2[nm] = df2[col].map(vc)
df2[nm] = df2[nm].astype('float32')
print(nm,', ',end='')
# COMBINE FEATURES
def encode_CB(col1, col2 , df1 = train, df2 = test):
nm = col1+'_'+col2
df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str)
print(nm,', ',end='')
# group aggregations nunique
def encode_AG2(main_columns, agg_col, train_df = train, test_df = test):
for main_column in main_columns:
for col in agg_col:
comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0)
mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32')
test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32')
print(col+'_'+main_column+'_ct, ',end='')
def encode_AG(main_columns, agg_col, aggregations=['mean'], train_df = train, test_df = test, fillna=True, usena=False):
# aggregation of main agg_cols
for main_column in main_columns:
for col in agg_col:
for agg_type in aggregations:
new_col_name = main_column+'_'+col+'_'+agg_type
temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
columns={agg_type: new_col_name})
temp_df.index = list(temp_df[col])
temp_df = temp_df[new_col_name].to_dict()
train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
test_df[new_col_name] = test_df[col].map(temp_df).astype('float32')
if fillna:
train_df[new_col_name].fillna(-1,inplace=True)
test_df[new_col_name].fillna(-1,inplace=True)
print("'"+new_col_name+"'",', ',end='')
# Frequency encode
encode_FE(train, test, ['Hour', 'Month', 'EntryType', 'ExitType', 'EntryType_1', 'EntryType_2', 'ExitType_1', 'ExitType_2', 'Intersection', 'City'])
# Agreggations of main columns
encode_AG(['Longitude', 'Latitude', 'CenterDistance', 'EntryHeading', 'ExitHeading'], ['Hour', 'Weekend', 'Month', 'Intersection'], ['mean', 'std'])
# bucketize lat and lon
temp_df = pd.concat([train[['Latitude', 'Longitude']], test[['Latitude', 'Longitude']]]).reset_index(drop = True)
temp_df['Latitude_B'] = pd.cut(temp_df['Latitude'], 30)
temp_df['Longitude_B'] = pd.cut(temp_df['Longitude'], 30)
# feature cross lat and lon
temp_df['Latitude_B_Longitude_B'] = temp_df['Latitude_B'].astype(str) + '_' + temp_df['Longitude_B'].astype(str)
train['Latitude_B'] = temp_df.loc[:(train.shape[0]), 'Latitude_B']
test['Latitude_B'] = temp_df.loc[(train.shape[0]):, 'Latitude_B']
train['Longitude_B'] = temp_df.loc[:(train.shape[0]), 'Longitude_B']
test['Longitude_B'] = temp_df.loc[(train.shape[0]):, 'Longitude_B']
train['Latitude_B_Longitude_B'] = temp_df.loc[:(train.shape[0]), 'Latitude_B_Longitude_B']
test['Latitude_B_Longitude_B'] = temp_df.loc[(train.shape[0]):, 'Latitude_B_Longitude_B']
# feature crosses hour with month
encode_CB('Hour', 'Month')
# group aggregations nunique
encode_AG2(['Intersection', 'Latitude_B_Longitude_B'], ['Hour', 'Month'])
# label encode
for i,f in enumerate(train.columns):
if (np.str(train[f].dtype)=='category')|(train[f].dtype=='object'):
df_comb = pd.concat([train[f],test[f]],axis=0)
df_comb,_ = df_comb.factorize(sort=True)
if df_comb.max()>32000: print(f,'needs int32')
train[f] = df_comb[:len(train)].astype('int16')
test[f] = df_comb[len(train):].astype('int16')
得到特征
Hour_FE , Month_FE , EntryType_FE , ExitType_FE , EntryType_1_FE , EntryType_2_FE , ExitType_1_FE , ExitType_2_FE , Intersection_FE , City_FE , ‘Longitude_Hour_mean’ , ‘Longitude_Hour_std’ , ‘Longitude_Weekend_mean’ , ‘Longitude_Weekend_std’ , ‘Longitude_Month_mean’ , ‘Longitude_Month_std’ , ‘Longitude_Intersection_mean’ , ‘Longitude_Intersection_std’ , ‘Latitude_Hour_mean’ , ‘Latitude_Hour_std’ , ‘Latitude_Weekend_mean’ , ‘Latitude_Weekend_std’ , ‘Latitude_Month_mean’ , ‘Latitude_Month_std’ , ‘Latitude_Intersection_mean’ , ‘Latitude_Intersection_std’ , ‘CenterDistance_Hour_mean’ , ‘CenterDistance_Hour_std’ , ‘CenterDistance_Weekend_mean’ , ‘CenterDistance_Weekend_std’ , ‘CenterDistance_Month_mean’ , ‘CenterDistance_Month_std’ , ‘CenterDistance_Intersection_mean’ , ‘CenterDistance_Intersection_std’ , ‘EntryHeading_Hour_mean’ , ‘EntryHeading_Hour_std’ , ‘EntryHeading_Weekend_mean’ , ‘EntryHeading_Weekend_std’ , ‘EntryHeading_Month_mean’ , ‘EntryHeading_Month_std’ , ‘EntryHeading_Intersection_mean’ , ‘EntryHeading_Intersection_std’ , ‘ExitHeading_Hour_mean’ , ‘ExitHeading_Hour_std’ , ‘ExitHeading_Weekend_mean’ , ‘ExitHeading_Weekend_std’ , ‘ExitHeading_Month_mean’ , ‘ExitHeading_Month_std’ , ‘ExitHeading_Intersection_mean’ , ‘ExitHeading_Intersection_std’ , Hour_Month , Hour_Intersection_ct, Month_Intersection_ct, Hour_Latitude_B_Longitude_B_ct, Month_Latitude_B_Longitude_B_ct,
特征选择
param = {'max_depth': 20,
'learning_rate': 0.1,
'objective': 'regression',
'boosting_type': 'gbdt',
'verbose': 1,
'metric': 'rmse',
'seed': 42,
'n_jobs': 12}
def run_lgb(train, target):
original_columns = ['IntersectionId', 'Latitude', 'Longitude', 'EntryStreetName','ExitStreetName', 'EntryHeading',
'ExitHeading', 'Hour', 'Weekend', 'Month', 'City', 'EntryType', 'ExitType']
train_columns = list(train.columns[13:])
usefull_columns = []
not_usefull_columns = []
best_score = 0
train_tmp = train[original_columns]
print('Training with {} features'.format(train_tmp.shape[1]))
x_train, x_val, y_train, y_val = train_test_split(train_tmp, target, test_size = 0.2, random_state = 42)
xg_train = lgb.Dataset(x_train, label = y_train)
xg_valid = lgb.Dataset(x_val, label= y_val)
clf = lgb.train(param, xg_train, 100000, valid_sets = [xg_train, xg_valid], verbose_eval = 3000,
early_stopping_rounds = 100)
predictions = clf.predict(x_val)
rmse_score = np.sqrt(mean_squared_error(y_val, predictions))
print("RMSE baseline val score: ", rmse_score)
best_score = rmse_score
for num, i in enumerate(train_columns):
train_tmp = train[original_columns + usefull_columns + [i]]
print('Training with {} features'.format(train_tmp.shape[1]))
x_train, x_val, y_train, y_val = train_test_split(train_tmp, target, test_size = 0.2, random_state = 42)
xg_train = lgb.Dataset(x_train, label = y_train)
xg_valid = lgb.Dataset(x_val, label= y_val)
clf = lgb.train(param, xg_train, 100000, valid_sets = [xg_train, xg_valid], verbose_eval = 3000,
early_stopping_rounds = 100)
predictions = clf.predict(x_val)
rmse_score = np.sqrt(mean_squared_error(y_val, predictions))
print("RMSE val score: ", rmse_score)
if rmse_score < best_score:
print('Column {} is usefull'.format(i))
best_score = rmse_score
usefull_columns.append(i)
else:
print('Column {} is not usefull'.format(i))
not_usefull_columns.append(i)
print('Best rmse score for iteration {} is {}'.format(num + 1, best_score))
return usefull_columns, not_usefull_columns
usefull_columns, not_usefull_columns = run_lgb(train, target)
得到有效特征
usefull_columns=[‘same_heading_exact’, ‘Intersection’, ‘Hour_FE’,
‘Intersection_FE’, ‘City_FE’,
‘Longitude_Hour_std’, ‘Longitude_Weekend_std’,
‘Longitude_Month_mean’, ‘Longitude_Intersection_mean’,
‘Latitude_Weekend_std’, ‘CenterDistance_Month_std’,
‘ExitHeading_Intersection_mean’, ‘ExitHeading_Intersection_std’]
final_features = usefull_columns + [‘IntersectionId’, ‘Latitude’, ‘Longitude’, ‘EntryStreetName’,‘ExitStreetName’,
‘EntryHeading’, ‘ExitHeading’, ‘Hour’, ‘Weekend’, ‘Month’, ‘City’, ‘EntryType’, ‘ExitType’]
参考大佬1
参考大佬2
参考大佬3