机器学习算法讲堂(一) 十分钟入门机器学习算法竞赛
比赛地址:https://www.kaggle.com/c/new-york-city-taxi-fare-prediction
import pandas as pd
import numpy as np
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
file = pd.read_csv('./data/train.csv', nrows = 1000000)
print(file.head())
print(file.shape)
file = file.dropna(how = 'any', axis = 'rows')
#Clean dataset
def clean_df(df):
return df[(df.fare_amount > 0) &
# (df.pickup_longitude > -80) & (df.pickup_longitude < -70) &
# (df.pickup_latitude > 35) & (df.pickup_latitude < 45) &
# (df.dropoff_longitude > -80) & (df.dropoff_longitude < -70) &
# (df.dropoff_latitude > 35) & (df.dropoff_latitude < 45) &
(df.passenger_count > 0) & (df.passenger_count < 10)]
file = clean_df(file)
print(len(file))
print(file.shape)
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
"""
Return distance along great radius between pickup and dropoff coordinates.
"""
#Define earth radius (km)
R_earth = 6371
#Convert degrees to radians
pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
[pickup_lat, pickup_lon,
dropoff_lat, dropoff_lon])
#Compute distances along lat, lon dimensions
dlat = dropoff_lat - pickup_lat
dlon = dropoff_lon - pickup_lon
#Compute haversine distance
a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
return 2 * R_earth * np.arcsin(np.sqrt(a))
def add_airport_dist(dataset):
"""
Return minumum distance from pickup or dropoff coordinates to each airport.
JFK: John F. Kennedy International Airport
EWR: Newark Liberty International Airport
LGA: LaGuardia Airport
"""
jfk_coord = (40.639722, -73.778889)
ewr_coord = (40.6925, -74.168611)
lga_coord = (40.77725, -73.872611)
Washington_Square = (40.4351,-73.5951)
pickup_lat = dataset['pickup_latitude']
dropoff_lat = dataset['dropoff_latitude']
pickup_lon = dataset['pickup_longitude']
dropoff_lon = dataset['dropoff_longitude']
pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1])
dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon)
pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon)
pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1])
dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
pickup_square = sphere_dist(pickup_lat, pickup_lon, Washington_Square[0], Washington_Square[1])
dropoff_square = sphere_dist(Washington_Square[0], Washington_Square[1], dropoff_lat, dropoff_lon)
dataset['jfk_dist'] = pd.concat([pickup_jfk, dropoff_jfk], axis=1).min(axis=1)
dataset['ewr_dist'] = pd.concat([pickup_ewr, dropoff_ewr], axis=1).min(axis=1)
dataset['lga_dist'] = pd.concat([pickup_lga, dropoff_lga], axis=1).min(axis=1)
dataset['washington_dist'] = pd.concat([pickup_square, dropoff_square], axis=1).min(axis=1)
dataset['longitude_distance'] = abs(dataset['pickup_longitude'] - dataset['dropoff_longitude'])
dataset['latitude_distance'] = abs(dataset['pickup_latitude'] - dataset['dropoff_latitude'])
# Straight distance
dataset['distance_travelled'] = (dataset['longitude_distance'] ** 2 + dataset['latitude_distance'] ** 2) ** .5
dataset['distance_travelled_sin'] = np.sin((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5)
dataset['distance_travelled_cos'] = np.cos((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5)
dataset['distance_travelled_sin_sqrd'] = np.sin((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5) ** 2
dataset['distance_travelled_cos_sqrd'] = np.cos((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5) ** 2
# dataset["fare_to_dist_ratio"] = dataset["fare_amount"] / ( dataset["distance_travelled"]+0.0001)
# dataset["fare_npassenger_to_dist_ratio"] = (dataset["fare_amount"] / dataset["passenger_count"]) /( dataset["distance_travelled"]+0.0001)
dataset['jfk'] = 0
dataset.loc[(dataset['pickup_longitude'] >= -73.7841) & (dataset['pickup_longitude'] <= -73.7721) &
(dataset['pickup_latitude'] <= 40.6613) & (dataset['pickup_latitude'] >= 40.6213),'jfk'] = 1
dataset.loc[(dataset['dropoff_longitude'] >= -73.7841) & (dataset['dropoff_longitude'] <= -73.7721) &
(dataset['dropoff_latitude'] <= 40.6613) & (dataset['dropoff_latitude'] >= 40.6213),'jfk'] = 1
dataset['lga'] = 0
dataset.loc[(dataset['pickup_longitude'] >= -73.8870) & (dataset['pickup_longitude'] <= -73.8580) &
(dataset['pickup_latitude'] <= 40.7800) & (dataset['pickup_latitude'] >= 40.7680),'lga'] = 1
dataset.loc[(dataset['dropoff_longitude'] >= -73.8870) & (dataset['dropoff_longitude'] <= -73.8580) &
(dataset['dropoff_latitude'] <= 40.7800) & (dataset['dropoff_latitude'] >= 40.7680),'lga'] = 1
dataset['ewr'] = 0
dataset.loc[(dataset['pickup_longitude'] >= -74.192) & (dataset['pickup_longitude'] <= -74.172) &
(dataset['pickup_latitude'] <= 40.708) & (dataset['pickup_latitude'] >= 40.676),'ewr'] = 1
dataset.loc[(dataset['dropoff_longitude'] >= -74.192) & (dataset['dropoff_longitude'] <= -74.172) &
(dataset['dropoff_latitude'] <= 40.708) & (dataset['dropoff_latitude'] >= 40.676),'ewr'] = 1
return dataset
def add_datetime_info(dataset):
#Convert to datetime format
dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
# dataset['second'] = dataset.pickup_datetime.dt.second
dataset['hour'] = dataset.pickup_datetime.dt.hour
dataset['day'] = dataset.pickup_datetime.dt.day
dataset['month'] = dataset.pickup_datetime.dt.month
dataset['weekday'] = dataset.pickup_datetime.dt.weekday
dataset['year'] = dataset.pickup_datetime.dt.year
# dataset['all_time'] = dataset['second'] + 60*dataset['hour'] + 24*60*dataset['day']+30*24*60*dataset['month']
return dataset
file = add_datetime_info(file)
file = add_airport_dist(file)
file = file.drop(columns=['pickup_datetime']) #'distance_travelled_sin_sqrd','passenger_count','distance_travelled_cos_sqrd'])
file['distance'] = sphere_dist(file['pickup_latitude'], file['pickup_longitude'],
file['dropoff_latitude'] , file['dropoff_longitude'])
file.head()
test_file = pd.read_csv('./data/test.csv')
test_file = add_datetime_info(test_file)
test_file = add_airport_dist(test_file)
test_file = test_file.drop(columns=['pickup_datetime']) #, 'distance_travelled_sin_sqrd','passenger_count','distance_travelled_cos_sqrd'])
test_file['distance'] = sphere_dist(test_file['pickup_latitude'], test_file['pickup_longitude'],
test_file['dropoff_latitude'] , test_file['dropoff_longitude'])
test_file.head()
import datetime as dt
from sklearn.model_selection import train_test_split
import xgboost as xgb
import os
train_x = file.drop(columns=['fare_amount'])
y = file['fare_amount']
new_test = test_file
from sklearn.preprocessing import LabelEncoder
for c in train_x.columns:
if train_x[c].dtype == 'datetime64[ns]' or train_x[c].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(train_x[c].values) + list(test_file[c].values))
train_x[c] = lbl.transform(list(train_x[c].values))
test_file[c] = lbl.transform(list(test_file[c].values))
print(test_file.head())
x_train,x_test,y_train,y_test = train_test_split(train_x,y,random_state=0,test_size=0.01)
'''
for x in range(0,len(x_train['pickup_datetime'])):
try:
time = ''
for time_ac in str(x_train['pickup_datetime'].loc[x]):
if time_ac <= '9' and time_ac >= '0':
time = time + time_ac
x_train['pickup_datetime'].loc[x] = time
except:
x_train['pickup_datetime'].loc[x] = 0
x_train['pickup_datetime'].astype('int64')
'''
print(x_train.dtypes)
print(x_train.head)
'''
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
eta = 0.1
max_depth = 8
subsample = 0.8
colsample_bytree = 0.8
print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
params = {
"objective": "reg:linear",
"booster" : "gbtree",
"eval_metric": "rmse",
"eta": eta,
"max_depth": max_depth,
"subsample": subsample,
"colsample_bytree": colsample_bytree,
"silent": 1,
"seed": 19960429
}
watchlist = [(dtrain,'train'),(dtest,'val')]
num_round = 3000
early_stopping_rounds=50
bst = xgb.train(params, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)
'''
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import matplotlib.pylab as plt
# Keep Relevant Variables..
trainshape = train_x.shape
testshape = test_file.shape
# print("\nTrain DF..")
# train = reduce_mem_usage(train)
# print("\nTest DF..")
# test_df = reduce_mem_usage(test_df)
# LGBM Dataset Formating
dtrain = lgb.Dataset(train_x, label=y, free_raw_data=False)
print("Light Gradient Boosting Regressor: ")
lgbm_params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
'max_depth':7,
'learning_rate':.1,
'subsample': 0.8,
'colsample_bytree': 0.8
}
folds = KFold(n_splits=5, shuffle=True, random_state=1)
fold_preds = np.zeros(testshape[0])
oof_preds = np.zeros(trainshape[0])
dtrain.construct()
# Fit 5 Folds
modelstart = time.time()
for trn_idx, val_idx in folds.split(file):
clf = lgb.train(
params=lgbm_params,
train_set=dtrain.subset(trn_idx),
valid_sets=dtrain.subset(val_idx),
num_boost_round=17000,
early_stopping_rounds=250,
verbose_eval=500
)
oof_preds[val_idx] = clf.predict(dtrain.data.iloc[val_idx])
fold_preds += clf.predict(test_file) / folds.n_splits
print(mean_squared_error(y.iloc[val_idx], oof_preds[val_idx]) ** .5)
# lgb.plot_importance(clf, max_num_features=30)
print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))
import time
Ttest = xgb.DMatrix(test_file)
# ypred = bst.predict(Ttest)
ypred = fold_preds
new_test = pd.read_csv('./data/test.csv')
output = pd.DataFrame({ 'key' : new_test['key'], 'fare_amount': ypred })
print(output.head())
dt = time.strftime('%Y%m%d%H%M%S',time.localtime())
output.to_csv('.//data//ans'+str(dt)+'.csv', index = False)