# New York City Taxi Trip Duration纽约出租车大数据探索
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
train=pd.read_csv(r"D:\2018_BigData\Python\Kaggle_learning\New York City Taxi Trip Duration\train.csv")
train.head()
| id | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | store_and_fwd_flag | trip_duration |
---|
0 | id2875421 | 2 | 2016-03-14 17:24:55 | 2016-03-14 17:32:30 | 1 | -73.982155 | 40.767937 | -73.964630 | 40.765602 | N | 455 |
---|
1 | id2377394 | 1 | 2016-06-12 00:43:35 | 2016-06-12 00:54:38 | 1 | -73.980415 | 40.738564 | -73.999481 | 40.731152 | N | 663 |
---|
2 | id3858529 | 2 | 2016-01-19 11:35:24 | 2016-01-19 12:10:48 | 1 | -73.979027 | 40.763939 | -74.005333 | 40.710087 | N | 2124 |
---|
3 | id3504673 | 2 | 2016-04-06 19:32:31 | 2016-04-06 19:39:40 | 1 | -74.010040 | 40.719971 | -74.012268 | 40.706718 | N | 429 |
---|
4 | id2181028 | 2 | 2016-03-26 13:30:55 | 2016-03-26 13:38:10 | 1 | -73.973053 | 40.793209 | -73.972923 | 40.782520 | N | 435 |
---|
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id 1458644 non-null object
vendor_id 1458644 non-null int64
pickup_datetime 1458644 non-null object
dropoff_datetime 1458644 non-null object
passenger_count 1458644 non-null int64
pickup_longitude 1458644 non-null float64
pickup_latitude 1458644 non-null float64
dropoff_longitude 1458644 non-null float64
dropoff_latitude 1458644 non-null float64
store_and_fwd_flag 1458644 non-null object
trip_duration 1458644 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB
train.describe()
| vendor_id | passenger_count | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | trip_duration |
---|
count | 1.458644e+06 | 1.458644e+06 | 1.458644e+06 | 1.458644e+06 | 1.458644e+06 | 1.458644e+06 | 1.458644e+06 |
---|
mean | 1.534950e+00 | 1.664530e+00 | -7.397349e+01 | 4.075092e+01 | -7.397342e+01 | 4.075180e+01 | 9.594923e+02 |
---|
std | 4.987772e-01 | 1.314242e+00 | 7.090186e-02 | 3.288119e-02 | 7.064327e-02 | 3.589056e-02 | 5.237432e+03 |
---|
min | 1.000000e+00 | 0.000000e+00 | -1.219333e+02 | 3.435970e+01 | -1.219333e+02 | 3.218114e+01 | 1.000000e+00 |
---|
25% | 1.000000e+00 | 1.000000e+00 | -7.399187e+01 | 4.073735e+01 | -7.399133e+01 | 4.073588e+01 | 3.970000e+02 |
---|
50% | 2.000000e+00 | 1.000000e+00 | -7.398174e+01 | 4.075410e+01 | -7.397975e+01 | 4.075452e+01 | 6.620000e+02 |
---|
75% | 2.000000e+00 | 2.000000e+00 | -7.396733e+01 | 4.076836e+01 | -7.396301e+01 | 4.076981e+01 | 1.075000e+03 |
---|
max | 2.000000e+00 | 9.000000e+00 | -6.133553e+01 | 5.188108e+01 | -6.133553e+01 | 4.392103e+01 | 3.526282e+06 |
---|
pd.set_option('display.max_columns', None)
train[["passenger_count","trip_duration"]].describe()
| passenger_count | trip_duration |
---|
count | 1.458644e+06 | 1.458644e+06 |
---|
mean | 1.664530e+00 | 9.594923e+02 |
---|
std | 1.314242e+00 | 5.237432e+03 |
---|
min | 0.000000e+00 | 1.000000e+00 |
---|
25% | 1.000000e+00 | 3.970000e+02 |
---|
50% | 1.000000e+00 | 6.620000e+02 |
---|
75% | 2.000000e+00 | 1.075000e+03 |
---|
max | 9.000000e+00 | 3.526282e+06 |
---|
import datetime
from datetime import datetime
train.pickup_datetime = train.pickup_datetime.apply(lambda x:datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
train["pickup_yearmonth"] = train["pickup_datetime"].map(lambda x: 100*x.year + x.month)
train.head(1)
| id | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | store_and_fwd_flag | trip_duration | pickup_yearmonth |
---|
0 | id2875421 | 2 | 2016-03-14 17:24:55 | 2016-03-14 17:32:30 | 1 | -73.982155 | 40.767937 | -73.96463 | 40.765602 | N | 455 | 201603 |
---|
train["pickup_yearmonth"].value_counts()
201603 256189
201604 251645
201605 248487
201602 238300
201606 234316
201601 229707
Name: pickup_yearmonth, dtype: int64
train["pickup_month"] = train["pickup_datetime"].map(lambda x: x.month)
train.head(1)
| id | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | store_and_fwd_flag | trip_duration | pickup_yearmonth | pickup_month |
---|
0 | id2875421 | 2 | 2016-03-14 17:24:55 | 2016-03-14 17:32:30 | 1 | -73.982155 | 40.767937 | -73.96463 | 40.765602 | N | 455 | 201603 | 3 |
---|
train["pickup_week"]=train.pickup_datetime.apply(lambda x: x.weekday())
train.head(2)
| id | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | store_and_fwd_flag | trip_duration | pickup_yearmonth | pickup_month | pickup_week |
---|
0 | id2875421 | 2 | 2016-03-14 17:24:55 | 2016-03-14 17:32:30 | 1 | -73.982155 | 40.767937 | -73.964630 | 40.765602 | N | 455 | 201603 | 3 | 0 |
---|
1 | id2377394 | 1 | 2016-06-12 00:43:35 | 2016-06-12 00:54:38 | 1 | -73.980415 | 40.738564 | -73.999481 | 40.731152 | N | 663 | 201606 | 6 | 6 |
---|
train["pickup_day"]=train.pickup_datetime.apply(lambda x: x.day)
train.head(2)
| id | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | store_and_fwd_flag | trip_duration | pickup_yearmonth | pickup_month | pickup_week | pickup_day |
---|
0 | id2875421 | 2 | 2016-03-14 17:24:55 | 2016-03-14 17:32:30 | 1 | -73.982155 | 40.767937 | -73.964630 | 40.765602 | N | 455 | 201603 | 3 | 0 | 14 |
---|
1 | id2377394 | 1 | 2016-06-12 00:43:35 | 2016-06-12 00:54:38 | 1 | -73.980415 | 40.738564 | -73.999481 | 40.731152 | N | 663 | 201606 | 6 | 6 | 12 |
---|
train["pickup_date"]=train.pickup_datetime.values.astype("datetime64[D]")
train.head(2)
| id | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | store_and_fwd_flag | trip_duration | pickup_yearmonth | pickup_month | pickup_week | pickup_day | pickup_date |
---|
0 | id2875421 | 2 | 2016-03-14 17:24:55 | 2016-03-14 17:32:30 | 1 | -73.982155 | 40.767937 | -73.964630 | 40.765602 | N | 455 | 201603 | 3 | 0 | 14 | 2016-03-14 |
---|
1 | id2377394 | 1 | 2016-06-12 00:43:35 | 2016-06-12 00:54:38 | 1 | -73.980415 | 40.738564 | -73.999481 | 40.731152 | N | 663 | 201606 | 6 | 6 | 12 | 2016-06-12 |
---|
month_trip=train.groupby(['pickup_month'])["trip_duration"].agg(["sum","mean","count"])
month_trip=month_trip.reset_index()
month_trip.head(2)
| pickup_month | sum | mean | count |
---|
0 | 1 | 211875608 | 922.373319 | 229707 |
---|
1 | 2 | 219433897 | 920.830453 | 238300 |
---|
month_trip.rename(columns={'sum':'month_sum_trip_dur','mean':'month_avg_trip_dur','count':"month_trip_times"}, inplace = True)
month_trip.head(2)
| pickup_month | month_sum_trip_dur | month_avg_trip_dur | month_trip_times |
---|
0 | 1 | 211875608 | 922.373319 | 229707 |
---|
1 | 2 | 219433897 | 920.830453 | 238300 |
---|
date_trip=train.groupby(['pickup_date'])["trip_duration"].agg(["sum","mean","count"])
date_trip=date_trip.reset_index()
date_trip.rename(columns={'sum':'date_sum_trip_duration','mean':'date_avg_trip_dur','count':"date_trip_times"}, inplace = True)
date_trip.head()
| pickup_date | date_sum_trip_duration | date_avg_trip_dur | date_trip_times |
---|
0 | 2016-01-01 | 6593910 | 920.679978 | 7162 |
---|
1 | 2016-01-02 | 5470632 | 840.084767 | 6512 |
---|
2 | 2016-01-03 | 5874410 | 924.667086 | 6353 |
---|
3 | 2016-01-04 | 5723773 | 851.118662 | 6725 |
---|
4 | 2016-01-05 | 10484304 | 1455.344808 | 7204 |
---|
day_trip=train.groupby(['pickup_day'])["trip_duration"].agg(["sum","mean","count"])
day_trip=day_trip.reset_index()
day_trip.rename(columns={'sum':'day_sum_trip_duration','mean':'day_avg_trip_dur','count':"day_trip_times"}, inplace = True)
day_trip.head()
| pickup_day | day_sum_trip_duration | day_avg_trip_dur | day_trip_times |
---|
0 | 1 | 44656812 | 958.053978 | 46612 |
---|
1 | 2 | 44354937 | 928.860299 | 47752 |
---|
2 | 3 | 46806173 | 976.247221 | 47945 |
---|
3 | 4 | 47050568 | 947.549451 | 49655 |
---|
4 | 5 | 51193213 | 1020.272899 | 50176 |
---|
plt.figure(figsize=(16,18))
plt.subplot(321)
ax1= plt.plot(month_trip.pickup_month, month_trip.month_trip_times, color="green", alpha=0.8, label='trip times',marker='*')
plt.title("Monthly Trip Times")
plt.ylabel('# of trip times', fontsize=12)
plt.xlabel('Pickup_Month', fontsize=12)
plt.subplot(322)
ax2= plt.plot(month_trip.pickup_month, month_trip.month_avg_trip_dur, color="green",alpha=0.8,marker='o')
plt.title("Monthly Trip Duration")
plt.ylabel('Avg of trip druration', fontsize=12)
plt.xlabel('Pickup_Month', fontsize=12)
plt.subplot(312)
ax2= plt.plot(date_trip.pickup_date, date_trip.date_trip_times, color="red", alpha=0.8,marker='o')
plt.title("Date Trip Time")
plt.ylabel('Date trip time', fontsize=12)
plt.xlabel('Pickup_Date', fontsize=12)
plt.subplot(313)
ax2= plt.plot(day_trip.pickup_day, day_trip.day_trip_times, color="red", alpha=0.8,marker='*')
plt.title("Day Trip Time")
plt.ylabel('Day trip time', fontsize=12)
plt.xlabel('Pickup_Day', fontsize=12)
plt.show()
date_trip[date_trip.date_trip_times<6000]
| pickup_date | date_sum_trip_duration | date_avg_trip_dur | date_trip_times |
---|
22 | 2016-01-23 | 1691754 | 1026.549757 | 1648 |
---|
23 | 2016-01-24 | 3052107 | 902.189477 | 3383 |
---|
150 | 2016-05-30 | 4568228 | 820.148654 | 5570 |
---|
x = train.groupby(['pickup_day'])["passenger_count"].agg(["mean"])
x = x.reset_index()
day_trip["avg_passenger_count"] = x["mean"]
day_trip.head()
| pickup_day | day_sum_trip_duration | day_avg_trip_dur | day_trip_times | avg_passenger_count |
---|
0 | 1 | 44656812 | 958.053978 | 46612 | 1.682871 |
---|
1 | 2 | 44354937 | 928.860299 | 47752 | 1.666611 |
---|
2 | 3 | 46806173 | 976.247221 | 47945 | 1.655355 |
---|
3 | 4 | 47050568 | 947.549451 | 49655 | 1.643279 |
---|
4 | 5 | 51193213 | 1020.272899 | 50176 | 1.670301 |
---|
plt.figure(figsize=(12,12))
plt.subplot(211)
ax2= plt.plot(day_trip.pickup_day, day_trip.avg_passenger_count, color="grey", alpha=0.8,marker='o')
plt.title("Day Trip Passenger")
plt.ylabel('Day trip passenger', fontsize=12)
plt.xlabel('Pickup_Day', fontsize=12)
plt.subplot(212)
ax2= plt.plot(day_trip.pickup_day, day_trip.day_trip_times, color="grey", alpha=0.9,marker='*')
plt.title("Day Trip Time")
plt.ylabel('Day trip time', fontsize=12)
plt.xlabel('Pickup_Day', fontsize=12)
plt.show()
train["pickup_hour"]=train.pickup_datetime.apply(lambda x: x.hour)
train.head()
| id | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | store_and_fwd_flag | trip_duration | pickup_yearmonth | pickup_month | pickup_week | pickup_day | pickup_date | pickup_hour |
---|
0 | id2875421 | 2 | 2016-03-14 17:24:55 | 2016-03-14 17:32:30 | 1 | -73.982155 | 40.767937 | -73.964630 | 40.765602 | N | 455 | 201603 | 3 | 0 | 14 | 2016-03-14 | 17 |
---|
1 | id2377394 | 1 | 2016-06-12 00:43:35 | 2016-06-12 00:54:38 | 1 | -73.980415 | 40.738564 | -73.999481 | 40.731152 | N | 663 | 201606 | 6 | 6 | 12 | 2016-06-12 | 0 |
---|
2 | id3858529 | 2 | 2016-01-19 11:35:24 | 2016-01-19 12:10:48 | 1 | -73.979027 | 40.763939 | -74.005333 | 40.710087 | N | 2124 | 201601 | 1 | 1 | 19 | 2016-01-19 | 11 |
---|
3 | id3504673 | 2 | 2016-04-06 19:32:31 | 2016-04-06 19:39:40 | 1 | -74.010040 | 40.719971 | -74.012268 | 40.706718 | N | 429 | 201604 | 4 | 2 | 6 | 2016-04-06 | 19 |
---|
4 | id2181028 | 2 | 2016-03-26 13:30:55 | 2016-03-26 13:38:10 | 1 | -73.973053 | 40.793209 | -73.972923 | 40.782520 | N | 435 | 201603 | 3 | 5 | 26 | 2016-03-26 | 13 |
---|
week_trip = train.groupby(["pickup_month",'pickup_week','pickup_day','pickup_hour'])["trip_duration"].agg(["mean","count"])
week_trip = week_trip.reset_index()
week_trip.head(2)
| pickup_month | pickup_week | pickup_day | pickup_hour | mean | count |
---|
0 | 1 | 0 | 4 | 0 | 656.771186 | 118 |
---|
1 | 1 | 0 | 4 | 1 | 703.086957 | 92 |
---|
week_trip.rename(columns={'mean':'week_avg_trip_dur','count':"week_trip_times"}, inplace = True)
print(week_trip.shape)
week_trip.head(2)
(4359, 6)
| pickup_month | pickup_week | pickup_day | pickup_hour | week_avg_trip_dur | week_trip_times |
---|
0 | 1 | 0 | 4 | 0 | 656.771186 | 118 |
---|
1 | 1 | 0 | 4 | 1 | 703.086957 | 92 |
---|
x1 = train.groupby(["pickup_month",'pickup_week','pickup_day','pickup_hour'])["passenger_count"].agg(["mean"])
x1 = x1.reset_index()
week_trip["avg_passenger_count"] = x1["mean"]
week_trip.head()
| pickup_month | pickup_week | pickup_day | pickup_hour | week_avg_trip_dur | week_trip_times | avg_passenger_count |
---|
0 | 1 | 0 | 4 | 0 | 656.771186 | 118 | 1.593220 |
---|
1 | 1 | 0 | 4 | 1 | 703.086957 | 92 | 1.673913 |
---|
2 | 1 | 0 | 4 | 2 | 692.085106 | 47 | 1.574468 |
---|
3 | 1 | 0 | 4 | 3 | 738.500000 | 32 | 1.250000 |
---|
4 | 1 | 0 | 4 | 4 | 644.000000 | 50 | 1.700000 |
---|
plt.figure(figsize=(10,8))
sns.swarmplot(x="pickup_week", y="week_trip_times", hue="pickup_month", data=week_trip)
plt.show()
plt.figure(figsize=(10,5))
sns.boxplot(x="pickup_week", y="avg_passenger_count", hue="pickup_month", data=week_trip)
plt.show()
plt.figure(figsize=(16,6))
sns.swarmplot(x="pickup_hour", y="week_trip_times", data=week_trip)
plt.show()
plt.figure(figsize=(16,6))
sns.stripplot(x="pickup_hour", y="avg_passenger_count", data=week_trip)
plt.show()
plt.figure(figsize=(16,6))
y=week_trip["week_avg_trip_dur"][week_trip["week_avg_trip_dur"]<=8000]
daytripdur=sns.boxplot(x="pickup_hour", y=y, data=week_trip)
plt.show()
plt.figure(figsize=(16,6))
daytripdur=sns.boxplot(x="pickup_hour", y=y, data=week_trip)
plt.show()
from math import sin,radians,cos,asin,sqrt
def haversine(lon1, lat1, lon2, lat2):
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371
return c * r
plt.figure(figsize=(16,6))
y=week_trip["week_avg_trip_dur"][week_trip["week_avg_trip_dur"]<=8000]
daytripdur=sns.boxplot(x="pickup_hour", y=y, data=week_trip)
plt.show()