包含全部示例的代码仓库见GIthub
1 导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
2 导入数据
data = pd.read_csv('dataset/bike.csv')
data.head()
# output
MemberType TripDurationSec BikeID StartDay StartDateTime StartDate StartTime StartStationName StartStationID EndDay EndDateTime EndDate EndTime EndStationName EndStationID
0 Subscriber 235.0 C00198 Tuesday 11/27/2012 05:17:00 PM 11/27/2012 17:17 Market St & ML King Blvd 1325.0 Tuesday 11/27/2012 05:21:00 PM 11/27/2012 17:21 Market St & Cowart St 1311.0
1 Subscriber 101.0 C00156 Tuesday 12/11/2012 04:10:00 PM 12/11/2012 16:10 Market St & ML King Blvd 1325.0 Tuesday 12/11/2012 04:12:00 PM 12/11/2012 16:12 Market St & W 12th St 1309.0
2 Subscriber 478.0 C00092 Wednesday 11/21/2012 04:17:00 PM 11/21/2012 16:17 100 Walnut St 1319.0 Wednesday 11/21/2012 04:25:00 PM 11/21/2012 16:25 200 River St Outdoor Chattanooga 1299.0
3 Customer 2399.0 C00061 Friday 11/23/2012 12:39:00 AM 11/23/2012 00:39 215 Broad St 1303.0 Friday 11/23/2012 01:19:00 AM 11/23/2012 01:19 215 Broad St 1303.0
4 Customer 3412.0 C00030 Saturday 12/08/2012 07:49:00 PM 12/08/2012 19:49 High St & E 2nd St 1321.0 Saturday 12/08/2012 08:46:00 PM 12/08/2012 20:46 High St & E 2nd St 1321.0
异常值
data[data.StartDay != data.EndDay].dropna(how='all')
# output
MemberType TripDurationSec BikeID StartDay StartDateTime StartDate StartTime StartStationName StartStationID EndDay EndDateTime EndDate EndTime EndStationName EndStationID
266 Subscriber 2469.0 C00040 Friday 12/07/2012 11:50:00 PM 12/08/2012 23:50 Oak St & University Greenway 1335.0 Saturday 12/08/2012 12:31:00 AM 12/08/2012 00:31 Oak St & University Greenway 1335.0
433 Customer 2536.0 C00304 Saturday 12/01/2012 11:17:00 PM 12/02/2012 23:17 100 Manufacturers Rd 1297.0 Sunday 12/02/2012 12:00:00 AM 12/02/2012 00:00 100 Manufacturers Rd 1297.0
452 Subscriber 3754.0 C00107 Sunday 11/25/2012 11:00:00 PM 11/26/2012 23:00 800 E 8th St 1333.0 Monday 11/26/2012 12:03:00 AM 11/26/2012 00:03 800 E 8th St 1333.0
465 Customer 1184.0 C00116 Thursday 12/06/2012 11:40:00 PM 12/07/2012 23:40 Broad St & W 4th St 1343.0 Friday 12/07/2012 12:00:00 AM 12/07/2012 00:00 215 Broad St 1303.0
515 Subscriber 315.0 C00114 Tuesday 11/20/2012 11:57:00 PM 11/21/2012 23:57 High St & E 2nd St 1321.0 Wednesday 11/21/2012 12:02:00 AM 11/21/2012 00:02 215 Broad St 1303.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
165831 Customer 168501.0 C00107 Wednesday 07/08/2015 10:36:00 PM 07/09/2015 22:36 Market & E 4th Street 1307.0 Friday 07/10/2015 09:25:00 PM 07/11/2015 21:25 SpringHill Suites 1420.0
165920 Customer 1557.0 C00036 Sunday 02/17/2013 11:48:00 PM 02/18/2013 23:48 200 River St Outdoor Chattanooga 1299.0 Monday 02/18/2013 12:14:00 AM 02/18/2013 00:14 800 E 8th St 1333.0
166050 Customer 18649.0 C00287 Tuesday 07/22/2014 09:03:00 PM 07/23/2014 21:03 100 Walnut St 1319.0 Wednesday 07/23/2014 02:14:00 AM 07/23/2014 02:14 Broad St & W 4th St 1343.0
166157 Customer 2359.0 C00115 Tuesday 04/21/2015 11:44:00 PM 04/22/2015 23:44 Market St & ML King Blvd 1325.0 Wednesday 04/22/2015 12:23:00 AM 04/22/2015 00:23 100 Manufacturers Rd 1297.0
166286 Subscriber 3067.0 C00205 Friday 05/31/2013 11:14:00 PM 06/01/2013 23:14 E Main St & Rossville Ave 1341.0 Saturday 06/01/2013 12:05:00 AM 06/01/2013 00:05 Broad St & W 10th St 1363.0
删除缺失值
data.dropna(how='all', inplace=True)
data.info()
# output
<class 'pandas.core.frame.DataFrame'>
Int64Index: 163914 entries, 0 to 166308
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MemberType 163914 non-null object
1 TripDurationSec 163914 non-null float64
2 BikeID 163914 non-null object
3 StartDay 163914 non-null object
4 StartDateTime 163914 non-null object
5 StartDate 163914 non-null object
6 StartTime 163914 non-null object
7 StartStationName 163914 non-null object
8 StartStationID 163914 non-null float64
9 EndDay 163912 non-null object
10 EndDateTime 163912 non-null object
11 EndDate 163912 non-null object
12 EndTime 163912 non-null object
13 EndStationName 163912 non-null object
14 EndStationID 163912 non-null float64
dtypes: float64(3), object(12)
memory usage: 20.0+ MB
data.head()
# output
MemberType TripDurationSec BikeID StartDay StartDateTime StartDate StartTime StartStationName StartStationID EndDay EndDateTime EndDate EndTime EndStationName EndStationID
0 Subscriber 235.0 C00198 Tuesday 11/27/2012 05:17:00 PM 11/27/2012 17:17 Market St & ML King Blvd 1325.0 Tuesday 11/27/2012 05:21:00 PM 11/27/2012 17:21 Market St & Cowart St 1311.0
1 Subscriber 101.0 C00156 Tuesday 12/11/2012 04:10:00 PM 12/11/2012 16:10 Market St & ML King Blvd 1325.0 Tuesday 12/11/2012 04:12:00 PM 12/11/2012 16:12 Market St & W 12th St 1309.0
2 Subscriber 478.0 C00092 Wednesday 11/21/2012 04:17:00 PM 11/21/2012 16:17 100 Walnut St 1319.0 Wednesday 11/21/2012 04:25:00 PM 11/21/2012 16:25 200 River St Outdoor Chattanooga 1299.0
3 Customer 2399.0 C00061 Friday 11/23/2012 12:39:00 AM 11/23/2012 00:39 215 Broad St 1303.0 Friday 11/23/2012 01:19:00 AM 11/23/2012 01:19 215 Broad St 1303.0
4 Customer 3412.0 C00030 Saturday 12/08/2012 07:49:00 PM 12/08/2012 19:49 High St & E 2nd St 1321.0 Saturday 12/08/2012 08:46:00 PM 12/08/2012 20:46 High St & E 2nd St 1321.0
data.MemberType.unique()
# output
array(['Subscriber', 'Customer', 'Dependent'], dtype=object)
data.MemberType.value_counts()
# output
Customer 113229
Subscriber 50629
Dependent 56
Name: MemberType, dtype: int64
删除企业用户
data = data[data.MemberType != 'Dependent']
data.MemberType.unique()
# output
array(['Subscriber', 'Customer'], dtype=object)
3 骑行时间分析
data.groupby('MemberType')['TripDurationSec'].mean()
# output
MemberType
Customer 17816.349645
Subscriber 1325.880484
Name: TripDurationSec, dtype: float64
data.TripDurationSec.mean()
# output
12721.108868654566
data.TripDurationSec.max()
# output
1351369984.0
删除最大值,可能是异常值
data = data[data.TripDurationSec != data.TripDurationSec.max()]
data.TripDurationSec.max()
# output
56988700.0
data = data[data.TripDurationSec < 3*3600]
data.groupby('MemberType')['TripDurationSec'].mean()
# output
MemberType
Customer 2163.449926
Subscriber 870.746617
Name: TripDurationSec, dtype: float64
data.groupby('MemberType')['TripDurationSec'].mean().plot.bar(rot=0) # rot=0 x轴文字旋转0度
4 骑行开始日期分析
data.StartDay.value_counts().plot.bar(rot=20)
data.groupby(['StartDay', 'MemberType']).size().plot.bar(rot=30)
# size 对nan值也会计数,count 不会对nan值计数
data.groupby(['StartDay', 'MemberType']).size().unstack()
# output
MemberType Customer Subscriber
StartDay
Friday 13744 8165
Monday 11850 6930
Saturday 33328 6485
Sunday 25850 5409
Thursday 8779 7827
Tuesday 9193 7615
Wednesday 8312 7975
data.groupby(['StartDay', 'MemberType']).size().unstack().plot.bar(rot=30)
5 每周中骑行时间最长3次骑行记录
data.head()
# output
MemberType TripDurationSec BikeID StartDay StartDateTime StartDate StartTime StartStationName StartStationID EndDay EndDateTime EndDate EndTime EndStationName EndStationID
0 Subscriber 235.0 C00198 Tuesday 11/27/2012 05:17:00 PM 11/27/2012 17:17 Market St & ML King Blvd 1325.0 Tuesday 11/27/2012 05:21:00 PM 11/27/2012 17:21 Market St & Cowart St 1311.0
1 Subscriber 101.0 C00156 Tuesday 12/11/2012 04:10:00 PM 12/11/2012 16:10 Market St & ML King Blvd 1325.0 Tuesday 12/11/2012 04:12:00 PM 12/11/2012 16:12 Market St & W 12th St 1309.0
2 Subscriber 478.0 C00092 Wednesday 11/21/2012 04:17:00 PM 11/21/2012 16:17 100 Walnut St 1319.0 Wednesday 11/21/2012 04:25:00 PM 11/21/2012 16:25 200 River St Outdoor Chattanooga 1299.0
3 Customer 2399.0 C00061 Friday 11/23/2012 12:39:00 AM 11/23/2012 00:39 215 Broad St 1303.0 Friday 11/23/2012 01:19:00 AM 11/23/2012 01:19 215 Broad St 1303.0
4 Customer 3412.0 C00030 Saturday 12/08/2012 07:49:00 PM 12/08/2012 19:49 High St & E 2nd St 1321.0 Saturday 12/08/2012 08:46:00 PM 12/08/2012 20:46 High St & E 2nd St 1321.0
# apply 对每一个分组应用
data.groupby('StartDay').apply(lambda x:x.sort_values('TripDurationSec', ascending=False)[:3])
# output
MemberType TripDurationSec BikeID StartDay StartDateTime StartDate StartTime StartStationName StartStationID EndDay EndDateTime EndDate EndTime EndStationName EndStationID
StartDay
Friday 31135 Customer 10799.0 C00273 Friday 08/30/2013 06:57:00 PM 08/30/2013 18:57 Broad St & W 4th St 1343.0 Friday 08/30/2013 09:57:00 PM 08/31/2013 21:57 Broad St & W 6th St 1317.0
128836 Customer 10789.0 C00285 Friday 08/30/2013 06:57:00 PM 08/30/2013 18:57 Broad St & W 4th St 1343.0 Friday 08/30/2013 09:56:00 PM 08/31/2013 21:56 Broad St & W 6th St 1317.0
128622 Customer 10773.0 C00022 Friday 11/06/2015 03:59:00 PM 11/06/2015 15:59 Carter St & W 12th St 1315.0 Friday 11/06/2015 06:58:00 PM 11/06/2015 18:58 Carter St & W 12th St 1315.0
Monday 31307 Customer 10776.0 C00082 Monday 09/01/2014 05:01:00 PM 09/01/2014 17:01 200 River St Outdoor Chattanooga 1299.0 Monday 09/01/2014 08:01:00 PM 09/01/2014 20:01 High St & E 2nd St 1321.0
69086 Customer 10769.0 C00237 Monday 02/03/2014 08:40:00 AM 02/03/2014 08:40 Carter St & W 12th St 1315.0 Monday 02/03/2014 11:40:00 AM 02/03/2014 11:40 Carter St & W 12th St 1315.0
106891 Customer 10761.0 C00276 Monday 11/19/2012 03:19:00 PM 11/19/2012 15:19 Power Alley & Riverfront Pkwy 1323.0 Monday 11/19/2012 06:18:00 PM 11/19/2012 18:18 Power Alley & Riverfront Pkwy 1323.0
Saturday 18496 Subscriber 10798.0 C00168 Saturday 08/01/2015 12:56:00 PM 08/01/2015 12:56 100 Walnut St 1319.0 Saturday 08/01/2015 03:56:00 PM 08/01/2015 15:56 High St & E 2nd St 1321.0
99145 Customer 10784.0 C00266 Saturday 03/16/2013 03:18:00 PM 03/16/2013 15:18 Power Alley & Riverfront Pkwy 1323.0 Saturday 03/16/2013 06:17:00 PM 03/16/2013 18:17 Chestnut St & Riverfront Pkwy 1313.0
69506 Customer 10764.0 C00034 Saturday 06/07/2014 12:12:00 PM 06/07/2014 12:12 Broad St & W 8th St 1345.0 Saturday 06/07/2014 03:12:00 PM 06/07/2014 15:12 Broad St & W 8th St 1345.0
Sunday 130253 Customer 10799.0 C00109 Sunday 06/14/2015 03:38:00 PM 06/14/2015 15:38 100 Walnut St 1319.0 Sunday 06/14/2015 06:38:00 PM 06/14/2015 18:38 100 Walnut St 1319.0
95625 Customer 10798.0 C00031 Sunday 04/13/2014 12:55:00 PM 04/13/2014 12:55 215 Broad St 1303.0 Sunday 04/13/2014 03:55:00 PM 04/13/2014 15:55 215 Broad St 1303.0
106953 Customer 10793.0 C00065 Sunday 10/05/2014 04:07:00 PM 10/05/2014 16:07 200 River St Outdoor Chattanooga 1299.0 Sunday 10/05/2014 07:07:00 PM 10/05/2014 19:07 200 River St Outdoor Chattanooga 1299.0
Thursday 148040 Customer 10775.0 C00177 Thursday 06/27/2013 01:29:00 PM 06/27/2013 13:29 215 Broad St 1303.0 Thursday 06/27/2013 04:28:00 PM 06/27/2013 16:28 215 Broad St 1303.0
97762 Customer 10724.0 C00156 Thursday 10/16/2014 02:00:00 PM 10/16/2014 14:00 Market St & W 12th St 1309.0 Thursday 10/16/2014 04:59:00 PM 10/16/2014 16:59 Market & E 4th Street 1307.0
162425 Customer 10707.0 C00179 Thursday 10/16/2014 02:01:00 PM 10/16/2014 14:01 Market St & W 12th St 1309.0 Thursday 10/16/2014 04:59:00 PM 10/16/2014 16:59 Market & E 4th Street 1307.0
Tuesday 137688 Customer 10797.0 C00256 Tuesday 06/23/2015 11:38:00 AM 06/23/2015 11:38 Broad St & W 6th St 1317.0 Tuesday 06/23/2015 02:38:00 PM 06/23/2015 14:38 Broad St & W 6th St 1317.0
33892 Customer 10784.0 C00147 Tuesday 06/23/2015 11:38:00 AM 06/23/2015 11:38 Broad St & W 6th St 1317.0 Tuesday 06/23/2015 02:38:00 PM 06/23/2015 14:38 Broad St & W 6th St 1317.0
165460 Customer 10780.0 C00196 Tuesday 09/30/2014 10:28:00 AM 09/30/2014 10:28 Carter St & W 12th St 1315.0 Tuesday 09/30/2014 01:28:00 PM 09/30/2014 13:28 Carter St & W 12th St 1315.0
Wednesday 111292 Customer 10735.0 C00076 Wednesday 09/03/2014 02:56:00 PM 09/03/2014 14:56 Power Alley & Riverfront Pkwy 1323.0 Wednesday 09/03/2014 05:55:00 PM 09/03/2014 17:55 ML King Blvd & Lindsay St 1327.0
22621 Customer 10725.0 C00053 Wednesday 06/17/2015 04:20:00 PM 06/17/2015 16:20 Market & E 4th Street 1307.0 Wednesday 06/17/2015 07:19:00 PM 06/17/2015 19:19 Market & E 4th Street 1307.0
14618 Customer 10708.0 C00004 Wednesday 09/03/2014 02:56:00 PM 09/03/2014 14:56 Power Alley & Riverfront Pkwy 1323.0 Wednesday 09/03/2014 05:55:00 PM 09/03/2014 17:55 ML King Blvd & Lindsay St 1327.0
6 透视表 改变表的行和列
data.head()
# output
MemberType TripDurationSec BikeID StartDay StartDateTime StartDate StartTime StartStationName StartStationID EndDay EndDateTime EndDate EndTime EndStationName EndStationID
0 Subscriber 235.0 C00198 Tuesday 11/27/2012 05:17:00 PM 11/27/2012 17:17 Market St & ML King Blvd 1325.0 Tuesday 11/27/2012 05:21:00 PM 11/27/2012 17:21 Market St & Cowart St 1311.0
1 Subscriber 101.0 C00156 Tuesday 12/11/2012 04:10:00 PM 12/11/2012 16:10 Market St & ML King Blvd 1325.0 Tuesday 12/11/2012 04:12:00 PM 12/11/2012 16:12 Market St & W 12th St 1309.0
2 Subscriber 478.0 C00092 Wednesday 11/21/2012 04:17:00 PM 11/21/2012 16:17 100 Walnut St 1319.0 Wednesday 11/21/2012 04:25:00 PM 11/21/2012 16:25 200 River St Outdoor Chattanooga 1299.0
3 Customer 2399.0 C00061 Friday 11/23/2012 12:39:00 AM 11/23/2012 00:39 215 Broad St 1303.0 Friday 11/23/2012 01:19:00 AM 11/23/2012 01:19 215 Broad St 1303.0
4 Customer 3412.0 C00030 Saturday 12/08/2012 07:49:00 PM 12/08/2012 19:49 High St & E 2nd St 1321.0 Saturday 12/08/2012 08:46:00 PM 12/08/2012 20:46 High St & E 2nd St 1321.0
data.pivot_table(index='MemberType', columns='StartDay', values='TripDurationSec',aggfunc=np.mean)
# output
StartDay Friday Monday Saturday Sunday Thursday Tuesday Wednesday
MemberType
Customer 2063.681970 2155.297637 2164.195151 2272.959807 2082.032464 2128.341238 2121.301973
Subscriber 826.752113 815.751659 1022.755127 1131.069144 803.214897 816.553381 781.432853
groupby
可以达到同样效果
data.groupby(['StartDay', 'MemberType'])['TripDurationSec'].mean().unstack().T
# output
StartDay Friday Monday Saturday Sunday Thursday Tuesday Wednesday
MemberType
Customer 2063.681970 2155.297637 2164.195151 2272.959807 2082.032464 2128.341238 2121.301973
Subscriber 826.752113 815.751659 1022.755127 1131.069144 803.214897 816.553381 781.432853
7 交叉表
pd.crosstab(index=data.MemberType, columns=data.StartDay) # 顶级方法,返回个数
# output
StartDay Friday Monday Saturday Sunday Thursday Tuesday Wednesday
MemberType
Customer 13744 11850 33328 25850 8779 9193 8312
Subscriber 8165 6930 6485 5409 7827 7615 7975
pd.crosstab(index=data.MemberType, columns=data.StartDay).T.plot.bar(rot=30)