【Pandas】3.2.共享单车骑行数据分析

包含全部示例的代码仓库见GIthub

1 导入库

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

2 导入数据

data = pd.read_csv('dataset/bike.csv')
data.head()
# output
MemberType	TripDurationSec	BikeID	StartDay	StartDateTime	StartDate	StartTime	StartStationName	StartStationID	EndDay	EndDateTime	EndDate	EndTime	EndStationName	EndStationID
0	Subscriber	235.0	C00198	Tuesday	11/27/2012 05:17:00 PM	11/27/2012	17:17	Market St & ML King Blvd	1325.0	Tuesday	11/27/2012 05:21:00 PM	11/27/2012	17:21	Market St & Cowart St	1311.0
1	Subscriber	101.0	C00156	Tuesday	12/11/2012 04:10:00 PM	12/11/2012	16:10	Market St & ML King Blvd	1325.0	Tuesday	12/11/2012 04:12:00 PM	12/11/2012	16:12	Market St & W 12th St	1309.0
2	Subscriber	478.0	C00092	Wednesday	11/21/2012 04:17:00 PM	11/21/2012	16:17	100 Walnut St	1319.0	Wednesday	11/21/2012 04:25:00 PM	11/21/2012	16:25	200 River St Outdoor Chattanooga	1299.0
3	Customer	2399.0	C00061	Friday	11/23/2012 12:39:00 AM	11/23/2012	00:39	215 Broad St	1303.0	Friday	11/23/2012 01:19:00 AM	11/23/2012	01:19	215 Broad St	1303.0
4	Customer	3412.0	C00030	Saturday	12/08/2012 07:49:00 PM	12/08/2012	19:49	High St & E 2nd St	1321.0	Saturday	12/08/2012 08:46:00 PM	12/08/2012	20:46	High St & E 2nd St	1321.0

异常值

data[data.StartDay != data.EndDay].dropna(how='all')
# output
	MemberType	TripDurationSec	BikeID	StartDay	StartDateTime	StartDate	StartTime	StartStationName	StartStationID	EndDay	EndDateTime	EndDate	EndTime	EndStationName	EndStationID
266	Subscriber	2469.0	C00040	Friday	12/07/2012 11:50:00 PM	12/08/2012	23:50	Oak St & University Greenway	1335.0	Saturday	12/08/2012 12:31:00 AM	12/08/2012	00:31	Oak St & University Greenway	1335.0
433	Customer	2536.0	C00304	Saturday	12/01/2012 11:17:00 PM	12/02/2012	23:17	100 Manufacturers Rd	1297.0	Sunday	12/02/2012 12:00:00 AM	12/02/2012	00:00	100 Manufacturers Rd	1297.0
452	Subscriber	3754.0	C00107	Sunday	11/25/2012 11:00:00 PM	11/26/2012	23:00	800 E 8th St	1333.0	Monday	11/26/2012 12:03:00 AM	11/26/2012	00:03	800 E 8th St	1333.0
465	Customer	1184.0	C00116	Thursday	12/06/2012 11:40:00 PM	12/07/2012	23:40	Broad St & W 4th St	1343.0	Friday	12/07/2012 12:00:00 AM	12/07/2012	00:00	215 Broad St	1303.0
515	Subscriber	315.0	C00114	Tuesday	11/20/2012 11:57:00 PM	11/21/2012	23:57	High St & E 2nd St	1321.0	Wednesday	11/21/2012 12:02:00 AM	11/21/2012	00:02	215 Broad St	1303.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
165831	Customer	168501.0	C00107	Wednesday	07/08/2015 10:36:00 PM	07/09/2015	22:36	Market & E 4th Street	1307.0	Friday	07/10/2015 09:25:00 PM	07/11/2015	21:25	SpringHill Suites	1420.0
165920	Customer	1557.0	C00036	Sunday	02/17/2013 11:48:00 PM	02/18/2013	23:48	200 River St Outdoor Chattanooga	1299.0	Monday	02/18/2013 12:14:00 AM	02/18/2013	00:14	800 E 8th St	1333.0
166050	Customer	18649.0	C00287	Tuesday	07/22/2014 09:03:00 PM	07/23/2014	21:03	100 Walnut St	1319.0	Wednesday	07/23/2014 02:14:00 AM	07/23/2014	02:14	Broad St & W 4th St	1343.0
166157	Customer	2359.0	C00115	Tuesday	04/21/2015 11:44:00 PM	04/22/2015	23:44	Market St & ML King Blvd	1325.0	Wednesday	04/22/2015 12:23:00 AM	04/22/2015	00:23	100 Manufacturers Rd	1297.0
166286	Subscriber	3067.0	C00205	Friday	05/31/2013 11:14:00 PM	06/01/2013	23:14	E Main St & Rossville Ave	1341.0	Saturday	06/01/2013 12:05:00 AM	06/01/2013	00:05	Broad St & W 10th St	1363.0

删除缺失值

data.dropna(how='all', inplace=True)
data.info()
# output
<class 'pandas.core.frame.DataFrame'>
Int64Index: 163914 entries, 0 to 166308
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   MemberType        163914 non-null  object 
 1   TripDurationSec   163914 non-null  float64
 2   BikeID            163914 non-null  object 
 3   StartDay          163914 non-null  object 
 4   StartDateTime     163914 non-null  object 
 5   StartDate         163914 non-null  object 
 6   StartTime         163914 non-null  object 
 7   StartStationName  163914 non-null  object 
 8   StartStationID    163914 non-null  float64
 9   EndDay            163912 non-null  object 
 10  EndDateTime       163912 non-null  object 
 11  EndDate           163912 non-null  object 
 12  EndTime           163912 non-null  object 
 13  EndStationName    163912 non-null  object 
 14  EndStationID      163912 non-null  float64
dtypes: float64(3), object(12)
memory usage: 20.0+ MB
data.head()
# output
	MemberType	TripDurationSec	BikeID	StartDay	StartDateTime	StartDate	StartTime	StartStationName	StartStationID	EndDay	EndDateTime	EndDate	EndTime	EndStationName	EndStationID
0	Subscriber	235.0	C00198	Tuesday	11/27/2012 05:17:00 PM	11/27/2012	17:17	Market St & ML King Blvd	1325.0	Tuesday	11/27/2012 05:21:00 PM	11/27/2012	17:21	Market St & Cowart St	1311.0
1	Subscriber	101.0	C00156	Tuesday	12/11/2012 04:10:00 PM	12/11/2012	16:10	Market St & ML King Blvd	1325.0	Tuesday	12/11/2012 04:12:00 PM	12/11/2012	16:12	Market St & W 12th St	1309.0
2	Subscriber	478.0	C00092	Wednesday	11/21/2012 04:17:00 PM	11/21/2012	16:17	100 Walnut St	1319.0	Wednesday	11/21/2012 04:25:00 PM	11/21/2012	16:25	200 River St Outdoor Chattanooga	1299.0
3	Customer	2399.0	C00061	Friday	11/23/2012 12:39:00 AM	11/23/2012	00:39	215 Broad St	1303.0	Friday	11/23/2012 01:19:00 AM	11/23/2012	01:19	215 Broad St	1303.0
4	Customer	3412.0	C00030	Saturday	12/08/2012 07:49:00 PM	12/08/2012	19:49	High St & E 2nd St	1321.0	Saturday	12/08/2012 08:46:00 PM	12/08/2012	20:46	High St & E 2nd St	1321.0
data.MemberType.unique()
# output
array(['Subscriber', 'Customer', 'Dependent'], dtype=object)

data.MemberType.value_counts()
# output
Customer      113229
Subscriber     50629
Dependent         56
Name: MemberType, dtype: int64

删除企业用户

data = data[data.MemberType != 'Dependent']
data.MemberType.unique()
# output
array(['Subscriber', 'Customer'], dtype=object)

3 骑行时间分析

data.groupby('MemberType')['TripDurationSec'].mean()
# output
MemberType
Customer      17816.349645
Subscriber     1325.880484
Name: TripDurationSec, dtype: float64

data.TripDurationSec.mean()
# output
12721.108868654566

data.TripDurationSec.max()
# output
1351369984.0

删除最大值,可能是异常值

data = data[data.TripDurationSec != data.TripDurationSec.max()]
data.TripDurationSec.max()
# output
56988700.0

data = data[data.TripDurationSec < 3*3600]
data.groupby('MemberType')['TripDurationSec'].mean()
# output
MemberType
Customer      2163.449926
Subscriber     870.746617
Name: TripDurationSec, dtype: float64

data.groupby('MemberType')['TripDurationSec'].mean().plot.bar(rot=0)  # rot=0 x轴文字旋转0度

在这里插入图片描述

4 骑行开始日期分析

data.StartDay.value_counts().plot.bar(rot=20)

在这里插入图片描述

data.groupby(['StartDay', 'MemberType']).size().plot.bar(rot=30)
# size 对nan值也会计数,count 不会对nan值计数

在这里插入图片描述

data.groupby(['StartDay', 'MemberType']).size().unstack()
# output
MemberType	Customer	Subscriber
StartDay		
Friday	13744	8165
Monday	11850	6930
Saturday	33328	6485
Sunday	25850	5409
Thursday	8779	7827
Tuesday	9193	7615
Wednesday	8312	7975
data.groupby(['StartDay', 'MemberType']).size().unstack().plot.bar(rot=30)

在这里插入图片描述

5 每周中骑行时间最长3次骑行记录

data.head()
# output
MemberType	TripDurationSec	BikeID	StartDay	StartDateTime	StartDate	StartTime	StartStationName	StartStationID	EndDay	EndDateTime	EndDate	EndTime	EndStationName	EndStationID
0	Subscriber	235.0	C00198	Tuesday	11/27/2012 05:17:00 PM	11/27/2012	17:17	Market St & ML King Blvd	1325.0	Tuesday	11/27/2012 05:21:00 PM	11/27/2012	17:21	Market St & Cowart St	1311.0
1	Subscriber	101.0	C00156	Tuesday	12/11/2012 04:10:00 PM	12/11/2012	16:10	Market St & ML King Blvd	1325.0	Tuesday	12/11/2012 04:12:00 PM	12/11/2012	16:12	Market St & W 12th St	1309.0
2	Subscriber	478.0	C00092	Wednesday	11/21/2012 04:17:00 PM	11/21/2012	16:17	100 Walnut St	1319.0	Wednesday	11/21/2012 04:25:00 PM	11/21/2012	16:25	200 River St Outdoor Chattanooga	1299.0
3	Customer	2399.0	C00061	Friday	11/23/2012 12:39:00 AM	11/23/2012	00:39	215 Broad St	1303.0	Friday	11/23/2012 01:19:00 AM	11/23/2012	01:19	215 Broad St	1303.0
4	Customer	3412.0	C00030	Saturday	12/08/2012 07:49:00 PM	12/08/2012	19:49	High St & E 2nd St	1321.0	Saturday	12/08/2012 08:46:00 PM	12/08/2012	20:46	High St & E 2nd St	1321.0
# apply 对每一个分组应用
data.groupby('StartDay').apply(lambda x:x.sort_values('TripDurationSec', ascending=False)[:3])
# output
MemberType	TripDurationSec	BikeID	StartDay	StartDateTime	StartDate	StartTime	StartStationName	StartStationID	EndDay	EndDateTime	EndDate	EndTime	EndStationName	EndStationID
StartDay																
Friday	31135	Customer	10799.0	C00273	Friday	08/30/2013 06:57:00 PM	08/30/2013	18:57	Broad St & W 4th St	1343.0	Friday	08/30/2013 09:57:00 PM	08/31/2013	21:57	Broad St & W 6th St	1317.0
        128836	Customer	10789.0	C00285	Friday	08/30/2013 06:57:00 PM	08/30/2013	18:57	Broad St & W 4th St	1343.0	Friday	08/30/2013 09:56:00 PM	08/31/2013	21:56	Broad St & W 6th St	1317.0
        128622	Customer	10773.0	C00022	Friday	11/06/2015 03:59:00 PM	11/06/2015	15:59	Carter St & W 12th St	1315.0	Friday	11/06/2015 06:58:00 PM	11/06/2015	18:58	Carter St & W 12th St	1315.0
Monday	31307	Customer	10776.0	C00082	Monday	09/01/2014 05:01:00 PM	09/01/2014	17:01	200 River St Outdoor Chattanooga	1299.0	Monday	09/01/2014 08:01:00 PM	09/01/2014	20:01	High St & E 2nd St	1321.0
        69086	Customer	10769.0	C00237	Monday	02/03/2014 08:40:00 AM	02/03/2014	08:40	Carter St & W 12th St	1315.0	Monday	02/03/2014 11:40:00 AM	02/03/2014	11:40	Carter St & W 12th St	1315.0
        106891	Customer	10761.0	C00276	Monday	11/19/2012 03:19:00 PM	11/19/2012	15:19	Power Alley & Riverfront Pkwy	1323.0	Monday	11/19/2012 06:18:00 PM	11/19/2012	18:18	Power Alley & Riverfront Pkwy	1323.0
Saturday	18496	Subscriber	10798.0	C00168	Saturday	08/01/2015 12:56:00 PM	08/01/2015	12:56	100 Walnut St	1319.0	Saturday	08/01/2015 03:56:00 PM	08/01/2015	15:56	High St & E 2nd St	1321.0
        99145	Customer	10784.0	C00266	Saturday	03/16/2013 03:18:00 PM	03/16/2013	15:18	Power Alley & Riverfront Pkwy	1323.0	Saturday	03/16/2013 06:17:00 PM	03/16/2013	18:17	Chestnut St & Riverfront Pkwy	1313.0
        69506	Customer	10764.0	C00034	Saturday	06/07/2014 12:12:00 PM	06/07/2014	12:12	Broad St & W 8th St	1345.0	Saturday	06/07/2014 03:12:00 PM	06/07/2014	15:12	Broad St & W 8th St	1345.0
Sunday	130253	Customer	10799.0	C00109	Sunday	06/14/2015 03:38:00 PM	06/14/2015	15:38	100 Walnut St	1319.0	Sunday	06/14/2015 06:38:00 PM	06/14/2015	18:38	100 Walnut St	1319.0
        95625	Customer	10798.0	C00031	Sunday	04/13/2014 12:55:00 PM	04/13/2014	12:55	215 Broad St	1303.0	Sunday	04/13/2014 03:55:00 PM	04/13/2014	15:55	215 Broad St	1303.0
       106953	Customer	10793.0	C00065	Sunday	10/05/2014 04:07:00 PM	10/05/2014	16:07	200 River St Outdoor Chattanooga	1299.0	Sunday	10/05/2014 07:07:00 PM	10/05/2014	19:07	200 River St Outdoor Chattanooga	1299.0
Thursday	148040	Customer	10775.0	C00177	Thursday	06/27/2013 01:29:00 PM	06/27/2013	13:29	215 Broad St	1303.0	Thursday	06/27/2013 04:28:00 PM	06/27/2013	16:28	215 Broad St	1303.0
       97762	Customer	10724.0	C00156	Thursday	10/16/2014 02:00:00 PM	10/16/2014	14:00	Market St & W 12th St	1309.0	Thursday	10/16/2014 04:59:00 PM	10/16/2014	16:59	Market & E 4th Street	1307.0
       162425	Customer	10707.0	C00179	Thursday	10/16/2014 02:01:00 PM	10/16/2014	14:01	Market St & W 12th St	1309.0	Thursday	10/16/2014 04:59:00 PM	10/16/2014	16:59	Market & E 4th Street	1307.0
Tuesday	137688	Customer	10797.0	C00256	Tuesday	06/23/2015 11:38:00 AM	06/23/2015	11:38	Broad St & W 6th St	1317.0	Tuesday	06/23/2015 02:38:00 PM	06/23/2015	14:38	Broad St & W 6th St	1317.0
        33892	Customer	10784.0	C00147	Tuesday	06/23/2015 11:38:00 AM	06/23/2015	11:38	Broad St & W 6th St	1317.0	Tuesday	06/23/2015 02:38:00 PM	06/23/2015	14:38	Broad St & W 6th St	1317.0
        165460	Customer	10780.0	C00196	Tuesday	09/30/2014 10:28:00 AM	09/30/2014	10:28	Carter St & W 12th St	1315.0	Tuesday	09/30/2014 01:28:00 PM	09/30/2014	13:28	Carter St & W 12th St	1315.0
Wednesday	111292	Customer	10735.0	C00076	Wednesday	09/03/2014 02:56:00 PM	09/03/2014	14:56	Power Alley & Riverfront Pkwy	1323.0	Wednesday	09/03/2014 05:55:00 PM	09/03/2014	17:55	ML King Blvd & Lindsay St	1327.0
            22621	Customer	10725.0	C00053	Wednesday	06/17/2015 04:20:00 PM	06/17/2015	16:20	Market & E 4th Street	1307.0	Wednesday	06/17/2015 07:19:00 PM	06/17/2015	19:19	Market & E 4th Street	1307.0
            14618	Customer	10708.0	C00004	Wednesday	09/03/2014 02:56:00 PM	09/03/2014	14:56	Power Alley & Riverfront Pkwy	1323.0	Wednesday	09/03/2014 05:55:00 PM	09/03/2014	17:55	ML King Blvd & Lindsay St	1327.0

6 透视表 改变表的行和列

data.head()
# output
MemberType	TripDurationSec	BikeID	StartDay	StartDateTime	StartDate	StartTime	StartStationName	StartStationID	EndDay	EndDateTime	EndDate	EndTime	EndStationName	EndStationID
0	Subscriber	235.0	C00198	Tuesday	11/27/2012 05:17:00 PM	11/27/2012	17:17	Market St & ML King Blvd	1325.0	Tuesday	11/27/2012 05:21:00 PM	11/27/2012	17:21	Market St & Cowart St	1311.0
1	Subscriber	101.0	C00156	Tuesday	12/11/2012 04:10:00 PM	12/11/2012	16:10	Market St & ML King Blvd	1325.0	Tuesday	12/11/2012 04:12:00 PM	12/11/2012	16:12	Market St & W 12th St	1309.0
2	Subscriber	478.0	C00092	Wednesday	11/21/2012 04:17:00 PM	11/21/2012	16:17	100 Walnut St	1319.0	Wednesday	11/21/2012 04:25:00 PM	11/21/2012	16:25	200 River St Outdoor Chattanooga	1299.0
3	Customer	2399.0	C00061	Friday	11/23/2012 12:39:00 AM	11/23/2012	00:39	215 Broad St	1303.0	Friday	11/23/2012 01:19:00 AM	11/23/2012	01:19	215 Broad St	1303.0
4	Customer	3412.0	C00030	Saturday	12/08/2012 07:49:00 PM	12/08/2012	19:49	High St & E 2nd St	1321.0	Saturday	12/08/2012 08:46:00 PM	12/08/2012	20:46	High St & E 2nd St	1321.0
data.pivot_table(index='MemberType', columns='StartDay', values='TripDurationSec',aggfunc=np.mean)
# output
StartDay	Friday	Monday	Saturday	Sunday	Thursday	Tuesday	Wednesday
MemberType							
Customer	2063.681970	2155.297637	2164.195151	2272.959807	2082.032464	2128.341238	2121.301973
Subscriber	826.752113	815.751659	1022.755127	1131.069144	803.214897	816.553381	781.432853

groupby可以达到同样效果

data.groupby(['StartDay', 'MemberType'])['TripDurationSec'].mean().unstack().T
# output
StartDay	Friday	Monday	Saturday	Sunday	Thursday	Tuesday	Wednesday
MemberType							
Customer	2063.681970	2155.297637	2164.195151	2272.959807	2082.032464	2128.341238	2121.301973
Subscriber	826.752113	815.751659	1022.755127	1131.069144	803.214897	816.553381	781.432853

7 交叉表

pd.crosstab(index=data.MemberType, columns=data.StartDay) # 顶级方法,返回个数
# output
StartDay	Friday	Monday	Saturday	Sunday	Thursday	Tuesday	Wednesday
MemberType							
Customer	13744	11850	33328	25850	8779	9193	8312
Subscriber	8165	6930	6485	5409	7827	7615	7975

pd.crosstab(index=data.MemberType, columns=data.StartDay).T.plot.bar(rot=30)

在这里插入图片描述

  • 14
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

LouHerGetUp

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值