【问题1】pandas时间序列01
现在我们有2015到2017年25万条911的紧急电话的数据
(1)请统计出这些数据中“不同类型的紧急情况的次数”
import pandas as pd
import numpy as np
df = pd.read_csv('./code2/911.csv')
print('\n【df.head()】')
print(df.head())
print('\n【df.info()】')
print(df.info())
【df.head()】
lat lng desc \
0 40.297876 -75.581294 REINDEER CT & DEAD END; NEW HANOVER; Station ...
1 40.258061 -75.264680 BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...
2 40.121182 -75.351975 HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...
3 40.116153 -75.343513 AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...
4 40.251492 -75.603350 CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...
zip title timeStamp twp \
0 19525.0 EMS: BACK PAINS/INJURY 2015-12-10 17:10:52 NEW HANOVER
1 19446.0 EMS: DIABETIC EMERGENCY 2015-12-10 17:29:21 HATFIELD TOWNSHIP
2 19401.0 Fire: GAS-ODOR/LEAK 2015-12-10 14:39:21 NORRISTOWN
3 19401.0 EMS: CARDIAC EMERGENCY 2015-12-10 16:47:36 NORRISTOWN
4 NaN EMS: DIZZINESS 2015-12-10 16:56:52 LOWER POTTSGROVE
addr e
0 REINDEER CT & DEAD END 1
1 BRIAR PATH & WHITEMARSH LN 1
2 HAWS AVE 1
3 AIRY ST & SWEDE ST 1
4 CHERRYWOOD CT & DEAD END 1
【df.info()】
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639898 entries, 0 to 639897
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 lat 639898 non-null float64
1 lng 639898 non-null float64
2 desc 639898 non-null object
3 zip 562221 non-null float64
4 title 639898 non-null object
5 timeStamp 639898 non-null object
6 twp 639619 non-null object
7 addr 639898 non-null object
8 e 639898 non-null int64
dtypes: float64(3), int64(1), object(5)
memory usage: 43.9+ MB
None
import pandas as pd
import numpy as np
df = pd.read_csv('./code2/911.csv')
print(df['title'].str.split(': '))
temp_list = df['title'].str.split(': ').tolist()
temp_cast_list = [i[0] for i in temp_list]
cast_list = list(set(temp_cast_list))
print(cast_list)
print('**(1)**'*10)
temp_zeros_df = np.zeros((df.shape[0], len(cast_list)))
print(temp_zeros_df)
zeros_df = pd.DataFrame(temp_zeros_df, columns=cast_list)
print(zeros_df)
print('**(2)**'*10)
for cast in cast_list:
print( df['title'].str )
print( df['title'].str.contains(cast) )
break
for cast in cast_list:
zeros_df[cast][df['title'].str.contains(cast)] = 1
print(zeros_df)
'''
(遍历行,每一次需修改列)------- 效率低,速度慢
for i in range(df.shape[1]):
zeros_df.loc[i, temp_list[i][0]] = 1
print(zeros_df)
'''
print('**(3)**'*10)
print(zeros_df.sum(axis=0))
0 [EMS, BACK PAINS/INJURY]
1 [EMS, DIABETIC EMERGENCY]
2 [Fire, GAS-ODOR/LEAK]
3 [EMS, CARDIAC EMERGENCY]
4 [EMS, DIZZINESS]
...
639893 [Fire, GAS-ODOR/LEAK]
639894 [EMS, RESPIRATORY EMERGENCY]
639895 [Fire, FIRE ALARM]
639896 [EMS, UNRESPONSIVE SUBJECT]
639897 [Fire, FIRE INVESTIGATION]
Name: title, Length: 639898, dtype: object
['Traffic', 'EMS', 'Fire']
**(1)****(1)****(1)****(1)****(1)****(1)****(1)****(1)****(1)****(1)**
[[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
...
[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]]
Traffic EMS Fire
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
4 0.0 0.0 0.0
... ... ... ...
639893 0.0 0.0 0.0
639894 0.0 0.0 0.0
639895 0.0 0.0 0.0
639896 0.0 0.0 0.0
639897 0.0 0.0 0.0
[639898 rows x 3 columns]
**(2)****(2)****(2)****(2)****(2)****(2)****(2)****(2)****(2)****(2)**
<pandas.core.strings.StringMethods object at 0x000001A468D71040>
0 False
1 False
2 False
3 False
4 False
...
639893 False
639894 False
639895 False
639896 False
639897 False
Name: title, Length: 639898, dtype: bool
Traffic EMS Fire
0 0.0 1.0 0.0
1 0.0 1.0 0.0
2 0.0 0.0 1.0
3 0.0 1.0 0.0
4 0.0 1.0 0.0
... ... ... ...
639893 0.0 0.0 1.0
639894 0.0 1.0 0.0
639895 0.0 0.0 1.0
639896 0.0 1.0 0.0
639897 0.0 0.0 1.0
[639898 rows x 3 columns]
**(3)****(3)****(3)****(3)****(3)****(3)****(3)****(3)****(3)****(3)**
Traffic 223395.0
EMS 320333.0
Fire 96177.0
dtype: float64
import pandas as pd
import numpy as np
df = pd.read_csv('./code2/911.csv')
temp_list = df['title'].str.split(': ').tolist()
cast_list = [i[0] for i in temp_list]
df['cate'] = cast_list
print(df.head())
grouped = df.groupby(by='cate').count()['title']
print(grouped)
lat lng desc \
0 40.297876 -75.581294 REINDEER CT & DEAD END; NEW HANOVER; Station ...
1 40.258061 -75.264680 BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...
2 40.121182 -75.351975 HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...
3 40.116153 -75.343513 AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...
4 40.251492 -75.603350 CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...
zip title timeStamp twp \
0 19525.0 EMS: BACK PAINS/INJURY 2015-12-10 17:10:52 NEW HANOVER
1 19446.0 EMS: DIABETIC EMERGENCY 2015-12-10 17:29:21 HATFIELD TOWNSHIP
2 19401.0 Fire: GAS-ODOR/LEAK 2015-12-10 14:39:21 NORRISTOWN
3 19401.0 EMS: CARDIAC EMERGENCY 2015-12-10 16:47:36 NORRISTOWN
4 NaN EMS: DIZZINESS 2015-12-10 16:56:52 LOWER POTTSGROVE
addr e cate
0 REINDEER CT & DEAD END 1 EMS
1 BRIAR PATH & WHITEMARSH LN 1 EMS
2 HAWS AVE 1 Fire
3 AIRY ST & SWEDE ST 1 EMS
4 CHERRYWOOD CT & DEAD END 1 EMS
cate
EMS 320326
Fire 96177
Traffic 223395
Name: title, dtype: int64
【问题2】(2)统计不同月份的电话次数的变化情况----折线图
import pandas as pd
from matplotlib import pyplot as plt
df = pd.read_csv('./911-Copy1.csv')
df['timeStamp'] = pd.to_datetime(df['timeStamp'])
df.set_index('timeStamp', inplace=True)
count_by_month = df.resample('M').count()['title']
print(df.head())
print(count_by_month)
_x = count_by_month.index
_y = count_by_month.values
_x = [i.strftime('%Y%m%d') for i in _x]
plt.figure(figsize=(20,8), dpi=80)
plt.plot(range(len(_x)), _y, label='title')
plt.xticks(range(len(_x)), _x, rotation=45)
plt.legend(loc='best')
plt.show()
lat lng \
timeStamp
2015-12-10 17:10:52 40.297876 -75.581294
2015-12-10 17:29:21 40.258061 -75.264680
2015-12-10 14:39:21 40.121182 -75.351975
2015-12-10 16:47:36 40.116153 -75.343513
2015-12-10 16:56:52 40.251492 -75.603350
desc \
timeStamp
2015-12-10 17:10:52 REINDEER CT & DEAD END; NEW HANOVER; Station ...
2015-12-10 17:29:21 BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...
2015-12-10 14:39:21 HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...
2015-12-10 16:47:36 AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...
2015-12-10 16:56:52 CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...
zip title twp \
timeStamp
2015-12-10 17:10:52 19525.0 EMS: BACK PAINS/INJURY NEW HANOVER
2015-12-10 17:29:21 19446.0 EMS: DIABETIC EMERGENCY HATFIELD TOWNSHIP
2015-12-10 14:39:21 19401.0 Fire: GAS-ODOR/LEAK NORRISTOWN
2015-12-10 16:47:36 19401.0 EMS: CARDIAC EMERGENCY NORRISTOWN
2015-12-10 16:56:52 NaN EMS: DIZZINESS LOWER POTTSGROVE
addr e
timeStamp
2015-12-10 17:10:52 REINDEER CT & DEAD END 1
2015-12-10 17:29:21 BRIAR PATH & WHITEMARSH LN 1
2015-12-10 14:39:21 HAWS AVE 1
2015-12-10 16:47:36 AIRY ST & SWEDE ST 1
2015-12-10 16:56:52 CHERRYWOOD CT & DEAD END 1
timeStamp
2015-12-31 7916
2016-01-31 13096
2016-02-29 11396
2016-03-31 11059
2016-04-30 11287
2016-05-31 11374
2016-06-30 11732
2016-07-31 12088
2016-08-31 11904
2016-09-30 11669
2016-10-31 12502
2016-11-30 12091
2016-12-31 12162
2017-01-31 11605
2017-02-28 10267
2017-03-31 11684
2017-04-30 11056
2017-05-31 11719
2017-06-30 12333
2017-07-31 11768
2017-08-31 11753
2017-09-30 11332
2017-10-31 12337
2017-11-30 11548
2017-12-31 12941
2018-01-31 13123
2018-02-28 11165
2018-03-31 14923
2018-04-30 11240
2018-05-31 12551
2018-06-30 12106
2018-07-31 12549
2018-08-31 12315
2018-09-30 12338
2018-10-31 12976
2018-11-30 14097
2018-12-31 12144
2019-01-31 12304
2019-02-28 11556
2019-03-31 12441
2019-04-30 11845
2019-05-31 12823
2019-06-30 12322
2019-07-31 13166
2019-08-31 12387
2019-09-30 11874
2019-10-31 13425
2019-11-30 12446
2019-12-31 12529
2020-01-31 12208
2020-02-29 11043
2020-03-31 9920
2020-04-30 8243
2020-05-31 7220
Freq: M, Name: title, dtype: int64