练习1
通过starbucks_store_worldwide.csv数据,获取中国每个省份的分布数量
代码实现
#!/user/bin/env python
#-*-coding: utf-8-*-
#@Time : 2020/9/1115:49
#@Author : GodSpeed
#@File : pandas第五次作业01.py
#@Software : PyCharm
import numpy as np
import pandas as pd
'''
练习1
通过starbucks_store_worldwide.csv数据,获取中国每个省份的分布数量
'''
# 作业01 先截取目标数据[国家为中国的数据],再进行统计
def task01_method(buck_store_w):
#1.截取Country为CN的数据
print(buck_store_w[buck_store_w.loc[:,'Country'] == 'CN'] )
#2.对数据依据列State/Province进行分组求['Store Number']的计数
dist_prov_stores = buck_store_w.groupby(by='State/Province')['Store Number'].count()
print(dist_prov_stores)
#3.保存dist_prov_stores数据
dist_prov_stores.to_csv('task01_dist_prov_stores.csv')
if __name__ == '__main__':
# 读取starbucks_store_worldwide.csv
buck_store_w = pd.read_csv('starbucks_store_worldwide.csv', usecols=['Store Number', 'State/Province', 'Country'])
print(buck_store_w.head())
'''
Store Number State/Province Country
0 47370-257954 7 AD
1 22331-212325 AJ AE
2 47089-256771 AJ AE
3 22126-218024 AZ AE
4 17127-178586 AZ AE
'''
#查看信息
print(buck_store_w.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25600 entries, 0 to 25599
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Store Number 25600 non-null object
1 State/Province 25600 non-null object
2 Country 25600 non-null object
dtypes: object(3)
memory usage: 300.1+ KB
None
'''
task01_method(buck_store_w);
结果
练习2
统计出911数据中不同月份的电话次数,并直观体现数据的变化情况
代码实现
#!/user/bin/env python
#-*-coding: utf-8-*-
#@Time : 2020/9/13 001323:17
#@Author : GodSpeed
#@File : pandas第五次作业最终修正版本.py
#@Software : PyCharm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
'''
练习2
统计出911数据中不同月份的电话次数,并直观体现数据的变化情况
'''
font = {
'family' : 'simhei',
'weight' : 'bold',
'size' : '12'
}
plt.rc('font', **font)
def calls_911_months():
# 1.读取911数据
data_911 = pd.read_csv ( '911.csv')
#print ( data_911.head() )
'''
lat lng ... addr e
0 40.297876 -75.581294 ... REINDEER CT & DEAD END 1
1 40.258061 -75.264680 ... BRIAR PATH & WHITEMARSH LN 1
2 40.121182 -75.351975 ... HAWS AVE 1
3 40.116153 -75.343513 ... AIRY ST & SWEDE ST 1
4 40.251492 -75.603350 ... CHERRYWOOD CT & DEAD END 1
[5 rows x 9 columns]
'''
#print(data_911.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249737 entries, 0 to 249736
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 lat 249737 non-null float64
1 lng 249737 non-null float64
2 desc 249737 non-null object
3 zip 219391 non-null float64
4 title 249737 non-null object
5 timeStamp 249737 non-null object
6 twp 249644 non-null object
7 addr 249737 non-null object
8 e 249737 non-null int64
dtypes: float64(3), int64(1), object(5)
memory usage: 17.1+ MB
None
'''
#print(data_911["timeStamp"].dtype) #object
#2.timeStamp列由object转换为dateTime
data_911['timeStamp'] = pd.to_datetime ( data_911['timeStamp'], format='%Y-%m' )
#print(data_911.dtypes) #timeStamp datetime64[ns]
#3.把timeStamp指定为行索引
data_911.set_index ( "timeStamp", inplace=True )
# 降采样 根据月份重采样
# 如果想用月头就用MS
# 去一列没有确实值的列,转换为Seris,以方便后续的绘图
count_m = data_911.resample ( "MS" ).count()['lat']
#print('count_m=',count_m)
'''
count_m= timeStamp
2015-12-01 7916
2016-01-01 13096
2016-02-01 11396
2016-03-01 11059
2016-04-01 11287
2016-05-01 11374
2016-06-01 11732
2016-07-01 12088
2016-08-01 11904
2016-09-01 11669
2016-10-01 12502
2016-11-01 12091
2016-12-01 12162
2017-01-01 11605
2017-02-01 10267
2017-03-01 11684
2017-04-01 11056
2017-05-01 11719
2017-06-01 12333
2017-07-01 11768
2017-08-01 11753
2017-09-01 7276
Freq: MS, Name: lat, dtype: int64
Process finished with exit code 0
'''
# 折线图显示变化
# 直方图显示数据的分布情况
# x是横轴是时间
# y是数据
#print(count_m.index)
x = count_m.index
y = count_m.values
#plt.plot(x,y)
#plt.show()
# 问题: X轴的刻度自动分配2016-01,2016-04,2016-07,2016-10...
# 并不是(应该让让x当刻度标签)我们想要的2015-12,2016-01,2016-02 ...
# 需要优化的地方:
# 自定义X轴刻度:用count_m.index来显示
# 方法:
# 用x的长度来做范围设定X刻度
#plt.plot(range(len(x)),y)
# 设置刻度值和刻度标签
# xticks有两个参数:刻度值和刻度标签
#plt.xticks(range(len(x)),x)
# 问题: X轴太密集
# 解决:设置rotation一下旋转
#plt.xticks ( range ( len ( x ) ), x, rotation=90)
# 问题:X轴的时间精度变成了秒
# 需求只想要X显示年月日
# 解决:格式化为分钟
# 时间序列里面每个元素都是TimeStamp对象,以此可以进行格式化
#print(x)
x_f = [i.strftime("%Y-%m") for i in x]
#print(x_f)
plt.xticks (range ( len ( x ) ),x_f, rotation=45)
plt.plot ( range ( len ( x ) ), y )
plt.grid () # 根据x轴和y轴的数值展示轴网格
plt.title ( "911每月来电次数情况" )
plt.show()
if __name__ == '__main__':
calls_911_months ()
结果:
总结收获: 1.复习到时间序列,文件数据可能数值,也可能是字符串,当不为时间数据是要转换为时间序列; 2.时间格式化strftime("%Y-%m"). 3.当坐标轴刻度不满足要求是,通过xticks 设置刻度和刻度标签. 4.折线图显示变化;.直方图显示数据的分布情况 |