import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time

temp_data = pd.read_csv("./IMDB-Movie-Data.csv")
data = temp_data["Runtime (Minutes)"].values

num_bins = (data.max()-data.min())//5

x_ticks = range(data.min(),data.max()+5,5)

plt.figure(figsize=(20,8),dpi=80)
plt.hist(data,num_bins)
plt.xticks(x_ticks)
plt.grid(alpha=0.4,linestyle="--")

plt.show()

#尝试一下将头尾的数据结合到一起
num_bins_2 = [66] + list(range(81,171,5))+[192] #可以把这里的bins理解为刻度,最后取到192是为了把最大值191包含进去，左闭右开

plt.figure(figsize=(20,8),dpi=80)
plt.hist(data,num_bins_2)
plt.xticks(num_bins_2)
plt.grid(alpha=0.6,linestyle="--")

plt.show()

对字符串进行分裂及统计¶

temp_list = temp_data["Genre"].str.split(",")
genre_set = set([i for j in temp_list for i in j])
#构造全为0的，以类名为列索引的DataFrame
count_frame = pd.DataFrame(np.zeros(shape=(temp_list.shape[0],len(genre_set))),columns=genre_set)

#start = time.process_time()
#for i in range(temp_list.shape[0]):
#    count_frame.loc[i,temp_list[i]] = 1
#end = time.process_time()
#print(end-start)

#这种方法性能更高，不需要遍历
#start = time.process_time()
for genre in genre_set:
    count_frame.loc[temp_data["Genre"].str.contains(genre),genre] = 1
#end = time.process_time()
#print(end-start)


sum_frame = count_frame.sum()
sum_frame = sum_frame.sort_values()

plt.figure(figsize=(20,8),dpi=80)
plt.bar(sum_frame.index,sum_frame.values,width=0.6,color='orange')

plt.show()

利用groupby进行分组统计¶

read_data = pd.read_csv("./starbucks_store_worldwide.csv")
group_data = read_data.groupby(by="Country")

#group_data为DataFrameGroupBy类型,取出来的groupData为DataFrame类型
#for groupName,groupData in group_data:
#    print(groupName,"\n",type(groupData),"\n",groupData,"\n"+"*"*100)

count_group_data = group_data.count()#count_group_data为DataFrame类型
count_group_data = count_group_data["Brand"]#此时Country列已经变为行索引了，随便取出数据完整的一列，series类型
print(count_group_data.head())

Country
AD      1
AE    144
AR    108
AT     18
AU     22
Name: Brand, dtype: int64

#多条件分组

#中国各省份的店铺数量
group_data_1 = read_data.groupby(by=["Country","State/Province"])#条件顺序分先后
count_group_data_1 = group_data_1.count()#复合索引的dataframe
data = count_group_data_1["Brand"]["CN"]

plt.figure(figsize=(20,8),dpi=80)
plt.bar(data.index,data.values,color="orange")

plt.show()

#店铺数量排前十的国家
data = read_data.groupby(by="Country").count()["Brand"].sort_values(ascending=False)[:10]

plt.figure(figsize=(20,8),dpi=80)
plt.bar(data.index,data.values,color="orange",width=0.5)

plt.show()

pandas之统计

对字符串进行分裂及统计¶

利用groupby进行分组统计¶

“相关推荐”对你有帮助么？