Pandas(数据的合并、分组、聚合)

最新推荐文章于 2022-05-17 11:18:50 发布

weixin_54475711

最新推荐文章于 2022-05-17 11:18:50 发布

阅读量185

点赞数

本文链接：https://blog.csdn.net/weixin_54475711/article/details/115352703

版权

对于这一组电影数据，如果我们想rating，runtime的分布情况，应该如何呈现数据？

# coding=utf-8
import pandas as pd
from matplotlib import pyplot as plt
file_path = "./IMDB-Movie-Data.csv"

df = pd.read_csv(file_path)
#print(df.head(1))
#print(df.info())

#rating,runtime分布情况
#选择图形，直方图
#准备数据
runtime_data = df["Runtime (Minutes)"].values

max_runtime = runtime_data.max()
min_runtime = runtime_data.min()

#计算组数
print(max_runtime-min_runtime)
num_bin = (max_runtime-min_runtime)//5

#设置图形的大小
plt.figure(figsize=(20,8),dpi=80)
plt.hist(runtime_data,num_bin)

#以下两种方法效果一样
#plt.xticks(range(min_runtime,max_runtime,5))

_x = [min_runtime]
i = min_runtime
while i<=max_runtime+5:
    i = i+5
    _x.append(i)

plt.xticks(_x)

plt.show()

import numpy as np
from matplotlib import pyplot as plt

runtime_data = np.array([电影评分数据])
max_runtime = runtime_data.max()
min_runtime = runtime_data.min()
print(min_runtime,max_runtime)

#设置不等宽的组距，hist方法中取到的会是一个左闭右开的去见[1.9,3.5)
num_bin_list = [1.9,3.5]
i=3.5
while i<=max_runtime:
    i += 0.5
    num_bin_list.append(i)
print(num_bin_list)

#设置图形的大小
plt.figure(figsize=(20,8),dpi=80)
plt.hist(runtime_data,num_bin_list)

#xticks让之前的组距能够对应上
plt.xticks(num_bin_list)

plt.show()

对于这一组电影数据，如果我们希望统计电影分类(genre)的情况，应该如何处理数据？
思路：重新构造一个全为0的数组，列名为分类，如果某一条数据中分类出现过，就让0变为1

对于字符串转化为数字一般采用以下方式（创建全为0的DF数组出现一次字符变为1，两个维度选一个相加的总和即转化为出现次数）

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

file_path = ".\IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)

temp_list = df["Genre"].str.split(",").tolist()  #[[],[],[]]列表嵌套列表
genre_list = list(set([i for j in temp_list for i in j]))
#构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
#print(zeros_df)

#给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
    #zeros_df.loc[0,["Sci-fi","Mucical"]] = 1     一行多列赋值1，循环每一行
    zeros_df.loc[i,temp_list[i]] = 1
#print(zeros_df.head(3))

#统计每个分类的电影的数量和
genre_count = zeros_df.sum(axis=0)
#print(genre_count)

#排序
genre_count = genre_count.sort_values()
_x = genre_count.index
_y = genre_count.values

#画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y,width=0.4,color="orange")
plt.xticks(range(len(_x)),_x)
plt.show()

数据的分组和聚合

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

file_path = ".\starbucks_store_worldwide.csv"

df = pd.read_csv(file_path)
#print(df.head(1))
# print(df.info())
#根据国家进行分组
grouped = df.groupby(by="Country")
#print(grouped)

#DataFrameGroupBy
#可以进行遍历
# for i,j in grouped:
#     print(i)
#     print("-"*100)
#     print(j,type(j))
#     print("*"*100)
# df[df["Country"]="US"]
#调用聚合方法

#print(grouped.count())

country_count = grouped["Brand"].count()
#print(country_count)
#print(country_count["US"])
#print(country_count["CN"])


#统计中国每个省店铺的数量
china_data = df[df["Country"] =="CN"]
#print(china_data)
grouped = china_data.groupby(by="City").count()["Brand"]

#print(grouped)

#数据按照多个条件进行分组,返回Series
#grouped = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()
#print(grouped)
#print(type(grouped))

#数据按照多个条件进行分组,返回DataFrame
grouped1 = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
# grouped2= df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
# grouped3 = df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]

#print(grouped1,type(grouped1))
# print("*"*100)
# print(grouped2,type(grouped2))
# print("*"*100)
# print(grouped3,type(grouped3))

#索引的方法和属性
print(grouped1.index)

数据的索引

weixin_54475711

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Pandas(数据的合并、分组、聚合)

对于这一组电影数据，如果我们想rating，runtime的分布情况，应该如何呈现数据？# coding=utf-8import pandas as pdfrom matplotlib import pyplot as pltfile_path = "./IMDB-Movie-Data.csv"df = pd.read_csv(file_path)#print(df.head(1))#print(df.info())#rating,runtime分布情况#选择图形，直方图#准备数据
复制链接

扫一扫