pandas数据处理
对数据进行分类统计排序
算法设计逻辑
- 从全部数据中选取电影分类数据Gerne
- 将gerne数据处理成电影类型名称集合
- 构造values全为0的 电影数量 x 电影类型的 DataFrame
- 将各符合电影种类的电影类型对应位置Values设为1
- 各类电影所在列值相加,得到各类电影的总数
- 画图
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
file_path = './IMDB-Movie-Data.csv'
df = pd.read_csv(file_path)
# print(df.head(1))
# print(df.info())
# print(df['Genre'])
#统计分类的列表
temp_list = df['Genre'].str.split(',').tolist()
# print('temp_list')
# print(temp_list)
gerne_list = list( set([i for j in temp_list for i in j ]))
# print('gerne_list')
# print(gerne_list)
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(gerne_list))),columns = gerne_list)
print('zeros_df')
print(zeros_df)
for i in range(df.shape[0]):
zeros_df.loc[i,temp_list[i]] = 1
# print(zeros_df.head(4))
gerne_count = zeros_df.sum(axis=0)
# print('gerne_count')
# print(gerne_count)
gerne_count = gerne_count.sort_values(ascending=True)
# print('gerne_count')
# print(gerne_count)
_x = gerne_count.index
_y = gerne_count.values
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.show()
数据的合并
使用join,merge合并数据
合并数据时,join()中没有的数据用NAN代替
import numpy as np
import pandas as pd
# list1 = np.array([[1,2,3],[4,4,5],[6,346,63],[235,564,5]])
# print(list1)
# print(list1.shape[0])
# print(list1.shape[1])
df1 = pd.DataFrame(np.ones((2,4)),index = ['A','B'],columns=list('abcd'))
df2 = pd.DataFrame(np.ones((3,3)),index =['A','B','C'],columns=list('xyz'))
df3 = pd.DataFrame(np.zeros((3,3)),columns = list('fax'))
print('df1')
print(df1)
print('df2')
print(df2)
print('df3')
print(df3)
t1 = df1.join(df2)
t2 = df2.join(df1)
t3 = df1.merge(df3,on='a')
print(t1)
print(t2)
print(t3)
使用merge合并数据时,数据将按照指定的方式进行合并
df4.loc['A','c']= 'a'
df4.loc['B','c']= 'c'
df4.loc['C','c']= 'c'
print('df4')
print(df4)
df5.loc['A','x']='c'
df5.loc['B','x']='d'
print('df5')
print(df5)
t1 = df4.merge(df5,left_on='c',right_on='x',how='inner')
print('t1')
print(t1)
how=‘inner’: 取交集(默认连接方式)
how=‘outer’:取并集
how=‘left’:按照左侧数据连接
how=‘right’:按照右侧数据连接
分组和聚合
import pandas as pd
import numpy as np
file_path = './starbucks_store_worldwide.csv'
df = pd.read_csv(file_path)
print(df.head(1))
print(df.info())
grouped = df.groupby(by='Country')
print('grouped')
print(grouped)
# for i,j in grouped:
# print(i)
# print('-'*100)
# print(j)
# print("*"*100)
# us_data = df[ df['Country'] =='US']
# # print(us_data)
#
# country_count = grouped['Brand'].count()
# print(country_count['US'])
# print(country_count['CN'])
# 统计中国每个省份的店铺的数量
# china_data = df[df['Country'] == 'CN']
# grouped_china = china_data.groupby(by = 'State/Province').count()['Brand']
#
# print(grouped_china)
# 数据按照多个条件进行分组
multi_grouped = df['Brand'].groupby(by = [df['Country'],df['State/Province']]).count()
print(multi_grouped,type(multi_grouped))
print("*"*100)
multi_grouped1 = df[['Brand']].groupby(by = [df['Country'],df['State/Province']]).count()
multi_grouped2 = df.groupby(by = [df['Country'],df['State/Province']])[['Brand']].count()
multi_grouped3 = df.groupby(by = [df['Country'],df['State/Province']]).count()[['Brand']]
print(multi_grouped1,type(multi_grouped1))
print("*"*100)
print(multi_grouped2,type(multi_grouped2))
print("*"*100)
print(multi_grouped3,type(multi_grouped3))
print("*"*100)
# 索引的方法和属性
print(multi_grouped3.index)
关于groupby用法详解,找到了一篇不错的文章: link.
索引和复合索引
import pandas as pd
import numpy as np
df1 = pd.DataFrame([[100,1,1,1],[1,1,1,1]],index=['A','B'],columns=list('abcd'))
print('--------------1---------------')
print(df1)
df1.index = ['X','Y']
print('--------------2----------------')
print(df1)
print('--------------3----------------')
print(df1.reindex(list('Xdef'))) #reindex不会改变df1的索引
print('---------------4----------------')
print(df1.set_index(['c','d'],drop=False))#set_index不会改变df1的值
a = pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','one','two','two','two'],'d':list('hjklmno')})
print('---------------5----------------')
print(a)
# a1是Dataframe
a1 = a.set_index(['c','d'])
print('----------a1-----------')
print(a1)
a2 = a1['a']
print('--------------a2-----------')
print(a2)
a3 = a2['one']
a4 = a2['two']['m']
print('-----------a3----------')
print(a3)
print('-----------a4----------')
print(a4)
# c是series
c = a.set_index(['c','d'])['a']
## cc是DataFrame
cc = a.set_index(['c','d'])[['a']]
print('--------------c------------')
print(c)
print('--------------cc-----------')
print(cc)
#series取索引,直接取
print(c['one','k'])
#Dataframe取索引,使用标签
print('Dataframe取值')
print(a1.loc['one'].loc['j'])
# a1交换里外层索引
c1 = a1.swaplevel()
print('c1')
print(c1)
分组,聚合及索引的应用
Task1 将店铺中数量排名前25的省份及店铺数量画图表示
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname="C:\Windows\Fonts\STXINGKA.TTF")
file_path = './starbucks_store_worldwide.csv'
df = pd.read_csv(file_path)
data1 = df.groupby(by='Country').count()['Brand'].sort_values(ascending=False)[:10]
# print(data1)
data2 = df[df['Country'] == "CN"].groupby(by='City').count()['Brand'].sort_values(ascending=False)[:25]
# print(data2)
_x1 = data1.index
_y1 = data1.values
_x2 = data2.index
_y2 = data2.values
plt.figure(figsize=(18,15),dpi=80)
# plt.bar(range(len(_x1)),_y1)
# plt.xticks(range(len(_x1)),_x1)
# plt.show()
plt.barh(range(len(_x2)),_y2,height=0.3,color='orange')
plt.yticks(range(len(_x2)),_x2,fontproperties=my_font)
plt.show()
Task2 不同年份书籍数量以及不同年份书籍的平均评分情况
import pandas as pd
from matplotlib import pyplot as plt
file_path = './books.csv'
df = pd.read_csv(file_path)
print(df.head(1))
print(df.info())
# Task1-------------------------
data1 = df[pd.notnull(df['original_publication_year'])] #处理缺失数据:删掉缺失
booknumber_peryear = data1.groupby(by='original_publication_year').count()['id']
print('每年书籍')
print(booknumber_peryear)
#-----------------------------------
data2 = df[pd.notnull(df['average_rating'])] #删掉缺失数据
average_rating = data2['average_rating'].groupby(by=data2['original_publication_year']).mean()
print('average_rating')
print(average_rating)
_x = average_rating.index
_y = average_rating.values
plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(_x)),_y)
plt.xticks(list(range(len(_x)))[::10],_x[::10].astype(int),rotation=45)
plt.show()