字符串离散化的案例
对于这一组电影数据,如果我们希望统计电影分类(genre)的情况,应该如何处理数据?
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "./IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
#统计分类的列表
temp_list = df["Genre"].str.split(",").tolist() #[[],[],[]]
# print(temp_list)
genre_list = list(set([i for j in temp_list for i in j]))
# print(genre_list)
#构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns = genre_list)
#print(zeros_df)
#Give 1 to each genre appear in the list
for i in range(df.shape[0]):
zeros_df.loc[i,temp_list[i]] = 1
# print(zeros_df.head(3))
#统计每个电影的数量和 axis = 0 算列
genre_count = zeros_df.sum(axis = 0)
# print(genre_count)
#排序
genre_sort_list = genre_count.sort_values()
# print(genre_sort_list)
#draw
_x = genre_sort_list.index
_y = genre_sort_list.values
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y,width=0.4,color="orange")
plt.xticks(range(len(_x)),_x)
plt.show()
数据的合并和分组聚合
数据合并之join
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2,4)),index = ["A","B"],columns = list("abcd"))
# print(df1)
df2 = pd.DataFrame(np.zeros((3,3)),index = ["A","B","C"],columns = list("xyz"))
# print(df2)
'''
a b c d
A 1.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0
x y z
A 0.0 0.0 0.0
B 0.0 0.0 0.0
C 0.0 0.0 0.0
'''
print(df1.join(df2))
'''
a b c d x y z
A 1.0 1.0 1.0 1.0 0.0 0.0 0.0
B 1.0 1.0 1.0 1.0 0.0 0.0 0.0
'''
print(df2.join(df1))
'''
x y z a b c d
A 0.0 0.0 0.0 1.0 1.0 1.0 1.0
B 0.0 0.0 0.0 1.0 1.0 1.0 1.0
C 0.0 0.0 0.0 NaN NaN NaN NaN
'''
数据合并之merge
import pandas as pd
import numpy as np
t1 = pd.DataFrame(np.ones((2,5)),index = ["A","B"],columns = list("abcde"))
t1.loc["A","c"] = "c"
t1.loc["B","c"] = "d"
# print(t1)
t2 = pd.DataFrame(np.zeros((3,4)),index = ["A","B","C"],columns = list("wxyz"))
t2.loc["A","y"] = "a"
t2.loc["B","y"] = "b"
t2.loc["C","y"] = "c"
# print(t2)
'''
t1:
a b c d e
A 1.0 1.0 c 1.0 1.0
B 1.0 1.0 d 1.0 1.0
t2:
w x y z
A 0.0 0.0 a 0.0
B 0.0 0.0 b 0.0
C 0.0 0.0 c 0.0
'''
'''
left_on 左边以c列作为key
right_on 右边以y列作为key
默认合并是inner,交集
outer 并集,以NaN补全
left:以左边为基准
right:以右边为基准
'''
result1 = t1.merge(t2,left_on = "c",right_on = "y")
result2 = t1.merge(t2,left_on = "c",right_on = "y",how = "inner")
result3 = t1.merge(t2,left_on = "c",right_on = "y",how = "outer")
result4 = t1.merge(t2,left_on = "c",right_on = "y",how = "left")
result5 = t1.merge(t2,left_on = "c",right_on = "y",how = "right")
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)
数据分组聚合
现在我们有一组关于全球星巴克店铺的统计数据,如果我想知道美国的星巴克数量和中国的哪个多,或者我想知道中国每个省份星巴克的数量的情况,那么应该怎么办?
import pandas as pd
import numpy as np
file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
# print(df.head(1))
# grouped = df.groupby(by="Country")
# # print(grouped)
# country_count = grouped["Brand"].count()
# print(country_count)
# print(country_count["US"])
# print(country_count["CN"])
#中国每个省店铺的数量
# china_data = df[df["Country"] == "CN"]
# grouped = china_data.groupby(by = "State/Province").count()["Brand"]
# print(grouped)
# print(type(grouped))
t2 = df.groupby(by=["Country","State/Province"])[["Country"]].count()
print(t2)
print(type(t2))
数据的索引学习
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2,4)),index = ["A","B"],columns = list("abcd"))
#获取index:df.index
#print(df1.index)
#指定index :df.index = ['x','y']
# df1.index = ['x','y']
print(df1)
#重新设置index : df.reindex(list("abcedf")) 实则是对dataframe进行取行
print(df1.reindex(["A","f"]))
#指定某一列作为index :df.set_index("Country",drop=False),drop为假表示之前列充当索引的列名称依然保存
print(df1.set_index("a",drop=False))
print(df1.set_index("a",drop=True))
print(df1.set_index(["a","d"],drop=False))
print("*"*100)
#返回index的唯一值:df.set_index("Country").index.unique()
print(df1.set_index("a").index.unique())
练习1
# 使用matplotlib呈现出店铺总数排名前10的国家
import pandas as pd
from matplotlib import pyplot as plt
file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
data_pre10 = df.groupby("Country")["Brand"].count().sort_values(ascending = False)[:10]
# print(data_pre10)
_x = data_pre10.index
_y = data_pre10.values
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y,width = 0.4)
plt.xticks(range(len(_x)),_x,)
plt.show()
plt.savefig("140.jpg")
练习2
# 现在我们有全球排名靠前的10000本书的数据,那么请统计一下下面几个问题:
# 1不同年份书的数量
# 2不同年份书的平均评分情况
import pandas as pd
from matplotlib import pyplot as plt
file_path = "./books.csv"
df = pd.read_csv(file_path)
#去除没数据的
data1 = df[pd.notnull(df["original_publication_year"])]
#统计数量
grouped1 = data1.groupby(by="original_publication_year").count()["title"]
#统计评分
grouped2 = data1["average_rating"].groupby(by=data1["original_publication_year"]).mean()
print(grouped1)
print(grouped2)