import pandas as pd from matplotlib import pyplot as plt import numpy as np file_path = "IMDB-Movie-Data.csv" df = pd.read_csv(file_path) #统计分类的列表 tem_list = df["Genre"].str.split(",").tolist() genre_list = list(set([i for j in tem_list for i in j])) #构造全为0的数组,.shape为统计列表中元素的个数,.shape[0]为第1行的个数 zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list) #给每个电影出现分类的位置赋值为1 for i in range(df.shape[0]): zeros_df.loc[i,tem_list[i]] = 1 #print(zeros_df.head(3)) #统计每个分类电影的数量 genre_count = zeros_df.sum(axis=0) #在横轴方向统计每一列的值 #print(genre_count) #排序.sort_values()为使数据的值按照从小到大升序排列 genre_count = genre_count.sort_values() #画图 plt.figure(figsize=(20,8),dpi=80) #plt.bar(genre_list,genre_count) #genre_list没有跟随genre_count变化 _x = genre_count.index _y = genre_count.values plt.bar(range(len(_x)),_y) plt.xticks(range(len(_x)),_x) plt.show()
import pandas as pd import numpy as np df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd")) print(df1) df2 = pd.DataFrame(np.zeros((3,3)),index=["A","B","C"],columns=list("xyz")) #print(df2) #.join是行索引合并 #tem_test1 = df1.join(df2) #print(tem_test1) #tem_test2 = df2.join(df1) #print(tem_test2) #.merge是列索引合并 # df3 = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list("fax")) # print(df3) # #df1.loc["A","a"]=100 # #print(df1) #如果对df1进行修改后,让df1的a列只有一个1,merge合并后只有1行 # print(df1.merge(df3,on="a")) #df1的a列在A,B这2行上都是1,因此指定a为标准进行合并时,有2行 # print(df1.index) # df1.index = ["a","b"] # print(df1) # print("*"*100) # print(df1.reindex(["a","f"])) #.reinde是从数据中抽出["a","f"],再赋值给数据,因为原数据有a这一行,所以a有值 # print(df1) #原数据没有f,f行没有数据可以抽取,因此显示为nan # print("*"*100) # print(df1.set_index("a")) # print(df1.set_index("a").index) # print("*"*100) # print(df1.set_index("a",drop=False)) # print(df1["d"].unique()) # print(df1.set_index("a").index.unique()) # print(len(df1.set_index("a").index)) # print(df1.set_index(["a","b"])) # print(df1.set_index(["a","b"]).index) a = pd.DataFrame({"a":range(7),"b":range(7,0,-1),"c":["one","one","one","two","two","two","two"],"d":list("hjklmno")}) print(a) b = a.set_index(["c","d"]) print(b) c= b["a"] print(c) print(c["one"]["j"]) print("*"*100) print(c["one"]) d = a.set_index(["d","c"])["a"] print(d) print(d.swaplevel()) print("*"*100) print(b.loc["one"].loc["h"]) print(b.swaplevel().loc["h"])
import pandas as pd import numpy as np file_path = "./directory.csv" df = pd.read_csv(file_path) #print(df.head(1)) #print(df.info()) #grouped = df.groupby(by="Country") #print(grouped) #DataFrameGroupby可以遍历 #for i,j in grouped: # print(i) # print("-" * 100) # print(j) # print("*"*100) #df[df["country"]="US"] #可以聚合 #country_count = grouped["Brand"].count() #print(country_count["US"]) #print(country_count["CN"]) #统计中国每个省份店铺的数量 #china_data = df[df["Country"] == "CN"] #grouped = china_data.groupby(by="State/Province").count()["Brand"] #print(grouped) #按照多个条件进行分组 df["Brand"]取到的是series,不是DataFrame,当使用多个条件进行分组时,需要限定,如df["Country"] #grouped = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count() #输出的结果为series,但是有2个索引 #print(grouped) #如果想返回一个DataFrame时,可以使用列表方式.grouped1与grouped2,grouped3的效果是一样的 grouped1 = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count() # grouped2 = df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count() # # grouped3 = df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]] #索引的方法和属性 print(grouped1.index)
import pandas as pd from matplotlib import pyplot as plt file_path = "./directory.csv" df = pd.read_csv(file_path) data1 = df.groupby(by="Country").count()#["Brand"].sort_values(ascending=False)[:10] print(data1) # _x = data1.index # _y = data1.values # print(len(_x)) # # print(range(len(_x))) # plt.figure(figsize=(20,8),dpi=80) # # plt.bar(range(len(_x)),_y) # # plt.xticks(range(len(_x)),_x) # # # plt.show()