字符串的离散化
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path="./datasets_IMDB-Movie-Data.csv"
df=pd.read_csv(file_path)
pd.set_option('display.max_columns',None)
#统计分类的列表
temp_list=df["Genre"].str.split(",").tolist()
genre_list=list(set([i for j in temp_list for i in j]))
#构造全为0的数组
zeros_df=pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
#df.shape[0]代表行的长度,len(genre_list)代表列的长度,columns=genre_list->重新定义列表的名字
#print(zeros_df)
#给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
zeros_df.loc[i,temp_list[i]]=1
#统计每个分类的电影的数量和
genre_count=zeros_df.sum(axis=0)
print(genre_count)
#排序
genre_count=genre_count.sort_values()
_x=genre_count.index
_y=genre_count.values
#画图
plt.figure(figsize=(20,10),dpi=100)
plt.bar(range(len(_x)),_y,color="red")
plt.xticks(range(len(_x)),_x)
plt.show()
War 13.0
Horror 119.0
Mystery 106.0
Animation 49.0
Family 51.0
Action 303.0
Fantasy 101.0
Crime 150.0
Romance 141.0
Adventure 259.0
History 29.0
Music 16.0
Sport 18.0
Drama 513.0
Thriller 195.0
Western 7.0
Biography 81.0
Musical 5.0
Comedy 279.0
Sci-Fi 120.0
dtype: float64
数据合并
join:默认情况下他是把行行索引相同的数据合并在一起
In [8]: t=pd.DataFrame(np.arange(12).reshape((3,4)),index=list("ABC"),columns=list("WXYZ"))
In [9]: t
Out[9]:
W X Y Z
A 0 1 2 3
B 4 5 6 7
C 8 9 10 11
In [17]: t1=pd.DataFrame(np.arange(9).reshape(3,3),index=list("ABC"))
In [19]: t1
Out[19]:
0 1 2
A 0 1 2
B 3 4 5
C 6 7 8
In [20]: t.join(t1)
Out[20]:
W X Y Z 0 1 2
A 0 1 2 3 0 1 2
B 4 5 6 7 3 4 5
C 8 9 10 11 6 7 8
merge:按照指定的列把数据按照一定的方式合并到一起
In [50]: t
Out[50]:
W X Y Z
A 0 1 2 3
B 4 5 6 7
C 8 9 10 11
In [51]: t1
Out[51]:
J K L
A 0 1 2
B 3 4 5
C 6 7 8
In [52]: t.merge(t1,left_on="W",right_on="J")
Out[52]:
W X Y Z J K L
0 0 1 2 3 0 1 2
数据分组聚合
grouped=df.groupby(by="columns_name")
grouped是一个DataFrameGroupBy对象,是可迭代的
grouped中的每一个元素是一个元组
元组里面是(索引(分组的值),分组之后的DataFrame)
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_columns', None)
# print(df.head())
#print(df.info())
# grouped=df.groupby(by="Country")
# print(grouped)
# DataFrameGroupBy
# 可以进行遍历
# for i,j in grouped:
# print(i)
# print("*"*100)
# print(j,type(j))
# print("*"*100)
# df[df["Country"]=="US"]
# 调用聚合方法
# country_count=grouped["Brand"].count()
# print(country_count["US"])
# print(country_count["CN"])
# 统计中国每个省份店铺的数量
#china_data = df[df["Country"] == "CN"]
#grouped=china_data.groupby(by="State/Province").count()["Brand"]
grouped=df[df["Country"] == "CN"].groupby(by=[df["Country"],df["State/Province"]])["Brand"].count()
print(grouped)
print(type(grouped))
#数据按照多个条件进行分组
#t1=df[["Country"]].groupby(by=[df["Country"],df["State/Province"]]).count()
#print(t1)
#print(type(t1))
#t2=df.groupby(by=[df["Country"],df["State/Province"]])[["Country"]].count()
#print(t2)
#print(type(t2))
索引和复合索引
简单的索引操作:
- 获取index:df.index
- 指定Index:df.index=[‘x’,‘y’]
- 重新设置index:df.reindex(list"abcdef"))
- 指定某一列作为Index:df.set_index(“Country”,drop=False)
- 返回index的唯一值:df.set_index(“Country”).index.unique()
In [5]: df1 = pd.DataFrame(np.arange(8).reshape((2, 4)), index=list("ab"),columns = list("abcd"))
In [6]: df1
Out[6]:
a b c d
a 0 1 2 3
b 4 5 6 7
In [7]: df1.reindex(["a","f"])
Out[7]:
a b c d
a 0.0 1.0 2.0 3.0
f NaN NaN NaN NaN
In [9]: df1.reindex(["a","f"])
Out[9]:
a b c d
a 0.0 1.0 2.0 3.0
f NaN NaN NaN NaN
In [10]: df1.set_index("b")
Out[10]:
a c d
b
1 0 2 3
5 4 6 7
In [11]: df1.set_index("c").index
Out[11]: Int64Index([2, 6], dtype='int64', name='c')
In [12]: df1.set_index("a",drop=False)
Out[12]:
a b c d
a
0 0 1 2 3
4 4 5 6 7
In [13]: df1["c"].unique()
Out[13]: array([2, 6], dtype=int64)
In [14]: df1["d"].unique()
Out[14]: array([3, 7], dtype=int64)
In [15]: df1.set_index(["a","c"])
Out[15]:
b d
a c
0 2 1 3
4 6 5 7
In [16]df1.set_index(["a","c"]).index
Out[16]:
MultiIndex([(0, 2),
(4, 6)],
names=['a', 'c'])
In [17]: df1.set_index(["a","c","d"],drop=False).index
Out[17]:
MultiIndex([(0, 2, 3),
(4, 6, 7)],
names=['a', 'c', 'd'])
In [18]: df1.set_index(["a","c","d"],drop=False)
Out[18]:
a b c d
a c d
0 2 3 0 1 2 3
4 6 7 4 5 6 7
In [12]: a=pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':list("hjklm
...: no")})
In [13]: a
Out[13]:
a b c d
0 0 7 one h
1 1 6 one j
2 2 5 one k
3 3 4 two l
4 4 3 two m
5 5 2 two n
6 6 1 two o
In [14]: a.set_index(["c","d"])
Out[14]:
a b
c d
one h 0 7
j 1 6
k 2 5
two l 3 4
m 4 3
n 5 2
o 6 1
Series复合索引
In [13]: a
Out[13]:
a b c d
0 0 7 one h
1 1 6 one j
2 2 5 one k
3 3 4 two l
4 4 3 two m
5 5 2 two n
6 6 1 two o
In [14]: a.set_index(["c","d"])
Out[14]:
a b
c d
one h 0 7
j 1 6
k 2 5
two l 3 4
m 4 3
n 5 2
o 6 1
In [15]: b= a.set_index(["c","d"])
In [16]: b
Out[16]:
a b
c d
one h 0 7
j 1 6
k 2 5
two l 3 4
m 4 3
n 5 2
o 6 1
In [17]: c=b["a"]
In [18]: c
Out[18]:
c d
one h 0
j 1
k 2
two l 3
m 4
n 5
o 6
Name: a, dtype: int64
In [19]: type(c)
Out[19]: pandas.core.series.Series
In [20]: c["one"]["j"]
Out[20]: 1
In [21]: c["one"]
Out[21]:
d
h 0
j 1
k 2
Name: a, dtype: int64
In [22]: b.loc["one"].loc["h"]
Out[22]:
a 0
b 7
Name: h, dtype: int64
In [23]: x=a.set_index(["c","d"])["a"]
In [24]: x
Out[24]:
c d
one h 0
j 1
k 2
two l 3
m 4
n 5
o 6
Name: a, dtype: int64
In [25]: type(x)
Out[25]: pandas.core.series.Series
练习
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_columns', None)
#使用matplotlib呈现出店铺总数排名前10的国家
#数据准备
data1=df.groupby(by="Country").count()["Brand"].sort_values(ascending=False)[:10]
_x=data1.index
#print(range(len(_x)))
_y=data1.values
#画图
plt.figure(figsize=(20,10),dpi=100)
plt.bar(range(len(_x)),_y,color="orange")
#x轴设置
plt.xticks(range(len(_x)),_x)
#展示
plt.show()
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
plt.rcParams['font.sans-serif'] = ['KaiTi']
mpl.rcParams["axes.unicode_minus"]=False
file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_columns', None)
#使用matplotlib呈现中国每个城市的店铺数量
china_data=df[df["Country"]=="CN"]
#print(china_data.head())
Data=china_data.groupby(by="City").count()["Brand"].sort_values(ascending=False)[:15]
_x=Data.index
print(_x)
_y=Data.values
#画图
plt.figure(figsize=(20,10),dpi=100)
plt.bar(range(len(_x)),_y,width=0.3,color="orange")
#x轴
plt.xticks(range(len(_x)),_x)
#展示
plt.show()
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
plt.rcParams['font.sans-serif'] = ['KaiTi']
mpl.rcParams["axes.unicode_minus"]=False
file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_columns', None)
#使用matplotlib呈现中国每个城市的店铺数量
china_data=df[df["Country"]=="CN"]
#print(china_data.head())
Data=china_data.groupby(by="City").count()["Brand"].sort_values(ascending=False)[:20]
_x=Data.index
#print(_x)
_y=Data.values
#画图
plt.figure(figsize=(20,10),dpi=100)
plt.barh(range(len(_x)),_y,height=0.3,color="orange")
#x轴
plt.yticks(range(len(_x)),_x)
#展示
plt.show()
from matplotlib import pyplot as plt
import pandas as pd
import matplotlib as mlp
plt.rcParams['font.sans-serif']=['KaiTi']
plt.rcParams["axes.unicode_minus"]=False
file_path="./books.csv"
df=pd.read_csv(file_path)
pd.set_option('display.max_columns',None)
#用matplotlib展示不同年份书的数量
#print(df.head(1))
#print(df.info())
data1=df[pd.notnull(df["original_publication_year"])]
data2=data1.groupby(by="original_publication_year").count()["books_count"].sort_values(ascending=False)
#print(data2)
_x=data2.index[:20]
_y=data2.values[:20]
plt.figure(figsize=(20,10),dpi=100)
plt.barh(range(len(_x)),_y,height=0.3,color="orange")
plt.yticks(range(len(_x)),_x)
plt.show()
from matplotlib import pyplot as plt
import pandas as pd
import matplotlib as mlp
plt.rcParams['font.sans-serif']=['KaiTi']
plt.rcParams["axes.unicode_minus"]=False
file_path="./books.csv"
df=pd.read_csv(file_path)
pd.set_option('display.max_columns',None)
#用matplotlib展示不同年份书的评分平均情况
#去除这一列中NAN的值
data1=df[pd.notnull(df["original_publication_year"])]
data2=data1["average_rating"].groupby(by=data1["original_publication_year"]).mean()
#print(data2)
_x=data2.index
_y=data2.values
plt.figure(figsize=(20,10),dpi=100)
plt.plot(range(len(_x)),_y,color="red")
plt.xticks(list(range(len(_x)))[::10],_x[::10],rotation=45,)
plt.show()
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib as mlp
mlp.rcParams['font.sans-serif']=['KaiTi']
mlp.rcParams["axes.unicode_minus"]=False
pd.set_option('display.max_columns',None)
file_path="./911.csv"
df=pd.read_csv(file_path)
#print(df.head(2))
#print(df.info())
#获取分类
temp_list=df["title"].str.split(": ").tolist()
cata_list=list(set([i[0] for i in temp_list]))#set()函数创建一个无序不重复元素集,可进行关系测试,删除重复数据,还可以计算交集、差集、并集等
#print(cata_list)
print(df.head(5))
#构造一个全为0的数组
zeros_df=pd.DataFrame(np.zeros((df.shape[0],len(cata_list))),columns=cata_list)
#赋值
for cate in cata_list:
zeros_df[cate][df["title"].str.contains(cate)]=1
#print(zeros_df)
sum_one=zeros_df.sum(axis=0)
print(sum_one)
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib as mlp
mlp.rcParams['font.sans-serif']=['KaiTi']
mlp.rcParams["axes.unicode_minus"]=False
pd.set_option('display.max_columns',None)
file_path="./911.csv"
df=pd.read_csv(file_path)
#print(df.head(2))
#print(df.info())
#获取分类
temp_list=df["title"].str.split(": ").tolist()
cata_list=[i[0] for i in temp_list]#set()函数创建一个无序不重复元素集,可进行关系测试,删除重复数据,还可以计算交集、差集、并集等
#print(cata_list)
df["cate"]=pd.DataFrame(np.array(cata_list).reshape((df.shape[0],1)))
Data=df.groupby(by="cate").count()["title"]
print(Data)
cate
EMS 320326
Fire 96177
Traffic 223395
Name: title, dtype: int64