文章目录
1 什么是pandas
numpy能够帮助我们处理数值,但是pandas除了处理数值之外(基于numpy),还能够帮助我们处理其他类型的数据
pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
2 Series
2.1 Series创建
2.1.1 通过numpy创建
import pandas as pd
a = pd.Series([1,2,32,121,43,12])
print(a)
# 0 1
# 1 2
# 2 32
# 3 121
# 4 43
# 5 12
# dtype: int64
print(type(a))
# <class 'pandas.core.series.Series'>
t2 = pd.Series([1,2,3,4,5],index=list('ABCDE'))
print(t2)
# A 1
# B 2
# C 3
# D 4
# E 5
# dtype: int64
2.1.2 通过字典创建
字典的值有一个为字符串,该Series的dtype就为object
import pandas as pd
temp_dict = {'name':'alex','age':18,'tel':10086}
t3 = pd.Series(temp_dict)
print(t3)
# name alex
# age 18
# tel 10086
# dtype: object
2.2 索引和切片
2.2.1 按照位置和index取值
import pandas as pd
temp_dict = {'name':'alex','age':18,'tel':10086}
t3 = pd.Series(temp_dict)
print(t3['age']) #18,通过index取
print(t3[1]) #18,通过位置取
print(t3[:2]) #取连续多行,亦可以写成 t3[:'tel']
# name alex
# age 18
# dtype: object
print('*'*100)
print(t3[['name','tel']])#取不连续的多行,亦可以写成 t3[[0,2]]
# name alex
# tel 10086
# dtype: object
2.2.2 布尔索引
import pandas as pd
a = pd.Series([1,2,32,121,43,12])
print(a[a>30])
# 2 32
# 3 121
# 4 43
# dtype: int64
2.2.3 陌生Series取值
ndarry 支持for循环和list转化以及索引切片
t.index 也支持for循环和list转化以及索引切片
两者都是可迭代对象,都具有__iter__
方法
import pandas as pd
temp_dict = {'name':'alex','age':18,'tel':10086}
t3 = pd.Series(temp_dict)
print(t3.index) #Index(['name', 'age', 'tel'], dtype='object')
for i in t3.index:
print(i)
# name
# age
# tel
print(type(t3.index)) #<class 'pandas.core.indexes.base.Index'>
print(list(t3.index)[:2]) #['name', 'age']
print(t3.values) #['alex' 18 10086]
print(type(t3.values)) #<class 'numpy.ndarray'>
2.3 读取外部数据
现在假设我们有一个组关于狗的名字的统计数据,那么为了观察这组数据的情况,我们应该怎么做呢?
我们的这组数据存在csv中,我们直接使用pd. read_csv即可
和我们想象的有些差别,我们以为他会是一个Series类型,但是他是一个DataFrame
import pandas as pd
#pandas读取csv中的文件
df = pd.read_csv('dogNames2.csv')
print(df)
# Row_Labels Count_AnimalName
# 0 RENNY 1
# 1 DEEDEE 2
# 2 GLADIATOR 1
# 3 NESTLE 1
# 4 NYKE 1
# ... ... ...
# 4159 ALEXXEE 1
# 4160 HOLLYWOOD 1
# 4161 JANGO 2
# 4162 SUSHI MAE 1
# 4163 GHOST 3
#
# [4164 rows x 2 columns]
3 DataFrame
3.1 创建DataFrame
DataFrame对象既有行索引,又有列索引
行索引,表明不同行,横向索引,叫index,0轴,axis=0
列索引,表名不同列,纵向索引,叫columns,1轴,axis=1
3.1.1 通过numpy创建
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape((3,4)),index = list('abc'),columns=list('WXYZ'))
print(df)
# W X Y Z
# a 0 1 2 3
# b 4 5 6 7
# c 8 9 10 11
3.1.2 通过字典创建
import pandas as pd
d1 = {'name':['alex','peter'],'age':[20,32],'tel':[10086,10010]}
#每个coloum下的个数必须一致
df1 = pd.DataFrame(d1)
print(df1)
# name age tel
# 0 alex 20 10086
# 1 peter 32 10010
print(type(df1)) #<class 'pandas.core.frame.DataFrame'>
t2 = [{'name':'alex','age':32,'tel':10086},{'name':'peter','age':19,'tel':10010}]
#可以存在部分值缺失
df2 = pd.DataFrame(t2)
print(df2)
# name age tel
# 0 alex 32.0 10086.0
# 1 peter NaN 10010.0
# 2 john 34.0 NaN
3.2 DataFrame 属性和方法
import pandas as pd
t2 = [{'name':'alex','age':32,'tel':10086},{'name':'peter','tel':10010},{'name':'john','age':34}]
df2 = pd.DataFrame(t2)
# name age tel
# 0 alex 32.0 10086.0
# 1 peter NaN 10010.0
# 2 john 34.0 NaN
print(df2.index)
#RangeIndex(start=0, stop=3, step=1)
print(df2.columns)
# Index(['name', 'age', 'tel'], dtype='object')
print(df2.values)
# [['alex' 32.0 10086.0]
# ['peter' nan 10010.0]
# ['john' 34.0 nan]]
print(df2.shape) #(3, 3)
print(df2.ndim) #2
import pandas as pd
t2 = [{'name':'alex','age':32,'tel':10086},{'name':'peter','tel':10010},{'name':'john','age':34}]
df2 = pd.DataFrame(t2)
# name age tel
# 0 alex 32.0 10086.0
# 1 peter NaN 10010.0
# 2 john 34.0 NaN
print(df2.head(2))
print(df2.tail(2))
print(df2.info())
print(df2.describe())
3.3 排序
import pandas as pd
df = pd.read_csv('dogNames2.csv')
#dataFrame中排序的方法
df = df.sort_values(by='Count_AnimalName',ascending=False) #默认升序
print(df.head(5))
# Row_Labels Count_AnimalName
# 858 BELLA 112
# 4134 MAX 82
# 3273 LUCY 82
# 843 BUDDY 79
# 433 SADIE 77
3.4 索引和切片
3.4.1 取行或者列
import pandas as pd
df = pd.read_csv('dogNames2.csv')
#dataFrame中排序的方法
df = df.sort_values(by='Count_AnimalName',ascending=False) #默认升序
# pandas取行或列的注意点
# 方括号写数字,表示取行,对行进行操作
# 写字符串,表示取列索引,对列进行操作
print(df[:20]) #取前20行
print(df['Row_Labels']) #取具体某一列,类型为Series
3.4.2 loc 和 iloc
1.df.loc 通过标签索引行数据
2.df.iloc 通过位置获取行数据
3.4.2.1 loc 取某一行或列
print(df.loc['a',:],type(df.loc['a',:])) #取行,Series
print(df.loc['a'],type(df.loc['a'])) #取行,Series
print(df.loc[:,'Z'],type(df.loc[:,'Z'])) #取列,Series
3.4.2.2 loc 取多行或多列
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape((3,4)),index = list('abc'),columns=list('WXYZ'))
# W X Y Z
# a 0 1 2 3
# b 4 5 6 7
# c 8 9 10 11
print(df.loc[['a','c']]) #取不连续的多行
# W X Y Z
# a 0 1 2 3
# c 8 9 10 11
print(df.loc[:,['W','Z']]) #取不连续的多列
# W Z
# a 0 3
# b 4 7
# c 8 11
print(df.loc[['a','c'],['W','Z']]) #取不连续的多行,多列
# W Z
# a 0 3
# c 8 11
print(df.loc['a':'c','W':'Y']) #取连续的多行多列,冒号在loc里是闭合的
# W X Y
# a 0 1 2
# b 4 5 6
# c 8 9 10
3.4.2.3 iloc取某一行或列
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape((3,4)),index = list('abc'),columns=list('WXYZ'))
# W X Y Z
# a 0 1 2 3
# b 4 5 6 7
# c 8 9 10
print(df.iloc[1])
# W 4
# X 5
# Y 6
# Z 7
print(df.iloc[:,1])
# a 1
# b 5
# c 9
3.4.2.4 iloc 取多行或多列
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape((3,4)),index = list('abc'),columns=list('WXYZ'))
# W X Y Z
# a 0 1 2 3
# b 4 5 6 7
# c 8 9 10
print(df.iloc[[0,2]]) #取不连续的多行
# W X Y Z
# a 0 1 2 3
# c 8 9 10 11
print(df.iloc[:,[0,3]]) #取不连续的多列
# W Z
# a 0 3
# b 4 7
# c 8 11
print(df.iloc[[0,2],[0,3]]) #取不连续的多行,多列
# W Z
# a 0 3
# c 8 11
print(df.iloc[0:2,0:3]) #取连续的多行多列 iloc冒号是不闭合的
# W X Y
# a 0 1 2
# b 4 5 6
# c 8 9 10
3.4.2.5 赋值
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape((3,4)),index = list('abc'),columns=list('WXYZ'))
# W X Y Z
# a 0 1 2 3
# b 4 5 6 7
# c 8 9 10
df.iloc[[0,2],[0,3]] = np.nan #直接自动赋值成nan
print(df)
# W X Y Z
# a NaN 1 2 NaN
# b 4.0 5 6 7.0
# c NaN 9 10 NaN
3.4.3 布尔索引
回到之前狗的名字的问题上,假如我们想找到所有的使用次数超过800的狗的名字,应该怎么选择?
import pandas as pd
df = pd.read_csv('dogNames2.csv')
df = df[df['Count_AnimalName']>80]
print(df)
# Row_Labels Count_AnimalName
# 858 BELLA 112
# 3273 LUCY 82
# 4134 MAX 82
多个条件and &; or |
import pandas as pd
df = pd.read_csv('dogNames2.csv')
df = df[(df['Count_AnimalName']>70)&(df['Count_AnimalName']<200)].sort_values(by = 'Count_AnimalName',ascending=False)
print(df)
# Row_Labels Count_AnimalName
# 858 BELLA 112
# 3273 LUCY 82
# 4134 MAX 82
# 843 BUDDY 79
# 433 SADIE 77
回到之前狗的名字的问题上,假如我们想找到所有的使用次数超过70并且名字的字符串的长度大于4的狗的名字,应该怎么选择?
3.5 缺失数据处理
我们的数据缺失通常有两种情况:
一种就是空,None等,在pandas是NaN(和np.nan一样)
另一种是我们让其为0,蓝色框中
判断数据是否为NaN:pd.isnull(df),pd.notnull(df)
处理方式1:删除NaN所在的行列dropna (axis=0, how=‘any’, inplace=False)
处理方式2:填充数据,t.fillna(t.mean()),t.fiallna(t.median()),t.fillna(0)
处理为0的数据:t[t==0]=np.nan
当然并不是每次为0的数据都需要处理
计算平均值等情况,nan是不参与计算的,但是0会
3.5.1 pd.isnull()进行布尔索引
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape((3,4)),index = list('abc'),columns=list('WXYZ'))
df.iloc[[0,2],[0,3]] = np.nan #直接自动赋值成nan
print(df)
df1 = pd.isnull(df)
print(df1)
# W X Y Z
# a True False False True
# b False False False False
# c True False False True
print(df[pd.notnull(df['W'])])
# W X Y Z
# b 4.0 5 6 7.0
3.5.2 dropna()删除含有nan的行
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape((3,4)),index = list('abc'),columns=list('WXYZ'))
df.iloc[[0,2],[0,3]] = np.nan #直接自动赋值成nan
print(df)
# W X Y Z
# a NaN 1 2 NaN
# b 4.0 5 6 7.0
# c NaN 9 10 NaN
df.dropna(axis=0,how= 'any',inplace=True)
#删除为nan的行,how默认为any;改为all,全部为nan才删除该行
#inplace=True 原地修改,不需要重新赋值给变量;默认为False
print(df)
# W X Y Z
# b 4.0 5 6 7.0
3.5.3 填充数据
3.5.3.1 直接填充某个具体数值
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape((3,4)),index = list('abc'),columns=list('WXYZ'))
df.iloc[[0,2],[0,3]] = np.nan #直接自动赋值成nan
print(df)
# W X Y Z
# a NaN 1 2 NaN
# b 4.0 5 6 7.0
# c NaN 9 10 NaN
df = df.fillna(100)
print(df)
# W X Y Z
# a 100.0 1 2 100.0
# b 4.0 5 6 7.0
# c 100.0 9 10 100.0
3.5.3.2 填充均值
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape((3,4)),index = list('abc'),columns=list('WXYZ'))
df.iloc[[0,2],[0,3]] = np.nan #直接自动赋值成nan
print(df)
# W X Y Z
# a NaN 1 2 NaN
# b 4.0 5 6 7.0
# c NaN 9 10 NaN
df = df.fillna(df.mean()) #填充均值
print(df)
# W X Y Z
# a 4.0 1 2 7.0
# b 4.0 5 6 7.0
# c 4.0 9 10 7.0
3.5.3.3 某一列填充均值
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape((3,4)),index = list('abc'),columns=list('WXYZ'))
df.iloc[[0,2],[0,3]] = np.nan #直接自动赋值成nan
print(df)
# W X Y Z
# a NaN 1 2 NaN
# b 4.0 5 6 7.0
# c NaN 9 10 NaN
df['W'] = df['W'].fillna(df['W'].mean())
print(df)
# W X Y Z
# a 4.0 1 2 NaN
# b 4.0 5 6 7.0
# c 4.0 9 10 NaN
3.6 数据统计
import pandas as pd
df = pd.read_csv('datasets_IMDB-Movie-Data.csv')
#获取平均评分
print(round(df['Rating'].mean(),2))
#获取导演的人数
print(len(set(df['Director'].tolist())))
print(len(df['Director'].unique()))
#获取演员的人数
temp_actors_list = df['Actors'].str.split(',').tolist()
actors_list = [i for j in temp_actors_list for i in j]
actors_num = len(set(actors_list))
print(actors_num)
#runtime 分布情况
#选择图形,直方图
runtime_data = df['Runtime (Minutes)'].values
max_runtime = runtime_data.max()
min_runtime = runtime_data.min()
#计算组数
num_bin = (max_runtime-min_runtime)//5
#设置图形大小
plt.figure(figsize=(20,8),dpi=80)
plt.hist(runtime_data,num_bin)
plt.xticks(range(min_runtime,max_runtime+5,5))
plt.show()