Pandas —— 数据分析处理库
安装Pandas: pip install pandas
import pandas as pd
pd.show_versions() # 显示当前版本信息
读取数据
读数据
pd.read_csv()
: 读取csv类型数据
df = pd.read_csv('./data/titanic.csv')
df.head(n)
: 显示前n条数据
df.head(6) # head()可以读取前几条数据,指定前几条都可以
df.info()
: 返回数据的总体信息
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
'''
数据项的索引
df.index
: 数据项
df.columns
: 指标
df.index # RangeIndex(start=0, stop=891, step=1)
df.columns
'''
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
'''
df.dtypes
: 数据类型
df.dtypes
'''
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
'''
索引
索引单个column
age = df['Age']
age[:5]
'''
age = df['Age']
age[:5]
age = df['Age']
age[:5]
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
Name: Age, dtype: float64
'''
索引多个column
df[['Age','Fare']][:5]
set_index
: 默认index是0,1,2…, 也可以用set_index来指定某一个指标为index
df = df.set_index('Name')
df['Age'][:5]
'''
Name
Braund, Mr. Owen Harris 22.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer) 38.0
Heikkinen, Miss. Laina 26.0
Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0
Allen, Mr. William Henry 35.0
Name: Age, dtype: float64
'''
age = df['Age']
age['Allen, Mr. William Henry'] #35.0
age=age+10
age[:5]
'''
Name
Braund, Mr. Owen Harris 32.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer) 48.0
Heikkinen, Miss. Laina 36.0
Futrelle, Mrs. Jacques Heath (Lily May Peel) 45.0
Allen, Mr. William Henry 45.0
Name: Age, dtype: float64
'''
iloc
:用position 来定位
df.iloc[0] # 取出第一条数据
df.iloc[0:5] # 展示前5行的信息
df.iloc[0:5,1:3] # 展示前5行,第2,3列的信息
loc
:用lable 来定位
df = df.set_index('Name')
df.loc['Heikkinen, Miss. Laina'] #展示Heikkinen, Miss. Laina的所有信息
df.loc['Heikkinen, Miss. Laina','Fare'] #展示Heikkinen, Miss. Laina的fare信息
df.loc['Heikkinen, Miss. Laina':'Allen, Mr. William Henry',:] # 展示Heikkinen, Miss. Laina到Allen, Mr. William Henry的所有信息
df.loc['Heikkinen, Miss. Laina','Fare'] = 1000 # 修改信息
bool类型的索引
df['Fare'] > 40 # 对于每一条数据返回True或者False
df[df['Fare'] > 40][:5] #返回fare大于50的前5条数据
df[df['Sex'] == 'male'][:5] #返回男性的前5条数据
df.loc[df['Sex'] == 'male','Age'].mean() #返回男性年龄的平均值
(df['Age'] > 70).sum()#年龄大于70的人数
数据处理
数值运算
import pandas as pd
df = pd.DataFrame([[1,2,3],[4,5,6]],index = ['a','b'],columns = ['A','B','C'])
df.sum()
: 求和运算
# 默认按照列求和
df.sum()
'''
A 5
B 7
C 9
dtype: int64
'''
# axis=1 按照行求和
df.sum(axis = 1)
'''
a 6
b 15
dtype: int64
'''
# axis = 'columns' 按照行求和
df.sum(axis = 'columns')
'''
a 6
b 15
dtype: int64
'''
df.mean()
:均值
# 默认按照列求均值
df.mean()
'''
A 2.5
B 3.5
C 4.5
dtype: float64
'''
# axis=1 按照行求均值
df.mean(axis = 1)
'''
a 2.0
b 5.0
dtype: float64
'''
df.min()
,df.max()
: 求最大值,最小值
df.min()
df.max()
'''
A 4
B 5
C 6
dtype: int64
'''
统计特性
df.describe()
; 描述性统计
df.describe() # 描述性统计
median
: 求中位数
df.median()
'''
A 2.5
B 3.5
C 4.5
dtype: float64
'''
df.cov()
:返回一个协方差矩阵
df.cov()
df.corr()
: 返回一个相关系数矩阵
df.corr()
value_counts()
: 按照统计数量进行排序
#默认降序
df['Age'].value_counts()
'''
24.00 30
22.00 27
18.00 26
19.00 25
...
0.42 1
Name: Age, Length: 88, dtype: int64
'''
#升序
df['Age'].value_counts(ascending = True)
'''
0.42 1
...
18.00 26
22.00 27
24.00 30
'''
# 划分区间
df['Age'].value_counts(ascending = True,bins = 5)
'''
(64.084, 80.0] 11
(48.168, 64.084] 69
(0.339, 16.336] 100
(32.252, 48.168] 188
(16.336, 32.252] 346
Name: Age, dtype: int64
'''
dataframe结构
创
pd.DataFrame(data= ,index= ,columns = )
: 指定data, index , columns
data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']
df = pd.DataFrame(data=data,index=index,columns = columns)
df = pd.DataFrame(data,index = )
: 将columns嵌入data中
import numpy as np
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data,index = labels)
df.head()
'''
age animal priority visits
a 2.5 cat yes 1
b 3.0 cat yes 3
c 0.5 snake no 2
d NaN dog yes 3
e 5.0 dog no 2
'''
查
df[]
: 索引
df['A']
'''
a 1
b 4
Name: A, dtype: int64
'''
df.iloc[]
: 通过位置索引
df.iloc[0]
'''
A 1
B 2
C 3
Name: a, dtype: int64
'''
df.loc[]
: 通过index,columns索引
df.loc['a']
'''
A 1
B 2
C 3
Name: a, dtype: int64
'''
df.loc['a']['A'] #1
改
# 修改数据项
df.loc['a']['A'] = 150
'''
A B C
a 150 2 3
b 4 5 6
'''
# 修改index
df.index = ['f','g']
'''
A B C
f 150 2 3
g 4 5 6
'''
增
增添一行数据
df.loc['c'] = [1,2,3]
'''
A B C
f 150 2 3
g 4 5 6
c 1 2 3
'''
增添一列指标
data = [[1,2,3],[4,5,6]]
index = ['j','k']
columns = ['A','B','C']
df2 = pd.DataFrame(data=data,index=index,columns = columns)
'''
A B C
j 1 2 3
k 4 5 6
'''
df2['Q'] = [10,11]
'''
A B C Q
j 1 2 3 10
k 4 5 6 11
'''
pd.concat()
: 拼接两组数据
# axis = 0 竖着拼接
df3 = pd.concat([df,df2],axis = 0)
'''
A B C
f 150 2 3
g 4 5 6
c 1 2 3
j 1 2 3
k 4 5 6
'''
df4 = pd.DataFrame([[10,11],[12,13]],index=['j','k'],columns=['D','E'])
'''
D E
j 10 11
k 12 13
'''
# axis = 1 横着拼接
df5 = pd.concat([df2,df4],axis = 1)
'''
A B C Q D E
j 1 2 3 10 10 11
k 4 5 6 11 12 13
'''
删
df.drop()
: 删除一行或者一列的数据
df5.drop(['j'],axis=0,inplace = True)
'''
A B C Q D E
k 4 5 6 11 12 13
'''
del
:删除一列数据
del df5['Q']
'''
A B C D E
k 4 5 6 12 13
'''
df5.drop(['A','B','C'],axis = 1,inplace = True)
'''
D E
k 12 13
'''
groupby操作
合并
构建一个key值有重复的df结构
df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
'data':[0,5,10,5,10,15,10,15,20]})
比较麻烦的方法进行分组
for key in ['A','B','C']:
print (key,df[df['key'] == key].sum())
用groupby 进行分组
df.groupby('key').sum() #key值相同的数据项相加合并
用groupy 可以进行单个或者多个依据分组
import pandas as pd
import numpy as np
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
'''
A B C D
0 foo one -0.785250 1.010458
1 bar one 2.549941 1.704677
2 foo two -0.255153 -0.603249
3 bar three -0.954625 0.117662
4 foo two -0.548512 0.648127
5 bar two -0.642762 -1.111568
6 foo one 0.870697 0.556371
7 foo three 0.839937 0.798669
'''
# 单个分组依据
grouped = df.groupby('A')
grouped.count()
'''
B C D
A
bar 3 3 3
foo 5 5 5
'''
#多个分组依据
grouped = df.groupby(['A','B'])
grouped.count()
'''
C D
A B
bar one 1 1
three 1 1
two 1 1
foo one 2 2
three 1 1
two 2 2
'''
groupy 中可以传入自定义函数
def get_letter_type(letter):
if letter.lower() in 'aeiou':
return 'a'
else:
return 'b'
grouped = df.groupby(get_letter_type,axis = 1)
grouped.count().iloc[0]
'''
a 1
b 3
Name: 0, dtype: int64
'''
有关groupy的其他操作
first()
,last()
: 显示第一次或者最后一次出现各个组的对应值
s = pd.Series([1,2,3,1,2,3],[8,7,5,8,7,5])
'''
8 1
7 2
5 3
8 1
7 2
5 3
dtype: int64
'''
grouped = s.groupby(level = 0)
grouped.first() #自动升序分组:5,7,8 显示第一次出现5,7,8的值
'''
5 3
7 2
8 1
dtype: int64
'''
grouped.last() #显示最后次出现5,7,8的值
'''
5 3
7 2
8 1
dtype: int64
'''
grouped = s.groupby(level = 0,sort =False)
grouped.first()
'''
8 1
7 2
5 3
dtype: int64
'''
grouped.sum()
: 同组内的数值求和
grouped.sum()
'''
5 6
7 4
8 2
dtype: int64
'''
get_group()
: 得到该组的所有数据
df2 = pd.DataFrame({'X':['A','B','A','B'],'Y':[1,2,3,4]})
'''
X Y
0 A 1
1 B 2
2 A 3
3 B 4
'''
df2.groupby(['X']).get_group('A')
'''
X Y
0 A 1
2 A 3
'''
df2.groupby(['X']).get_group('B')
'''
X Y
1 B 2
3 B 4
'''
pd.MultiIndex.from_arrays(arrays,names = )
: 多重索引分组
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
index = pd.MultiIndex.from_arrays(arrays,names = ['first','second']) # 多重索引
s = pd.Series(np.random.randn(8),index = index)
'''
first second
bar one -0.877562
two -1.296007
baz one 1.026419
two 0.445126
foo one 0.044509
two 0.271037
qux one -1.686649
two 0.914649
dtype: float64
'''
#level =0 按照一级标签进行分组
grouped = s.groupby(level =0)
grouped.sum()
'''
first
bar -2.173569
baz 1.471545
foo 0.315545
qux -0.772001
dtype: float64
'''
#level =1 按照二级标签进行分组
grouped = s.groupby(level = 1)
grouped.sum()
'''
second
one -1.493284
two 0.334805
dtype: float64
'''
#level =first 按照一级标签进行分组
grouped = s.groupby(level = 'first')
grouped.sum()
'''
first
bar -2.173569
baz 1.471545
foo 0.315545
qux -0.772001
dtype: float64
'''
普通方法 对已经分好的组进行按组操作
df = pd.read_csv('./data/titanic.csv')
df.groupby('Sex')['Age'].mean() # 分别计算男性和女性的的年龄平均值
aggregate()
: 对已经分好的组进行按组单个操作
import numpy as np
df.groupby('key').aggregate(np.mean) # key值相同的数据项以均值合并
'''
df=
A B C D
0 foo one -0.785250 1.010458
1 bar one 2.549941 1.704677
2 foo two -0.255153 -0.603249
3 bar three -0.954625 0.117662
4 foo two -0.548512 0.648127
5 bar two -0.642762 -1.111568
6 foo one 0.870697 0.556371
7 foo three 0.839937 0.798669
'''
# 多重依据分组
grouped = df.groupby(['A','B'])
grouped.aggregate(np.sum)
'''
C D
A B
bar one 2.549941 1.704677
three -0.954625 0.117662
two -0.642762 -1.111568
foo one 0.085447 1.566829
three 0.839937 0.798669
two -0.803665 0.044878
'''
# as_index = False
grouped = df.groupby(['A','B'],as_index = False)
grouped.aggregate(np.sum)
'''
A B C D
0 bar one 2.549941 1.704677
1 bar three -0.954625 0.117662
2 bar two -0.642762 -1.111568
3 foo one 0.085447 1.566829
4 foo three 0.839937 0.798669
5 foo two -0.803665 0.044878
'''
# reset_index()
df.groupby(['A','B']).sum().reset_index()
'''
A B C D
0 bar one 2.549941 1.704677
1 bar three -0.954625 0.117662
2 bar two -0.642762 -1.111568
3 foo one 0.085447 1.566829
4 foo three 0.839937 0.798669
5 foo two -0.803665 0.044878
'''
#组合个数
grouped = df.groupby(['A','B'])
grouped.size()
'''
A B
bar one 1
three 1
two 1
foo one 2
three 1
two 2
dtype: int64
'''
agg()
: 对已经分好的组进行按组多个操作
grouped = df.groupby('A')
grouped['C'].agg([np.sum,np.mean,np.std])
'''
sum mean std
A
bar 0.952553 0.317518 1.939613
foo 0.121719 0.024344 0.781542
'''
grouped['C'].agg({'res_sum':np.sum,'res_mean':np.mean,'res_std':np.std})
'''
res_std res_mean res_sum
A
bar 1.939613 0.317518 0.952553
foo 0.781542 0.024344 0.121719
'''
索引
创建数据
s = pd.Series(np.arange(5),index = np.arange(5)[::-1],dtype='int64')
'''
4 0
3 1
2 2
1 3
0 4
dtype: int64
'''
isin
: 判断数组中的值是否在列表中
# 返回值是布尔类型
s.isin([1,3,4])
'''
4 False
3 True
2 False
1 True
0 True
dtype: bool
'''
# 通过mask索引
s[s.isin([1,3,4])]
'''
3 1
1 3
0 4
dtype: int64
'''
pd.MultiIndex.from_product
: 多重依据分类
s2 = pd.Series(np.arange(6),index = pd.MultiIndex.from_product([[0,1],['a','b','c']]))
'''
0 a 0
b 1
c 2
1 a 3
b 4
c 5
dtype: int32
'''
iloc
: 通过位置索引
s2.iloc[s2.index.isin([(1,'a'),(0,'b')])]
'''
0 b 1
1 a 3
dtype: int32
'''
其他方法索引
s
'''
4 0
3 1
2 2
1 3
0 4
dtype: int64
'''
s[s>2]
'''
1 3
0 4
dtype: int64
'''
df.select
: 传入lambda 函数进行索引
dates = pd.date_range('20171124',periods=8)
df = pd.DataFrame(np.random.randn(8,4),index=dates,columns=['A','B','C','D'])
'''
A B C D
2017-11-24 -1.690231 -0.338101 -1.071022 1.084637
2017-11-25 1.292291 0.449885 0.468264 0.637102
2017-11-26 0.602494 -0.591658 0.301893 -1.050524
2017-11-27 -1.132170 1.310110 0.552812 0.370947
2017-11-28 -0.113234 -0.859983 0.381977 0.371730
2017-11-29 0.616029 0.209225 1.879964 0.179152
2017-11-30 0.554969 -0.656240 2.449274 -0.302113
2017-12-01 0.700342 1.068990 -0.572698 0.577581
'''
df.select(lambda x:x=='A',axis='columns')
'''
A
2017-11-24 -1.690231
2017-11-25 1.292291
2017-11-26 0.602494
2017-11-27 -1.132170
2017-11-28 -0.113234
2017-11-29 0.616029
2017-11-30 0.554969
2017-12-01 0.700342
'''
df.where()
: 通过设置条件进行索引, 并进行相关操作
df.where(df < 0) # 小于0的位置被置为NaN
df.where(df < 0,-df) # 小于0的值被置为相反数
df.query()
: 通过设置条件进行索引
df = pd.DataFrame(np.random.rand(10,3),columns = list('abc'))
'''
a b c
0 0.760971 0.910097 0.480540
1 0.242792 0.778342 0.852441
2 0.147953 0.449719 0.539780
3 0.519164 0.936192 0.402399
4 0.365343 0.148621 0.176917
5 0.837852 0.283028 0.527734
6 0.729312 0.066871 0.747968
7 0.502851 0.462246 0.116735
8 0.472404 0.517753 0.945877
9 0.962282 0.300276 0.258252
'''
df.query('(a<b)') #列出a<d的数据项
df.query('(a<b) & (b<c)')
serise
创
pd.Series(data = ,index = )
data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data = data,index = index)
'''
a 10
b 11
c 12
dtype: int64
'''
查
# 用[]进行切片和索引
s[0] #10
s[0:2]
'''
a 10
b 11
dtype: int64
'''
# 用mask进行索引
mask = [True,False,True]
s[mask]
'''
a 10
c 12
dtype: int64
'''
# 用loc和iloc 进行索引
s.loc['b'] #11
s.iloc[1] # 11
改
# 直接通过索引[] 赋值修改
s1 = s.copy()
s1['a'] = 100
'''
a 100
b 11
c 12
dtype: int64
'''
# 通过replace 进行修改
s1.replace(to_replace = 100,value = 101,inplace = True)
'''
a 101
b 11
c 12
dtype: int64
'''
# 直接赋值对索引进行修改
s1.index # Index(['a', 'b', 'c'], dtype='object')
s1.index = ['a','b','d']
'''
a 101
b 11
d 12
dtype: int64
'''
# 通过rename进行修改
s1.rename(index = {'a':'A'},inplace = True)
'''
A 101
b 11
d 12
dtype: int64
'''
增
s.append()
: 对两个serise进行合并
data = [100,110]
index = ['h','k']
s2 = pd.Series(data = data,index = index)
'''
h 100
k 110
dtype: int64
'''
s3 = s1.append(s2)
s3['j'] = 500
'''
A 101
b 11
d 12
j 500
h 100
k 110
dtype: int64
'''
s1.append(s2,ignore_index = True)
'''
0 101
1 11
2 12
3 500
4 100
5 110
dtype: int64
'''
删
用del
或者drop
进行删除操作
s1
'''
A 101
b 11
d 12
j 500
dtype: int64
'''
del s1['A']
'''
b 11
d 12
j 500
dtype: int64
'''
s1.drop(['b','d'],inplace = True)
'''
j 500
dtype: int64
'''
合并
merge
import pandas as pd
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
'''
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K2
3 A3 B3 K3
'''
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
'''
C D key
0 C0 D0 K0
1 C1 D1 K1
2 C2 D2 K2
3 C3 D3 K3
'''
res = pd.merge(left, right)
'''
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
'''
res = pd.merge(left, right, on = 'key')
'''
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
'''
left = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
'key2': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
'''
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K1 K1
2 A2 B2 K2 K2
3 A3 B3 K3 K3
'''
right = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
'key2': ['K0', 'K1', 'K2', 'K4'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
'''
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K1
2 C2 D2 K2 K2
3 C3 D3 K3 K4
'''
#默认交集
res = pd.merge(left, right, on = ['key1', 'key2'])
'''
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K1 K1 C1 D1
2 A2 B2 K2 K2 C2 D2
'''
# how = 'outer'并集
res = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer')
'''
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K1 K1 C1 D1
2 A2 B2 K2 K2 C2 D2
3 A3 B3 K3 K3 NaN NaN
4 NaN NaN K3 K4 C3 D3
'''
# indicator = True 添加提示信息
res = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer', indicator = True)
'''
A B key1 key2 C D _merge
0 A0 B0 K0 K0 C0 D0 both
1 A1 B1 K1 K1 C1 D1 both
2 A2 B2 K2 K2 C2 D2 both
3 A3 B3 K3 K3 NaN NaN left_only
4 NaN NaN K3 K4 C3 D3 right_only
'''
# 按照'left'的进行合并
res = pd.merge(left, right, how = 'left')
'''
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K1 K1 C1 D1
2 A2 B2 K2 K2 C2 D2
3 A3 B3 K3 K3 NaN NaN
'''
# 按照'left'的进行合并
res = pd.merge(left, right, how = 'right')
'''
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K1 K1 C1 D1
2 A2 B2 K2 K2 C2 D2
3 NaN NaN K3 K4 C3 D3
'''
join
left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'key': ['K0', 'K1', 'K0', 'K1']})
'''
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K0
3 A3 B3 K1
'''
right = pd.DataFrame({'C': ['C0', 'C1'],
'D': ['D0', 'D1']},
index=['K0', 'K1'])
'''
C D
K0 C0 D0
K1 C1 D1
'''
result = left.join(right, on='key')
'''
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K0 C0 D0
3 A3 B3 K1 C1 D1
'''
显示格式设置
pd.set_option
: 控制显示的格式,如最大行数,列数,宽度,小数位数
# display.max_rows 显示的最大行数
pd.get_option('display.max_rows') #60
pd.set_option('display.max_rows',6)
pd.Series(index = range(0,100))
'''
0 NaN
1 NaN
2 NaN
..
97 NaN
98 NaN
99 NaN
Length: 100, dtype: float64
'''
#display.max_columns 显示的最大列数
pd.get_option('display.max_columns') #20
pd.set_option('display.max_columns',30)
pd.DataFrame(columns = range(0,30))
# display.max_colwidth 显示的最大宽度
pd.get_option('display.max_colwidth') #50
pd.set_option('display.max_colwidth',100)
pd.Series(index = ['A'],data=['t'*70])
# display.precision 设置显示的小数位数
pd.get_option('display.precision') #6
pd.Series(data = [1.23456789236546])
'''
0 1.234568
dtype: float64
'''
pd.set_option('display.precision',5)
pd.Series(data = [1.23456789236546])
'''
0 1.23457
dtype: float64
'''
特殊表格
数据透视表
数据透视表用来做数据透视,可以通过一个或多个键分组聚合DataFrame中的数据,通过aggfunc参数决定聚合类型,是groupby的高级功能。
pd.pivot_table(data, values = None,index = None,columns = None,aggfunc =‘mean’,fill_value = None,margin = False,dropna = True,margins_name =‘All’ )
data:DataFrame对象
values:要聚合的列或列的列表
index:数据透视表的index,从原数据的列中筛选
columns:数据透视表的columns,从原数据的列中筛选
aggfunc:用于聚合的函数,默认为numpy.mean,支持numpy计算方法
fill_value: 用于替换缺失值的值
margin: 添加所有行/列
dropna: 不包括条目为 NaN的列,默认为True
margin_name: 当margin为True时,将包含总计的行/列的名称
import pandas as pd
example = pd.DataFrame({'Month': ["January", "January", "January", "January",
"February", "February", "February", "February",
"March", "March", "March", "March"],
'Category': ["Transportation", "Grocery", "Household", "Entertainment",
"Transportation", "Grocery", "Household", "Entertainment",
"Transportation", "Grocery", "Household", "Entertainment"],
'Amount': [74., 235., 175., 100., 115., 240., 225., 125., 90., 260., 200., 120.]})
pivot(index =,columns=,values =)
: 返回数据透视表
example_pivot = example.pivot(index = 'Category',columns= 'Month',values = 'Amount')
'''
Month February January March
Category
Entertainment 125.0 100.0 120.0
Grocery 240.0 235.0 260.0
Household 225.0 175.0 200.0
Transportation 115.0 74.0 90.0
'''
sum
:求和
# axis = 1 按照行求和
example_pivot.sum(axis = 1)
'''
Category
Entertainment 345.0
Grocery 735.0
Household 600.0
Transportation 279.0
dtype: float64
'''
# axis = 0 按照列求和
example_pivot.sum(axis = 0)
'''
Month
February 705.0
January 584.0
March 670.0
dtype: float64
'''
pivot_table
: 对数据透视表进行相关操作
# 默认值求平均
df = pd.read_csv('./data/titanic.csv')
df.pivot_table(index = 'Sex',columns='Pclass',values='Fare')
'''
Pclass 1 2 3
Sex
female 106.125798 21.970121 16.118810
male 67.226127 19.741782 12.661633
'''
# aggfunc='max' 求最大值
df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='max')
'''
最大值
Pclass 1 2 3
Sex
female 512.3292 65.0 69.55
male 512.3292 73.5 69.55
'''
# aggfunc='count' 统计个数
df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='count')
'''
统计数目
Pclass 1 2 3
Sex
female 94 76 144
male 122 108 347
'''
交叉表
默认情况下,crosstab是用来计算因子的频率表。
pd.crosstab()参数:
pd.crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True, normalize=False)
index:要在行中分组的值
columns:要在列中分组的值
values:根据因子聚合的值数组,需指定aggfunc
aggfunc:如指定,还需指定value
normalize:将所有值除以值的总和进行归一化 ,为True时候显示百分比
pd.crosstab()
:
# 默认计算因子的频率表。
pd.crosstab(index = df['Sex'],columns = df['Pclass'])
'''
Pclass 1 2 3
Sex
female 94 76 144
male 122 108 347
'''
# aggfunc='mean' 求平均值
df.pivot_table(index = 'Pclass',columns='Sex',values='Survived',aggfunc='mean')
'''
Sex female male
Pclass
1 0.968085 0.368852
2 0.921053 0.157407
3 0.500000 0.135447
'''
# 设置条件进行筛选
df['Underaged'] = df['Age'] <= 18
df.pivot_table(index = 'Underaged',columns='Sex',values='Survived',aggfunc='mean')
'''
Sex female male
Underaged
False 0.760163 0.167984
True 0.676471 0.338028
'''
时间操作
时间转换
datetime.datetime(year=,month=,day=,hour=,minute=)
%matplotlib notebook
import datetime
dt = datetime.datetime(year=2017,month=11,day=24,hour=10,minute=30)
'''
datetime.datetime(2017, 11, 24, 10, 30)
'''
pd.Timestamp
: 设置时间
pd.Timedelta
: 时间增量
pd.to_datetime
: 时间格式转换
import pandas as pd
ts = pd.Timestamp('2017-11-24')
ts.month
ts.day
ts + pd.Timedelta('5 days')# Timestamp('2017-11-29 00:00:00')
pd.to_datetime('2017-11-24') #Timestamp('2017-11-24 00:00:00')
pd.to_datetime('24/11/2017')# Timestamp('2017-11-24 00:00:00')
时间序列
to_datetime()
: 传入的参数是时间序列
s = pd.Series(['2017-11-24 00:00:00','2017-11-25 00:00:00','2017-11-26 00:00:00'])
'''
0 2017-11-24 00:00:00
1 2017-11-25 00:00:00
2 2017-11-26 00:00:00
dtype: object
'''
ts = pd.to_datetime(s)
'''
0 2017-11-24
1 2017-11-25
2 2017-11-26
dtype: datetime64[ns]
'''
dt.hour
: 通过小时进行序列的索引
dt.weekday
: 通过星期进行序列的索引
ts.dt.hour
'''
0 0
1 0
2 0
dtype: int64
'''
ts.dt.weekday
'''
0 4
1 5
2 6
dtype: int64
'''
pd.date_range(start=,periods = ,freq = )
:生成时间序列
pd.Series(pd.date_range(start='2017-11-24',periods = 10,freq = '12H'))
'''
0 2017-11-24 00:00:00
1 2017-11-24 12:00:00
2 2017-11-25 00:00:00
3 2017-11-25 12:00:00
4 2017-11-26 00:00:00
5 2017-11-26 12:00:00
6 2017-11-27 00:00:00
7 2017-11-27 12:00:00
8 2017-11-28 00:00:00
9 2017-11-28 12:00:00
dtype: datetime64[ns]
'''
时间序列的截取的方式
data = pd.read_csv('./data/flowdata.csv')
data.head()
'''
Time L06_347 LS06_347 LS06_348
0 2009-01-01 00:00:00 0.137417 0.097500 0.016833
1 2009-01-01 03:00:00 0.131250 0.088833 0.016417
2 2009-01-01 06:00:00 0.113500 0.091250 0.016750
3 2009-01-01 09:00:00 0.135750 0.091500 0.016250
4 2009-01-01 12:00:00 0.140917 0.096167 0.017000
'''
data['Time'] = pd.to_datetime(data['Time'])
data = data.set_index('Time')
'''
L06_347 LS06_347 LS06_348
Time
2009-01-01 00:00:00 0.137417 0.097500 0.016833
2009-01-01 03:00:00 0.131250 0.088833 0.016417
2009-01-01 06:00:00 0.113500 0.091250 0.016750
2009-01-01 09:00:00 0.135750 0.091500 0.016250
...
'''
# 时间序列截取方式
data[pd.Timestamp('2012-01-01 09:00'):pd.Timestamp('2012-01-01 19:00')] #截取一段数据
data[('2012-01-01 09:00'):('2012-01-01 19:00')]
data.tail(10) #截取最后10个数据
data[data.index.month == 1]
data[(data.index.hour > 8) & (data.index.hour <12)]
data.between_time('08:00','12:00')
时间抽样
data.resample
: 对时间进行抽样
data.resample('D').mean().head() #间隔一天取平均值
data.resample('D').max().head()
data.resample('3D').mean().head()
data.resample('M').mean().head()
data.resample('M').mean().plot()
排序
sort_values
import pandas as pd
data = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'],
'data':[4,3,2,1,12,3,4,5,7]})
'''
data group
0 4 a
1 3 a
2 2 a
3 1 b
4 12 b
5 3 b
6 4 c
7 5 c
8 7 c
'''
#按照group降序的基础上,data升序
data.sort_values(by=['group','data'],ascending = [False,True],inplace=True)
'''
data group
6 4 c
7 5 c
8 7 c
3 1 b
5 3 b
4 12 b
2 2 a
1 3 a
0 4 a
'''
data = pd.DataFrame({'k1':['one']*3+['two']*4,
'k2':[3,2,1,3,3,4,4]})
'''
k1 k2
0 one 3
1 one 2
2 one 1
3 two 3
4 two 3
5 two 4
6 two 4
'''
# 根据关键字进行排序
data.sort_values(by='k2')
'''
2 one 1
1 one 2
0 one 3
3 two 3
4 two 3
5 two 4
6 two 4
'''
去重
drop_duplicates()
: 去重操作
# k1, k2 的值都相等时才除去
data.drop_duplicates()
'''
k1 k2
0 one 3
1 one 2
2 one 1
3 two 3
5 two 4
'''
# 只针对k1去重
data.drop_duplicates(subset='k1')
'''
k1 k2
0 one 3
3 two 3
'''
映射
apply
data = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})
'''
data food
0 1 A1
1 2 A2
2 3 B1
3 4 B2
4 5 B3
5 6 C1
6 7 C2
'''
def food_map(series):
if series['food'] == 'A1':
return 'A'
elif series['food'] == 'A2':
return 'A'
elif series['food'] == 'B1':
return 'B'
elif series['food'] == 'B2':
return 'B'
elif series['food'] == 'B3':
return 'B'
elif series['food'] == 'C1':
return 'C'
elif series['food'] == 'C2':
return 'C'
data['food_map'] = data.apply(food_map,axis = 'columns')
'''
data food food_map
0 1 A1 A
1 2 A2 A
2 3 B1 B
3 4 B2 B
4 5 B3 B
5 6 C1 C
6 7 C2 C
'''
map
food2Upper = {
'A1':'A',
'A2':'A',
'B1':'B',
'B2':'B',
'B3':'B',
'C1':'C',
'C2':'C'
}
data['upper'] = data['food'].map(food2Upper)
'''
data food food_map upper
0 1 A1 A A
1 2 A2 A A
2 3 B1 B B
3 4 B2 B B
4 5 B3 B B
5 6 C1 C C
6 7 C2 C C
'''
assign
: 添加一列
import numpy as np
df = pd.DataFrame({'data1':np.random.randn(5),
'data2':np.random.randn(5)})
df2 = df.assign(ration = df['data1']/df['data2'])
'''
data1 data2 ration
0 -1.069925 -0.186540 5.735617
1 0.636127 0.020425 31.143814
2 0.366197 -0.102836 -3.560992
3 -0.975327 0.451201 -2.161624
4 -1.562407 -2.436845 0.641160
'''
drop
: 去除一行
df2.drop('ration',axis='columns',inplace=True)
'''
data1 data2
0 -1.069925 -0.186540
1 0.636127 0.020425
2 0.366197 -0.102836
3 -0.975327 0.451201
4 -1.562407 -2.436845
'''
分类
pd.cut
: 返回每个元素对应的区间
ages = [15,18,20,21,22,34,41,52,63,79]
bins = [10,40,80]
bins_res = pd.cut(ages,bins)
'''
[(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (40, 80], (40, 80], (40, 80], (40, 80]]
'''
lables
: 返回每个元素对应区间的标号
bins_res.labels
'''
array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int8)
'''
value_counts
: 返回每个区间对应的个数
pd.value_counts(bins_res)
'''
(10, 40] 6
(40, 80] 4
dtype: int64
'''
pd.cut(ages,[10,30,50,80])
'''
[(10, 30], (10, 30], (10, 30], (10, 30], (10, 30], (30, 50], (30, 50], (50, 80], (50, 80], (50, 80]]
'''
group_names = ['Yonth','Mille','Old']
pd.value_counts(pd.cut(ages,[10,20,50,80],labels=group_names))
'''
Mille 4
Old 3
Yonth 3
dtype: int64
'''
判空
nan
: 表示数据为空
df = pd.DataFrame([range(3),[0, np.nan,0],[0,0,np.nan],range(3)])
'''
0 1 2
0 0 1.0 2.0
1 0 NaN 0.0
2 0 0.0 NaN
3 0 1.0 2.0
'''
isnull()
: 返回值为布尔类型,若为nan则返回true, 否则返回false
df.isnull()
'''
0 1 2
0 False False False
1 False True False
2 False False True
3 False False False
'''
isnull().any()
: 返回值为布尔类型,若该条数据全为nan则返回true, 否则返回false
# 默认判断每行数据
df.isnull().any()
'''
0 False
1 True
2 True
dtype: bool
'''
# axis = 1 判断每一列的数据
df.isnull().any(axis = 1)
'''
0 False
1 True
2 True
3 False
dtype: bool
'''
df.fillna(n)
: 将值为nan的数据用n填充
df.fillna(5)
'''
0 1 2
0 0 1.0 2.0
1 0 5.0 0.0
2 0 0.0 5.0
3 0 1.0 2.0
'''
df[df.isnull().any(axis = 1)]
'''
0 1 2
1 0 NaN 0.0
2 0 0.0 NaN
'''
字符串
import pandas as pd
import numpy as np
s = pd.Series(['A','b','B','gaer','AGER',np.nan])
'''
0 A
1 b
2 B
3 gaer
4 AGER
5 NaN
dtype: object
'''
大小写转换
str.lower()
, str.upper()
: 转换为小写,大写字母格式
s.str.lower()
'''
0 a
1 b
2 b
3 gaer
4 ager
5 NaN
dtype: object
'''
s.str.upper()
'''
0 A
1 B
2 B
3 GAER
4 AGER
5 NaN
dtype: object
'''
长度
str.len()
: 求字符串的长度
s.str.len()
'''
0 1.0
1 1.0
2 1.0
3 4.0
4 4.0
5 NaN
dtype: float64
'''
去除空格
str.strip()
: 去除字符串左右两边的空格。
str.lstrip()
: 去除字符串左边的空格。
str.rstrip()
: 去除字符串右边的空格。
index = pd.Index([' tang',' yu ','di'])
Index([' tang', ' yu ', 'di'], dtype='object')
index.str.strip()
'''
Index(['tang', 'yu', 'di'], dtype='object')
'''
index.str.lstrip()
'''
Index(['tang', 'yu ', 'di'], dtype='object')
'''
index.str.rstrip()
'''
Index([' tang', ' yu', 'di'], dtype='object')
'''
替换
str.replace()
: 将字符串中的某一个字符用另外一个字符填充
df = pd.DataFrame(np.random.randn(3,2),columns = ['A a','B b'],index = range(3))
'''
A a B b
0 -1.392628 1.020082
1 0.866707 0.654731
2 -0.320871 1.360513
'''
df.columns = df.columns.str.replace(' ','_')
'''
A_a B_b
0 -1.392628 1.020082
1 0.866707 0.654731
2 -0.320871 1.360513
'''
分割
str.split()
: 按照分隔符对字符串进行分割
s = pd.Series(['a_b_C','c_d_e','f_g_h'])
'''
0 a_b_C
1 c_d_e
2 f_g_h
dtype: object
'''
# 将字符串按照分隔符分割为list
s.str.split('_')
'''
0 [a, b, C]
1 [c, d, e]
2 [f, g, h]
dtype: object
'''
# expand = True 将字符串按照分隔符分割为多个columns
s.str.split('_',expand = True)
'''
0 1 2
0 a b C
1 c d e
2 f g h
'''
# 指定分割的次数
s.str.split('_',expand = True,n=1)
'''
0 1
0 a b_C
1 c d_e
2 f g_h
'''
str.contains()
: 检查字符串中是否包含某个子串,返回值为布尔类型
s = pd.Series(['A','Aas','Afgew','Ager','Agre','Ager'])
'''
0 A
1 Aas
2 Afgew
3 Ager
4 Agre
5 Ager
dtype: object
'''
s.str.contains('Ag')
'''
0 False
1 False
2 False
3 True
4 True
5 True
dtype: bool
'''
str.get_dummies(sep = )
: 按照分割符sep 将字符串分割成多个columns
s = pd.Series(['a','a|b','a|c'])
'''
0 a
1 a|b
2 a|c
dtype: object
'''
s.str.get_dummies(sep = '|')
'''
a b c
0 1 0 0
1 1 1 0
2 1 0 1
'''
绘图
折线图
单条折线图
%matplotlib inline
import pandas as pd
import numpy as np
s = pd.Series(np.random.randn(10),index = np.arange(0,100,10))
s.plot()
多条折线图
df = pd.DataFrame(np.random.randn(10, 4).cumsum(0),
index = np.arange(0, 100, 10),
columns = ['A', 'B', 'C', 'D'])
df.plot()
柱状图
水平和竖直柱状图
import matplotlib.pyplot as plt
fig,axes = plt.subplots(2,1) #子图
data = pd.Series(np.random.rand(16),index=list('abcdefghijklmnop'))
data.plot(ax = axes[0],kind='bar') #竖直柱状图
data.plot(ax = axes[1],kind='barh') #水平柱状图
多个指标柱状图
df = pd.DataFrame(np.random.rand(6, 4),
index = ['one', 'two', 'three', 'four', 'five', 'six'],
columns = pd.Index(['A', 'B', 'C', 'D'], name = 'Genus'))
df.plot(kind='bar')
tips = pd.read_csv('tips.csv')
tips.total_bill.plot(kind='hist',bins=50)
'''
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
'''
直方图
tips = pd.read_csv('tips.csv')
tips.head()
tips.total_bill.plot(kind='hist',bins=50)
散点图
macro = pd.read_csv('macrodata.csv')
macro.head()
data = macro[['quarter','realgdp','realcons']]
data.plot.scatter('quarter','realgdp')
pd.scatter_matrix(data,color='g',alpha=0.3)
大数据处理
向下类型转换
import pandas as pd
gl = pd.read_csv('game_logs.csv')
gl.head()
gl.info(memory_usage='deep') # 占用内存大小
for dtype in ['float64','object','int64']: #查看每种类型平均占用内存大小
selected_dtype = gl.select_dtypes(include=[dtype])
mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
mean_usage_mb = mean_usage_b / 1024 ** 2
print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))
import numpy as np
int_types = ["uint8", "int8", "int16","int32","int64"]
for it in int_types:
print(np.iinfo(it)) #查看每种类型表示数的范围大小
def mem_usage(pandas_obj): #计算内存占用
if isinstance(pandas_obj,pd.DataFrame):
usage_b = pandas_obj.memory_usage(deep=True).sum()
else: # we assume if not a df it's a series
usage_b = pandas_obj.memory_usage(deep=True)
usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
return "{:03.2f} MB".format(usage_mb)
gl_int = gl.select_dtypes(include=['int64'])
converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned') #转换数据类型
print(mem_usage(gl_int)) # 7.87 MB
print(mem_usage(converted_int))# 1.48 MB
去重
gl_obj = gl.select_dtypes(include = ['object']).copy()
gl_obj.describe() #重复值过多
dow = gl_obj.day_of_week
dow.head()
dow_cat = dow.astype('category')
dow_cat.head()
'''
0 Thu
1 Fri
2 Sat
3 Mon
4 Tue
Name: day_of_week, dtype: category
Categories (7, object): [Fri, Mon, Sat, Sun, Thu, Tue, Wed]
'''
dow_cat.head(10).cat.codes#object类型转换为category
'''
0 4
1 0
2 2
3 1
4 5
5 4
6 2
7 2
8 1
9 5
dtype: int8
'''
时间压缩
date = optimized_gl.date
date[:5]
'''
0 18710504
1 18710505
2 18710506
3 18710508
4 18710509
Name: date, dtype: uint32
'''
print (mem_usage(date)) #0.66MB
optimized_gl['date'] = pd.to_datetime(date,format='%Y%m%d')
print (mem_usage(optimized_gl['date'])) #1.31 MB
optimized_gl['date'][:5]
'''
0 1871-05-04
1 1871-05-05
2 1871-05-06
3 1871-05-08
4 1871-05-09
Name: date, dtype: datetime64[ns]
'''