Day 2
Pandas基本操作
groupby操作
-
小示例
A 0
B 15
C 5
A 10
B 15
C 20
groupby三步走:
1、split:分块。A分到一起,B分到一起,C分到一起。
2、apply:计和
3、combine:汇总–>A:10 B:30 C:25
import pandas as pd
df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
'data':[0,5,10,5,10,15,10,5,15]})
df
#普通汇总方式
for key in['A','B','C']:
print(key,df[df['key'] == key].sum())
'''
A key AAA
data 15
dtype: object
B key BBB
data 20
dtype: object
C key CCC
data 40
dtype: object
'''
#使用groupby
df.groupby('key').sum()
#或者整合numpy中的方法
import numpy as np
df.groupby('key').aggregate(np.sum)
- 使用titanic数据集示例
import pandas as pd
df = pd.read_csv('data/titanic.csv')
#计算不同性别人群的平均生存概率
df.groupby('Sex')['Survived'].mean()
'''
Sex
female 0.742038
male 0.188908
Name: Survived, dtype: float64
'''
#计算不同性别人群的年龄情况
df.groupby('Sex')['Age'].describe()
数值计算
import pandas as pd
#index代表索引,column代表属性
df = pd.DataFrame([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C'])
df
df.sum() #默认按列求和
#df.sum(axis='index')
'''
A 5
B 7
C 9
dtype: int64
'''
df.sum(axis=1)#按行求和
#df.sum(axis='columns')
'''
a 6
b 15
dtype: int64
'''
df.mean(axis=1)#min() max() median()
# 二元统计
df = pd.read_csv('data/titanic.csv')
#统计数据之间的协方差
df.cov()
#相关系数
df.corr()
#特征计数
#年龄一般计数
df['Age'].count()
#714
#各个值计数
df['Age'].value_counts() #降序排列
'''
24.00 30
22.00 27
18.00 26
19.00 25
30.00 25
..
55.50 1
70.50 1
66.00 1
23.50 1
0.42 1
Name: Age, Length: 88, dtype: int64
'''
df['Age'].value_counts(ascending = True) #升序排列
'''
0.42 1
23.50 1
66.00 1
70.50 1
55.50 1
..
30.00 25
19.00 25
18.00 26
22.00 27
24.00 30
Name: Age, Length: 88, dtype: int64
'''
#年龄分组
df['Age'].value_counts(ascending = True,bins=5)
'''
(64.084, 80.0] 11
(48.168, 64.084] 69
(0.339, 16.336] 100
(32.252, 48.168] 188
(16.336, 32.252] 346
Name: Age, dtype: int64
'''
#各个舱位的数量
df['Pclass'].value_counts(ascending=True)
'''
2 184
1 216
3 491
Name: Pclass, dtype: int64
'''
对象操作
- Seires结构的增删改查
import pandas as pd
data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data=data,index=index)
s
'''
a 10
b 11
c 12
dtype: int64
'''
#查操作
s[0:2]
'''
a 10
b 11
dtype: int64
'''
s[[True,False,True]] #bool查询
'''
a 10
c 12
dtype: int64
'''
s.loc['b']
#11
s.iloc[2]
#12
#改操作
s1 = s.copy()
s1['a'] = 100
s1
'''
a 100
b 11
c 12
dtype: int64
'''
s1.replace(to_replace=100,value=101,inplace=False)#inplace=False,原始series不变,仅是执行此操作
s1
'''
a 100
b 11
c 12
dtype: int64
'''
s1.replace(to_replace=100,value=101,inplace=True)#inplace=True,原始series改变
s1
'''
a 101
b 11
c 12
dtype: int64
'''
s1.index=['a','b','d']
s1
'''
a 101
b 11
d 12
dtype: int64
'''
s1.rename(index={'a':'A'},inplace=True)
s1
'''
A 101
b 11
d 12
dtype: int64
'''
#增操作
s2 = pd.Series([100,500],index=['g','h'])
s2
'''
g 100
h 500
dtype: int64
'''
s1.append(s2)
'''
A 101
b 11
d 12
g 100
h 500
dtype: int64
'''
s1['j'] = 500
s1
'''
A 101
b 11
d 12
j 500
dtype: int64
'''
s1.append(s2,ignore_index=False)
'''
A 101
b 11
d 12
j 500
g 100
h 500
dtype: int64
'''
s1.append(s2,ignore_index=True)
'''
0 101
1 11
2 12
3 500
4 100
5 500
dtype: int64
'''
#删操作
del s1['b']
s1
'''
d 12
j 500
dtype: int64
'''
s1.drop('d',inplace=False)
'''
j 500
dtype: int64
'''
- Pandas结构的增删改查
data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']
df = pd.DataFrame(data=data,index= index, columns=columns)
df
#查操作
df.iloc[0]
'''
A 1
B 2
C 3
Name: a, dtype: int64
'''
df.loc['a']
'''
A 1
B 2
C 3
Name: a, dtype: int64
'''
#改操作
df.loc['a']['A'] = 150
df
df.index = ['f','g']
df
#增操作
df.loc['c'] = [1,2,3]
df
data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']
df2 = pd.DataFrame(data=data,index= index, columns=columns)
df3 = pd.concat([df,df2])
df3
df3 = pd.concat([df,df2],axis=1)
df3
df2['yu'] = [10,11]
df2
df4 = pd.DataFrame([[10,11],[12,13]],index=['a','b'],columns=['D','E'])
df4
df5 = pd.concat((df2,df4),axis=1)
df5
#删操作
df5.drop(['a'],axis=0,inplace=True)
df5
df5.drop(['A'],axis=1,inplace=True)
df5
df5.drop(['yu','D','E'],axis =1,inplace=True)
df5