Pandas索引结构¶
import pandas as pd
df = pd.read_csv('./data/titanic.csv')
df['Age'][:5]
df['Age'][:5]
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
Name: Age, dtype: float64
df[['Age','Fare']][:5]
df[['Age','Fare']][:5]
Age Fare
0 22.0 7.2500
1 38.0 71.2833
2 26.0 7.9250
3 35.0 53.1000
4 35.0 8.0500
loc 用label来去定位
iloc 用position来去定位
df.iloc[0]
df.iloc[0]
PassengerId 1
Survived 0
Pclass 3
Name Braund, Mr. Owen Harris
Sex male
Age 22
SibSp 1
Parch 0
Ticket A/5 21171
Fare 7.25
Cabin NaN
Embarked S
Name: 0, dtype: object
df.iloc[0:5]
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
df.iloc[0:5,1:3]
Survived Pclass
0 0 3
1 1 1
2 1 3
3 1 1
4 0 3
df = df.set_index('Name')
df = df.set_index('Name')
df.loc['Heikkinen, Miss. Laina']
PassengerId 3
Survived 1
Pclass 3
Sex female
Age 26
SibSp 0
Parch 0
Ticket STON/O2. 3101282
Fare 7.925
Cabin NaN
Embarked S
Name: Heikkinen, Miss. Laina, dtype: object
df.loc['Heikkinen, Miss. Laina','Fare']
7.9249999999999998
df.loc['Heikkinen, Miss. Laina':'Allen, Mr. William Henry',:]
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked
Name
Heikkinen, Miss. Laina 3 1 3 female 26.0 0 0 STON/O2. 3101282 7.925 NaN S
Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 1 1 female 35.0 1 0 113803 53.100 C123 S
Allen, Mr. William Henry 5 0 3 male 35.0 0 0 373450 8.050 NaN S
df.loc['Heikkinen, Miss. Laina','Fare'] = 1000
df.head()
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked
Name
Braund, Mr. Owen Harris 1 0 3 male 22.0 1 0 A/5 21171 7.2500 NaN S
Cumings, Mrs. John Bradley (Florence Briggs Thayer) 2 1 1 female 38.0 1 0 PC 17599 71.2833 C85 C
Heikkinen, Miss. Laina 3 1 3 female 26.0 0 0 STON/O2. 3101282 1000.0000 NaN S
Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 1 1 female 35.0 1 0 113803 53.1000 C123 S
Allen, Mr. William Henry 5 0 3 male 35.0 0 0 373450 8.0500 NaN S
#要被遗弃了,别用了
df.ix['Heikkinen, Miss. Laina','Fare']
1000.0
bool类型的索引
df['Fare'] > 40
df['Fare'] > 40
Name
Braund, Mr. Owen Harris False
Cumings, Mrs. John Bradley (Florence Briggs Thayer) True
Heikkinen, Miss. Laina True
Futrelle, Mrs. Jacques Heath (Lily May Peel) True
Allen, Mr. William Henry False
Moran, Mr. James False
McCarthy, Mr. Timothy J True
Palsson, Master. Gosta Leonard False
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) False
Nasser, Mrs. Nicholas (Adele Achem) False
Sandstrom, Miss. Marguerite Rut False
Bonnell, Miss. Elizabeth False
Saundercock, Mr. William Henry False
Andersson, Mr. Anders Johan False
Vestrom, Miss. Hulda Amanda Adolfina False
Hewlett, Mrs. (Mary D Kingcome) False
Rice, Master. Eugene False
Williams, Mr. Charles Eugene False
Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele) False
Masselmani, Mrs. Fatima False
Fynney, Mr. Joseph J False
Beesley, Mr. Lawrence False
McGowan, Miss. Anna "Annie" False
Sloper, Mr. William Thompson False
Palsson, Miss. Torborg Danira False
Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson) False
Emir, Mr. Farred Chehab False
Fortune, Mr. Charles Alexander True
O'Dwyer, Miss. Ellen "Nellie" False
Todoroff, Mr. Lalio False
...
Giles, Mr. Frederick Edward False
Swift, Mrs. Frederick Joel (Margaret Welles Barron) False
Sage, Miss. Dorothy Edith "Dolly" True
Gill, Mr. John William False
Bystrom, Mrs. (Karolina) False
Duran y More, Miss. Asuncion False
Roebling, Mr. Washington Augustus II True
van Melkebeke, Mr. Philemon False
Johnson, Master. Harold Theodor False
Balkic, Mr. Cerin False
Beckwith, Mrs. Richard Leonard (Sallie Monypeny) True
Carlsson, Mr. Frans Olof False
Vander Cruyssen, Mr. Victor False
Abelson, Mrs. Samuel (Hannah Wizosky) False
Najib, Miss. Adele Kiamie "Jane" False
Gustafsson, Mr. Alfred Ossian False
Petroff, Mr. Nedelio False
Laleff, Mr. Kristo False
Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) True
Shelley, Mrs. William (Imanita Parrish Hall) False
Markun, Mr. Johann False
Dahlberg, Miss. Gerda Ulrika False
Banfield, Mr. Frederick James False
Sutehall, Mr. Henry Jr False
Rice, Mrs. William (Margaret Norton) False
Montvila, Rev. Juozas False
Graham, Miss. Margaret Edith False
Johnston, Miss. Catherine Helen "Carrie" False
Behr, Mr. Karl Howell False
Dooley, Mr. Patrick False
Name: Fare, Length: 891, dtype: bool
df[df['Fare'] > 40][:5]
df[df['Fare'] > 40][:5]
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked
Name
Cumings, Mrs. John Bradley (Florence Briggs Thayer) 2 1 1 female 38.0 1 0 PC 17599 71.2833 C85 C
Heikkinen, Miss. Laina 3 1 3 female 26.0 0 0 STON/O2. 3101282 1000.0000 NaN S
Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 1 1 female 35.0 1 0 113803 53.1000 C123 S
McCarthy, Mr. Timothy J 7 0 1 male 54.0 0 0 17463 51.8625 E46 S
Fortune, Mr. Charles Alexander 28 0 1 male 19.0 3 2 19950 263.0000 C23 C25 C27 S
df[df['Sex'] == 'male'][:5]
df[df['Sex'] == 'male'][:5]
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked
Name
Braund, Mr. Owen Harris 1 0 3 male 22.0 1 0 A/5 21171 7.2500 NaN S
Allen, Mr. William Henry 5 0 3 male 35.0 0 0 373450 8.0500 NaN S
Moran, Mr. James 6 0 3 male NaN 0 0 330877 8.4583 NaN Q
McCarthy, Mr. Timothy J 7 0 1 male 54.0 0 0 17463 51.8625 E46 S
Palsson, Master. Gosta Leonard 8 0 3 male 2.0 3 1 349909 21.0750 NaN S
df.loc[df['Sex'] == 'male','Age'].mean()
df.loc[df['Sex'] == 'male','Age'].mean()
30.72664459161148
(df['Age'] > 70).sum()
(df['Age'] > 70).sum()
5
import pandas as pd
df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
'data':[0,5,10,5,10,15,10,15,20]})
df
import pandas as pd
df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
'data':[0,5,10,5,10,15,10,15,20]})
df
data key
0 0 A
1 5 B
2 10 C
3 5 A
4 10 B
5 15 C
6 10 A
7 15 B
8 20 C
for key in ['A','B','C']:
print (key,df[df['key'] == key].sum())
for key in ['A','B','C']:
print (key,df[df['key'] == key].sum())
A data 15
key AAA
dtype: object
B data 30
key BBB
dtype: object
C data 45
key CCC
dtype: object
df.groupby('key').sum()
df.groupby('key').sum()
data
key
A 15
B 30
C 45
import numpy as np
df.groupby('key').aggregate(np.mean)
import numpy as np
df.groupby('key').aggregate(np.mean)
data
key
A 5
B 10
C 15
df = pd.read_csv('./data/titanic.csv')
df.groupby('Sex')['Age'].mean()
df.groupby('Sex')['Age'].mean()
Sex
female 27.915709
male 30.726645
Name: Age, dtype: float64
df.groupby('Sex')['Survived'].mean()
df.groupby('Sex')['Survived'].mean()
Sex
female 0.742038
male 0.188908
Name: Survived, dtype: float64
对象的增删改查
import pandas as pd
Series结构的增删改查
data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data = data,index = index)
s
data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data = data,index = index)
s
a 10
b 11
c 12
dtype: int64
查操作
s[0]
10
s[0:2]
s[0:2]
a 10
b 11
dtype: int64
mask = [True,False,True]
s[mask]
mask = [True,False,True]
s[mask]
a 10
c 12
dtype: int64
s.loc['b']
s.loc['b']
11
s.iloc[1]
11
改操作
s1 = s.copy()
s1['a'] = 100
s1
s1 = s.copy()
s1['a'] = 100
s1
a 100
b 11
c 12
dtype: int64
s1.replace(to_replace = 100,value = 101,inplace = True)
s1.replace(to_replace = 100,value = 101,inplace = True)
s1
a 101
b 11
c 12
dtype: int64
s1.index
s1.index
Index(['a', 'b', 'c'], dtype='object')
s1.index = ['a','b','d']
s1.index = ['a','b','d']
s1
a 101
b 11
d 12
dtype: int64
s1.rename(index = {'a':'A'},inplace = True)
s1.rename(index = {'a':'A'},inplace = True)
s1
A 101
b 11
d 12
dtype: int64
增操作
data = [100,110]
index = ['h','k']
s2 = pd.Series(data = data,index = index)
s2
data = [100,110]
index = ['h','k']
s2 = pd.Series(data = data,index = index)
s2
h 100
k 110
dtype: int64
s3 = s1.append(s2)
s3 = s1.append(s2)
s3['j'] = 500
s3['j'] = 500
s3
A 101
b 11
d 12
j 500
h 100
k 110
dtype: int64
s1.append(s2,ignore_index = False)
s1.append(s2,ignore_index = True)
s1.append(s2,ignore_index = True)
0 101
1 11
2 12
3 500
4 100
5 110
dtype: int64
删操作
s1
s1
A 101
b 11
d 12
j 500
dtype: int64
del s1['A']
del s1['A']
s1
b 11
d 12
j 500
dtype: int64
s1.drop(['b','d'],inplace = True)
s1.drop(['b','d'],inplace = True)
s1
j 500
dtype: int64
DataFrame结构的增删改查
data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']
df = pd.DataFrame(data=data,index=index,columns = columns)
df
data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']
df = pd.DataFrame(data=data,index=index,columns = columns)
df
A B C
a 1 2 3
b 4 5 6
查操作是类似的
df['A']
a 1
b 4
Name: A, dtype: int64
df.iloc[0]
A 1
B 2
C 3
Name: a, dtype: int64
df.loc['a']
A 1
B 2
C 3
Name: a, dtype: int64
改操作
df.loc['a']['A']
1
df.loc['a']['A'] = 150
df
A B C
a 150 2 3
b 4 5 6
df.index = ['f','g']
df
A B C
f 150 2 3
g 4 5 6
增操作
df.loc['c'] = [1,2,3]
df
A B C
f 150 2 3
g 4 5 6
c 1 2 3
data = [[1,2,3],[4,5,6]]
index = ['j','k']
columns = ['A','B','C']
df2 = pd.DataFrame(data=data,index=index,columns = columns)
df2
A B C
j 1 2 3
k 4 5 6
df3 = pd.concat([df,df2],axis = 0)
df3
A B C
f 150 2 3
g 4 5 6
c 1 2 3
j 1 2 3
k 4 5 6
df2['Tang'] = [10,11]
df2
A B C Tang
j 1 2 3 10
k 4 5 6 11
df4 = pd.DataFrame([[10,11],[12,13]],index=['j','k'],columns=['D','E'])
df4
D E
j 10 11
k 12 13
df5 = pd.concat([df2,df4],axis = 1)
df5
A B C Tang D E
j 1 2 3 10 10 11
k 4 5 6 11 12 13
删操作
df5.drop(['j'],axis=0,inplace = True)
df5
A B C Tang D E
k 4 5 6 11 12 13
del df5['Tang']
df5
A B C D E
k 4 5 6 12 13
df5.drop(['A','B','C'],axis = 1,inplace = True)
df5
D E
k 12 13
7、Pandas merge操作
7.1 pycharm读取
import pandas as pd
#左表 A和B两个特征
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
res = pd.merge(left, right)
print(res)
res = pd.merge(left, right, on = 'key')
print(res)
left = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
'key2': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
'key2': ['K0', 'K1', 'K2', 'K4'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print (left)
print (right)
#当上面两个表不同的时候还可以merge吗? 默认结果下丢数据了
res = pd.merge(left, right, on = ['key1', 'key2'])
print(res)
#不丢数据
res = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer')
print(res)
res = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer', indicator = True)
print(res)
res = pd.merge(left, right, how = 'left')
print(res)
res = pd.merge(left, right, how = 'right')
print(res)
#join操作
left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'key': ['K0', 'K1', 'K0', 'K1']})
right = pd.DataFrame({'C': ['C0', 'C1'],
'D': ['D0', 'D1']},
index=['K0', 'K1'])
print(left)
print(right)
result = left.join(right, on='key')
print(result)