常见DataFrame数据处理
import numpy as np
import pandas as pd
from numpy.random import randn
np.random.seed(101)
df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())
df
| W | X | Y | Z |
---|
A | 0.907969 | 0.503826 | 0.651118 | -0.319318 |
---|
B | -0.848077 | 0.605965 | -2.018168 | 0.740122 |
---|
C | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
D | -0.933237 | 0.955057 | 0.190794 | 1.978757 |
---|
E | 2.605967 | 0.683509 | 0.302665 | 1.693723 |
---|
df['W']
A 0.907969
B -0.848077
C 0.528813
D -0.933237
E 2.605967
Name: W, dtype: float64
df[['Y','Z']]
| Y | Z |
---|
A | 0.651118 | -0.319318 |
---|
B | -2.018168 | 0.740122 |
---|
C | 0.188695 | -0.758872 |
---|
D | 0.190794 | 1.978757 |
---|
E | 0.302665 | 1.693723 |
---|
df[1:3]
| W | X | Y | Z |
---|
B | -0.848077 | 0.605965 | -2.018168 | 0.740122 |
---|
C | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
df.loc[['A','B'],['Y','Z']]
| Y | Z |
---|
A | 0.651118 | -0.319318 |
---|
B | -2.018168 | 0.740122 |
---|
df.iloc[1]
W -0.848077
X 0.605965
Y -2.018168
Z 0.740122
Name: B, dtype: float64
df.iloc[1:3]
| W | X | Y | Z |
---|
B | -0.848077 | 0.605965 | -2.018168 | 0.740122 |
---|
C | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
df>0
| W | X | Y | Z |
---|
A | True | True | True | False |
---|
B | False | True | False | True |
---|
C | True | False | True | False |
---|
D | False | True | True | True |
---|
E | True | True | True | True |
---|
type(df>0)
pandas.core.frame.DataFrame
df[df>0]
| W | X | Y | Z |
---|
A | 0.907969 | 0.503826 | 0.651118 | NaN |
---|
B | NaN | 0.605965 | NaN | 0.740122 |
---|
C | 0.528813 | NaN | 0.188695 | NaN |
---|
D | NaN | 0.955057 | 0.190794 | 1.978757 |
---|
E | 2.605967 | 0.683509 | 0.302665 | 1.693723 |
---|
df[df['W']>0]
| W | X | Y | Z |
---|
A | 0.907969 | 0.503826 | 0.651118 | -0.319318 |
---|
C | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
E | 2.605967 | 0.683509 | 0.302665 | 1.693723 |
---|
df[df['W']>0][['Z','Y']]
| Z | Y |
---|
A | -0.319318 | 0.651118 |
---|
C | -0.758872 | 0.188695 |
---|
E | 1.693723 | 0.302665 |
---|
df[(df['W']>0) & (df['Y'] > 0.5)]
| W | X | Y | Z |
---|
A | 0.907969 | 0.503826 | 0.651118 | -0.319318 |
---|
df.W
A 0.907969
B -0.848077
C 0.528813
D -0.933237
E 2.605967
Name: W, dtype: float64
df['new'] = df['W'] + df['Y']
df
| W | X | Y | Z | new |
---|
A | 0.907969 | 0.503826 | 0.651118 | -0.319318 | 1.559087 |
---|
B | -0.848077 | 0.605965 | -2.018168 | 0.740122 | -2.866245 |
---|
C | 0.528813 | -0.589001 | 0.188695 | -0.758872 | 0.717509 |
---|
D | -0.933237 | 0.955057 | 0.190794 | 1.978757 | -0.742443 |
---|
E | 2.605967 | 0.683509 | 0.302665 | 1.693723 | 2.908633 |
---|
df.drop('new',axis = 1,inplace=True)
df
| W | X | Y | Z |
---|
A | 0.907969 | 0.503826 | 0.651118 | -0.319318 |
---|
B | -0.848077 | 0.605965 | -2.018168 | 0.740122 |
---|
C | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
D | -0.933237 | 0.955057 | 0.190794 | 1.978757 |
---|
E | 2.605967 | 0.683509 | 0.302665 | 1.693723 |
---|
df.drop('E',axis=0,inplace = True)
df
| W | X | Y | Z |
---|
A | 0.907969 | 0.503826 | 0.651118 | -0.319318 |
---|
B | -0.848077 | 0.605965 | -2.018168 | 0.740122 |
---|
C | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
D | -0.933237 | 0.955057 | 0.190794 | 1.978757 |
---|
df.loc['A']
W 0.907969
X 0.503826
Y 0.651118
Z -0.319318
Name: A, dtype: float64
df.loc[['A','B'],['W','Y']]
| W | Y |
---|
A | 0.907969 | 0.651118 |
---|
B | -0.848077 | -2.018168 |
---|
df.reset_index()
| index | W | X | Y | Z |
---|
0 | A | 0.907969 | 0.503826 | 0.651118 | -0.319318 |
---|
1 | B | -0.848077 | 0.605965 | -2.018168 | 0.740122 |
---|
2 | C | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
3 | D | -0.933237 | 0.955057 | 0.190794 | 1.978757 |
---|
df.reindex()
| index | new | W | X | Y | Z |
---|
0 | 0 | SH | 0.907969 | 0.503826 | 0.651118 | -0.319318 |
---|
1 | 1 | BJ | -0.848077 | 0.605965 | -2.018168 | 0.740122 |
---|
2 | 2 | CD | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
3 | 3 | WU | -0.933237 | 0.955057 | 0.190794 | 1.978757 |
---|
newcolume = 'SH BJ CD WU'.split()
df['new'] = newcolume
df
| index | new | W | X | Y | Z |
---|
0 | 0 | SH | 0.907969 | 0.503826 | 0.651118 | -0.319318 |
---|
1 | 1 | BJ | -0.848077 | 0.605965 | -2.018168 | 0.740122 |
---|
2 | 2 | CD | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
3 | 3 | WU | -0.933237 | 0.955057 | 0.190794 | 1.978757 |
---|
df.set_index('new')
| index | W | X | Y | Z |
---|
new | | | | | |
---|
SH | 0 | 0.907969 | 0.503826 | 0.651118 | -0.319318 |
---|
BJ | 1 | -0.848077 | 0.605965 | -2.018168 | 0.740122 |
---|
CD | 2 | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
WU | 3 | -0.933237 | 0.955057 | 0.190794 | 1.978757 |
---|
df
| index | new | W | X | Y | Z |
---|
0 | 0 | SH | 0.907969 | 0.503826 | 0.651118 | -0.319318 |
---|
1 | 1 | BJ | -0.848077 | 0.605965 | -2.018168 | 0.740122 |
---|
2 | 2 | CD | 0.528813 | -0.589001 | 0.188695 | -0.758872 |
---|
3 | 3 | WU | -0.933237 | 0.955057 | 0.190794 | 1.978757 |
---|
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index
MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df
| | A | B |
---|
G1 | 1 | -1.706086 | -1.159119 |
---|
2 | -0.134841 | 0.390528 |
---|
3 | 0.166905 | 0.184502 |
---|
G2 | 1 | 0.807706 | 0.072960 |
---|
2 | 0.638787 | 0.329646 |
---|
3 | -0.497104 | -0.754070 |
---|
df.loc['G1']
| A | B |
---|
1 | -1.706086 | -1.159119 |
---|
2 | -0.134841 | 0.390528 |
---|
3 | 0.166905 | 0.184502 |
---|
df.index.names
FrozenList([None, None])
df.index.names = ['Group','Num']
df
| | A | B |
---|
Group | Num | | |
---|
G1 | 1 | -1.706086 | -1.159119 |
---|
2 | -0.134841 | 0.390528 |
---|
3 | 0.166905 | 0.184502 |
---|
G2 | 1 | 0.807706 | 0.072960 |
---|
2 | 0.638787 | 0.329646 |
---|
3 | -0.497104 | -0.754070 |
---|
df.xs('G1')
| A | B |
---|
Num | | |
---|
1 | -1.706086 | -1.159119 |
---|
2 | -0.134841 | 0.390528 |
---|
3 | 0.166905 | 0.184502 |
---|
df.xs(['G1',1])
A -1.706086
B -1.159119
Name: (G1, 1), dtype: float64
df.xs(1,level='Num')
| A | B |
---|
Group | | |
---|
G1 | -1.706086 | -1.159119 |
---|
G2 | 0.807706 | 0.072960 |
---|
处理缺失数据##
import numpy as np
import pandas as pd
df = pd.DataFrame({'A':[1,2,np.nan],
'B':[5,np.nan,np.nan],
'C':[1,2,3]})
df
| A | B | C |
---|
0 | 1.0 | 5.0 | 1 |
---|
1 | 2.0 | NaN | 2 |
---|
2 | NaN | NaN | 3 |
---|
df.dropna()
df.dropna(axis=1)
df.dropna(thresh=2)
df.fillna(value='FILL VALUE')
| A | B | C |
---|
0 | 1 | 5 | 1 |
---|
1 | 2 | FILL VALUE | 2 |
---|
2 | FILL VALUE | FILL VALUE | 3 |
---|
df['A'].fillna(value=df['A'].mean())
0 1.0
1 2.0
2 1.5
Name: A, dtype: float64