pandas简单操作(2):处理丢失数据、数据运算、数据合并、数据分组等

pandas简单操作(2):处理丢失数据、数据运算、数据合并、数据分组等

input

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as plt
dates = pd.date_range(start ='20160301',periods = 6)
df = pd.DataFrame(np.random.randn(6,4), index = datas, columns = list('ABCD'))

input

df1 = df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df1

output

ABCDE
2016-03-01-0.5899890.781182-1.539535-0.564109NaN
2016-03-02-0.0136721.247939-0.0795881.121198NaN
2016-03-030.502634-0.022906-0.558544-0.057238NaN
2016-03-041.131296-0.7221591.8382511.780309NaN

input

df1.loc[dates[1:3],'E'] = 2
df1

output

ABCDE
2016-03-01-0.5899890.781182-1.539535-0.564109NaN
2016-03-02-0.0136721.247939-0.0795881.1211982.0
2016-03-030.502634-0.022906-0.558544-0.0572382.0
2016-03-041.131296-0.7221591.8382511.780309NaN

input

df1.dropna() #把空数据扔掉

output

ABCDE
2016-03-02-0.0136721.247939-0.0795881.1211982.0
2016-03-030.502634-0.022906-0.558544-0.0572382.0

input

df1.fillna(value=5) #把空数据替换掉

output

ABCDE
2016-03-01-0.5899890.781182-1.539535-0.5641095.0
2016-03-02-0.0136721.247939-0.0795881.1211982.0
2016-03-030.502634-0.022906-0.558544-0.0572382.0
2016-03-041.131296-0.7221591.8382511.7803095.0

input

pd.isnull(df1) #判断空数据

output

ABCDE
2016-03-01FalseFalseFalseFalseTrue
2016-03-02FalseFalseFalseFalseFalse
2016-03-03FalseFalseFalseFalseFalse
2016-03-04FalseFalseFalseFalseTrue

input

 pd.isnull(df1).any() #判断每一列有没有空数据

output

A    False
B    False
C    False
D    False
E     True
dtype: bool

input

pd.isnull(df1).any().any() #判断整个表有没有空数据
True

input

df1.mean() #空数据是不参与计算的

output

A    0.257567
B    0.321014
C   -0.084854
D    0.570040
E    2.000000
dtype: float64

input

 df1.mean(axis=1)  #按行求平均值

output

2016-03-01   -0.478113
2016-03-02    0.855175
2016-03-03    0.372789
2016-03-04    1.006924
Freq: D, dtype: float64

input

s = pd.Series([1,3,5,np.nan,6,8],index=datas).shift(2) #创建一个序列
s
2016-03-01    NaN
2016-03-02    NaN
2016-03-03    1.0
2016-03-04    3.0
2016-03-05    5.0
2016-03-06    NaN
Freq: D, dtype: float64

input

df

output

ABCD
2016-03-01-0.5899890.781182-1.539535-0.564109
2016-03-02-0.0136721.247939-0.0795881.121198
2016-03-030.502634-0.022906-0.558544-0.057238
2016-03-041.131296-0.7221591.8382511.780309
2016-03-050.167379-1.1786040.554787-1.018810
2016-03-06-1.870176-0.2777850.946150-1.130179

input

df.sub(s,axis='index') #df的每一列都减去s这个序列

output

ABCD
2016-03-01NaNNaNNaNNaN
2016-03-02NaNNaNNaNNaN
2016-03-03-0.497366-1.022906-1.558544-1.057238
2016-03-04-1.868704-3.722159-1.161749-1.219691
2016-03-05-4.832621-6.178604-4.445213-6.018810
2016-03-06NaNNaNNaNNaN

input

 df

output

ABCD
2016-03-01-0.5899890.781182-1.539535-0.564109
2016-03-02-0.0136721.247939-0.0795881.121198
2016-03-030.502634-0.022906-0.558544-0.057238
2016-03-041.131296-0.7221591.8382511.780309
2016-03-050.167379-1.1786040.554787-1.018810
2016-03-06-1.870176-0.2777850.946150-1.130179

input

df.apply(np.cumsum) #累加 e.g.第一列的第三个数字是前两个累加,第四个是前三个累加

output

ABCD
2016-03-01-0.5899890.781182-1.539535-0.564109
2016-03-02-0.6036612.029122-1.6191230.557089
2016-03-03-0.1010272.006216-2.1776670.499851
2016-03-041.0302691.284057-0.3394172.280160
2016-03-051.1976480.1054530.2153701.261350
2016-03-06-0.672528-0.1723321.1615200.131171

input

df.apply(lambda x : x.max()-x.min()) #每一列返回每一列的最大值减去最小值

output

A    3.001471
B    2.426544
C    3.377786
D    2.910488
dtype: float64

input

def _sum(x):
    print(type(x))
    return x.sum()
df.apply(_sum)   #了解apply函数的用法
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>





A   -0.672528
B   -0.172332
C    1.161520
D    0.131171
dtype: float64

input

 s = pd.Series(np.random.randint(10,20,size=20)) #创建一个序列
 s

output

0     12
1     10
2     12
3     19
4     18
5     12
6     14
7     10
8     14
9     13
10    14
11    14
12    17
13    18
14    14
15    15
16    12
17    19
18    15
19    10
dtype: int64

input

s.value_counts() #value_counts(): 统计每个数出现了多少次

output

14    5
12    4
10    3
19    2
18    2
15    2
17    1
13    1
dtype: int64

input

s.mode() #产生了最多的数字。0是标签,最多的数是14

output

0    14
dtype: int64

input

df = pd.DataFrame(np.random.randn(10,4),columns = list('ABCD'))
df

output

ABCD
00.336937-1.5363051.0653850.443815
10.258999-0.4922260.0316240.293882
21.170411-0.7696400.0319390.410354
3-0.664322-0.767787-0.1734392.197244
41.8748291.2131790.697452-0.463159
52.366758-1.818289-0.567515-0.429355
61.1135810.370977-1.5148330.768817
7-0.481723-0.342043-0.2610160.402664
81.5857740.170456-0.9501960.780025
90.950888-0.6994420.0769221.226366

input

df.iloc[:3] #取前三行

output

ABCD
00.336937-1.5363051.0653850.443815
10.258999-0.4922260.0316240.293882
21.170411-0.7696400.0319390.410354

input

df.iloc[3:7]

output

ABCD
3-0.664322-0.767787-0.1734392.197244
41.8748291.2131790.697452-0.463159
52.366758-1.818289-0.567515-0.429355
61.1135810.370977-1.5148330.768817

input

df.iloc[7:]

output

ABCD
7-0.481723-0.342043-0.2610160.402664
81.5857740.170456-0.9501960.780025
90.950888-0.6994420.0769221.226366

input

df1 = pd .concat([df.iloc[:3],df.iloc[3:7],df.iloc[7:]]) #concat:合并

input

df == df1 #看看合并后的和原来的是否相等

output

ABCD
0TrueTrueTrueTrue
1TrueTrueTrueTrue
2TrueTrueTrueTrue
3TrueTrueTrueTrue
4TrueTrueTrueTrue
5TrueTrueTrueTrue
6TrueTrueTrueTrue
7TrueTrueTrueTrue
8TrueTrueTrueTrue
9TrueTrueTrueTrue

input

(df == df1).all().all() #比上面那个看起来简单
True

input

left = pd.DataFrame({'key':['foo','foo'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})

input

left
keylval
0foo1
1foo2

input

right

output

keyrval
0foo4
1foo5

input

pd.merge(left,right,on='key') # merge:通过中介key来把两个表合起来

output

keylvalrval
0foo14
1foo15
2foo24
3foo25

input

s = pd.Series(np.random.randint(1,5,size=4),index=list('ABCD'))
s

output

A    2
B    4
C    2
D    3
dtype: int64

input

df.append(s, ignore_index=True) #和merge功能差不多

output

ABCD
00.336937-1.5363051.0653850.443815
10.258999-0.4922260.0316240.293882
21.170411-0.7696400.0319390.410354
3-0.664322-0.767787-0.1734392.197244
41.8748291.2131790.697452-0.463159
52.366758-1.818289-0.567515-0.429355
61.1135810.370977-1.5148330.768817
7-0.481723-0.342043-0.2610160.402664
81.5857740.170456-0.9501960.780025
90.950888-0.6994420.0769221.226366
102.0000004.0000002.0000003.000000

input

df = pd.DataFrame({'A' : ['foo','bar','foo','bar',
                       'foo','bar','foo','foo'],
                  'B' : ['one','one','two','three',
                       'two','two','one','three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df

output

ABCD
0fooone1.3286731.462918
1barone0.9419170.617067
2footwo1.401242-0.877970
3barthree-0.9137391.528186
4footwo-0.8433091.093093
5bartwo0.5720831.853178
6fooone-0.7258310.983818
7foothree0.006845-0.916164

input

df.groupby('A').sum( ) #把A进行分组,分完组后进行一定的计算,这里是求和

output

CD
A
bar0.6002613.998431
foo1.1676201.745695

input

df.groupby(['A','B']).sum()

output

CD
AB
barone0.9419170.617067
three-0.9137391.528186
two0.5720831.853178
fooone0.6028422.446736
three0.006845-0.916164
two0.5579330.215123

input

df.groupby(['B','A']).sum()

output

CD
BA
onebar0.9419170.617067
foo0.6028422.446736
threebar-0.9137391.528186
foo0.006845-0.916164
twobar0.5720831.853178
foo0.5579330.215123
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值