Pandas学习(二)

import pandas as pd
import numpy as np
#处理丢失数据
dates = pd.date_range('20180107',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index = dates,columns=['A','B','C','D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan

print df
             A     B     C   D
2018-01-07   0   NaN   2.0   3
2018-01-08   4   5.0   NaN   7
2018-01-09   8   9.0  10.0  11
2018-01-10  12  13.0  14.0  15
2018-01-11  16  17.0  18.0  19
2018-01-12  20  21.0  22.0  23
#1.直接丢弃
print df.dropna(axis=0,how='any') #how={'any','all'}
             A     B     C   D
2018-01-09   8   9.0  10.0  11
2018-01-10  12  13.0  14.0  15
2018-01-11  16  17.0  18.0  19
2018-01-12  20  21.0  22.0  23
print df.dropna(axis=0,how='all') #只有整行的时候为nan时才丢掉
             A     B     C   D
2018-01-07   0   NaN   2.0   3
2018-01-08   4   5.0   NaN   7
2018-01-09   8   9.0  10.0  11
2018-01-10  12  13.0  14.0  15
2018-01-11  16  17.0  18.0  19
2018-01-12  20  21.0  22.0  23
#2.填充默认数据
print df.fillna(value=0)
             A     B     C   D
2018-01-07   0   0.0   2.0   3
2018-01-08   4   5.0   0.0   7
2018-01-09   8   9.0  10.0  11
2018-01-10  12  13.0  14.0  15
2018-01-11  16  17.0  18.0  19
2018-01-12  20  21.0  22.0  23
print df.isnull() #查看是否有缺失数据(all)
                A      B      C      D
2018-01-07  False   True  False  False
2018-01-08  False  False   True  False
2018-01-09  False  False  False  False
2018-01-10  False  False  False  False
2018-01-11  False  False  False  False
2018-01-12  False  False  False  False
print np.any(df.isnull()==True) #查看是否有缺失数据(boolean)
True
#pandas数据导入
#read_csv
#read_excel
#read_sql
#read_pickle #python自带压缩格式
#read_json
#read_html
#....
#导出
#to_csv
#...
#pandas合并:concatenating
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
#上下合并
res = pd.concat([df1,df2,df3],axis=0)
print res
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
#但是index没变,所以我们把index改成连续
res1 = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print res1
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
#join, ['inner','outer']
df4 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df5 = pd.DataFrame(np.ones((3,4))*2,columns=['c','d','e','f'])
#全连接,没有部分用NaN填充
print pd.concat([df4,df5],axis=0)
     a    b    c    d    e    f
0  1.0  1.0  1.0  1.0  NaN  NaN
1  1.0  1.0  1.0  1.0  NaN  NaN
2  1.0  1.0  1.0  1.0  NaN  NaN
0  NaN  NaN  2.0  2.0  2.0  2.0
1  NaN  NaN  2.0  2.0  2.0  2.0
2  NaN  NaN  2.0  2.0  2.0  2.0
print pd.concat([df4,df5],join='outer') #默认
     a    b    c    d    e    f
0  1.0  1.0  1.0  1.0  NaN  NaN
1  1.0  1.0  1.0  1.0  NaN  NaN
2  1.0  1.0  1.0  1.0  NaN  NaN
0  NaN  NaN  2.0  2.0  2.0  2.0
1  NaN  NaN  2.0  2.0  2.0  2.0
2  NaN  NaN  2.0  2.0  2.0  2.0
print pd.concat([df4,df5],join='inner',ignore_index=True) 
     c    d
0  1.0  1.0
1  1.0  1.0
2  1.0  1.0
3  2.0  2.0
4  2.0  2.0
5  2.0  2.0
#join_axes 左右合并
df6 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'],index=[1,2,3])
df7 = pd.DataFrame(np.ones((3,4))*2,columns=['c','d','e','f'],index=[2,3,4])
print pd.concat([df6,df7],axis=1)
     a    b    c    d    c    d    e    f
1  1.0  1.0  1.0  1.0  NaN  NaN  NaN  NaN
2  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
3  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
4  NaN  NaN  NaN  NaN  2.0  2.0  2.0  2.0
print pd.concat([df4,df5],axis=1,join_axes=[df6.index])
     a    b    c    d    c    d    e    f
1  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
2  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
3  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
#append
res3 = df1.append(df2,ignore_index=True)
print res3
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
#合并 merge
#merge two df by keys
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
                    'A':['A0','A1','A2','A3'],
                    'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
                    'C':['C0','C1','C2','C3'],
                    'D':['D0','D1','D2','D3']})
print left
    A   B key
0  A0  B0  K0
1  A1  B1  K1
2  A2  B2  K2
3  A3  B3  K3
print right
    C   D key
0  C0  D0  K0
1  C1  D1  K1
2  C2  D2  K2
3  C3  D3  K3
#基于key合并
fin = pd.merge(left,right,on='key')
print fin
    A   B key   C   D
0  A0  B0  K0  C0  D0
1  A1  B1  K1  C1  D1
2  A2  B2  K2  C2  D2
3  A3  B3  K3  C3  D3
#如何两个keys
left1 = pd.DataFrame({'key1':['K0','K0','K1','K2'],
                      'key2':['K0','K1','K1','K1'],
                    'A':['A0','A1','A2','A3'],
                    'B':['B0','B1','B2','B3']})
right1 = pd.DataFrame({'key1':['K0','K1','K1','K3'],
                       'key2':['K1','K1','K1','K1'],
                    'C':['C0','C1','C2','C3'],
                    'D':['D0','D1','D2','D3']})
print left1
    A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
2  A2  B2   K1   K1
3  A3  B3   K2   K1
print right1
    C   D key1 key2
0  C0  D0   K0   K1
1  C1  D1   K1   K1
2  C2  D2   K1   K1
3  C3  D3   K3   K1
#how = ['left','right','inner','oute']
print pd.merge(left1,right1,on=['key1','key2'],how='inner') #默认
    A   B key1 key2   C   D
0  A1  B1   K0   K1  C0  D0
1  A2  B2   K1   K1  C1  D1
2  A2  B2   K1   K1  C2  D2
print pd.merge(left1,right1,on=['key1','key2'],how='outer')
     A    B key1 key2    C    D
0   A0   B0   K0   K0  NaN  NaN
1   A1   B1   K0   K1   C0   D0
2   A2   B2   K1   K1   C1   D1
3   A2   B2   K1   K1   C2   D2
4   A3   B3   K2   K1  NaN  NaN
5  NaN  NaN   K3   K1   C3   D3
print pd.merge(left1,right1,on=['key1','key2'],how='left')
    A   B key1 key2    C    D
0  A0  B0   K0   K0  NaN  NaN
1  A1  B1   K0   K1   C0   D0
2  A2  B2   K1   K1   C1   D1
3  A2  B2   K1   K1   C2   D2
4  A3  B3   K2   K1  NaN  NaN
print pd.merge(left1,right1,on=['key1','key2'],how='right')
     A    B key1 key2   C   D
0   A1   B1   K0   K1  C0  D0
1   A2   B2   K1   K1  C1  D1
2   A2   B2   K1   K1  C2  D2
3  NaN  NaN   K3   K1  C3  D3
#indicator
#indicator=True #显示合并方式
#indicator=‘column’ #改变名字
#merge by index
#处理overlapping 修改列名
boys = pd.DataFrame({'K':['K0','K1','K2','K3'],'age':[23,22,25,22]})
girls = pd.DataFrame({'K':['K0','K0','K2','K4'],'age':[20,21,18,22]})
print boys
    K  age
0  K0   23
1  K1   22
2  K2   25
3  K3   22
ores = pd.merge(boys,girls,on='K',suffixes=['_boys','_girls'],how='inner')
print ores
    K  age_boys  age_girls
0  K0        23         20
1  K0        23         21
2  K2        25         18
# 数据课时化
import matplotlib.pyplot as plt
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()

这里写图片描述

data = pd.DataFrame(np.random.randn(1000,4),index = np.arange(1000),
                   columns = list("ABCD"))
data = data.cumsum()
print data.head()
          A         B         C         D
0  1.453690  0.685309 -0.324429 -1.226143
1  1.537888  1.641124 -1.528630 -1.622637
2  1.837019  1.439954 -0.889837  0.819762
3  2.798952  1.561226 -1.000018  1.174287
4  3.129973  1.915419 -1.451802 -0.052203
data.plot()
plt.show()

这里写图片描述

ax = data.plot.scatter(x='A',y='B',color='Blue',label='Class1')
plt.show()

这里写图片描述

axx = data.plot.scatter(x='A',y='B',color='Blue',label='Class1')
data.plot.scatter(x='A',y='C',color='Red',label='Class2',ax = axx)
plt.show()

这里写图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值