# -*- coding: utf-8 -*-
"""
Created on Fri Jan 25 15:08:25 2019
@author: ZengWei
"""
import pandas as pd
import numpy as np
'''
Part 1:基础操作
'''
s = pd.Series([1,3,6,np.nan,44,1])
dates = pd.date_range('20160101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
df2 = pd.DataFrame({
'A':1,
'B':pd.Timestamp('20130102'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(['test','train','test','train']),
'F':'foo'
})
df2.dtypes # 类型
df2.index
df2.columns
df2.values
df2.describe()
df2.T #
df2.sort_index(axis=1,ascending=False)
df2.sort_index(axis=0,ascending=False)
df2.sort_values(by='E')
'''
Part 2:选择数据
'''
print(df['A'],df.A)
print(df[0:3],df['20130102':'20130104'])
# selet by label:loc
print(df.loc['20130102'])
print(df.loc[:,['A','B']])
# select by position:iloc
print(df.iloc[3:5,1:3])
print(df.iloc[[1,3,5],1:3])
# mixed selection:ix
print(df.ix[:3,['A','C']]) # deprecated
# Boolean indexing
print(df[df.A>8])
'''
Part 3:赋值
先定位再修改
'''
df.iloc[2,2] = 11
df.loc['20130101','A'] = 22
df[df.A>0] = 0
df.B[df.A>0] = 0
# 添加一列
df['F'] = np.nan
df['E'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20160101',periods=6))
'''
Part 4:处理缺失数据
'''
print(df.dropna(axis=0,how='any')) # how={'any','all'}
print(df.isnull())
print(np.any(df.isnull()) == True)
print(df.fillna(value=0))
'''
Part 5:导入导出
'''
data = pd.read_csv('finename.csv')
data.to_pickle('some.pickle')
'''
Part 6:合并concat
'''
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
# join,['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
res1 = pd.concat([df1,df2],join='outer') # outer和inner效果不同
res2 = pd.concat([df1,df2],join='inner',ignore_index=True)
res3 = pd.concat([df1,df2],axis=1,join_axes=[df1.index])
# append
res4 = df1.append(df2,ignore_index=True)
# res4 = df1.append([df3,df2],ignore_index=True)
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
res5 = df1.append(s1,ignore_index=True)
'''
Part 7:合并merge
'''
left = pd.DataFrame({
'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3'],
})
right = pd.DataFrame({
'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3'],
})
res = pd.merge(left,right,on='key')
# considering two axis
left = pd.DataFrame({
'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3'],
})
right = pd.DataFrame({
'key1':['K0','K1','K1','K2'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3'],
})
# how = {'left','right','outer','inner'}
res = pd.merge(left,right,on=['key1','key2'],how='inner')
'''
Part 8:可视化
'''
import matplotlib.pyplot as plt
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
data = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),
columns=list("ABCD"))
data = data.cumsum()
data.plot()
plt.show()
'''
plot methods:
'bar','hist','box','kde','area','scatter','pie'
'''
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1')
data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2',ax=ax)
plt.show()