test1:
import pandas as pd
import numpy as np
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20190326'),
'C':pd.Series(1.,index=['one','two','three','four'],dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':'fool'})
print(df2)
print(df2.dtypes)
print(df2.index)
print(df2.columns)
print(df2.values)
print(df2.describe())
print(df2.T)
print(df2.sort_index(axis=1,ascending = False))
print(df2.sort_index(axis=0,ascending = False))
test2:
import numpy as np
import pandas as pd
dates = pd.date_range('20130101', periods = 6)
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=['A','B','C','D'])
print(df)
print(df['A'])
print(df.A)
print(df[:3])
print(df['20130101':'20130103'])
#select by label:loc
print(df.loc['20130101'])
print(df.loc[:,'A'])
print(df.loc['20130101',['A','B']])
#select by position:iloc
print(df.iloc[3])
print(df.iloc[3,1])
print(df.iloc[1:3,1:2])
#Boolean indexing
print(df[df.A >8])
test3:
import numpy as np
import pandas as pd
dates = pd.date_range('20190326', periods = 6)
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=['A','B','C','D'])
print(df)
df.iloc[2,3] = 111
print(df)
df.loc['20190326', 'A'] = 123
print(df)
df[df.A>16] = 0 #df中A列大于4所在的行全部赋值为0
print(df)
df.B[df.A>13] = 0 #df的A列中大于8的行里面B列对应的元素赋值为0
print(df)
df['F'] = np.nan #添加F列,并全部赋值NaN,当然也可以单个赋值
print(df)
df['E'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20190326',periods=6))
print(df)
test4:
import numpy as np
import pandas as pd
dates = pd.date_range('20190327',periods = 6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A','B','C','D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
print(df.isnull()) #检查数据有没有缺失(缺失数据为True,没有缺失的为False)
print(np.any(df.isnull()) == True) #检查数据是否缺失,只要其中任意一个数据缺失,则返回True
print(df.dropna(axis=0, how='any')) #如果发现某一行(axis=0)任意一个为nan,则丢到这一行;how={'any','all'} 如果how='all',表示当所有元素均为nan,则丢掉行(列)
print(df.fillna(value=0)) #将nan替换为0
test5:
import numpy as np
import pandas as pd
#concatenating
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
res = pd.concat([df1,df2,df3], axis=0 ,ignore_index= True) #竖向合并 ignore_index = True解决行标题重复等问题
print(res)
#join,['inner','outer']
df4 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=['1','2','3'])
df5 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'],index=['2','3','4'])
print(df4)
print(df5)
res2 = pd.concat([df4,df5], join='outer') #join默认为outer模式,同名的列竖向合并,再用NaN填充
print(res2)
res3 = pd.concat([df4,df5], join='inner') #join的‘inner’模式只将共同都有的部分合并,其余部分去掉,对行的序号问题,可以用ignore_index = True进行处理
print(res3)
#join_axes
res4 = pd.concat([df4,df5],axis=1,join_axes=[df4.index]) #axis=1行合并,没有定义join_axes时默认将没有的现填充NaN,而本句是以df4的index为主,df4没有的index会直接丢到
print(df4)
test6:
import numpy as np
import pandas as pd
#merging two df by key/keys
left = pd.DataFrame({'Key':['k0','k1','k2','k3'],
'A':['a0','a1','a2','a3'],
'B':['b0','b1','b2','b3']})
right = pd.DataFrame({'Key':['k0','k1','k2','k3'],
'C':['c0','c1','c2','c3'],
'D':['d0','d1','d2','d3']})
print(left)
print(right)
res = pd.merge(left, right, on='Key')
print(res)
#consider two keys
left1 = pd.DataFrame({'Key1':['k0','k0','k1','k2'],
'Key2':['k0','k1','k0','k1'],
'A':['a0','a1','a2','a3'],
'B':['b0','b1','b2','b3']})
right1 = pd.DataFrame({'Key1':['k0','k1','k1','k2'],
'Key2':['k0','k0','k0','k0'],
'C':['c0','c1','c2','c3'],
'D':['d0','d1','d2','d3']})
print(left1)
print(right1)
res1 = pd.merge(left1, right1, on=['Key1','Key2'])
#默认为'how=inner'
#相同的保留,不同的丢掉,left1中key1 ky2有一组k1 k0,而在right中有两组 k1 k0,因此会将left1中k1 k0 对应的A B中元素复制两遍对应两组C D
#how={'left','right','inner','outer'}
'''
how='outer' 全部保留 未定义元素用NaN填充
how='left' 以left为主 未定义元素用NaN填充
how='right' 以right为主 未定义元素用NaN填充
'''
print(res1)
#indicator 多出一列 _merge 元素为 both、left_only、right_only
res2 = pd.merge(left1, right1, on=['Key1','Key2'],how ='outer', indicator=True)
print(res2)
#give the indicator a custom name
res3 = pd.merge(left1, right1, on=['Key1','Key2'],how ='outer', indicator='indicator_column')
print(res3)
#left_index and rihght_index
res4 = pd.merge(left1, right1, left_index=True, right_index=True, how='outer') #left_index right_index均为True,左右两个pandas数据列表的index均要考虑
print(res4)
#handle overlapping #列项命名一样的问题,可以在其后面加上对应的后缀
boys = pd.DataFrame({'K':['k0','k1','k2'],
'age':[1,2,3]})
girls = pd.DataFrame({'K':['k0','k0','k3'],
'age':[4,5,6]})
print(boys)
print(girls)
res5 = pd.merge(boys, girls, on='K',suffixes=['_boys', '_girls'])
print(res5)
test7:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#plot data
#Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000)) #随机生成1000个数据,并index命名从0~999
data = data.cumsum() #对数据进行一个一个的累加
data.plot()
plt.show()
'''
在plt模块中可以用 plt.plot(x=..., y=...) 在pandas中这些数据以线的形式直接显示;
在plt模块中可以用 plt.scatter(x=..., y=...) 在pandas中这些数据以点的形式直接显示;
以及其他plot methods:
'bar'、'hist'、'box'、'kde'、'area'、'scatter'、'hexbin'、'pie'
'''
#DataFrame
data1 = pd.DataFrame(np.random.randn(1000,4), index=np.arange(1000), columns=list("ABCD"))
print(data1.head(3)) #打印前3行数据,没有行数则默认是5行
data2 = data1.cumsum()
data1.plot()
data2.plot()
plt.show()
data2.plot.scatter(x='A', y='B', color='DarkBlue', label='Class1') #scatter只能有两个属性,因此需要指定x &
plt.show()
#散点图打印多组数据
a=data2.plot.scatter(x='A', y='B', color='DarkBlue', label='Class1')
data1.plot.scatter(x='A', y='C', color='DarkGreen', label='Class2', ax=a)
plt.show()