1、pandas基本介绍
1.1 series
https://www.cnblogs.com/songzhixue/p/11341440.html
1.2periods
https://blog.csdn.net/starter_____/article/details/81432956
1.3DataFrame
https://blog.csdn.net/wei_lin/article/details/93492252
1.4Categoricals
能够有效地编码并显著性地提高了文本分类的效率。
https://blog.csdn.net/FrankieHello/article/details/81220116
import numpy as np
import pandas as pd
series
s = pd.Series([1,3,6,np.nan,44,1]) #np.nan相当于none
print(s)
dates = pd.date_range('20200411',periods=6) #periods=6是`在这里插入代码片`生成6个数据
print(dates)
#随机生成一个6行4列的数,行索引(index)是dates,列索引(columns)是columns
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print('df',df)
#没规定索引名称时默认为数字
df1 = pd.DataFrame(np.random.randn(12).reshape((3,4)))
print('df1',df1)
也可以用字典生成df
df2 = pd.DataFrame({
'a':1.,
'b':pd.Timestamp('20200411'),
'c':pd.Series(1,index=list(range(4)),dtype='float64'),
'd':np.array([3]*4,dtype='int32'),
'e':pd.Categorical(['test','train','test','train']),
'f':'foo'
})
print(df2)
#查看dtype
print(df2.dtypes)
#查看所有列的序号和dtype
print(df2.index)
#查看所有行的序号和dtype
print(df2.columns) #查看所有行的序号和dtype
打印值的名字(values)
print(df2.values)
#discribe只能运算数字形式的(和,平均值,方差,最小值,最大值)
print(df2.describe())
转置
print(df2.T)
#sort排序:按照index(行)
print(df2.sort_index(axis=1,ascending=False)) #ascending=False以倒叙排序
#sort排序:按照index(列)
print(df2.sort_index(axis=0,ascending=True)) #ascending=True以正叙排序
#sort排序:按照values
print(df2.sort_values(by='e')) #按e的值排序
———————————————————————————
2、pandas选择数据
import numpy as np
import pandas as pd
dates = pd.date_range('20200411',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d']) #从0-23共24个数生成6行4的序列,列标(index)是dates,行表(columns)是a,b,c,d
print(df['a'],df.a) #这两种输出方法相同
#按照切片选择
print(df[0:3],df['20200411':'20200413'])
#select by label(按标签选):loc
print(df.loc['20200411']) #按行选择
print(df.loc[:,['a','b']]) #按列选择
print(df.loc['20200411',['a','b']]) #选择行为20200411的a列和b列
#select by position(按位置选): iloc
print(df.iloc[3]) #取第三行
print(df.iloc[3,1]) #取第三行第一列
print(df.iloc[3:5,1:3]) #取第三行到第五行的第一列到第三列
print(df.iloc[[1,3,5],1:3]) #逐个筛选,选取第1,3,5行的第一列到第三列
#mixed selection: ix 标签和位置结合起来筛选
print(df.ix[:3,['a','c']]) #!!!报错原因:在pandas的1.0.0版本开始,移除了Series.ix and DataFrame.ix 方法
#boolean indexing 是否筛选
print(df)
print(df[df.a>8]) #筛选df中a列大于8的数
————————————————————————————
3、pandas设置值
import numpy as np
import pandas as pd
dates = pd.date_range('20200411',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d']) #从0-23共24个数生成6行4的序列,列标(index)是dates,行表(columns)是a,b,c,d
# 3.1 将第二行第二列的值改为111
df.iloc[2,2] = 1111
print('df按位置更改后',df)
# 3.2 通过标签更改
df.loc['20200411','b'] = 222
print('df按标签更改后',df)
# 3.3 此时更改的是全部值
df[df.a>4] = 0
print('通过比较更改1',df)
# 3.4 此时只更改a列中符合a大于4的数
df.a[df.a>4] = 0
print('通过比较只更改a列',df)
# 3.5 此时只更改b列中符合a大于4的数
df.b[df.a>4] = 0
print('通过此时只更改b列',df)
# 3.6 加一个空列f为空
df['f'] = np.nan
print(df)
# 3.7 加一个非空新列
df['e'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20200411',periods=6)) #添加一个列名为e的内容为1,2,3,4,5,6的列
print(df)
————————————————————————————
4、pandas处理丢失数据
import numpy as np
import pandas as pd
dates = pd.date_range('20200411',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d']) #从0-23共24个数生成6行4的序列,列标(index)是dates,行表(columns)是a,b,c,d
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
# 4.1 how={'any','all'} 默认how=any
print(df.dropna(axis=0,how='any'))
print(df.dropna(axis=1,how='any')) #any是只有存在一个nan就删除一整列
print(df.dropna(axis=0,how='all')) #all是删除全是nan的一整行
print(df.dropna(axis=1,how='all')) #all是删除全是nan的一整列
# 4.2 df.fillna(value=?) 将nan值填上
print(df.fillna(value=0)) #将nan值填为0
print(df.fillna(value=1)) #将nan值填为1
# 4.3 检查全文是否含有缺失数据 结果true是有nan,false是没有空
# print(df.isnull())
#查看全文是否有缺失数据,因为缺失为true,所以更简单快捷
print(np.any(df.isnull()) ==True)
————————————————————————————
5、pandas导入导出数据
read_csv
read_excel
read_hdf
read_sql
read_json
read_msgpack(experimental)
read_html
read_gbq(experimental)
read_stata
read_sas
read_clipboard
read_pickle
eg:
import pandas as pd
data = pd.read_csv(‘位置/名称.csv’)
data = pd.read_hdf(‘位置/名称’)
———————————————————————————
6、pandas合并concat
import numpy as np
import pandas as pd
# concatenating 合并
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) #值全为0
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d']) #值全为1
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d']) #值全为2
print('df1',df1)
print('df2',df2)
print('df3',df3)
#6.1 竖向合并
res1 = pd.concat([df1,df2,df3],axis=0)
print('竖向合并',res1)
#6.2 竖向合并,并更改列标
res2 = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print('更改列标',res2)
#6.3 join,['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) #值全为0
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e']) #值全为1
print(df1)
print(df2)
# outer是存异
res1 = pd.concat([df1,df2])
res1 = pd.concat([df1,df2],join='outer') #默认outer,所以不写也行
print(res1)
# inner是求同
res2 = pd.concat([df1,df2],join='inner') #默认outer,所以不写也行
print(res2)
res3 = pd.concat([df1,df2],join='inner',ignore_index=True) #ignore_index=True:处理序号问题,按true是按序排列
print(res3)
#6.4join_axes
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3]) #值全为0
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4]) #值全为1
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])
print(res)
#6.4 appten
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) #值全为0
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e']) #值全为1
res1 = df1.append(df2,ignore_index=True)
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
print(res1)
res2 = df1.append(s1,ignore_index=True)
print(res2)
———————————————————————————
7、pandas合并merge
import numpy as np
import pandas as pd
left1 = pd.DataFrame({'key':['k0','k1','k2','k3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right1 = pd.DataFrame({'key':['k0','k1','k2','k3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left1)
print(right1)
#7.1merging two df by key/keys.(may be used in database)
res1 = pd.merge(left1,right1,on='key')
# print(res1)
#consider two keys
left2 = pd.DataFrame({'key1':['k0','k0','k1','k2'],
'key2':['k0','k0','k0','k1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right2 = pd.DataFrame({'key1':['k0','k1','k1','k2'],
'key2':['k0','k0','k0','k0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left2)
print(right2)
#7.2how = ['left','right','outer','inner']
res2 = pd.merge(left2,right2,on=['key1','key2'])
res2 = pd.merge(left2,right2,on=['key1','key2'],how='inner') #默认为inner
print(res2)
res3 = pd.merge(left2,right2,on=['key1','key2'],how='outer')
print(res3)
#7.3indicator
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,3],'col_right':[2,2,2]})
print(df1)
print(df2)
res1 = pd.merge(df1,df2,on='col1',how='outer',indicator=True)
print(res1)
#give the indicator a custom name
res2 = pd.merge(df1,df2,on='col1',how='outer',indicator='indicator_column') #自定义名称
print(res2)
#7.4 merged by index
left = pd.DataFrame({ 'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index=['k0','k1','k2'])
right= pd.DataFrame({ 'C':['C0','C3','C3'],
'D':['D0','D2','D3']},
index=['k0','k2','k3'])
print(left)
print(right)
#ledt_index and right_index
res1 = pd.merge(left,right,left_index=True,right_index=True,how='outer') #存异
res2 = pd.merge(left,right,left_index=True,right_index=True,how='inner') #求同
print(res1)
print(res2)
#7.5 handle overlapping
boys = pd.DataFrame({'k':['k0','k1','k3'],'age':[1,2,3]})
print(boys)
girls = pd.DataFrame({'k':['k0','k0','k3'],'age':[4,5,6]})
print(girls)
res1 = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner') #求同
print(res1)
res2 = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='outer') #存异
print(res2)