pandas基础知识

1、pandas基本介绍

1.1 series
https://www.cnblogs.com/songzhixue/p/11341440.html
1.2periods
https://blog.csdn.net/starter_____/article/details/81432956
1.3DataFrame
https://blog.csdn.net/wei_lin/article/details/93492252
1.4Categoricals
能够有效地编码并显著性地提高了文本分类的效率。
https://blog.csdn.net/FrankieHello/article/details/81220116
import numpy as np
import pandas as pd

series

s = pd.Series([1,3,6,np.nan,44,1])       #np.nan相当于none
print(s)
dates = pd.date_range('20200411',periods=6)     #periods=6是`在这里插入代码片`生成6个数据
print(dates)
#随机生成一个6行4列的数,行索引(index)是dates,列索引(columns)是columns
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print('df',df)
#没规定索引名称时默认为数字
df1 = pd.DataFrame(np.random.randn(12).reshape((3,4)))
print('df1',df1)

也可以用字典生成df

df2 = pd.DataFrame({
    'a':1.,
    'b':pd.Timestamp('20200411'),
    'c':pd.Series(1,index=list(range(4)),dtype='float64'),
    'd':np.array([3]*4,dtype='int32'),
    'e':pd.Categorical(['test','train','test','train']),
    'f':'foo'
})
print(df2)
#查看dtype
print(df2.dtypes)
#查看所有列的序号和dtype
print(df2.index)
#查看所有行的序号和dtype
print(df2.columns)  #查看所有行的序号和dtype

打印值的名字(values)

print(df2.values)
#discribe只能运算数字形式的(和,平均值,方差,最小值,最大值)
print(df2.describe())

转置

print(df2.T)
#sort排序:按照index(行)
print(df2.sort_index(axis=1,ascending=False))  #ascending=False以倒叙排序
#sort排序:按照index(列)
print(df2.sort_index(axis=0,ascending=True))  #ascending=True以正叙排序
#sort排序:按照values
print(df2.sort_values(by='e'))   #按e的值排序

———————————————————————————

2、pandas选择数据

import numpy as np
import pandas as pd
dates = pd.date_range('20200411',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])  #从0-23共24个数生成6行4的序列,列标(index)是dates,行表(columns)是a,b,c,d
print(df['a'],df.a)   #这两种输出方法相同
#按照切片选择
print(df[0:3],df['20200411':'20200413'])
#select by label(按标签选):loc
print(df.loc['20200411'])   #按行选择
print(df.loc[:,['a','b']])  #按列选择
print(df.loc['20200411',['a','b']])  #选择行为20200411的a列和b列
#select by position(按位置选): iloc
print(df.iloc[3])  #取第三行
print(df.iloc[3,1])  #取第三行第一列
print(df.iloc[3:5,1:3])  #取第三行到第五行的第一列到第三列
print(df.iloc[[1,3,5],1:3])  #逐个筛选,选取第1,3,5行的第一列到第三列
#mixed selection: ix  标签和位置结合起来筛选
print(df.ix[:3,['a','c']])   #!!!报错原因:在pandas的1.0.0版本开始,移除了Series.ix and DataFrame.ix 方法
#boolean indexing 是否筛选
print(df)
print(df[df.a>8])   #筛选df中a列大于8的数

————————————————————————————

3、pandas设置值

import numpy as np
import pandas as pd
dates = pd.date_range('20200411',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])  #从0-23共24个数生成6行4的序列,列标(index)是dates,行表(columns)是a,b,c,d
# 3.1 将第二行第二列的值改为111
df.iloc[2,2] = 1111   
print('df按位置更改后',df)
# 3.2 通过标签更改
df.loc['20200411','b'] = 222  
print('df按标签更改后',df)
# 3.3 此时更改的是全部值
df[df.a>4] = 0   
print('通过比较更改1',df)
# 3.4 此时只更改a列中符合a大于4的数
df.a[df.a>4] = 0   
print('通过比较只更改a列',df)
# 3.5 此时只更改b列中符合a大于4的数
df.b[df.a>4] = 0   
print('通过此时只更改b列',df)
# 3.6 加一个空列f为空
df['f'] = np.nan
print(df)
# 3.7 加一个非空新列
df['e'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20200411',periods=6))    #添加一个列名为e的内容为1,2,3,4,5,6的列
print(df)

————————————————————————————

4、pandas处理丢失数据

import numpy as np
import pandas as pd
dates = pd.date_range('20200411',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])  #从0-23共24个数生成6行4的序列,列标(index)是dates,行表(columns)是a,b,c,d
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
# 4.1 how={'any','all'}  默认how=any
print(df.dropna(axis=0,how='any'))  
print(df.dropna(axis=1,how='any'))    #any是只有存在一个nan就删除一整列
print(df.dropna(axis=0,how='all'))    #all是删除全是nan的一整行
print(df.dropna(axis=1,how='all'))    #all是删除全是nan的一整列
# 4.2 df.fillna(value=?) 将nan值填上
print(df.fillna(value=0))   #将nan值填为0
print(df.fillna(value=1))   #将nan值填为1
# 4.3 检查全文是否含有缺失数据  结果true是有nan,false是没有空
# print(df.isnull())
#查看全文是否有缺失数据,因为缺失为true,所以更简单快捷
print(np.any(df.isnull()) ==True)

————————————————————————————

5、pandas导入导出数据

read_csv
read_excel
read_hdf
read_sql
read_json
read_msgpack(experimental)
read_html
read_gbq(experimental)
read_stata
read_sas
read_clipboard
read_pickle
eg:
import pandas as pd
data = pd.read_csv(‘位置/名称.csv’)
data = pd.read_hdf(‘位置/名称’)

———————————————————————————

6、pandas合并concat

import numpy as np
import pandas as pd
# concatenating  合并
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])  #值全为0
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])  #值全为1
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])  #值全为2
print('df1',df1)
print('df2',df2)
print('df3',df3)
#6.1 竖向合并
res1 = pd.concat([df1,df2,df3],axis=0)
print('竖向合并',res1)
#6.2 竖向合并,并更改列标
res2 = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print('更改列标',res2)
#6.3 join,['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])  #值全为0
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'])  #值全为1
print(df1)
print(df2)
# outer是存异
res1 = pd.concat([df1,df2])
res1 = pd.concat([df1,df2],join='outer')   #默认outer,所以不写也行
print(res1)
# inner是求同
res2 = pd.concat([df1,df2],join='inner')   #默认outer,所以不写也行
print(res2)
res3 = pd.concat([df1,df2],join='inner',ignore_index=True)   #ignore_index=True:处理序号问题,按true是按序排列
print(res3)
#6.4join_axes
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])  #值全为0
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])  #值全为1
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])
print(res)
#6.4 appten
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])  #值全为0
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'])  #值全为1
res1 = df1.append(df2,ignore_index=True)
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
print(res1)
res2 = df1.append(s1,ignore_index=True)
print(res2)

———————————————————————————

7、pandas合并merge

import numpy as np
import pandas as pd
left1 = pd.DataFrame({'key':['k0','k1','k2','k3'],
                      'A':['A0','A1','A2','A3'],
                      'B':['B0','B1','B2','B3']})
right1 = pd.DataFrame({'key':['k0','k1','k2','k3'],
                      'C':['C0','C1','C2','C3'],
                      'D':['D0','D1','D2','D3']})
print(left1)
print(right1)
#7.1merging two df by key/keys.(may be used in database)
res1 = pd.merge(left1,right1,on='key')
# print(res1)
#consider two keys
left2 = pd.DataFrame({'key1':['k0','k0','k1','k2'],
                      'key2':['k0','k0','k0','k1'],
                      'A':['A0','A1','A2','A3'],
                      'B':['B0','B1','B2','B3']})

right2 = pd.DataFrame({'key1':['k0','k1','k1','k2'],
                       'key2':['k0','k0','k0','k0'],
                       'C':['C0','C1','C2','C3'],
                       'D':['D0','D1','D2','D3']})
print(left2)
print(right2)
#7.2how = ['left','right','outer','inner']
res2 = pd.merge(left2,right2,on=['key1','key2'])
res2 = pd.merge(left2,right2,on=['key1','key2'],how='inner')  #默认为inner
print(res2)
res3 = pd.merge(left2,right2,on=['key1','key2'],how='outer')  
print(res3)
#7.3indicator
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,3],'col_right':[2,2,2]})
print(df1)
print(df2)
res1 = pd.merge(df1,df2,on='col1',how='outer',indicator=True)
print(res1)
#give the indicator a custom name
res2 = pd.merge(df1,df2,on='col1',how='outer',indicator='indicator_column')   #自定义名称
print(res2)
#7.4 merged by index
left = pd.DataFrame({ 'A':['A0','A1','A2'],
                      'B':['B0','B1','B2']},
                    index=['k0','k1','k2'])

right= pd.DataFrame({ 'C':['C0','C3','C3'],
                      'D':['D0','D2','D3']},
                    index=['k0','k2','k3'])
print(left)
print(right)
#ledt_index and right_index
res1 = pd.merge(left,right,left_index=True,right_index=True,how='outer')  #存异
res2 = pd.merge(left,right,left_index=True,right_index=True,how='inner')  #求同
print(res1)
print(res2)
#7.5 handle overlapping
boys = pd.DataFrame({'k':['k0','k1','k3'],'age':[1,2,3]})
print(boys)
girls = pd.DataFrame({'k':['k0','k0','k3'],'age':[4,5,6]})
print(girls)
res1 = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')  #求同
print(res1)
res2 = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='outer')  #存异
print(res2)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值