一.pandas 简介
numpy 相当于列表,那么pandas 相当于字典,就是比numpy 高级了一点点~
import numpy as np
import pandas as pd
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
看一下输出,自动加上序号,还有数据类型
创建一个行索引
dates = pd.date_range('20200816',periods=6)
print(dates)
创建一个有名字的numpy数组
df = pd.DataFrame(np.arange(12).reshape((3,4)))
# 使用默认索引 0,1,2....
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
完整代码
import numpy as np
import pandas as pd
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
dates = pd.date_range('20200816',periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
df.dtypes # 返回每一列的数据形式
df.index # 输出每一行的序号和类型
df.columns # 输出每一列的序号和类型
df.values # 输出每个元素的值
df.describe() #对每列数据进行简单的运算
df.T # 转置
df.sort_index(axis=1,ascending=False) # 对列的名称进行排序(倒序)
df.sort_values(by='E') #对某一列进行排序(指定)
二.pandas 数据选择
import numpy as np
import pandas as pd
dates = pd.date_range('20200816',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
# 取出dataframe里面的某一列数据 两种方法
df['a']
df.a
# 选择行数据
df[0:3]
df['2020-08-16':'2020-08-20']
# select by label:loc
df.loc['2020-08-20']
df.loc[:,['a','b']] # 选出a,b列的所有行数据
df.loc['2020-08-18',['a','b']] # 选出此行2020-08-18的所以数据
# select by position : iloc
df.iloc[3] # 第三行数据
df.iloc[3,1] # 第三行第一位数据
df.iloc[3:5,1:3] # 切片处理:第3-5行,第1-3列
df.iloc[[1,3,5],1:3] # 逐个不连续筛选
# mixed selection :ix
df.ix[:3,['a','c']]
# boolean indexing :条件判断的筛选
df[df.a>8]
三.pandas 设置值
import numpy as np
import pandas as pd
dates = pd.date_range('20200816',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
df.iloc[2,2]=111 # 改变定位一个位置上的值
df.a[df.a>4] = 0 # 条件改变(不想改变整个数据)
df.loc['2020-08-17','b'] =222 # 改变标签一个位置上的值
df['f'] = np.nan # 不同形式增加一行数列
df['e'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20200716',periods=6))
print(df)
四.pandas 处理丢失数据
import numpy as np
import pandas as pd
dates = pd.date_range('20200816',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
# 人为搞一些丢失数据
df.iloc[0.1] = np.nan
df.iloc[1,2] = np.nan
print(df.dropna(axis=0,how='any'))# 只要这行有任何一个数据缺失/当这行所以数据都缺失,就会被丢弃 how = {'any','all'}
print(df.fillna(value=0)) # 将填充的数据赋值
df.isnull()# 检查是否有缺失数据
np.any(df.isnull()) == True # 问整篇是否有缺失数据
五.pandas 数据导入Pycharm
import pandas as pd
data = pd.read_csv('D:\ChromeCoreDownloads\新冠疫情预测\_us-counties.csv')
print(data)
效果
六.pandas 合并
(1)concat 合并
import numpy as np
import pandas as pd
# concatenating 合并
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
res = pd.concat([df1,df2,df3],axis=0)
print(res)
此时的打印输出
加上一个属性后,就会按顺序标号
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
join属性
import numpy as np
import pandas as pd
# join['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
print(df1)
print(df2)
res = pd.concat([df1,df2],join='outer')# 默认合并方式Outer,两个彼此没有的地方自动填充
res = pd.concat([df1,df2],join='inner')# 去这两者的交集
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])# 左右合并,以df1的序号填充
print(res)
添加数据
import numpy as np
import pandas as pd
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
# 将2加到1上面去
res = df1.append(df2,ignore_index=True)
# 添加一行(到最下面)
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
res = df1.append(s1,ignore_index=True)
(2)merge 合并
merging two df by key/keys.(may be used in database)
import numpy as np
import pandas as pd
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
res = pd.merge(left,right,on='key')
print(left)
print(right)
print(res)
效果(看图比描述更清晰~)
consider two keys
import numpy as np
import pandas as pd
left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],
'key2':['K0','K1','K0','K1'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
res = pd.merge(left,right,on=['key1','key2'],how='inner')
# how = ['left','right','inner','outer']
print(left)
print(right)
print(res)
inicator
import pandas as pd
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
res = pd.merge(df1,df2,on='col1',how='outer',indicator=True)
# give the indicater a custom name
pd.merge(df1,df2,on='col1',how='outer',indicator=True)
print(df1)
print(df2)
print(res)
增加合并属性
import numpy as np
import pandas as pd
left = pd.DataFrame( {'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']},
index=['K0','K1','K2','K3'],)
right = pd.DataFrame({'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']},
index=['K0','K1','K2','K3'],)
res = pd.merge(left,right,left_index=True,right_index=True,how='outer')
print(left)
print(right)
print(res)
区分名字相同但是内涵不同的数据
import pandas as pd
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls= pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(boys)
print(girls)
print(res)