pandas
pandas基本介绍
import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,1])
运行结果:
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
创建一个大的矩阵
定义一个自定义的列名的矩阵
#日期的范围
dates = pd.date_range('20210626',periods=6)
print(dates)
```bash
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d']) #index=dates日期定义的行名;columns=['a','b','c','d'] 定义的列名
print(df)
运行结果:
```bash
DatetimeIndex(['2021-06-26', '2021-06-27', '2021-06-28', '2021-06-29',
'2021-06-30', '2021-07-01'],
dtype='datetime64[ns]', freq='D')
#自定义的行和列
a b c d
2021-06-26 -0.991276 0.455919 -0.480912 1.666879
2021-06-27 0.221047 -1.013943 0.807503 0.276951
2021-06-28 1.248941 -0.320192 0.914366 0.557270
2021-06-29 0.883774 1.112702 -0.517360 -0.651265
2021-06-30 -0.159575 -0.514005 -0.380291 0.466350
2021-07-01 0.684210 -1.297312 0.116049 1.216697
没有自定义
df1 = pd.DataFrame(np.range(12).reshape((3,4)))
print(df1)
运行结果:
#默认行列名都是数字
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
创建一个矩阵字典
df2 = pd.DataFrame({'A':1,
'B':pd.Timestamp('20210612'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(['test','train','text','train']),
'F':'foo'})
print(df2)
#相关属性
df2.dtypes #多少维数据,和对应列的类型
df2.index #对应行的序号和类型
df2.columns#对应列的序号和类型
df2.values#对应的值
df2.describe()#描述的只能是数值类型,平均值这些
df2.T #矩阵转置
df2.sort_index(axis=1,ascending=False)#axis=1列->倒序列排序
df2.sort_index(axis=0,ascending=False)#axis=1行->倒序列排序
df2.sort_values(by='E')#对E列进行排序
运行结果:
#df2.dtypes
A int64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
#df2.index
Int64Index([0, 1, 2, 3], dtype='int64')
#df2.columns
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
#df2.values
array([[1, Timestamp('2021-06-12 00:00:00'), 1.0, 3, 'test', 'foo'],
[1, Timestamp('2021-06-12 00:00:00'), 1.0, 3, 'train', 'foo'],
[1, Timestamp('2021-06-12 00:00:00'), 1.0, 3, 'text', 'foo'],
[1, Timestamp('2021-06-12 00:00:00'), 1.0, 3, 'train', 'foo']],
dtype=object)
#df2.describe()
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
#df2.T
0 1 2 3
A 1 1 1 1
B 2021-06-12 00:00:00 2021-06-12 00:00:00 2021-06-12 00:00:00 2021-06-12 00:00:00
C 1.0 1.0 1.0 1.0
D 3 3 3 3
E test train text train
F foo foo foo foo
#df2.sort_index(axis=1,ascending=False)
F E D C B A
0 foo test 3 1.0 2021-06-12 1
1 foo train 3 1.0 2021-06-12 1
2 foo text 3 1.0 2021-06-12 1
3 foo train 3 1.0 2021-06-12 1
#df2.sort_index(axis=1,ascending=False)
A B C D E F
0 1 2021-06-12 1.0 3 test foo
1 1 2021-06-12 1.0 3 train foo
2 1 2021-06-12 1.0 3 text foo
3 1 2021-06-12 1.0 3 train foo
A B C D E F
3 1 2021-06-12 1.0 3 train foo
2 1 2021-06-12 1.0 3 text foo
1 1 2021-06-12 1.0 3 train foooy
0 1 2021-06-12 1.0 3 test foo
#df2.sort_values(by='E')
A B C D E F
0 1 2021-06-12 1.0 3 test foo
2 1 2021-06-12 1.0 3 text foo
1 1 2021-06-12 1.0 3 train foo
3 1 2021-06-12 1.0 3 train foo
选择数据(筛选数据)
#select by label:loc 根据标签筛选
print(df.loc('2021-06-26 '))#根据标签行名筛选
print(df.loc('2021-06-26',['A','B']))
#select by position:iloc 根据位置筛选
print(df.iloc(1,3,5),1:3)#(1,3,5)行,1~3列筛选
#mixed selection:ix 综合两种方式
print(df.ix[:3],['A','C'])
#根据values值大小筛选
print(df[df.A<8])#对应A列小于把的值
#修改值
df.A[df.A>4]=0#修改A列值
df.B[df.A>4]=0#修改B列的值
#新增一空值列
df['F']=np.nan
NaN数据的处理,丢失数据处理
dates = pd.date_range('20210626',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df.dropna(axis=0,how='any')) #how = {'any','all'} 如果how=any,表示任意含有NaN的,就删除;all表示全部为NaN才删除
print(df.fillna(value=0))#填上所有的NaN,value=0填入NaN的位置
print(df.isnull())#是否为NaN返回数值为True/False
print(df.any(df.isnull()) == True)# 如果含有一个就为True
导入导出
#导入
read_cvs
read_excel
read_hdf
read_sql
read_json
read_msgpack(experimental)
read_html
read_gbq(experimental)
read_stata
read_sas
read_clipboard
read_pickle
#导出,跟导入一样,只不过换掉关键字
to_csv
import pandas as pd
#导入
data = pd.read_csv('student.csv')
print(data)
#保存
data.to_prickle('student.pickle')
DataFrame的合并
concatenating
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
res = pd.concat([df1,df2,df3],axis=0,igonre_index=True)#列,igonre_index=True序号自动改变
print(res)
join,[‘inner’,‘outer’]
df1 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
res = pd.concat([df1,df2])#没有匹配的部分NaN填充,join=‘outer’默认
res = pd.concat([df1,ef2],join='inner',ignore_index=True)#取出没有匹配部分,类似与交集
join_axes
df1 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
pd.concat([df1,ef2],axis=1,join_axes=[df1.index],ignore_index=True)
append追加
df1 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
df3 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
res = df1.append(df2,ignore_index=True)
res = def1.append([de2,def3])#append多个
#append Series
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
res = def1.append(s1,ignore_index=True)
DataFrame的合并merge
merging two df by key/keys.(may be used in database)
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right= pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(right)
print(left)
res = pd.merge(left,right,on='key')#merge合并,on='key'
print(res)
运行结果
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
#根据key进行匹配合并
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
consider two keys
#改变即可
on = ['key1','key2']
merge默认的join=‘inner’,而concat默认的是outer
indicator
df1 = pd.DataFrame({'col':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col':[1,2,2],'col_left':[2,2,2]})
print(df1)
print(df2)
res = pd.merge(df1,df2,on='col',how='outer',indicator=True)
print(res)
运行结果:
col col_left
0 0 a
1 1 b
col col_left
0 1 2
1 2 2
2 2 2
#give the indicator custom name
col col_left_x col_left_y _merge#indicator之后出现
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
df1 = pd.DataFrame({'col':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col':[1,2,2],'col_left':[2,2,2]})
print(df1)
print(df2)
res = pd.merge(df1,df2,left_index=True,right_index=True,how='outer')
print(res)
运行结果:
col col_left
0 0 a
1 1 b
col col_left
0 1 2
1 2 2
2 2 2
col_x col_left_x col_y col_left_y
0 0.0 a 1 2
1 1.0 b 2 2
2 NaN NaN 2 2
plot画图
记得先安装matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Series数据
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
#DataFrame数据
data = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
data = data.cumsum()
print(data.head())
data.plot()
plt.show()
运行结果: