DataFrame
#.DataFrame()创建一个DF,参数colums为各列属性
import pandas as pd
df=pd.DataFrame(np.random.randn(12).reshape(3,4),columns=['a','b','c','d'])
df
a b c d
0 0.927449 0.461756 0.395449 0.294089
1 -1.296071 -0.780128 0.513385 1.608865
2 -0.071200 -1.105894 -0.568288 -0.216750
#按照字典的形式创建DF
df1=pd.DataFrame({'A':np.arange(4),'B':np.arange(4,8)})
df1
A B
0 0 4
1 1 5
2 2 6
3 3 7
#.columns方法显示列属性
#.valus显示值
df1.columns
Index(['A', 'B'], dtype='object')
df.values
array([[-0.6023384 , 0.67498799, -0.68959674, -0.18298839],
[ 1.26339357, -1.39439048, -0.67637389, 1.24420078],
[-2.48233953, -1.42888899, -0.75211092, -0.28422075]])
#.describe()显示DF各种信息,包括中位数,平均值,std等
df.describe()
a b c d
count 3.000000 3.000000 3.000000 3.000000
mean -0.607095 -0.716097 -0.706027 0.258997
std 1.872871 1.204839 0.040454 0.854711
min -2.482340 -1.428889 -0.752111 -0.284221
25% -1.542339 -1.411640 -0.720854 -0.233605
50% -0.602338 -1.394390 -0.689597 -0.182988
75% 0.330528 -0.359701 -0.682985 0.530606
max 1.263394 0.674988 -0.676374 1.244201
#DF也可转置.T
#参数index()引入属性
df=pd.DataFrame(np.random.randn(12).reshape(3,4),index=['a','b','c'],columns=['a1','b1','c1','d1'])
df
a1 b1 c1 d1
a -0.723560 -1.363361 -0.290657 -0.915854
b 1.750881 0.885029 -0.935353 -0.771946
c -0.074274 0.634728 -0.465676 -1.204649
#用列属性查找一列数据
print(df['a1'])
print(df.a1)
a -0.723560
b 1.750881
c -0.074274
Name: a1, dtype: float64
#按行索引选取行数据
print(df[0:2])
a b c d
0 0.927449 0.461756 0.395449 0.294089
1 -1.296071 -0.780128 0.513385 1.608865
loc()方法
#选择某一行
print(df.loc['a'])
a1 0.157265
b1 -0.091589
c1 0.678737
d1 -0.091182
Name: a, dtype: float64
#选择某一列的第几行元素
# 选中某一列"[]",用行列名称
print(df.loc[:,['a1']])
# 具体某个数
print(df.loc['b','a1'])
a1
a 0.157265
b 0.827624
c -1.627656
0.8276240510180705
iloc()方法选择数据
#行列索引选择数据
# 某个具体的数,用行列索引
print(df.iloc[0,0])
# 切片
print(df.iloc[0:2,0:2])
-0.7235602120079216
a1 b1
a -0.723560 -1.363361
b 1.750881 0.885029
#改变选取的值
# 改变位置上的值
df.iloc[0,0]=0
df.loc['b','d1']=0
#按条件赋值
# 按条件赋值
df.a1[df.a1>0]=0
df
a1 b1 c1 d1
a 0.000000 -0.091589 0.678737 -0.091182
b 0.000000 -0.283218 -0.408538 1.446401
c -1.627656 0.469021 -0.024202 -0.788833
#.fillna()填充缺失数据
df.fillna(value=0)
a1 b1 c1 d1
a 0.000000 -1.363361 -0.290657 -0.915854
b 0.000000 0.885029 -0.935353 0.000000
c -0.074274 0.634728 -0.465676 -1.204649
plot画图总结
#老规矩引入包
import matplotlib.pyplot as plt
#.plot(),与plt.show()组合使用画图
import matplotlib.pyplot as plt
# plot可视化数据
data=pd.Series(np.random.randn(1000),index=np.arange(1000))
print(data)
data=data.cumsum()
data.plot()
plt.show()
#对于DataFrame添加了index,与columns的数据绘图
散点图
#plot.scatter(x= ,y=’’,color=’’,label=’’,ax=)
ax参数是将两张图合并
#xlim() x轴范围
ylim() y轴范围
# 点图
# 点图
scatter_plot=data.plot.scatter(x='A',y='B',color='Blue',label='Class1')
# # data.plot.scatter(x='A',y='C',s=20,color='Blue',alpha=0.5,label='Class2',ax=scatter_plot)
data.plot(kind="scatter",x="A",y="C",s=50,c='Red',alpha=0.5,label='Class2',ax=scatter_plot,cmap=plt.get_cmap("jet"),colorbar=True)
plt.xlim((-10,10))#x范围
plt.ylim((-10,10))#y范围
plt.show()
数据合并(concat,join)
1.concat
#再次强调axis=对DF纵向操作,axis=1对DF横向操作
#ignore_index=True 使拼接的DF的index(0-----)
df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2=pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
res=pd.concat([df1,df2,df3],axis=0,ignore_index=True)#默认axis=0
print(df1)
print(df2)
print(df3)
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
2.join
#inner 保留重复位置
#outer没有数据的填充Null
# join('inner','outer')
df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2=pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
print(df1)
print(df2)
# inner保留重复位置
res1=pd.concat([df1,df2],join='inner',ignore_index=True)
print(res1)
# outer没有的填充Null,默认
res=pd.concat([df1,df2],join='outer',ignore_index=True)
print(res)
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0
join_axes合并
#join_axes[df1.index]:按照df1的index合并DF
#axis=0/1:考虑df1与df2的index,没有数据的填充Null
import numpy as np
import pandas as pd
df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2=pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
res=pd.concat([df2,df1],axis=1,join_axes=[df1.index])
res1=pd.concat([df1,df2],axis=1)
print(res)
print(res1)
b c d e a b c d
1 NaN NaN NaN NaN 0.0 0.0 0.0 0.0
2 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
append添加数据
#向DF横向添加数据
#Series(data,index=),index对应data
# append,在DataFrame横向或纵向添加数据
df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
# Series,一行数据
s1=pd.Series([1,2,3,4],index=['a','b','c','d'])
rs=df1.append(s1,ignore_index=True)
print(rs)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0