1、pandas选择数据
import pandas as pd
import numpy as np
s=pd.Series([1,3,6,np.nan,44,1])
print(s)
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
dates=pd.date_range('20160101',periods=6)
print(dates)
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06'],
dtype='datetime64[ns]', freq='D')
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
a b c d
2016-01-01 0.092622 1.038269 -0.363460 -0.064857
2016-01-02 2.134920 1.099559 -0.549231 -0.093023
2016-01-03 -1.961073 0.260070 0.495155 0.884979
2016-01-04 2.400581 -0.719417 0.510193 -1.416291
2016-01-05 2.049382 -0.697941 0.810225 0.732032
2016-01-06 -0.633934 -0.465445 -0.857863 -0.374819
df1=pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
df2=pd.DataFrame({'A':1,
'B':pd.Timestamp('20180701'),
'C':np.array([3]*4,dtype="int32"),
'D':pd.Series(1,index=list(range(4)),dtype="float32"),
'E':pd.Categorical(["hello","boys","and","girls"]),
'F':'foo'})
print(df2)
A B C D E F
0 1 2018-07-01 3 1.0 hello foo
1 1 2018-07-01 3 1.0 boys foo
2 1 2018-07-01 3 1.0 and foo
3 1 2018-07-01 3 1.0 girls foo
print(df2.dtypes)
A int64
B datetime64[ns]
C int32
D float32
E category
F object
dtype: object
print(df2.index)
Int64Index([0, 1, 2, 3], dtype='int64')
print(df2.columns)
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df2.values)
[[1 Timestamp('2018-07-01 00:00:00') 3 1.0 'hello' 'foo']
[1 Timestamp('2018-07-01 00:00:00') 3 1.0 'boys' 'foo']
[1 Timestamp('2018-07-01 00:00:00') 3 1.0 'and' 'foo']
[1 Timestamp('2018-07-01 00:00:00') 3 1.0 'girls' 'foo']]
print(df2.describe())
A C D
count 4.0 4.0 4.0
mean 1.0 3.0 1.0
std 0.0 0.0 0.0
min 1.0 3.0 1.0
25% 1.0 3.0 1.0
50% 1.0 3.0 1.0
75% 1.0 3.0 1.0
max 1.0 3.0 1.0
print(df2.T)
A 1 ... 1
B 2018-07-01 00:00:00 ... 2018-07-01 00:00:00
C 3 ... 3
D 1 ... 1
E hello ... girls
F foo ... foo
[6 rows x 4 columns]
print(df2.sort_index(axis=1,ascending=False))
F E D C B A
0 foo hello 1.0 3 2018-07-01 1
1 foo boys 1.0 3 2018-07-01 1
2 foo and 1.0 3 2018-07-01 1
3 foo girls 1.0 3 2018-07-01 1
print(df2.sort_values(by='E'))
A B C D E F
2 1 2018-07-01 3 1.0 and foo
1 1 2018-07-01 3 1.0 boys foo
3 1 2018-07-01 3 1.0 girls foo
0 1 2018-07-01 3 1.0 hello foo
dates=pd.date_range('20160101',periods=6)
df1=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df1)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 10 11
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
print(df1['A'],df1.A)
2016-01-01 0
2016-01-02 4
2016-01-03 8
2016-01-04 12
2016-01-05 16
2016-01-06 20
Freq: D, Name: A, dtype: int32 2016-01-01 0
2016-01-02 4
2016-01-03 8
2016-01-04 12
2016-01-05 16
2016-01-06 20
Freq: D, Name: A, dtype: int32
print(df1[0:1],df1['20160101':'20160103'])#第一行,前三行
A B C D
2016-01-01 0 1 2 3
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 10 11
#select by label:loc
print(df1.loc['20160101'])#第一行
A 0
B 1
C 2
D 3
Name: 2016-01-01 00:00:00, dtype: int32
print(df1.loc[:,['A','B']])
A B
2016-01-01 0 1
2016-01-02 4 5
2016-01-03 8 9
2016-01-04 12 13
2016-01-05 16 17
2016-01-06 20 21
print(df1.loc['20160101',['A','B']])
A 0
B 1
Name: 2016-01-01 00:00:00, dtype: int32
#select by position:iloc
print(df1.iloc[3:5,1:3])#4-5行,2-3列
B C
2016-01-04 13 14
2016-01-05 17 18
#mixed selection:ix
print(df1.ix[:3,['A','C']])
A C
2016-01-01 0 2
2016-01-02 4 6
2016-01-03 8 10
#boolean indexing
print(df1[df1.A>8])
A B C D
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
2、pandas设置值
df1=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df1.iloc[2,2]=1111
print(df1)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 1111 11
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
df1.loc['20160102','B']=2222
print(df1)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 2222 6 7
2016-01-03 8 9 1111 11
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
df1[df1.A>4]=0#3-6行A列大于4,全部改为0
print(df1)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 2222 6 7
2016-01-03 0 0 0 0
2016-01-04 0 0 0 0
2016-01-05 0 0 0 0
2016-01-06 0 0 0 0
df1.B[df1.A>2]=55#第二行A大于2,此行B列为55
print(df1)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 55 6 7
2016-01-03 0 0 0 0
2016-01-04 0 0 0 0
2016-01-05 0 0 0 0
2016-01-06 0 0 0 0
df1['F']=np.nan
print(df1)
A B C D F
2016-01-01 0 1 2 3 NaN
2016-01-02 4 55 6 7 NaN
2016-01-03 0 0 0 0 NaN
2016-01-04 0 0 0 0 NaN
2016-01-05 0 0 0 0 NaN
2016-01-06 0 0 0 0 NaN
df1['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20160101',periods=6))
print(df1)
A B C D F E
2016-01-01 0 1 2 3 NaN 1
2016-01-02 4 55 6 7 NaN 2
2016-01-03 0 0 0 0 NaN 3
2016-01-04 0 0 0 0 NaN 4
2016-01-05 0 0 0 0 NaN 5
2016-01-06 0 0 0 0 NaN 6
3、pandas处理丢失数据
df1=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df1.iloc[0,1]=np.nan
df1.iloc[1,2]=np.nan
print(df1)
A B C D
2016-01-01 0 NaN 2.0 3
2016-01-02 4 5.0 NaN 7
2016-01-03 8 9.0 10.0 11
2016-01-04 12 13.0 14.0 15
2016-01-05 16 17.0 18.0 19
2016-01-06 20 21.0 22.0 23
print(df1.dropna(axis=0,how='any'))#去掉前两行
A B C D
2016-01-03 8 9.0 10.0 11
2016-01-04 12 13.0 14.0 15
2016-01-05 16 17.0 18.0 19
2016-01-06 20 21.0 22.0 23
print(df1.fillna(value=0))#nan的值赋为0
A B C D
2016-01-01 0 0.0 2.0 3
2016-01-02 4 5.0 0.0 7
2016-01-03 8 9.0 10.0 11
2016-01-04 12 13.0 14.0 15
2016-01-05 16 17.0 18.0 19
2016-01-06 20 21.0 22.0 23
print(df1.isnull())
A B C D
2016-01-01 False True False False
2016-01-02 False False True False
2016-01-03 False False False False
2016-01-04 False False False False
2016-01-05 False False False False
2016-01-06 False False False False
print(np.any(df1.isnull())==True)#B列和C列均有空值的
A False
B True
C True
D False
dtype: bool
4、pandas导入和导出
data=pd.read_csv('hello.csv')
data.to_csv('bye.csv')
5、pandas 合并concat
df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2=pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
res=pd.concat([df1,df2,df3],axis=0,ignore_index=True)#列合并,重新进行排序
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
#jion,['inner','outer']
df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2=pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
print(df1)
print(df2)
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
res=pd.concat([df1,df2],join='outer')#没有的数据用Nan填充,outer为默认,类sql
print(res)
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
res1=pd.concat([df1,df2],join='inner',ignore_index=True)
print(res1)
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
res2=pd.concat([df1,df2],axis=1,join_axes=[df1.index])
print(res2)
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
res4=df1.append(df2,ignore_index=True)#加两个,[df2,df3]
print(res4)
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0
s1=pd.Series([1,2,3,4],index=['a','b','c','d'])
res=df1.append(s1,ignore_index=True)
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
6、pandas 合并merge
left1=pd.DataFrame({'key1':['K0','K1','K2','K3'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right1=pd.DataFrame({'key1':['K0','K1','K2','K3'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
key1 key2 A B
0 K0 K0 A0 B0
1 K1 K1 A1 B1
2 K2 K0 A2 B2
3 K3 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K2 K0 C2 D2
3 K3 K0 C3 D3
#how=['right','left','outer','inner']
res=pd.merge(left1,right1,on=['key1','key2'])#key为链接键,默认是inner
print(res)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K2 K0 A2 B2 C2 D2
res1=pd.merge(left1,right1,left_index=True,right_index=True,how='outer')
print(res1)
key1_x key2_x A B key1_y key2_y C D
0 K0 K0 A0 B0 K0 K0 C0 D0
1 K1 K1 A1 B1 K1 K0 C1 D1
2 K2 K0 A2 B2 K2 K0 C2 D2
3 K3 K1 A3 B3 K3 K0 C3 D3
res2=pd.merge(left1,right1,how='outer')
print(res2)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K1 A1 B1 NaN NaN
2 K2 K0 A2 B2 C2 D2
3 K3 K1 A3 B3 NaN NaN
4 K1 K0 NaN NaN C1 D1
5 K3 K0 NaN NaN C3 D3
df3=pd.DataFrame({'col':[0,1],'col_left':['a','b']})
df4=pd.DataFrame({'col':[1,2,2],'col_left':[2,2,2]})
col col_left
0 0 a
1 1 b
col col_left
0 1 2
1 2 2
2 2 2
res1=pd.merge(df3,df4,on='col',how='outer',indicator=True)#默认False
res2=pd.merge(df3,df4,on='col',how='outer',indicator="indicator_column")
print(res1)
col col_left_x col_left_y _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
print(res2)
col col_left_x col_left_y indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
boys=pd.DataFrame({'k':['K0','K1','K2'],'age':[4,5,6]})
girls=pd.DataFrame({'k':['K0','K0','K23'],'age':[7,8,9]})
k age
0 K0 4
1 K1 5
2 K2 6
k age
0 K0 7
1 K0 8
2 K23 9
res=pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(res)
k age_boy age_girl
0 K0 4 7
1 K0 4 8
7、plt画图
data=pd.Series(np.random.randn(1000),index=np.arange(1000))
data=data.cumsum()
data.plot()
plt.show()
data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
print(data.head())#默认输出前5行
data=data.cumsum()
data.plot()
plt.show()#A、B、C、D四条线
data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
ax=data.plot.scatter(x='A',y='B',color='r',label='class1')
data.plot.scatter(x='A',y='C',color='g',label='class2',ax=ax)#ax : matplotlib axes object, default None
plt.show()#散点图,两分类
kind : str
‘line’ : line plot (default)
‘bar’ : vertical bar plot
‘barh’ : horizontal bar plot
‘hist’ : histogram
‘box’ : boxplot
‘kde’ : Kernel Density Estimation plot
‘density’ : same as ‘kde’
‘area’ : area plot
‘pie’ : pie plot
‘scatter’ : scatter plot
‘hexbin’ : hexbin plot
8、排序
x=np.random.randint(1,100,10)
print(x)
[59 83 25 6 74 66 41 18 32 7]
y=np.sort(x)
print(y)
[ 6 7 18 25 32 41 59 66 74 83]
print(x)
[59 83 25 6 74 66 41 18 32 7]
y=x.sort()
print(y)
None
print(x)
[ 6 7 18 25 32 41 59 66 74 83]
9、散点图
import matplotlib.pyplot as plt
imort numpy as np
N=1000
x=np.random.randn(N)
y=x+np.random.randn(N)*0.5
plt.scatter(x,y,s=500,c='r',marker='<',alpha=0.5)#s=100面积,c颜色,alph透明度
plt.show()
10、折线图
import matplotlib.dates as mdates
#opentime,closetime,volume=np.loadtxt('**.csv',delimiter='',skiprows=1,usecols=(1,4,6),unpack=True)#unpack=True表示把1,4,6三列放在不同的数据列中
plt.plot_date(date,opentime)#散点图
plt.plot_date(date,opentime,"-",linestyle="--",color="g",marker="<")#折线图
plt.show()
11、条线图
N=5
y=[20,10,30,25,15]
index=np.arange(N)
plt.bar(left=index,height=y,color='b',width=0.5)
plt.show()
plt.bar(left=0,bottom=index,width=y,color='b',height=0.5,orientation='horizontal')#plt.barh(left=0,bottom=index,width=y)功能一致
plt.show()
index=np.arange(4)
sales_BJ=[10,14,22,32]
sales_SH=[15,13,21,42]
bar_width=0.3
plt.bar(index,sales_BJ,bar_width,color='b')
plt.bar(index+bar_width,sales_SH,bar_width,color='r')
plt.show()
plt.bar(index,sales_BJ,bar_width,color='b')
plt.bar(index,sales_SH,bar_width,color='r',bottom=sales_BJ)#层叠
plt.show()