pandas_import numpy as np import pandas as pd-CSDN博客

本文链接：https://blog.csdn.net/u014299421/article/details/81264731

1、pandas选择数据

import pandas as pd
import numpy as np  

s=pd.Series([1,3,6,np.nan,44,1])
print(s)

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64

dates=pd.date_range('20160101',periods=6)
print(dates)

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)

                   a         b         c         d
2016-01-01  0.092622  1.038269 -0.363460 -0.064857
2016-01-02  2.134920  1.099559 -0.549231 -0.093023
2016-01-03 -1.961073  0.260070  0.495155  0.884979
2016-01-04  2.400581 -0.719417  0.510193 -1.416291
2016-01-05  2.049382 -0.697941  0.810225  0.732032
2016-01-06 -0.633934 -0.465445 -0.857863 -0.374819

df1=pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11

df2=pd.DataFrame({'A':1,
                  'B':pd.Timestamp('20180701'),
                  'C':np.array([3]*4,dtype="int32"),
                  'D':pd.Series(1,index=list(range(4)),dtype="float32"),
                  'E':pd.Categorical(["hello","boys","and","girls"]),
                  'F':'foo'})

print(df2)
   A          B  C    D      E    F
0  1 2018-07-01  3  1.0  hello  foo
1  1 2018-07-01  3  1.0   boys  foo
2  1 2018-07-01  3  1.0    and  foo
3  1 2018-07-01  3  1.0  girls  foo

print(df2.dtypes)
A             int64
B    datetime64[ns]
C             int32
D           float32
E          category
F            object
dtype: object

print(df2.index)
Int64Index([0, 1, 2, 3], dtype='int64')

print(df2.columns)
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

print(df2.values)
[[1 Timestamp('2018-07-01 00:00:00') 3 1.0 'hello' 'foo']
 [1 Timestamp('2018-07-01 00:00:00') 3 1.0 'boys' 'foo']
 [1 Timestamp('2018-07-01 00:00:00') 3 1.0 'and' 'foo']
 [1 Timestamp('2018-07-01 00:00:00') 3 1.0 'girls' 'foo']]

print(df2.describe())
         A    C    D
count  4.0  4.0  4.0
mean   1.0  3.0  1.0
std    0.0  0.0  0.0
min    1.0  3.0  1.0
25%    1.0  3.0  1.0
50%    1.0  3.0  1.0
75%    1.0  3.0  1.0
max    1.0  3.0  1.0


print(df2.T)
A                    1         ...                             1
B  2018-07-01 00:00:00         ...           2018-07-01 00:00:00
C                    3         ...                             3
D                    1         ...                             1
E                hello         ...                         girls
F                  foo         ...                           foo

[6 rows x 4 columns]

print(df2.sort_index(axis=1,ascending=False))
     F      E    D  C          B  A
0  foo  hello  1.0  3 2018-07-01  1
1  foo   boys  1.0  3 2018-07-01  1
2  foo    and  1.0  3 2018-07-01  1
3  foo  girls  1.0  3 2018-07-01  1

print(df2.sort_values(by='E'))
   A          B  C    D      E    F
2  1 2018-07-01  3  1.0    and  foo
1  1 2018-07-01  3  1.0   boys  foo
3  1 2018-07-01  3  1.0  girls  foo
0  1 2018-07-01  3  1.0  hello  foo

dates=pd.date_range('20160101',periods=6)
df1=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df1)

             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23

print(df1['A'],df1.A)
2016-01-01     0
2016-01-02     4
2016-01-03     8
2016-01-04    12
2016-01-05    16
2016-01-06    20
Freq: D, Name: A, dtype: int32 2016-01-01     0
2016-01-02     4
2016-01-03     8
2016-01-04    12
2016-01-05    16
2016-01-06    20
Freq: D, Name: A, dtype: int32

print(df1[0:1],df1['20160101':'20160103'])#第一行，前三行
            A  B  C  D
2016-01-01  0  1  2  3   

            A  B   C   D
2016-01-01  0  1   2   3
2016-01-02  4  5   6   7
2016-01-03  8  9  10  11   

#select by label:loc
print(df1.loc['20160101'])#第一行

A    0
B    1
C    2
D    3
Name: 2016-01-01 00:00:00, dtype: int32

print(df1.loc[:,['A','B']])
             A   B
2016-01-01   0   1
2016-01-02   4   5
2016-01-03   8   9
2016-01-04  12  13
2016-01-05  16  17
2016-01-06  20  21
      
print(df1.loc['20160101',['A','B']])
A    0
B    1
Name: 2016-01-01 00:00:00, dtype: int32

#select by position:iloc
print(df1.iloc[3:5,1:3])#4-5行，2-3列

             B   C
2016-01-04  13  14
2016-01-05  17  18


#mixed selection:ix
print(df1.ix[:3,['A','C']])

            A   C
2016-01-01  0   2
2016-01-02  4   6
2016-01-03  8  10

#boolean indexing
print(df1[df1.A>8])

             A   B   C   D
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23

2、pandas设置值

df1=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])

df1.iloc[2,2]=1111
print(df1)

             A   B     C   D
2016-01-01   0   1     2   3
2016-01-02   4   5     6   7
2016-01-03   8   9  1111  11
2016-01-04  12  13    14  15
2016-01-05  16  17    18  19
2016-01-06  20  21    22  23

df1.loc['20160102','B']=2222
print(df1)

             A     B     C   D
2016-01-01   0     1     2   3
2016-01-02   4  2222     6   7
2016-01-03   8     9  1111  11
2016-01-04  12    13    14  15
2016-01-05  16    17    18  19
2016-01-06  20    21    22  23

df1[df1.A>4]=0#3-6行A列大于4，全部改为0
print(df1)

            A     B  C  D
2016-01-01  0     1  2  3
2016-01-02  4  2222  6  7
2016-01-03  0     0  0  0
2016-01-04  0     0  0  0
2016-01-05  0     0  0  0
2016-01-06  0     0  0  0


df1.B[df1.A>2]=55#第二行A大于2，此行B列为55
print(df1)

            A   B  C  D
2016-01-01  0   1  2  3
2016-01-02  4  55  6  7
2016-01-03  0   0  0  0
2016-01-04  0   0  0  0
2016-01-05  0   0  0  0
2016-01-06  0   0  0  0

df1['F']=np.nan
print(df1)

            A   B  C  D   F
2016-01-01  0   1  2  3 NaN
2016-01-02  4  55  6  7 NaN
2016-01-03  0   0  0  0 NaN
2016-01-04  0   0  0  0 NaN
2016-01-05  0   0  0  0 NaN
2016-01-06  0   0  0  0 NaN

df1['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20160101',periods=6))
print(df1)

            A   B  C  D   F  E
2016-01-01  0   1  2  3 NaN  1
2016-01-02  4  55  6  7 NaN  2
2016-01-03  0   0  0  0 NaN  3
2016-01-04  0   0  0  0 NaN  4
2016-01-05  0   0  0  0 NaN  5
2016-01-06  0   0  0  0 NaN  6

3、pandas处理丢失数据

df1=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df1.iloc[0,1]=np.nan
df1.iloc[1,2]=np.nan
print(df1)
             A     B     C   D
2016-01-01   0   NaN   2.0   3
2016-01-02   4   5.0   NaN   7
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23

print(df1.dropna(axis=0,how='any'))#去掉前两行

             A     B     C   D
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23

print(df1.fillna(value=0))#nan的值赋为0
             A     B     C   D
2016-01-01   0   0.0   2.0   3
2016-01-02   4   5.0   0.0   7
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23

print(df1.isnull())
               A      B      C      D
2016-01-01  False   True  False  False
2016-01-02  False  False   True  False
2016-01-03  False  False  False  False
2016-01-04  False  False  False  False
2016-01-05  False  False  False  False
2016-01-06  False  False  False  False

print(np.any(df1.isnull())==True)#B列和C列均有空值的
A    False
B     True
C     True
D    False
dtype: bool

4、pandas导入和导出

data=pd.read_csv('hello.csv')
data.to_csv('bye.csv')

5、pandas 合并concat

df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2=pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)

     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0

res=pd.concat([df1,df2,df3],axis=0,ignore_index=True)#列合并,重新进行排序
print(res)

     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0

#jion,['inner','outer']
df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2=pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
print(df1)
print(df2)

     a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
     b    c    d    e
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0

res=pd.concat([df1,df2],join='outer')#没有的数据用Nan填充,outer为默认，类sql
print(res)
     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
res1=pd.concat([df1,df2],join='inner',ignore_index=True)
print(res1)
     b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0

res2=pd.concat([df1,df2],axis=1,join_axes=[df1.index])
print(res2)
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0

res4=df1.append(df2,ignore_index=True)#加两个,[df2,df3]
print(res4)
     a    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
5  NaN  1.0  1.0  1.0  1.0

s1=pd.Series([1,2,3,4],index=['a','b','c','d'])
res=df1.append(s1,ignore_index=True)
print(res)

     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0

6、pandas 合并merge

left1=pd.DataFrame({'key1':['K0','K1','K2','K3'],
                   'key2':['K0','K1','K0','K1'],
                   'A':['A0','A1','A2','A3'],
                   'B':['B0','B1','B2','B3']})

right1=pd.DataFrame({'key1':['K0','K1','K2','K3'],
                     'key2':['K0','K0','K0','K0'],
                   'C':['C0','C1','C2','C3'],
                   'D':['D0','D1','D2','D3']})
  key1 key2   A   B
0   K0   K0  A0  B0
1   K1   K1  A1  B1
2   K2   K0  A2  B2
3   K3   K1  A3  B3
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K2   K0  C2  D2
3   K3   K0  C3  D3

#how=['right','left','outer','inner']
res=pd.merge(left1,right1,on=['key1','key2'])#key为链接键,默认是inner
print(res)
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K2   K0  A2  B2  C2  D2

res1=pd.merge(left1,right1,left_index=True,right_index=True,how='outer')
print(res1)
  key1_x key2_x   A   B key1_y key2_y   C   D
0     K0     K0  A0  B0     K0     K0  C0  D0
1     K1     K1  A1  B1     K1     K0  C1  D1
2     K2     K0  A2  B2     K2     K0  C2  D2
3     K3     K1  A3  B3     K3     K0  C3  D3
res2=pd.merge(left1,right1,how='outer')
print(res2)
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K1   K1   A1   B1  NaN  NaN
2   K2   K0   A2   B2   C2   D2
3   K3   K1   A3   B3  NaN  NaN
4   K1   K0  NaN  NaN   C1   D1
5   K3   K0  NaN  NaN   C3   D3

df3=pd.DataFrame({'col':[0,1],'col_left':['a','b']})
df4=pd.DataFrame({'col':[1,2,2],'col_left':[2,2,2]})

   col col_left
0    0        a
1    1        b
   col  col_left
0    1         2
1    2         2
2    2         2

res1=pd.merge(df3,df4,on='col',how='outer',indicator=True)#默认False
res2=pd.merge(df3,df4,on='col',how='outer',indicator="indicator_column")
print(res1)
   col col_left_x  col_left_y      _merge
0    0          a         NaN   left_only
1    1          b         2.0        both
2    2        NaN         2.0  right_only
3    2        NaN         2.0  right_only
print(res2)
   col col_left_x  col_left_y indicator_column
0    0          a         NaN        left_only
1    1          b         2.0             both
2    2        NaN         2.0       right_only
3    2        NaN         2.0       right_only

boys=pd.DataFrame({'k':['K0','K1','K2'],'age':[4,5,6]})
girls=pd.DataFrame({'k':['K0','K0','K23'],'age':[7,8,9]})

    k  age
0  K0    4
1  K1    5
2  K2    6
     k  age
0   K0    7
1   K0    8
2  K23    9

res=pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(res)
    k  age_boy  age_girl
0  K0        4         7
1  K0        4         8

7、plt画图

data=pd.Series(np.random.randn(1000),index=np.arange(1000))
data=data.cumsum()
data.plot()
plt.show()

data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
print(data.head())#默认输出前5行

data=data.cumsum()
data.plot()
plt.show()#A、B、C、D四条线

data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
ax=data.plot.scatter(x='A',y='B',color='r',label='class1')
data.plot.scatter(x='A',y='C',color='g',label='class2',ax=ax)#ax : matplotlib axes object, default None
plt.show()#散点图，两分类

kind : str
‘line’ : line plot (default)
‘bar’ : vertical bar plot
‘barh’ : horizontal bar plot
‘hist’ : histogram
‘box’ : boxplot
‘kde’ : Kernel Density Estimation plot
‘density’ : same as ‘kde’
‘area’ : area plot
‘pie’ : pie plot
‘scatter’ : scatter plot
‘hexbin’ : hexbin plot

8、排序

x=np.random.randint(1,100,10)
print(x)
[59 83 25  6 74 66 41 18 32  7]
y=np.sort(x)
print(y)
[ 6  7 18 25 32 41 59 66 74 83]
print(x)
[59 83 25  6 74 66 41 18 32  7]
y=x.sort()
print(y)
None
print(x)
[ 6  7 18 25 32 41 59 66 74 83]

9、散点图

import matplotlib.pyplot as plt
imort numpy as np

N=1000
x=np.random.randn(N)
y=x+np.random.randn(N)*0.5
plt.scatter(x,y,s=500,c='r',marker='<',alpha=0.5)#s=100面积,c颜色，alph透明度
plt.show()

散点图

10、折线图

import matplotlib.dates as mdates
#opentime,closetime,volume=np.loadtxt('**.csv',delimiter='',skiprows=1,usecols=(1,4,6),unpack=True)#unpack=True表示把1,4,6三列放在不同的数据列中


plt.plot_date(date,opentime)#散点图
plt.plot_date(date,opentime,"-",linestyle="--",color="g",marker="<")#折线图
plt.show()

11、条线图

N=5
y=[20,10,30,25,15]
index=np.arange(N)
plt.bar(left=index,height=y,color='b',width=0.5)
plt.show()
plt.bar(left=0,bottom=index,width=y,color='b',height=0.5,orientation='horizontal')#plt.barh(left=0,bottom=index,width=y)功能一致
plt.show()

条线图1

条线图2

index=np.arange(4)
sales_BJ=[10,14,22,32]
sales_SH=[15,13,21,42]

bar_width=0.3
plt.bar(index,sales_BJ,bar_width,color='b')
plt.bar(index+bar_width,sales_SH,bar_width,color='r')
plt.show()


plt.bar(index,sales_BJ,bar_width,color='b')
plt.bar(index,sales_SH,bar_width,color='r',bottom=sales_BJ)#层叠
plt.show()

条线图3