【model02】pandas_pandasmodel-CSDN博客

本文链接：https://blog.csdn.net/cs_xuhuan/article/details/82858028

这篇博客详细介绍了Pandas库在数据处理中的基本操作，包括数据选择、设置值、处理缺失数据、数据导入导出，以及如何利用Pandas进行数据合并（concat和merge）和plot画图。

摘要由CSDN通过智能技术生成

基本介绍

import pandas as pd
import numpy as np

s = pd.Series([1,3,6,np.nan,44,1])
print(s)
dates = pd.date_range('20160101',periods = 6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index = dates,columns = ['a','b','c','d'])
print(df)
df1 = pd.DataFrame(np.arange(12).reshape(3,4))
print(df1)
df2 = pd.DataFrame({
    
    'A':1.,
    'B':pd.Timestamp('20130102'),
    'C':pd.Series(1,index = list(range(4)),dtype = 'float32'),
    'D':np.array([3]*4,dtype = 'int32'),
    'E':pd.Categorical(['test','train','test','train']),
    'F':'foo'
})
print(df2)
print(df2.dtypes)#每一列的数据形式
print(df2.index)
print(df2.columns)
print(df2.values)
print(df2.describe())
print(df2.T)
print(df2.sort_index(axis=1,ascending = False))#ascending = 1排序倒着排序
print(df2.sort_index(axis=0,ascending = False))
df2.sort_values(by='E')#对E列进行排序

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')
                   a         b         c         d
2016-01-01 -1.487623  0.888221 -1.506509 -0.776984
2016-01-02 -0.040059  0.262338  1.658207 -0.713937
2016-01-03 -1.316003  1.942315  1.483371  2.473241
2016-01-04 -0.014973  0.465519  2.120004  1.569695
2016-01-05 -2.075723 -1.336707 -0.409277 -0.431981
2016-01-06 -0.776415 -0.293674  0.240641 -0.402298
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
                     0                    1                    2  \
A                    1                    1                    1   
B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00   
C                    1                    1                    1   
D                    3                    3                    3   
E                 test                train                 test   
F                  foo                  foo                  foo   

                     3  
A                    1  
B  2013-01-02 00:00:00  
C                    1  
D                    3  
E                train  
F                  foo  
     F      E  D    C          B    A
0  foo   test  3  1.0 2013-01-02  1.0
1  foo  train  3  1.0 2013-01-02  1.0
2  foo   test  3  1.0 2013-01-02  1.0
3  foo  train  3  1.0 2013-01-02  1.0
     A          B    C  D      E    F
3  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
0  1.0 2013-01-02  1.0  3   test  foo

	A	B	C	D	E	F
0	1.0	2013-01-02	1.0	3	test	foo
2	1.0	2013-01-02	1.0	3	test	foo
1	1.0	2013-01-02	1.0	3	train	foo
3	1.0	2013-01-02	1.0	3	train	foo

选择数据

import pandas as pd
import numpy as np

dates = pd.date_range('20130101',periods = 6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns = ['A','B','C','D'])
print(df)
print(df.A,'\n',df['A'])
print(df[0:3],'\n',df['20130102':'20130104'])
print(df.loc['20130102'])#select by label:loc
print(df.loc[:,['A','B']])#打印所有行，A、B两列元素
print(df.loc['20130102',['A','B']])
print(df.iloc[3])#select by position:iloc
prin