Python3-pandas

最新推荐文章于 2024-07-10 23:40:12 发布

风吴痕

最新推荐文章于 2024-07-10 23:40:12 发布

阅读量1.2k

点赞数

分类专栏： python 文章标签： python

python 专栏收录该内容

95 篇文章 0 订阅

订阅专栏

参考：

1、http://pandas.pydata.org/pandas-docs/stable/10min.html

2、http://pandas.pydata.org/pandas-docs/stable/tutorials.html

10 Minutes to pandas

Object Creation

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a Series
s = pd.Series([1,3,5,np.nan,6,8])
# print(s)
'''
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
'''

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)
# print(dates)
'''
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
'''

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
# print(df)
'''
                   A         B         C         D
2013-01-01 -2.106954  1.731281  0.252205 -1.257373
2013-01-02 -1.297739 -0.144438  1.405233  0.128684
2013-01-03 -1.515715  0.068778  0.313289  0.127228
2013-01-04 -0.028522  0.732110 -0.289821 -0.145544
2013-01-05  0.505480  0.918043  0.159986 -0.480223
2013-01-06  0.237698 -0.030478  0.920267  1.040430
'''

# Creating a DataFrame by passing a dict
df2 = pd.DataFrame({ 'A' : 1.,
                         'B' : pd.Timestamp('20130102'),
                         'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                         'D' : np.array([3] * 4,dtype='int32'),
                         'E' : pd.Categorical(["test","train","test","train"]),
                         'F' : 'foo' })
# print(df2)
'''
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
'''
# Having specific dtypes
print(df2.dtypes)
'''
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
'''

Viewing Data

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
# print(df)
'''
                   A         B         C         D
2013-01-01 -0.144929  0.516025  1.414747  0.956931
2013-01-02 -0.114975 -1.999099 -1.305006 -1.263897
2013-01-03  0.409725 -0.829613  0.010113 -1.991170
2013-01-04  1.282872 -1.338264 -3.314622  0.447457
2013-01-05  0.499045  0.326436  0.117400 -0.188779
2013-01-06 -0.701257 -1.522831 -1.270148 -1.496934
'''

print(df.head(2)) # 查看前2行
'''
                   A         B         C         D
2013-01-01 -0.144929  0.516025  1.414747  0.956931
2013-01-02 -0.114975 -1.999099 -1.305006 -1.263897
'''

print(df.tail(3)) # 查看后3行
'''
                   A         B         C         D
2013-01-04  1.282872 -1.338264 -3.314622  0.447457
2013-01-05  0.499045  0.326436  0.117400 -0.188779
2013-01-06 -0.701257 -1.522831 -1.270148 -1.496934
'''

# Display the index, columns, and the underlying numpy data
print(df.index) # Display the index 显示行标题

print(df.columns) # 显示列标题

print(df.values) # 显示实际值

print(type(df.values)) # <numpy ndarray>

print(df.describe()) # Describe shows a quick statistic summary of your data

print(df.T) # Transposing your data

print(df.sort_index(axis=1, ascending=False)) # Sorting by an axis 按某一轴方向排序

print(df.sort_values(by='B')) # 按B列排序

Selection

Getting

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

df['A'] # 获取第A列的值

df[0:3] # 得到前3行的值

df['20130102':'20130104'] # 得到['20130102','20130104')行的值

df.loc[dates[0]] # 第一行的值

df.loc[:,['A','B']] # 获取A、B列的值

df.loc['20130102':'20130104',['A','B']] # 行取['20130102','20130104') 列取A、B

df.loc['20130102',['A','B']] # 行取'20130102' 列取A、B

df.loc[dates[0],'A'] # 行取第一行，列取第A列

df.at[dates[0],'A'] # 行取第一行，列取第A列

df.iloc[3] # 第4行

df.iloc[3:5,0:2] # 行 [3,5) 列[0,2)

df.iloc[[1,2,4],[0,2]] # 行 1,2,4 列 0,2

df.iloc[1:3,:] # 行[1,3) 列 所有列

df.iloc[:,1:3] # 行 所有行 列[1,3)

df.iloc[1,1] # 行 1，列 1

df.iat[1,1] # 行 1，列 1

df[df.A > 0] # 第A列的值大于0的所有行列，注包含A、B，C，D等其他所有列

df[df > 0] # df中值大于0的所有行列，缺少的用NaN补充

df2 = df.copy()

df2['E'] = ['one', 'one','two','three','four','three']


df2[df2['E'].isin(['two','four'])] # 第E列中的  'two','four'对应的行

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

df['F']=s1

# print(df)
'''
                   A         B         C         D    F
2013-01-01 -0.447267 -1.585050 -0.388142  0.378795  NaN
2013-01-02 -0.045593 -0.876191 -1.910121  0.365615  1.0
2013-01-03  2.137712 -0.384255 -0.940755  1.987526  2.0
2013-01-04 -1.662465 -0.264535 -0.315382 -0.448721  3.0
2013-01-05  0.568981 -0.448420 -0.294313  1.914237  4.0
2013-01-06  1.867176  0.646454  1.968858 -0.290831  5.0
'''
# Setting values by label
df.at[dates[0],'A'] = 0 # 第0行，第A列的值为 0

# Setting values by position
df.iat[0,1] = 0 # 第0行，第2列（这里就是B列）的值为

# Setting by assigning with a numpy array
df.loc[:, 'D'] = np.array([5] * len(df)) # 设置第D列的值

print(df)
'''
                   A         B         C  D    F
2013-01-01  0.000000  0.000000  0.038916  5  NaN
2013-01-02  0.730072  1.325252 -0.166603  5  1.0
2013-01-03  0.727788  1.001638 -0.293277  5  2.0
2013-01-04  1.801973  0.814501 -0.145767  5  3.0
2013-01-05 -0.245231 -0.060449 -0.244515  5  4.0
2013-01-06  0.116800  0.115574 -0.300012  5  5.0
'''
df2 = df.copy()
df2[df2 > 0] = -df2 # 值大于0，取取相反数

Missing Data¶

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

df['F']=s1


df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1

print(df1)
'''
                   A         B         C         D    F    E
2013-01-01  1.777087 -1.400922 -0.719084 -0.025552  NaN  1.0
2013-01-02  0.577299  1.455149 -1.410174 -0.331292  1.0  1.0
2013-01-03  0.078308  0.173087 -1.370062 -1.497506  2.0  NaN
2013-01-04  1.032272  0.805309  1.447437  0.027947  3.0  NaN
'''
# To drop any rows that have missing data.
print(df1.dropna(how='any'))
'''
                   A         B         C         D    F    E
2013-01-02  0.577299  1.455149 -1.410174 -0.331292  1.0  1.0
'''

print(df1.fillna(value=5)) # Filling missing data
'''
                   A         B         C         D    F    E
2013-01-01  1.777087 -1.400922 -0.719084 -0.025552  5.0  1.0
2013-01-02  0.577299  1.455149 -1.410174 -0.331292  1.0  1.0
2013-01-03  0.078308  0.173087 -1.370062 -1.497506  2.0  5.0
2013-01-04  1.032272  0.805309  1.447437  0.027947  3.0  5.0
'''

print(pd.isnull(df1)) # To get the boolean mask where values are nan
'''
                A      B      C      D      F      E
2013-01-01  False  False  False  False   True  False
2013-01-02  False  False  False  False  False  False
2013-01-03  False  False  False  False  False   True
2013-01-04  False  False  False  False  False   True
'''

Operations

Stats

Operations in general exclude missing data.

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

# Operations in general exclude missing data.
# print(df.dropna())

df.mean(0) #  df.mean() 每一列的平均值

df.mean(1) # 每一行的平均值

print(df)
'''
                   A         B         C         D
2013-01-01 -0.677487 -0.807600 -0.487288  0.025362
2013-01-02 -0.647777  1.900635  0.372034  0.785723
2013-01-03  0.893734  0.837393  1.175039 -0.235330
2013-01-04 -0.334574 -0.250514  0.474364  1.041698
2013-01-05  0.264945  1.679856 -0.716078  1.298507
2013-01-06  0.051315  1.352663  0.678500  2.070729
'''
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

print(s)
'''
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64
'''

print(df.sub(s, axis=0)) #  df.sub(s, axis='index')
'''
                   A         B         C         D
2013-01-01       NaN       NaN       NaN       NaN
2013-01-02       NaN       NaN       NaN       NaN
2013-01-03 -0.106266 -0.162607  0.175039 -1.235330
2013-01-04 -3.334574 -3.250514 -2.525636 -1.958302
2013-01-05 -4.735055 -3.320144 -5.716078 -3.701493
2013-01-06       NaN       NaN       NaN       NaN
'''
print(df.apply(np.cumsum)) # Applying functions to the data
'''
                   A         B         C         D
2013-01-01 -0.677487 -0.807600 -0.487288  0.025362
2013-01-02 -1.325264  1.093034 -0.115254  0.811084
2013-01-03 -0.431530  1.930427  1.059785  0.575754
2013-01-04 -0.766105  1.679913  1.534150  1.617452
2013-01-05 -0.501159  3.359769  0.818072  2.915959
2013-01-06 -0.449844  4.712432  1.496572  4.986688
'''
print(df.apply(lambda x: x.max() - x.min()))
'''
A    1.571221
B    2.708235
C    1.891117
D    2.306059
dtype: float64
'''

Histogramming

See more at Histogramming and Discretization

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt


s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
'''
0    2
1    4
2    0
3    5
4    6
5    3
6    5
7    2
8    3
9    6
dtype: int32
'''
print(s.value_counts()) # 统计每个数出现的次数
'''
6    2
5    2
3    2
2    2
4    1
0    1
dtype: int64
'''

String Methods

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s)

print(s.str.lower()) # 所有字母小写

print(s.str.upper()) # 所有字母大写

Merge

Concat

See the Merging section

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

df = pd.DataFrame(np.random.randn(10, 4))

pieces = [df[:3], df[3:7], df[7:]] # # break it into pieces
print(pieces[0])
'''
          0         1         2         3
0 -0.524490  0.940037  0.866867 -0.030973
1  0.679116  1.187920  1.519773 -2.596930
2 -0.526557  0.436916 -1.804245  0.058277
'''

print(pd.concat(pieces))
'''
          0         1         2         3
0 -0.524490  0.940037  0.866867 -0.030973
1  0.679116  1.187920  1.519773 -2.596930
2 -0.526557  0.436916 -1.804245  0.058277
3 -1.204341  0.771885  0.474900 -0.308840
4 -0.018233 -0.405723 -0.344591 -0.454778
5 -1.255896  0.352891  0.231837 -0.802345
6  0.777226  0.252132 -0.252539 -0.779598
7  1.726603  0.210850  0.118263 -0.082848
8 -0.507362 -0.265372 -0.468006  0.997232
9  0.825417 -1.098757 -0.920184  0.227833
'''

Join

SQL style merges. See the Database style joining

import pandas as pd

import numpy as np


left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})

right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

print(left)
'''
   key  lval
0  foo     1
1  foo     2
'''
print(right)
'''
   key  rval
0  foo     4
1  foo     5
'''
print(pd.merge(left, right, on='key'))
'''
   key  lval  rval
0  foo     1     4
1  foo     1     5
2  foo     2     4
3  foo     2     5
'''

Another example that can be given is:

import pandas as pd

import numpy as np


left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})

right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

print(left)
'''
   key  lval
0  foo     1
1  bar     2
'''
print(right)
'''
   key  rval
0  foo     4
1  bar     5
'''
print(pd.merge(left, right, on='key'))
'''
   key  lval  rval
0  foo     1     4
1  bar     2     5
'''

Append

Append rows to a dataframe. See the Appending

import pandas as pd

import numpy as np


df = pd.DataFrame(np.random.randn(4, 4), columns=['A','B','C','D'])
print(df)
'''
          A         B         C         D
0 -0.131941  0.687743  0.096007  0.421632
1  1.299803  0.878197  0.734132 -0.685885
2  1.578180  0.371533  0.914458  0.603601
3  0.844306  1.265807  0.039494  1.894346
'''
s = df.iloc[3] # 取第3行

print(df.append(s, ignore_index=True))
'''
          A         B         C         D
0 -0.131941  0.687743  0.096007  0.421632
1  1.299803  0.878197  0.734132 -0.685885
2  1.578180  0.371533  0.914458  0.603601
3  0.844306  1.265807  0.039494  1.894346
4  0.844306  1.265807  0.039494  1.894346
'''

Grouping

Splitting the data into groups based on some criteria
Applying a function to each group independently
Combining the results into a data structure

See the Grouping section

import pandas as pd

import numpy as np


df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                    'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                    'C' : np.random.randn(8),
                    'D' : np.random.randn(8)})

print(df)
'''
     A      B         C         D
0  foo    one -0.805533  0.878175
1  bar    one  0.587196  1.019560
2  foo    two  0.428103 -0.288053
3  bar  three  0.085747  0.479096
4  foo    two -0.460235 -0.323406
5  bar    two -1.180654 -1.925048
6  foo    one  1.866390 -0.750412
7  foo  three  0.146175 -0.692545
'''

print(df.groupby('A').sum()) # 按A列中的对应元素求和
'''
           C         D
A                     
bar -0.50771 -0.426392   # -0.50771=0.587196+0.085747-1.180654
foo  1.17490 -1.176241
'''

print(df.groupby(['A','B']).sum())
'''
                  C         D
A   B                        
bar one    0.587196  1.019560
    three  0.085747  0.479096
    two   -1.180654 -1.925048
foo one    1.060857  0.127763
    three  0.146175 -0.692545
    two   -0.032132 -0.611460
'''

Reshaping

See the sections on Hierarchical Indexing and Reshaping .

Stack

import pandas as pd
import numpy as np

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                      'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))

print(tuples)
'''
[('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]
'''
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
print(df)
'''
                     A         B
first second                    
bar   one    -3.084166 -1.439038
      two     0.692997 -0.267035
baz   one    -0.183196  0.616568
      two     0.055133 -0.214597
foo   one     0.503489  0.046369
      two     0.978864 -0.185691
qux   one     0.945434 -0.199447
      two     0.839038  0.193106
'''

df2 = df[:4]

print(df2)
'''
                     A         B
first second                    
bar   one    -3.084166 -1.439038
      two     0.692997 -0.267035
baz   one    -0.183196  0.616568
      two     0.055133 -0.214597
'''
stacked = df2.stack()
print(stacked)
'''
first  second   
bar    one     A   -3.084166
               B   -1.439038
       two     A    0.692997
               B   -0.267035
baz    one     A   -0.183196
               B    0.616568
       two     A    0.055133
               B   -0.214597
dtype: float64
'''
print(stacked.unstack())
'''
                     A         B
first second                    
bar   one    -3.084166 -1.439038
      two     0.692997 -0.267035
baz   one    -0.183196  0.616568
      two     0.055133 -0.214597
'''
print(stacked.unstack(1))
'''
second        one       two
first                      
bar   A -3.084166  0.692997
      B -1.439038 -0.267035
baz   A -0.183196  0.055133
      B  0.616568 -0.214597
'''
print(stacked.unstack(0))
'''
first          bar       baz
second                      
one    A -3.084166 -0.183196
       B -1.439038  0.616568
two    A  0.692997  0.055133
       B -0.267035 -0.214597
'''

Pivot Tables

See the section on Pivot Tables .

import pandas as pd
import numpy as np

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                    'B' : ['A', 'B', 'C'] * 4,
                    'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                    'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})

print(df)
'''
        A  B    C         D         E
0     one  A  foo -0.082127 -0.526810
1     one  B  foo -0.657548 -1.120610
2     two  C  foo  0.332124 -0.180561
3   three  A  bar  1.181363  0.078891
4     one  B  bar -0.787990  0.427932
5     one  C  bar  0.897438  0.834303
6     two  A  foo -0.156936  1.245256
7   three  B  foo -0.453510  0.209916
8     one  C  foo  0.936943  0.219536
9     one  A  bar -0.972587 -1.417912
10    two  B  bar  1.456860 -0.198735
11  three  C  bar -0.750585  0.108095
'''
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))
'''
C             bar       foo
A     B                    
one   A -0.972587 -0.082127
      B -0.787990 -0.657548
      C  0.897438  0.936943
three A  1.181363       NaN
      B       NaN -0.453510
      C -0.750585       NaN
two   A       NaN -0.156936
      B  1.456860       NaN
      C       NaN  0.332124
'''

Time Series

See the Time Series section

import pandas as pd
import numpy as np

rng = pd.date_range('1/1/2012', periods=100, freq='S')

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

print(ts.resample('5Min').sum())

# Time zone representation
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
print(ts)

ts_utc = ts.tz_localize('UTC')
print(ts_utc)

# Convert to another time zone
print(ts_utc.tz_convert('US/Eastern'))

# Converting between time span representations
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
print(ts)

ps = ts.to_period()
print(ps)

print(ps.to_timestamp())

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()

Categoricals

For full docs, see the categorical introduction and the API documentation .

import pandas as pd
import numpy as np

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

print(df)
'''
   id raw_grade
0   1         a
1   2         b
2   3         b
3   4         a
4   5         a
5   6         e
'''
df["grade"] = df["raw_grade"].astype("category")

print(df["grade"]) # Convert the raw grades to a categorical data type.
'''
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
'''
print(df)
'''
   id raw_grade grade
0   1         a     a
1   2         b     b
2   3         b     b
3   4         a     a
4   5         a     a
5   6         e     e
'''
# Rename the categories to more meaningful names (assigning to Series.cat.categories is inplace!)
df["grade"].cat.categories = ["very good", "good", "very bad"]
print(df["grade"])
'''
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]
'''
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
print(df["grade"])
'''
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]
'''
print(df.sort_values(by="grade"))
'''
   id raw_grade      grade
5   6         e   very bad
1   2         b       good
2   3         b       good
0   1         a  very good
3   4         a  very good
4   5         a  very good
'''
print(df.groupby("grade").size())
'''
grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64
'''

Plotting

Plotting docs.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()

plt.figure('1');
plt.subplot(211)
ts.plot()

plt.subplot(212)
plt.plot(ts)
# plt.legend(loc='best');plt.show()


df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                      columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best');plt.show()

Getting Data In/Out

CSV

Writing to a csv file

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4))
df = pd.DataFrame(np.random.randn(4, 4), index=ts.index,
                      columns=['A', 'B', 'C', 'D'])

# Writing to a csv file
df.to_csv('foo.csv')

# Reading from a csv file
datas=pd.read_csv('foo.csv')

print(df)
'''
                   A         B         C         D
2000-01-01 -0.886796  0.150827  1.891757  0.703912
2000-01-02 -0.174584 -2.120584  0.251963 -1.786527
2000-01-03 -0.190375  0.603245  0.965307  0.259912
2000-01-04  0.615358  0.432191  0.781446  0.883223
'''
print(datas)
'''
   Unnamed: 0         A         B         C         D
0  2000-01-01 -0.886796  0.150827  1.891757  0.703912
1  2000-01-02 -0.174584 -2.120584  0.251963 -1.786527
2  2000-01-03 -0.190375  0.603245  0.965307  0.259912
3  2000-01-04  0.615358  0.432191  0.781446  0.883223
'''
datas2=datas.iloc[:,1:]
print(datas2)
'''
          A         B         C         D
0 -0.886796  0.150827  1.891757  0.703912
1 -0.174584 -2.120584  0.251963 -1.786527
2 -0.190375  0.603245  0.965307  0.259912
3  0.615358  0.432191  0.781446  0.883223
'''

HDF5

Reading and writing to HDFStores

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4))
df = pd.DataFrame(np.random.randn(4, 4), index=ts.index,
                      columns=['A', 'B', 'C', 'D'])

# Writing to a csv file
df.to_hdf('foo.h5','df')

# Reading from a csv file
datas=pd.read_hdf('foo.h5','df')

print(datas)

Excel

Reading and writing to MS Excel

Writing to an excel file

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4))
df = pd.DataFrame(np.random.randn(4, 4), index=ts.index,
                      columns=['A', 'B', 'C', 'D'])

# Writing to a csv file
df.to_excel('foo.xlsx', sheet_name='Sheet1')

# Reading from a csv file
datas=pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

print(datas)

Gotchas

If you are trying an operation and you see an exception like:

 
  >>> if pd.Series([False, True, False]):
    print("I was true")
Traceback
    ...
ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all().
 
 

See Comparisons for an explanation and what to do.

See Gotchas as well.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ds=pd.Series([False, True, False])
print(ds)
[print("true") for i in ds if not i]