Python3-pandas

参考:

1、http://pandas.pydata.org/pandas-docs/stable/10min.html

2、http://pandas.pydata.org/pandas-docs/stable/tutorials.html

10 Minutes to pandas


10 Minutes to pandas

Object Creation

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a Series
s = pd.Series([1,3,5,np.nan,6,8])
# print(s)
'''
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
'''

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)
# print(dates)
'''
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
'''

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
# print(df)
'''
                   A         B         C         D
2013-01-01 -2.106954  1.731281  0.252205 -1.257373
2013-01-02 -1.297739 -0.144438  1.405233  0.128684
2013-01-03 -1.515715  0.068778  0.313289  0.127228
2013-01-04 -0.028522  0.732110 -0.289821 -0.145544
2013-01-05  0.505480  0.918043  0.159986 -0.480223
2013-01-06  0.237698 -0.030478  0.920267  1.040430
'''

# Creating a DataFrame by passing a dict
df2 = pd.DataFrame({ 'A' : 1.,
                         'B' : pd.Timestamp('20130102'),
                         'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                         'D' : np.array([3] * 4,dtype='int32'),
                         'E' : pd.Categorical(["test","train","test","train"]),
                         'F' : 'foo' })
# print(df2)
'''
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
'''
# Having specific dtypes
print(df2.dtypes)
'''
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
'''

Viewing Data


import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
# print(df)
'''
                   A         B         C         D
2013-01-01 -0.144929  0.516025  1.414747  0.956931
2013-01-02 -0.114975 -1.999099 -1.305006 -1.263897
2013-01-03  0.409725 -0.829613  0.010113 -1.991170
2013-01-04  1.282872 -1.338264 -3.314622  0.447457
2013-01-05  0.499045  0.326436  0.117400 -0.188779
2013-01-06 -0.701257 -1.522831 -1.270148 -1.496934
'''

print(df.head(2)) # 查看前2'''
                   A         B         C         D
2013-01-01 -0.144929  0.516025  1.414747  0.956931
2013-01-02 -0.114975 -1.999099 -1.305006 -1.263897
'''

print(df.tail(3)) # 查看后3'''
                   A         B         C         D
2013-01-04  1.282872 -1.338264 -3.314622  0.447457
2013-01-05  0.499045  0.326436  0.117400 -0.188779
2013-01-06 -0.701257 -1.522831 -1.270148 -1.496934
'''

# Display the index, columns, and the underlying numpy data
print(df.index) # Display the index 显示行标题

print(df.columns) # 显示列标题

print(df.values) # 显示实际值

print(type(df.values)) # <numpy ndarray>

print(df.describe()) # Describe shows a quick statistic summary of your data

print(df.T) # Transposing your data

print(df.sort_index(axis=1, ascending=False)) # Sorting by an axis 按某一轴方向排序

print(df.sort_values(by='B')) # B列排序

Selection


Getting

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

df['A'] # 获取第A列的值

df[0:3] # 得到前3行的值

df['20130102':'20130104'] # 得到['20130102','20130104')行的值

df.loc[dates[0]] # 第一行的值

df.loc[:,['A','B']] # 获取AB列的值

df.loc['20130102':'20130104',['A','B']] # 行取['20130102','20130104') 列取AB

df.loc['20130102',['A','B']] # 行取'20130102' 列取AB

df.loc[dates[0],'A'] # 行取第一行,列取第A
df.at[dates[0],'A'] # 行取第一行,列取第A
df.iloc[3] # 4
df.iloc[3:5,0:2] #  [3,5) [0,2)

df.iloc[[1,2,4],[0,2]] #  1,2,4  0,2

df.iloc[1:3,:] # [1,3) 列 所有列

df.iloc[:,1:3] # 行 所有行 列[1,3)

df.iloc[1,1] #  1,列 1

df.iat[1,1] #  1,列 1

df[df.A > 0] # A列的值大于0的所有行列,注包含ABCD等其他所有列

df[df > 0] # df中值大于0的所有行列,缺少的用NaN补充

df2 = df.copy()

df2['E'] = ['one', 'one','two','three','four','three']


df2[df2['E'].isin(['two','four'])] # E列中的  'two','four'对应的行
 


import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

df['F']=s1

# print(df)
'''
                   A         B         C         D    F
2013-01-01 -0.447267 -1.585050 -0.388142  0.378795  NaN
2013-01-02 -0.045593 -0.876191 -1.910121  0.365615  1.0
2013-01-03  2.137712 -0.384255 -0.940755  1.987526  2.0
2013-01-04 -1.662465 -0.264535 -0.315382 -0.448721  3.0
2013-01-05  0.568981 -0.448420 -0.294313  1.914237  4.0
2013-01-06  1.867176  0.646454  1.968858 -0.290831  5.0
'''
# Setting values by label
df.at[dates[0],'A'] = 0 # 0行,第A列的值为 0

# Setting values by position
df.iat[0,1] = 0 # 0行,第2列(这里就是B列)的值为

# Setting by assigning with a numpy array
df.loc[:, 'D'] = np.array([5] * len(df)) # 设置第D列的值

print(df)
'''
                   A         B         C  D    F
2013-01-01  0.000000  0.000000  0.038916  5  NaN
2013-01-02  0.730072  1.325252 -0.166603  5  1.0
2013-01-03  0.727788  1.001638 -0.293277  5  2.0
2013-01-04  1.801973  0.814501 -0.145767  5  3.0
2013-01-05 -0.245231 -0.060449 -0.244515  5  4.0
2013-01-06  0.116800  0.115574 -0.300012  5  5.0
'''
df2 = df.copy()
df2[df2 > 0] = -df2 # 值大于0,取取相反数



Missing Data

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

df['F']=s1


df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1

print(df1)
'''
                   A         B         C         D    F    E
2013-01-01  1.777087 -1.400922 -0.719084 -0.025552  NaN  1.0
2013-01-02  0.577299  1.455149 -1.410174 -0.331292  1.0  1.0
2013-01-03  0.078308  0.173087 -1.370062 -1.497506  2.0  NaN
2013-01-04  1.032272  0.805309  1.447437  0.027947  3.0  NaN
'''
# To drop any rows that have missing data.
print(df1.dropna(how='any'))
'''
                   A         B         C         D    F    E
2013-01-02  0.577299  1.455149 -1.410174 -0.331292  1.0  1.0
'''

print(df1.fillna(value=5)) # Filling missing data
'''
                   A         B         C         D    F    E
2013-01-01  1.777087 -1.400922 -0.719084 -0.025552  5.0  1.0
2013-01-02  0.577299  1.455149 -1.410174 -0.331292  1.0  1.0
2013-01-03  0.078308  0.173087 -1.370062 -1.497506  2.0  5.0
2013-01-04  1.032272  0.805309  1.447437  0.027947  3.0  5.0
'''

print(pd.isnull(df1)) # To get the boolean mask where values are nan
'''
                A      B      C      D      F      E
2013-01-01  False  False  False  False   True  False
2013-01-02  False  False  False  False  False  False
2013-01-03  False  False  False  False  False   True
2013-01-04  False  False  False  False  False   True
'''

Operations

Stats

Operations in general  exclude  missing data.

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Creating a DataFrame by passing a numpy array
dates = pd.date_range('20130101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

# Operations in general exclude missing data.
# print(df.dropna())

df.mean(0) #  df.mean() 每一列的平均值

df.mean(1) # 每一行的平均值

print(df)
'''
                   A         B         C         D
2013-01-01 -0.677487 -0.807600 -0.487288  0.025362
2013-01-02 -0.647777  1.900635  0.372034  0.785723
2013-01-03  0.893734  0.837393  1.175039 -0.235330
2013-01-04 -0.334574 -0.250514  0.474364  1.041698
2013-01-05  0.264945  1.679856 -0.716078  1.298507
2013-01-06  0.051315  1.352663  0.678500  2.070729
'''
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

print(s)
'''
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64
'''

print(df.sub(s, axis=0)) #  df.sub(s, axis='index')
'''
                   A         B         C         D
2013-01-01       NaN       NaN       NaN       NaN
2013-01-02       NaN       NaN       NaN       NaN
2013-01-03 -0.106266 -0.162607  0.175039 -1.235330
2013-01-04 -3.334574 -3.250514 -2.525636 -1.958302
2013-01-05 -4.735055 -3.320144 -5.716078 -3.701493
2013-01-06       NaN       NaN       NaN       NaN
'''
print(df.apply(np.cumsum)) # Applying functions to the data
'''
                   A         B         C         D
2013-01-01 -0.677487 -0.807600 -0.487288  0.025362
2013-01-02 -1.325264  1.093034 -0.115254  0.811084
2013-01-03 -0.431530  1.930427  1.059785  0.575754
2013-01-04 -0.766105  1.679913  1.534150  1.617452
2013-01-05 -0.501159  3.359769  0.818072  2.915959
2013-01-06 -0.449844  4.712432  1.496572  4.986688
'''
print(df.apply(lambda x: x.max() - x.min()))
'''
A    1.571221
B    2.708235
C    1.891117
D    2.306059
dtype: float64
'''

Histogramming

See more at  Histogramming and Discretization

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt


s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
'''
0    2
1    4
2    0
3    5
4    6
5    3
6    5
7    2
8    3
9    6
dtype: int32
'''
print(s.value_counts()) # 统计每个数出现的次数
'''
6    2
5    2
3    2
2    2
4    1
0    1
dtype: int64
'''

String Methods

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s)

print(s.str.lower()) # 所有字母小写

print(s.str.upper()) # 所有字母大写

Merge

Concat

See the  Merging section

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

df = pd.DataFrame(np.random.randn(10, 4))

pieces = [df[:3], df[3:7], df[7:]] # # break it into pieces
print(pieces[0])
'''
          0         1         2         3
0 -0.524490  0.940037  0.866867 -0.030973
1  0.679116  1.187920  1.519773 -2.596930
2 -0.526557  0.436916 -1.804245  0.058277
'''

print(pd.concat(pieces))
'''
          0         1         2         3
0 -0.524490  0.940037  0.866867 -0.030973
1  0.679116  1.187920  1.519773 -2.596930
2 -0.526557  0.436916 -1.804245  0.058277
3 -1.204341  0.771885  0.474900 -0.308840
4 -0.018233 -0.405723 -0.344591 -0.454778
5 -1.255896  0.352891  0.231837 -0.802345
6  0.777226  0.252132 -0.252539 -0.779598
7  1.726603  0.210850  0.118263 -0.082848
8 -0.507362 -0.265372 -0.468006  0.997232
9  0.825417 -1.098757 -0.920184  0.227833
'''

Join

SQL style merges. See the  Database style joining

import pandas as pd

import numpy as np


left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})

right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

print(left)
'''
   key  lval
0  foo     1
1  foo     2
'''
print(right)
'''
   key  rval
0  foo     4
1  foo     5
'''
print(pd.merge(left, right, on='key'))
'''
   key  lval  rval
0  foo     1     4
1  foo     1     5
2  foo     2     4
3  foo     2     5
'''
Another example that can be given is:

import pandas as pd

import numpy as np


left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})

right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

print(left)
'''
   key  lval
0  foo     1
1  bar     2
'''
print(right)
'''
   key  rval
0  foo     4
1  bar     5
'''
print(pd.merge(left, right, on='key'))
'''
   key  lval  rval
0  foo     1     4
1  bar     2     5
'''

Append

Append rows to a dataframe. See the  Appending

import pandas as pd

import numpy as np


df = pd.DataFrame(np.random.randn(4, 4), columns=['A','B','C','D'])
print(df)
'''
          A         B         C         D
0 -0.131941  0.687743  0.096007  0.421632
1  1.299803  0.878197  0.734132 -0.685885
2  1.578180  0.371533  0.914458  0.603601
3  0.844306  1.265807  0.039494  1.894346
'''
s = df.iloc[3] # 取第3
print(df.append(s, ignore_index=True))
'''
          A         B         C         D
0 -0.131941  0.687743  0.096007  0.421632
1  1.299803  0.878197  0.734132 -0.685885
2  1.578180  0.371533  0.914458  0.603601
3  0.844306  1.265807  0.039494  1.894346
4  0.844306  1.265807  0.039494  1.894346
'''

Grouping

  • Splitting the data into groups based on some criteria
  • Applying a function to each group independently
  • Combining the results into a data structure
See the  Grouping section

import pandas as pd

import numpy as np


df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                    'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                    'C' : np.random.randn(8),
                    'D' : np.random.randn(8)})

print(df)
'''
     A      B         C         D
0  foo    one -0.805533  0.878175
1  bar    one  0.587196  1.019560
2  foo    two  0.428103 -0.288053
3  bar  three  0.085747  0.479096
4  foo    two -0.460235 -0.323406
5  bar    two -1.180654 -1.925048
6  foo    one  1.866390 -0.750412
7  foo  three  0.146175 -0.692545
'''

print(df.groupby('A').sum()) # 按A列中的对应元素求和
'''
           C         D
A                     
bar -0.50771 -0.426392   # -0.50771=0.587196+0.085747-1.180654
foo  1.17490 -1.176241
'''

print(df.groupby(['A','B']).sum())
'''
                  C         D
A   B                        
bar one    0.587196  1.019560
    three  0.085747  0.479096
    two   -1.180654 -1.925048
foo one    1.060857  0.127763
    three  0.146175 -0.692545
    two   -0.032132 -0.611460
'''

Reshaping

See the sections on  Hierarchical Indexing  and  Reshaping .

Stack

import pandas as pd
import numpy as np

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                      'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))

print(tuples)
'''
[('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]
'''
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
print(df)
'''
                     A         B
first second                    
bar   one    -3.084166 -1.439038
      two     0.692997 -0.267035
baz   one    -0.183196  0.616568
      two     0.055133 -0.214597
foo   one     0.503489  0.046369
      two     0.978864 -0.185691
qux   one     0.945434 -0.199447
      two     0.839038  0.193106
'''

df2 = df[:4]

print(df2)
'''
                     A         B
first second                    
bar   one    -3.084166 -1.439038
      two     0.692997 -0.267035
baz   one    -0.183196  0.616568
      two     0.055133 -0.214597
'''
stacked = df2.stack()
print(stacked)
'''
first  second   
bar    one     A   -3.084166
               B   -1.439038
       two     A    0.692997
               B   -0.267035
baz    one     A   -0.183196
               B    0.616568
       two     A    0.055133
               B   -0.214597
dtype: float64
'''
print(stacked.unstack())
'''
                     A         B
first second                    
bar   one    -3.084166 -1.439038
      two     0.692997 -0.267035
baz   one    -0.183196  0.616568
      two     0.055133 -0.214597
'''
print(stacked.unstack(1))
'''
second        one       two
first                      
bar   A -3.084166  0.692997
      B -1.439038 -0.267035
baz   A -0.183196  0.055133
      B  0.616568 -0.214597
'''
print(stacked.unstack(0))
'''
first          bar       baz
second                      
one    A -3.084166 -0.183196
       B -1.439038  0.616568
two    A  0.692997  0.055133
       B -0.267035 -0.214597
'''

Pivot Tables

See the section on  Pivot Tables .

import pandas as pd
import numpy as np

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                    'B' : ['A', 'B', 'C'] * 4,
                    'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                    'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})

print(df)
'''
        A  B    C         D         E
0     one  A  foo -0.082127 -0.526810
1     one  B  foo -0.657548 -1.120610
2     two  C  foo  0.332124 -0.180561
3   three  A  bar  1.181363  0.078891
4     one  B  bar -0.787990  0.427932
5     one  C  bar  0.897438  0.834303
6     two  A  foo -0.156936  1.245256
7   three  B  foo -0.453510  0.209916
8     one  C  foo  0.936943  0.219536
9     one  A  bar -0.972587 -1.417912
10    two  B  bar  1.456860 -0.198735
11  three  C  bar -0.750585  0.108095
'''
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))
'''
C             bar       foo
A     B                    
one   A -0.972587 -0.082127
      B -0.787990 -0.657548
      C  0.897438  0.936943
three A  1.181363       NaN
      B       NaN -0.453510
      C -0.750585       NaN
two   A       NaN -0.156936
      B  1.456860       NaN
      C       NaN  0.332124
'''

Time Series

See the  Time Series section

import pandas as pd
import numpy as np

rng = pd.date_range('1/1/2012', periods=100, freq='S')

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

print(ts.resample('5Min').sum())

# Time zone representation
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
print(ts)

ts_utc = ts.tz_localize('UTC')
print(ts_utc)

# Convert to another time zone
print(ts_utc.tz_convert('US/Eastern'))

# Converting between time span representations
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
print(ts)

ps = ts.to_period()
print(ps)

print(ps.to_timestamp())

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()

Categoricals

For full docs, see the  categorical introduction and the  API documentation .

import pandas as pd
import numpy as np

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

print(df)
'''
   id raw_grade
0   1         a
1   2         b
2   3         b
3   4         a
4   5         a
5   6         e
'''
df["grade"] = df["raw_grade"].astype("category")

print(df["grade"]) # Convert the raw grades to a categorical data type.
'''
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
'''
print(df)
'''
   id raw_grade grade
0   1         a     a
1   2         b     b
2   3         b     b
3   4         a     a
4   5         a     a
5   6         e     e
'''
# Rename the categories to more meaningful names (assigning to Series.cat.categories is inplace!)
df["grade"].cat.categories = ["very good", "good", "very bad"]
print(df["grade"])
'''
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]
'''
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
print(df["grade"])
'''
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]
'''
print(df.sort_values(by="grade"))
'''
   id raw_grade      grade
5   6         e   very bad
1   2         b       good
2   3         b       good
0   1         a  very good
3   4         a  very good
4   5         a  very good
'''
print(df.groupby("grade").size())
'''
grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64
'''

Plotting

Plotting  docs.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()

plt.figure('1');
plt.subplot(211)
ts.plot()

plt.subplot(212)
plt.plot(ts)
# plt.legend(loc='best');plt.show()


df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                      columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best');plt.show()

Getting Data In/Out

CSV

Writing to a csv file

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4))
df = pd.DataFrame(np.random.randn(4, 4), index=ts.index,
                      columns=['A', 'B', 'C', 'D'])

# Writing to a csv file
df.to_csv('foo.csv')

# Reading from a csv file
datas=pd.read_csv('foo.csv')

print(df)
'''
                   A         B         C         D
2000-01-01 -0.886796  0.150827  1.891757  0.703912
2000-01-02 -0.174584 -2.120584  0.251963 -1.786527
2000-01-03 -0.190375  0.603245  0.965307  0.259912
2000-01-04  0.615358  0.432191  0.781446  0.883223
'''
print(datas)
'''
   Unnamed: 0         A         B         C         D
0  2000-01-01 -0.886796  0.150827  1.891757  0.703912
1  2000-01-02 -0.174584 -2.120584  0.251963 -1.786527
2  2000-01-03 -0.190375  0.603245  0.965307  0.259912
3  2000-01-04  0.615358  0.432191  0.781446  0.883223
'''
datas2=datas.iloc[:,1:]
print(datas2)
'''
          A         B         C         D
0 -0.886796  0.150827  1.891757  0.703912
1 -0.174584 -2.120584  0.251963 -1.786527
2 -0.190375  0.603245  0.965307  0.259912
3  0.615358  0.432191  0.781446  0.883223
'''

HDF5

Reading and writing to  HDFStores

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4))
df = pd.DataFrame(np.random.randn(4, 4), index=ts.index,
                      columns=['A', 'B', 'C', 'D'])

# Writing to a csv file
df.to_hdf('foo.h5','df')

# Reading from a csv file
datas=pd.read_hdf('foo.h5','df')

print(datas)

Excel

Reading and writing to MS Excel

Writing to an excel file

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4))
df = pd.DataFrame(np.random.randn(4, 4), index=ts.index,
                      columns=['A', 'B', 'C', 'D'])

# Writing to a csv file
df.to_excel('foo.xlsx', sheet_name='Sheet1')

# Reading from a csv file
datas=pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

print(datas)

Gotchas

If you are trying an operation and you see an exception like:

>>> if pd.Series([False, True, False]):
    print("I was true")
Traceback
    ...
ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all().

See Comparisons for an explanation and what to do.

See Gotchas as well.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ds=pd.Series([False, True, False])
print(ds)
[print("true") for i in ds if not i]



  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值