import pandas as pd
from pandas import Series, DataFrame
import numpy as np
Series基础
obj = pd.Series([4 , 7 , -5 , 3 ], index=['d' , 'b' , 'a' , 'c' ])
obj
d 4
b 7
a -5
c 3
dtype: int64
obj.name = 'Series Name'
obj.index.name = 'index'
obj
index
d 4
b 7
a -5
c 3
Name: Series Name, dtype: int64
print('左边的index:' , obj.index)
print('右边的values:' , obj.values)
print('索引a:' , obj['a' ])
print('判断:' , 'b' in obj)
print('\n索引b/d/c: \n' , obj[['b' ,'d' ,'c' ]])
print('\n大于0的:\n' ,obj[obj > 0 ])
左边的index: Index(['d', 'b', 'a', 'c'], dtype='object', name='index')
右边的values: [ 4 7 -5 3]
索引a: -5
判断: True
索引b/d/c:
index
b 7
d 4
c 3
Name: Series Name, dtype: int64
大于0的:
index
d 4
b 7
c 3
Name: Series Name, dtype: int64
obj[obj>0 ].isnull()
index
d False
b False
c False
Name: Series Name, dtype: bool
DataFrame基础
data = {
'state' : ['Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' , 'Nevada' ],
'year' : [2000 , 2001 , 2002 , 2001 , 2002 , 2003 ],
'pop' : [1.5 , 1.7 , 3.6 , 2.4 , 2.9 , 3.2 ]}
frame = pd.DataFrame(data)
frame
pop
state
year
0
1.5
Ohio
2000
1
1.7
Ohio
2001
2
3.6
Ohio
2002
3
2.4
Nevada
2001
4
2.9
Nevada
2002
5
3.2
Nevada
2003
df = pd.DataFrame(frame, columns=['year' , 'state' , 'pop' ])
df.head(3 )
year
state
pop
0
2000
Ohio
1.5
1
2001
Ohio
1.7
2
2002
Ohio
3.6
df.tail(3 )
year
state
pop
3
2001
Nevada
2.4
4
2002
Nevada
2.9
5
2003
Nevada
3.2
df2 = pd.DataFrame(data, columns=['year' , 'state' , 'pop' , 'debt' ],
index=['one' , 'two' , 'three' , 'four' , 'five' , 'six' ])
df2
year
state
pop
debt
one
2000
Ohio
1.5
NaN
two
2001
Ohio
1.7
NaN
three
2002
Ohio
3.6
NaN
four
2001
Nevada
2.4
NaN
five
2002
Nevada
2.9
NaN
six
2003
Nevada
3.2
NaN
df2['year' ].head(3 )
one 2000
two 2001
three 2002
Name: year, dtype: int64
df2.year
one 2000
two 2001
three 2002
four 2001
five 2002
six 2003
Name: year, dtype: int64
注意:df2[column]能应对任何列名,但df2.column的情况下,列名必须是有效的python变量名才行。
df2.loc['three' ]
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
df2.iloc[2 :5 ]
year
state
pop
debt
three
2002
Ohio
3.6
NaN
four
2001
Nevada
2.4
NaN
five
2002
Nevada
2.9
NaN
df2.iloc[::2 ]
year
state
pop
debt
one
2000
Ohio
1.5
NaN
three
2002
Ohio
3.6
NaN
five
2002
Nevada
2.9
NaN
df2['debt' ] = np.arange(6. )
df2
year
state
pop
debt
one
2000
Ohio
1.5
0.0
two
2001
Ohio
1.7
1.0
three
2002
Ohio
3.6
2.0
four
2001
Nevada
2.4
3.0
five
2002
Nevada
2.9
4.0
six
2003
Nevada
3.2
5.0
df2['eastern' ] = df2.state == 'Ohio'
df2
year
state
pop
debt
eastern
one
2000
Ohio
1.5
0.0
True
two
2001
Ohio
1.7
1.0
True
three
2002
Ohio
3.6
2.0
True
four
2001
Nevada
2.4
3.0
False
five
2002
Nevada
2.9
4.0
False
six
2003
Nevada
3.2
5.0
False
del df2['eastern' ]
df.columns
Index(['year', 'state', 'pop'], dtype='object')
df2.T
one
two
three
four
five
six
year
2000
2001
2002
2001
2002
2003
state
Ohio
Ohio
Ohio
Nevada
Nevada
Nevada
pop
1.5
1.7
3.6
2.4
2.9
3.2
debt
0
1
2
3
4
5
df2.index.name = '索引名'
df2.columns.name = '行名'
df2
行名
year
state
pop
debt
索引名
one
2000
Ohio
1.5
0.0
two
2001