import pandas as pd
import numpy as np
import matplotlib.pyplot as pltx
In [3]:
dates = pd.date_range('20130101', periods=6)
dates
Out[3]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
Out[4]:
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.043660 | 0.914219 | 1.364281 | 0.960460 |
2013-01-02 | 0.245818 | 0.582317 | 0.456372 | -0.734680 |
2013-01-03 | -0.997398 | -0.476202 | 0.967015 | 0.089730 |
2013-01-04 | -1.132148 | 0.867161 | 0.458086 | 0.797743 |
2013-01-05 | -1.226727 | 1.524988 | -1.980305 | 0.694533 |
2013-01-06 | 1.695086 | 0.796078 | -0.688947 | -0.910752 |
In [5]:
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
df2
Out[5]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
0 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
1 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
2 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
3 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
In [6]:
df2.dtypes
Out[6]:
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
In [11]:
df.describe()
Out[11]:
A | B | C | D | |
---|---|---|---|---|
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
mean | -0.228618 | 0.701427 | 0.096084 | 0.149506 |
std | 1.131678 | 0.657413 | 1.229257 | 0.810483 |
min | -1.226727 | -0.476202 | -1.980305 | -0.910752 |
25% | -1.098461 | 0.635757 | -0.402617 | -0.528577 |
50% | -0.476869 | 0.831620 | 0.457229 | 0.392131 |
75% | 0.195279 | 0.902454 | 0.839783 | 0.771940 |
max | 1.695086 | 1.524988 | 1.364281 | 0.960460 |
In [12]:
df.T
Out[12]:
2013-01-01 00:00:00 | 2013-01-02 00:00:00 | 2013-01-03 00:00:00 | 2013-01-04 00:00:00 | 2013-01-05 00:00:00 | 2013-01-06 00:00:00 | |
---|---|---|---|---|---|---|
A | 0.043660 | 0.245818 | -0.997398 | -1.132148 | -1.226727 | 1.695086 |
B | 0.914219 | 0.582317 | -0.476202 | 0.867161 | 1.524988 | 0.796078 |
C | 1.364281 | 0.456372 | 0.967015 | 0.458086 | -1.980305 | -0.688947 |
D | 0.960460 | -0.734680 | 0.089730 | 0.797743 | 0.694533 | -0.910752 |
In [13]:
df.sort_index(axis=1, ascending=False)
Out[13]:
D | C | B | A | |
---|---|---|---|---|
2013-01-01 | 0.960460 | 1.364281 | 0.914219 | 0.043660 |
2013-01-02 | -0.734680 | 0.456372 | 0.582317 | 0.245818 |
2013-01-03 | 0.089730 | 0.967015 | -0.476202 | -0.997398 |
2013-01-04 | 0.797743 | 0.458086 | 0.867161 | -1.132148 |
2013-01-05 | 0.694533 | -1.980305 | 1.524988 | -1.226727 |
2013-01-06 | -0.910752 | -0.688947 | 0.796078 | 1.695086 |
In [14]:
df.sort_values(by='B')
Out[14]:
A | B | C | D | |
---|---|---|---|---|
2013-01-03 | -0.997398 | -0.476202 | 0.967015 | 0.089730 |
2013-01-02 | 0.245818 | 0.582317 | 0.456372 | -0.734680 |
2013-01-06 | 1.695086 | 0.796078 | -0.688947 | -0.910752 |
2013-01-04 | -1.132148 | 0.867161 | 0.458086 | 0.797743 |
2013-01-01 | 0.043660 | 0.914219 | 1.364281 | 0.960460 |
2013-01-05 | -1.226727 | 1.524988 | -1.980305 | 0.694533 |
In [15]:
df.apply(np.cumsum)
Out[15]:
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.043660 | 0.914219 | 1.364281 | 0.960460 |
2013-01-02 | 0.289478 | 1.496535 | 1.820653 | 0.225781 |
2013-01-03 | -0.707920 | 1.020334 | 2.787668 | 0.315511 |
2013-01-04 | -1.840068 | 1.887495 | 3.245754 | 1.113254 |
2013-01-05 | -3.066794 | 3.412483 | 1.265449 | 1.807786 |
2013-01-06 | -1.371708 | 4.208561 | 0.576502 | 0.897035 |
In [16]:
df.apply(lambda x: x.max() - x.min())
Out[16]:
A 2.921813
B 2.001190
C 3.344586
D 1.871212
dtype: float64
In [18]:
s = pd.Series(np.random.randint(0, 7, size=10))
s
Out[18]:
0 3
1 3
2 3
3 4
4 3
5 1
6 6
7 3
8 1
9 4
dtype: int32
In [19]:
s.value_counts()
Out[19]:
3 5
4 2
1 2
6 1
dtype: int64
In [20]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()
Out[20]:
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8