import pandas as pd
import numpy as np
values = pd. Series( [ 'apple' , 'orange' , 'apple' , 'apple' ] * 2 )
values
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
dtype: object
pd. unique( values)
array(['apple', 'orange'], dtype=object)
pd. value_counts( values)
apple 6
orange 2
dtype: int64
values = pd. Series( [ 0 , 1 , 0 , 0 ] * 2 )
dim = pd. Series( [ 'apple' , 'orange' ] )
values
0 0
1 1
2 0
3 0
4 0
5 1
6 0
7 0
dtype: int64
dim
0 apple
1 orange
dtype: object
dim. take( values)
0 apple
1 orange
0 apple
0 apple
0 apple
1 orange
0 apple
0 apple
dtype: object
fruits = [ 'apple' , 'orange' , 'apple' , 'apple' ] * 2
N = len ( fruits)
df = pd. DataFrame( { 'fruits' : fruits,
'basket_id' : np. arange( N) ,
'count' : np. random. randint( 3 , 15 , size= N) ,
'weight' : np. random. uniform( 0 , 4 , size= N) } ,
columns= [ 'basket_id' , 'fruits' , 'count' , 'weight' ] )
df
basket_id fruits count weight 0 0 apple 10 2.679414 1 1 orange 8 2.278047 2 2 apple 9 0.087745 3 3 apple 6 2.028924 4 4 apple 11 1.704697 5 5 orange 6 1.352336 6 6 apple 11 2.940028 7 7 apple 4 2.798046
fruit_cat = df[ 'fruits' ] . astype( 'category' )
fruit_cat
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
Name: fruits, dtype: category
Categories (2, object): [apple, orange]
c = fruit_cat. values
type ( c)
pandas.core.arrays.categorical.Categorical
c. categories
Index(['apple', 'orange'], dtype='object')
c. codes
array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)
df[ 'fruits' ] = df[ 'fruits' ] . astype( 'category' )
df. fruits
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
Name: fruits, dtype: category
Categories (2, object): [apple, orange]
my_categories = pd. Categorical( [ 'foo' , 'bar' , 'baz' , 'foo' , 'bar' ] )
my_categories
[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]
categories = [ 'foo' , 'bar' , 'baz' ]
codes = [ 0 , 1 , 2 , 0 , 0 , 1 ]
my_cats_2 = pd. Categorical. from_codes( codes, categories)
my_cats_2
[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo, bar, baz]
draws = np. random. randn( 1000 )
draws[ : 5 ]
array([ 1.41984629, 0.25818437, -0.78979829, 0.69114415, 0.58610681])
bins = pd. qcut( draws, 4 )
bins
[(0.714, 3.115], (0.0138, 0.714], (-2.7239999999999998, -0.658], (0.0138, 0.714], (0.0138, 0.714], ..., (-2.7239999999999998, -0.658], (0.714, 3.115], (0.0138, 0.714], (0.0138, 0.714], (0.0138, 0.714]]
Length: 1000
Categories (4, interval[float64]): [(-2.7239999999999998, -0.658] < (-0.658, 0.0138] < (0.0138, 0.714] < (0.714, 3.115]]
bins = pd. qcut( draws, 4 , labels= [ 'Q1' , 'Q2' , 'Q3' , 'Q4' ] )
bins
[Q4, Q3, Q1, Q3, Q3, ..., Q1, Q4, Q3, Q3, Q3]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]
bins = pd. Series( bins, name= 'quartile' )
results = ( pd. Series( draws) . groupby( bins) . agg( [ 'count' , 'min' , 'max' ] ) . reset_index( ) )
results
quartile count min max 0 Q1 250 -2.722817 -0.669126 1 Q2 250 -0.654161 0.011138 2 Q3 250 0.016389 0.713528 3 Q4 250 0.714217 3.115205
N = 100000
draws = pd. Series( np. random. randn( N) )
labels = pd. Series( [ 'foo' , 'bar' , 'baz' , 'qux' ] * ( N// 4 ) )
categories = labels. astype( 'category' )
labels. memory_usage( )
800080
categories. memory_usage( )
100272
s = pd. Series( [ 'a' , 'b' , 'c' , 'd' ] * 2 )
cat_s = s. astype( 'category' )
cat_s
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (4, object): [a, b, c, d]
cat_s. cat. codes
0 0
1 1
2 2
3 3
4 0
5 1
6 2
7 3
dtype: int8
cat_s. cat. categories
Index(['a', 'b', 'c', 'd'], dtype='object')
actual_categories = [ 'a' , 'b' , 'c' , 'd' , 'e' ]
cat_s2 = cat_s. cat. set_categories( actual_categories)
cat_s2
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (5, object): [a, b, c, d, e]
cat_s2. value_counts( )
d 2
c 2
b 2
a 2
e 0
dtype: int64
cat_s = pd. Series( [ 'a' , 'b' , 'c' , 'd' ] * 2 , dtype= 'category' )
pd. get_dummies( cat_s)
a b c d 0 1 0 0 0 1 0 1 0 0 2 0 0 1 0 3 0 0 0 1 4 1 0 0 0 5 0 1 0 0 6 0 0 1 0 7 0 0 0 1
df = pd. DataFrame( { 'key' : [ 'a' , 'b' , 'c' ] * 4 ,
'value' : np. arange( 12 . ) } )
df
key value 0 a 0.0 1 b 1.0 2 c 2.0 3 a 3.0 4 b 4.0 5 c 5.0 6 a 6.0 7 b 7.0 8 c 8.0 9 a 9.0 10 b 10.0 11 c 11.0
g = df. groupby( 'key' ) . value
g. mean( )
key
a 4.5
b 5.5
c 6.5
Name: value, dtype: float64
g. transform( lambda x: x. mean( ) )
0 4.5
1 5.5
2 6.5
3 4.5
4 5.5
5 6.5
6 4.5
7 5.5
8 6.5
9 4.5
10 5.5
11 6.5
Name: value, dtype: float64
g. transform( 'mean' )
0 4.5
1 5.5
2 6.5
3 4.5
4 5.5
5 6.5
6 4.5
7 5.5
8 6.5
9 4.5
10 5.5
11 6.5
Name: value, dtype: float64
g. transform( lambda x: x* 2 )
0 0.0
1 2.0
2 4.0
3 6.0
4 8.0
5 10.0
6 12.0
7 14.0
8 16.0
9 18.0
10 20.0
11 22.0
Name: value, dtype: float64
g. transform( lambda x: x. rank( ascending= False ) )
0 4.0
1 4.0
2 4.0
3 3.0
4 3.0
5 3.0
6 2.0
7 2.0
8 2.0
9 1.0
10 1.0
11 1.0
Name: value, dtype: float64
N = 15
times = pd. date_range( '2017-05-20 00:00' , freq= '1min' , periods= N)
df = pd. DataFrame( { 'time' : times,
'values' : np. arange( N) } )
df
time values 0 2017-05-20 00:00:00 0 1 2017-05-20 00:01:00 1 2 2017-05-20 00:02:00 2 3 2017-05-20 00:03:00 3 4 2017-05-20 00:04:00 4 5 2017-05-20 00:05:00 5 6 2017-05-20 00:06:00 6 7 2017-05-20 00:07:00 7 8 2017-05-20 00:08:00 8 9 2017-05-20 00:09:00 9 10 2017-05-20 00:10:00 10 11 2017-05-20 00:11:00 11 12 2017-05-20 00:12:00 12 13 2017-05-20 00:13:00 13 14 2017-05-20 00:14:00 14
df. set_index( 'time' ) . resample( '5min' ) . count( )
values time 2017-05-20 00:00:00 5 2017-05-20 00:05:00 5 2017-05-20 00:10:00 5
df2 = pd. DataFrame( { 'time' : times. repeat( 3 ) ,
'key' : np. tile( [ 'a' , 'b' , 'c' ] , N) ,
'value' : np. arange( N * 3 . ) } )
df2
time key value 0 2017-05-20 00:00:00 a 0.0 1 2017-05-20 00:00:00 b 1.0 2 2017-05-20 00:00:00 c 2.0 3 2017-05-20 00:01:00 a 3.0 4 2017-05-20 00:01:00 b 4.0 5 2017-05-20 00:01:00 c 5.0 6 2017-05-20 00:02:00 a 6.0 7 2017-05-20 00:02:00 b 7.0 8 2017-05-20 00:02:00 c 8.0 9 2017-05-20 00:03:00 a 9.0 10 2017-05-20 00:03:00 b 10.0 11 2017-05-20 00:03:00 c 11.0 12 2017-05-20 00:04:00 a 12.0 13 2017-05-20 00:04:00 b 13.0 14 2017-05-20 00:04:00 c 14.0 15 2017-05-20 00:05:00 a 15.0 16 2017-05-20 00:05:00 b 16.0 17 2017-05-20 00:05:00 c 17.0 18 2017-05-20 00:06:00 a 18.0 19 2017-05-20 00:06:00 b 19.0 20 2017-05-20 00:06:00 c 20.0 21 2017-05-20 00:07:00 a 21.0 22 2017-05-20 00:07:00 b 22.0 23 2017-05-20 00:07:00 c 23.0 24 2017-05-20 00:08:00 a 24.0 25 2017-05-20 00:08:00 b 25.0 26 2017-05-20 00:08:00 c 26.0 27 2017-05-20 00:09:00 a 27.0 28 2017-05-20 00:09:00 b 28.0 29 2017-05-20 00:09:00 c 29.0 30 2017-05-20 00:10:00 a 30.0 31 2017-05-20 00:10:00 b 31.0 32 2017-05-20 00:10:00 c 32.0 33 2017-05-20 00:11:00 a 33.0 34 2017-05-20 00:11:00 b 34.0 35 2017-05-20 00:11:00 c 35.0 36 2017-05-20 00:12:00 a 36.0 37 2017-05-20 00:12:00 b 37.0 38 2017-05-20 00:12:00 c 38.0 39 2017-05-20 00:13:00 a 39.0 40 2017-05-20 00:13:00 b 40.0 41 2017-05-20 00:13:00 c 41.0 42 2017-05-20 00:14:00 a 42.0 43 2017-05-20 00:14:00 b 43.0 44 2017-05-20 00:14:00 c 44.0
time_key = pd. TimeGrouper( '5min' )
resampled = ( df2. set_index( 'time' ) . groupby( [ 'key' , time_key] ) . sum ( ) )
resampled
C:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: pd.TimeGrouper is deprecated and will be removed; Please use pd.Grouper(freq=...)
"""Entry point for launching an IPython kernel.
value key time a 2017-05-20 00:00:00 30.0 2017-05-20 00:05:00 105.0 2017-05-20 00:10:00 180.0 b 2017-05-20 00:00:00 35.0 2017-05-20 00:05:00 110.0 2017-05-20 00:10:00 185.0 c 2017-05-20 00:00:00 40.0 2017-05-20 00:05:00 115.0 2017-05-20 00:10:00 190.0