函数或映射进行值替代
df = pd. DataFrame( [ [ 'jeff' , 18 ]
, [ 'herry' , 20 ]
, [ 'chris' , 25 ]
, [ 'culry' , 38 ] ] , columns= [ 'name' , 'age' ] )
df
name age 0 jeff 18 1 herry 20 2 chris 25 3 culry 38
info = { 'jeff' : [ 'dog' , 3 ]
, 'herry' : [ 'cat' , 2 ]
, 'chris' : [ 'cat' , 3 ]
, 'culry' : [ 'cat' , 1 ]
}
df[ 'pet' ] = df[ 'name' ] . map ( lambda k: info[ k] [ 0 ] )
df[ 'pet_name' ] = df[ 'name' ] . map ( lambda k: info[ k] [ 1 ] )
df
name age pet pet_name 0 jeff 18 dog 3 1 herry 20 cat 2 2 chris 25 cat 3 3 culry 38 cat 1
分箱
import numpy as np
import pandas as pd
ages = np. random. randint( 4 , 100 , 30 )
ages
array([60, 64, 98, 63, 73, 75, 62, 42, 43, 18, 70, 35, 32, 87, 4, 78, 78,
37, 61, 47, 95, 62, 54, 90, 41, 48, 29, 27, 61, 91])
按照指定的边界值来分箱
bins = [ 10 , 20 , 30 , 40 , 50 , 60 , 70 , 80 , 90 , 100 ]
cutdata = pd. cut( ages
, bins
, right= False
)
cutdata
[[60, 70), [60, 70), [90, 100), [60, 70), [70, 80), ..., [40, 50), [20, 30), [20, 30), [60, 70), [90, 100)]
Length: 30
Categories (9, interval[int64]): [[10, 20) < [20, 30) < [30, 40) < [40, 50) ... [60, 70) < [70, 80) < [80, 90) < [90, 100)]
cutdata. categories
IntervalIndex([(10, 20], (20, 30], (30, 40], (40, 50], (50, 60], (60, 70], (70, 80], (80, 90], (90, 100]],
closed='right',
dtype='interval[int64]')
cutdata = pd. cut( ages
, bins
, right= False
, labels= [ str ( i) for i in range ( 9 ) ]
)
cutdata
['5', '5', '8', '5', '6', ..., '3', '1', '1', '5', '8']
Length: 30
Categories (9, object): ['0' < '1' < '2' < '3' ... '5' < '6' < '7' < '8']
cutdata. codes
array([ 4, 5, 8, 5, 6, 6, 5, 3, 3, 0, 5, 2, 2, 7, -1, 6, 6,
2, 5, 3, 8, 5, 4, 7, 3, 3, 1, 1, 5, 8], dtype=int8)
cutdata. value_counts( )
(10, 20] 1
(20, 30] 2
(30, 40] 3
(40, 50] 5
(50, 60] 2
(60, 70] 7
(70, 80] 4
(80, 90] 2
(90, 100] 3
dtype: int64
按照指定的分位数进行分箱
import matplotlib. pyplot as plt
qcutdata = pd. qcut( ages, q= [ 0 , 0.25 , 0.5 , 0.75 , 1 ] )
qcutdata
[(41.25, 61.0], (61.0, 74.5], (74.5, 98.0], (61.0, 74.5], (61.0, 74.5], ..., (41.25, 61.0], (3.999, 41.25], (3.999, 41.25], (41.25, 61.0], (74.5, 98.0]]
Length: 30
Categories (4, interval[float64]): [(3.999, 41.25] < (41.25, 61.0] < (61.0, 74.5] < (74.5, 98.0]]