import pandas as pd
import numpy as np
company= [ "A" , "B" , "C" , "D" ]
data= pd. DataFrame(
{
"company" : [ company[ x] for x in np. random. randint( 0 , len ( company) , 10 ) ] ,
"salary" : np. random. randint( 5 , 50 , 10 ) ,
"age" : np. random. randint( 20 , 30 , 10 )
}
)
data
company salary age 0 A 10 21 1 B 12 21 2 D 41 23 3 B 44 22 4 B 36 21 5 C 22 22 6 B 27 28 7 D 14 20 8 B 18 25 9 A 33 26
group= data. groupby( "company" )
list ( group)
group= data. groupby( by= [ "company" , "age" ] )
list ( group)
data. groupby( "company" ) . mean( )
salary age company A 21.5 23.5 B 27.4 23.4 C 22.0 22.0 D 27.5 21.5
temp= data. groupby( "company" ) [ "salary" ] . mean( )
data[ "avg_salary" ] = data. company. map ( temp)
data
company salary age avg_salary 0 A 10 21 21.5 1 B 12 21 27.4 2 D 41 23 27.5 3 B 44 22 27.4 4 B 36 21 27.4 5 C 22 22 22.0 6 B 27 28 27.4 7 D 14 20 27.5 8 B 18 25 27.4 9 A 33 26 21.5
data[ "avg_age" ] = data. groupby( "company" ) [ "age" ] . transform( "mean" )
data
company salary age avg_salary avg_age 0 A 10 21 21.5 23.5 1 B 12 21 27.4 23.4 2 D 41 23 27.5 21.5 3 B 44 22 27.4 23.4 4 B 36 21 27.4 23.4 5 C 22 22 22.0 22.0 6 B 27 28 27.4 23.4 7 D 14 20 27.5 21.5 8 B 18 25 27.4 23.4 9 A 33 26 21.5 23.5
def get_min ( x) :
df= x. sort_values( by= "age" , ascending= True )
return df. iloc[ 0 , : ]
min_age= data. groupby( "company" ) . apply ( get_min)
min_age
company salary age avg_salary avg_age company A A 10 21 21.5 23.5 B B 12 21 27.4 23.4 C C 22 22 22.0 22.0 D D 14 20 27.5 21.5
data[ "min_age" ] = data. groupby( "company" ) [ "age" ] . transform( min )
data
company salary age avg_salary avg_age min_age 0 A 10 21 21.5 23.5 21 1 B 12 21 27.4 23.4 21 2 D 41 23 27.5 21.5 20 3 B 44 22 27.4 23.4 21 4 B 36 21 27.4 23.4 21 5 C 22 22 22.0 22.0 22 6 B 27 28 27.4 23.4 21 7 D 14 20 27.5 21.5 20 8 B 18 25 27.4 23.4 21 9 A 33 26 21.5 23.5 21
data= pd. DataFrame(
{
"company" : [ company[ x] for x in np. random. randint( 0 , len ( company) , 100 ) ] ,
"salary" : np. random. randint( 5 , 50 , 100 ) ,
"age" : np. random. randint( 20 , 30 , 100 )
}
)
data
company salary age 0 A 37 26 1 C 5 29 2 B 36 22 3 A 43 21 4 B 44 29 ... ... ... ... 95 A 11 29 96 D 48 25 97 B 27 27 98 B 37 26 99 A 37 25
100 rows × 3 columns
test= data. head( 10 )
test
company salary age 0 A 37 26 1 C 5 29 2 B 36 22 3 A 43 21 4 B 44 29 5 D 46 21 6 C 16 27 7 B 28 26 8 D 28 26 9 D 40 23
test. iloc[ test. groupby( "company" ) . apply ( lambda x: x[ "age" ] . idxmax( ) ) ]
company salary age 0 A 37 26 4 B 44 29 1 C 5 29 8 D 28 26
data. iloc[ data. groupby( "company" ) . apply ( lambda x: x[ "age" ] . idxmax( ) ) ]
company salary age 82 A 40 29 4 B 44 29 1 C 5 29 48 D 25 29
data. groupby( "company" ) . apply ( lambda x: x[ "age" ] . idxmax( ) )
company
A 82
B 4
C 1
D 48
dtype: int64
def get_max ( x) :
return x[ "age" ] . idxmax( )
data. iloc[ data. groupby( "company" ) . apply ( lambda x: get_max( x) ) ]
company salary age 82 A 40 29 4 B 44 29 1 C 5 29 48 D 25 29
data[ "rank" ] = data. groupby( "company" ) [ "salary" ] . rank( method= "min" , ascending= False ) . astype( np. int64)
data
company salary age rank 0 A 37 26 7 1 C 5 29 21 2 B 36 22 5 3 A 43 21 2 4 B 44 29 1 ... ... ... ... ... 95 A 11 29 22 96 D 48 25 1 97 B 27 27 13 98 B 37 26 4 99 A 37 25 7
100 rows × 4 columns
data[ data[ "rank" ] == 1 ]
company salary age rank 4 B 44 29 1 10 C 49 27 1 12 A 47 25 1 34 C 49 27 1 96 D 48 25 1
data[ "salary" ] = data[ "salary" ] . astype( "str" )
data[ data. salary. str . contains( "2" ) ] . head( 10 )
company salary age 3 D 20 24 5 D 27 23 6 D 29 25 8 D 22 23 9 A 24 20 10 D 42 22 12 C 25 23 16 D 27 21 22 D 22 28 27 A 24 20
data[ "salary" ] = data[ "salary" ] . astype( "int" )
data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 company 100 non-null object
1 salary 100 non-null int32
2 age 100 non-null int32
dtypes: int32(2), object(1)
memory usage: 1.7+ KB
def convet_numter ( x) :
new_val = val. replace( ',' , '' ) . replace( '$' , '' )
return new_val
data. sort_values( [ "salary" , "age" ] , ascending= False )
company salary age 82 D 49 28 67 A 49 24 69 C 48 29 49 B 48 25 34 D 46 25 ... ... ... ... 60 B 7 23 68 C 5 27 38 A 5 22 65 D 5 21 90 C 5 20
100 rows × 3 columns
def sort_va ( x) :
return x. sort_values( "salary" , ascending= False )
data. groupby( "company" ) . apply ( lambda x: sort_va( x) ) . reset_index( drop= True )
company salary age 0 A 49 24 1 A 43 21 2 A 41 20 3 A 40 26 4 A 38 20 ... ... ... ... 95 D 14 22 96 D 11 22 97 D 11 27 98 D 11 26 99 D 5 21
100 rows × 3 columns
data. groupby( "company" ) . apply ( lambda x: x. sort_values( "age" , ascending= False ) ) . reset_index( drop= True )
company salary age 0 A 28 29 1 A 26 29 2 A 24 28 3 A 15 28 4 A 8 28 ... ... ... ... 95 D 23 21 96 D 27 21 97 D 46 21 98 D 37 20 99 D 17 20
100 rows × 3 columns
df = pd. DataFrame( { '姓名' : [ '张 三' , '李 四' , '王 五' ] ,
'所在地' : [ '北京-东城区' , '上海-黄浦区' , '广州-白云区' ] } )
df
姓名 所在地 0 张 三 北京-东城区 1 李 四 上海-黄浦区 2 王 五 广州-白云区
temp
temp= df. 所在地. str . split( "-" , expand= True )
pd. concat( [ df, temp] , axis= 1 )
姓名 所在地 0 1 0 张 三 北京-东城区 北京 东城区 1 李 四 上海-黄浦区 上海 黄浦区 2 王 五 广州-白云区 广州 白云区