import pandas as pd
import numpy as np
df = pd. read_csv( 'Data/Game_of_Thrones_Script.csv' )
df. head( )
Release Date Season Episode Episode Title Name Sentence 0 2011/4/17 Season 1 Episode 1 Winter is Coming waymar royce What do you expect? They're savages. One lot s... 1 2011/4/17 Season 1 Episode 1 Winter is Coming will I've never seen wildlings do a thing like this... 2 2011/4/17 Season 1 Episode 1 Winter is Coming waymar royce How close did you get? 3 2011/4/17 Season 1 Episode 1 Winter is Coming will Close as any man would. 4 2011/4/17 Season 1 Episode 1 Winter is Coming gared We should head back to the wall.
df[ 'Name' ] . nunique( )
564
df[ 'Name' ] . value_counts( ) . index[ 0 ]
'tyrion lannister'
df_words = df. assign( Words= df[ 'Sentence' ] . apply ( lambda x: len ( x. split( ) ) ) ) . sort_values( by= 'Name' )
df_words. head( )
Release Date Season Episode Episode Title Name Sentence Words 276 2011/4/17 Season 1 Episode 1 Winter is Coming a voice It's Maester Luwin, my lord. 5 3012 2011/6/19 Season 1 Episode 10 Fire and Blood addam marbrand ls it true about Stannis and Renly? 7 3017 2011/6/19 Season 1 Episode 10 Fire and Blood addam marbrand Kevan Lannister 2 13610 2014/6/8 Season 4 Episode 9 The Watchers on the Wall aemon And what is it that couldn't wait until mornin... 10 13614 2014/6/8 Season 4 Episode 9 The Watchers on the Wall aemon Oh, no need. I know my way around this library... 48
L_count = [ ]
N_words = list ( zip ( df_words[ 'Name' ] , df_words[ 'Words' ] ) )
for i in N_words:
if i == N_words[ 0 ] :
L_count. append( i[ 1 ] )
last = i[ 0 ]
else :
L_count. append( L_count[ - 1 ] + i[ 1 ] if i[ 0 ] == last else i[ 1 ] )
last = i[ 0 ]
df_words[ 'Count' ] = L_count
df_words[ 'Name' ] [ df_words[ 'Count' ] . idxmax( ) ]
'tyrion lannister'
df = pd. read_csv( 'data/Kobe_data.csv' , index_col= 'shot_id' )
df. head( )
action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period ... shot_made_flag shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent shot_id 1 Jump Shot Jump Shot 10 20000012 33.9723 167 72 -118.1028 10 1 ... NaN 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 2 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 ... 0.0 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 3 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 1 ... 1.0 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 4 Jump Shot Jump Shot 43 20000012 33.8693 138 175 -118.1318 6 1 ... 0.0 2PT Field Goal Right Side Center(RC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 5 Driving Dunk Shot Dunk 155 20000012 34.0443 0 0 -118.2698 6 2 ... 1.0 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR
5 rows × 24 columns
pd. Series( list ( zip ( df[ 'action_type' ] , df[ 'combined_shot_type' ] ) ) ) . value_counts( ) . index[ 0 ]
('Jump Shot', 'Jump Shot')
pd. Series( list ( list ( zip ( * ( pd. Series( list ( zip ( df[ 'game_id' ] , df[ 'opponent' ] ) ) )
. unique( ) ) . tolist( ) ) ) [ 1 ] ) ) . value_counts( ) . index[ 0 ]
'SAS'
df = pd. read_csv( 'data/UFO.csv' )
df. rename( columns= { 'duration (seconds)' : 'duration' } , inplace= True )
df[ 'duration' ] . astype( 'float' )
df. head( )
datetime shape duration latitude longitude 0 10/10/1949 20:30 cylinder 2700.0 29.883056 -97.941111 1 10/10/1949 21:00 light 7200.0 29.384210 -98.581082 2 10/10/1955 17:00 circle 20.0 53.200000 -2.916667 3 10/10/1956 21:00 circle 20.0 28.978333 -96.645833 4 10/10/1960 20:00 light 900.0 21.418056 -157.803611
df. query( 'duration > 60' ) [ 'shape' ] . value_counts( ) . index[ 0 ]
'light'
bins_long = np. linspace( - 180 , 180 , 13 ) . tolist( )
bins_la = np. linspace( - 90 , 90 , 11 ) . tolist( )
cuts_long = pd. cut( df[ 'longitude' ] , bins= bins_long)
df[ 'cuts_long' ] = cuts_long
cuts_la = pd. cut( df[ 'latitude' ] , bins= bins_la)
df[ 'cuts_la' ] = cuts_la
df. head( )
datetime shape duration latitude longitude cuts_long cuts_la 0 10/10/1949 20:30 cylinder 2700.0 29.883056 -97.941111 (-120.0, -90.0] (18.0, 36.0] 1 10/10/1949 21:00 light 7200.0 29.384210 -98.581082 (-120.0, -90.0] (18.0, 36.0] 2 10/10/1955 17:00 circle 20.0 53.200000 -2.916667 (-30.0, 0.0] (36.0, 54.0] 3 10/10/1956 21:00 circle 20.0 28.978333 -96.645833 (-120.0, -90.0] (18.0, 36.0] 4 10/10/1960 20:00 light 900.0 21.418056 -157.803611 (-180.0, -150.0] (18.0, 36.0]
df = pd. read_csv( 'data/Pokemon.csv' )
df. head( )
# Name Type 1 Type 2 Total HP Attack Defense Sp. Atk Sp. Def Speed Generation Legendary 0 1 Bulbasaur Grass Poison 318 45 49 49 65 65 45 1 False 1 2 Ivysaur Grass Poison 405 60 62 63 80 80 60 1 False 2 3 Venusaur Grass Poison 525 80 82 83 100 100 80 1 False 3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122 120 80 1 False 4 4 Charmander Fire NaN 309 39 52 43 60 50 65 1 False
df[ 'Type 2' ] . count( ) / df. shape[ 0 ]
0.5175
df. query( 'Total >= 580' ) [ 'Legendary' ] . value_counts( normalize= True )
True 0.575221
False 0.424779
Name: Legendary, dtype: float64
df[ df[ 'Type 1' ] == 'Fighting' ] . sort_values( by= 'Attack' , ascending= False ) . iloc[ : 3 ]
# Name Type 1 Type 2 Total HP Attack Defense Sp. Atk Sp. Def Speed Generation Legendary 498 448 LucarioMega Lucario Fighting Steel 625 70 145 88 140 70 112 4 False 594 534 Conkeldurr Fighting NaN 505 105 140 95 55 65 45 5 False 74 68 Machamp Fighting NaN 505 90 130 80 65 85 55 1 False
df[ 'range' ] = df. iloc[ : , 5 : 11 ] . max ( axis= 1 ) - df. iloc[ : , 5 : 11 ] . min ( axis= 1 )
attribute = df[ [ 'Type 1' , 'range' ] ] . set_index( 'Type 1' )
max_range = 0
result = ''
for i in attribute. index. unique( ) :
temp = attribute. loc[ i, : ] . mean( )
if temp. values[ 0 ] > max_range:
max_range = temp. values[ 0 ]
result = i
result
'Steel'
df. query( 'Legendary == True' ) [ 'Type 1' ] . value_counts( normalize= True ) . index[ 0 ]
'Psychic'
attribute = df. query( 'Legendary == True' ) [ [ 'Type 1' , 'Total' ] ] . set_index( 'Type 1' )
max_value = 0
result = ''
for i in attribute. index. unique( ) :
temp = float ( attribute. loc[ i, : ] . mean( ) )
if temp > max_value:
max_value = temp
result = i
result
'Normal'
df = pd. read_csv( 'data/Diamonds.csv' )
df. head( )
carat color depth price 0 0.23 E 61.5 326 1 0.21 E 59.8 326 2 0.23 E 56.9 327 3 0.29 I 62.4 334 4 0.31 J 63.3 335
df_r = df. query( 'carat>1' ) [ 'price' ]
df_r. max ( ) - df_r. min ( )
17561
bins = df[ 'depth' ] . quantile( np. linspace( 0 , 1 , 6 ) ) . tolist( )
cuts = pd. cut( df[ 'depth' ] , bins= bins)
df[ 'cuts' ] = cuts
df. head( )
carat color depth price cuts 0 0.23 E 61.5 326 (60.8, 61.6] 1 0.21 E 59.8 326 (43.0, 60.8] 2 0.23 E 56.9 327 (43.0, 60.8] 3 0.29 I 62.4 334 (62.1, 62.7] 4 0.31 J 63.3 335 (62.7, 79.0]
color_result = df. groupby( 'cuts' ) [ 'color' ] . describe( )
color_result
count unique top freq cuts (43.0, 60.8] 11294 7 E 2259 (60.8, 61.6] 11831 7 G 2593 (61.6, 62.1] 10403 7 G 2247 (62.1, 62.7] 10137 7 G 2193 (62.7, 79.0] 10273 7 G 2000
df[ '均重价格' ] = df[ 'price' ] / df[ 'carat' ]
color_result[ 'top' ] == [ i[ 1 ] for i in df. groupby( [ 'cuts'
, 'color' ] ) [ '均重价格' ] . mean( ) . groupby( [ 'cuts' ] ) . idxmax( ) . values]
cuts
(43.0, 60.8] False
(60.8, 61.6] False
(61.6, 62.1] False
(62.1, 62.7] True
(62.7, 79.0] True
Name: top, dtype: bool
df = df. drop( columns= '均重价格' )
cuts = pd. cut( df[ 'carat' ] , bins= [ 0 , 0.5 , 1 , 1.5 , 2 , np. inf] )
df[ 'cuts' ] = cuts
df. head( )
carat color depth price cuts 0 0.23 E 61.5 326 (0.0, 0.5] 1 0.21 E 59.8 326 (0.0, 0.5] 2 0.23 E 56.9 327 (0.0, 0.5] 3 0.29 I 62.4 334 (0.0, 0.5] 4 0.31 J 63.3 335 (0.0, 0.5]
def f ( nums) :
if not nums:
return 0
res = 1
cur_len = 1
for i in range ( 1 , len ( nums) ) :
if nums[ i- 1 ] < nums[ i] :
cur_len += 1
res = max ( cur_len, res)
else :
cur_len = 1
return res
for name, group in df. groupby( 'cuts' ) :
group = group. sort_values( by= 'depth' )
s = group[ 'price' ]
print ( name, f( s. tolist( ) ) )
(0.0, 0.5] 8
(0.5, 1.0] 8
(1.0, 1.5] 7
(1.5, 2.0] 11
(2.0, inf] 7
for name, group in df[ [ 'carat' , 'price' , 'color' ] ] . groupby( 'color' ) :
L1 = np. array( [ np. ones( group. shape[ 0 ] ) , group[ 'carat' ] ] ) . reshape( 2 , group. shape[ 0 ] )
L2 = group[ 'price' ]
result = ( np. linalg. inv( L1. dot( L1. T) ) . dot( L1) ) . dot( L2) . reshape( 2 , 1 )
print ( '当颜色为%s时,截距项为:%f,回归系数为:%f' % ( name, result[ 0 ] , result[ 1 ] ) )
当颜色为D时,截距项为:-2361.017152,回归系数为:8408.353126
当颜色为E时,截距项为:-2381.049600,回归系数为:8296.212783
当颜色为F时,截距项为:-2665.806191,回归系数为:8676.658344
当颜色为G时,截距项为:-2575.527643,回归系数为:8525.345779
当颜色为H时,截距项为:-2460.418046,回归系数为:7619.098320
当颜色为I时,截距项为:-2878.150356,回归系数为:7761.041169
当颜色为J时,截距项为:-2920.603337,回归系数为:7094.192092
df = pd. read_csv( 'data/Drugs.csv' )
df. head( )
YYYY State COUNTY SubstanceName DrugReports 0 2010 VA ACCOMACK Propoxyphene 1 1 2010 OH ADAMS Morphine 9 2 2010 PA ADAMS Methadone 2 3 2010 VA ALEXANDRIA CITY Heroin 5 4 2010 PA ALLEGHENY Hydromorphone 5
idx= pd. IndexSlice
for i in range ( 2010 , 2018 ) :
county = ( df. groupby( [ 'COUNTY' , 'YYYY' ] ) . sum ( ) . loc[ idx[ : , i] , : ] . idxmax( ) [ 0 ] [ 0 ] )
state = df. query( 'COUNTY == "%s"' % county) [ 'State' ] . iloc[ 0 ]
state_true = df. groupby( [ 'State' , 'YYYY' ] ) . sum ( ) . loc[ idx[ : , i] , : ] . idxmax( ) [ 0 ] [ 0 ]
if state== state_true:
print ( '在%d年,%s县的报告数最多,它所属的州%s也是报告数最多的' % ( i, county, state) )
else :
print ( '在%d年,%s县的报告数最多,但它所属的州%s不是报告数最多的,%s州报告数最多' % ( i, county, state, state_true) )
在2010年,PHILADELPHIA县的报告数最多,它所属的州PA也是报告数最多的
在2011年,PHILADELPHIA县的报告数最多,但它所属的州PA不是报告数最多的,OH州报告数最多
在2012年,PHILADELPHIA县的报告数最多,但它所属的州PA不是报告数最多的,OH州报告数最多
在2013年,PHILADELPHIA县的报告数最多,但它所属的州PA不是报告数最多的,OH州报告数最多
在2014年,PHILADELPHIA县的报告数最多,但它所属的州PA不是报告数最多的,OH州报告数最多
在2015年,PHILADELPHIA县的报告数最多,但它所属的州PA不是报告数最多的,OH州报告数最多
在2016年,HAMILTON县的报告数最多,它所属的州OH也是报告数最多的
在2017年,HAMILTON县的报告数最多,它所属的州OH也是报告数最多的
df_b = df[ ( df[ 'YYYY' ] . isin( [ 2014 , 2015 ] ) ) & ( df[ 'SubstanceName' ] == 'Heroin' ) ]
df_add = df_b. groupby( [ 'YYYY' , 'State' ] ) . sum ( )
( df_add. loc[ 2015 ] - df_add. loc[ 2014 ] ) . idxmax( )
DrugReports OH
dtype: object
df_b = df[ ( df[ 'YYYY' ] . isin( [ 2014 , 2015 ] ) ) & ( df[ 'State' ] == 'OH' ) ]
df_add = df_b. groupby( [ 'YYYY' , 'SubstanceName' ] ) . sum ( )
display( ( df_add. loc[ 2015 ] - df_add. loc[ 2014 ] ) . idxmax( ) )
display( ( df_add. loc[ 2015 ] / df_add. loc[ 2014 ] ) . idxmax( ) )
DrugReports Heroin
dtype: object
DrugReports Acetyl fentanyl
dtype: object
df = pd. read_csv( 'data/Drugs.csv' , index_col= [ 'State' , 'COUNTY' ] ) . sort_index( )
df. head( )
YYYY SubstanceName DrugReports State COUNTY KY ADAIR 2010 Methadone 1 ADAIR 2010 Hydrocodone 6 ADAIR 2011 Oxycodone 4 ADAIR 2011 Buprenorphine 3 ADAIR 2011 Morphine 2
result = pd. pivot_table( df, index= [ 'State' , 'COUNTY' , 'SubstanceName' ]
, columns= 'YYYY'
, values= 'DrugReports' , fill_value= '-' ) . reset_index( ) . rename_axis( columns= { 'YYYY' : '' } )
result. head( )
State COUNTY SubstanceName 2010 2011 2012 2013 2014 2015 2016 2017 0 KY ADAIR Buprenorphine - 3 5 4 27 5 7 10 1 KY ADAIR Codeine - - 1 - - - - 1 2 KY ADAIR Fentanyl - - 1 - - - - - 3 KY ADAIR Heroin - - 1 2 - 1 - 2 4 KY ADAIR Hydrocodone 6 9 10 10 9 7 11 3
result_melted = result. melt( id_vars= result. columns[ : 3 ] , value_vars= result. columns[ - 8 : ]
, var_name= 'YYYY' , value_name= 'DrugReports' ) . query( 'DrugReports != "-"' )
result2 = result_melted. sort_values( by= [ 'State' , 'COUNTY' , 'YYYY'
, 'SubstanceName' ] ) . reset_index( ) . drop( columns= 'index' )
cols = list ( result2. columns)
a, b = cols. index( 'SubstanceName' ) , cols. index( 'YYYY' )
cols[ b] , cols[ a] = cols[ a] , cols[ b]
result2 = result2[ cols] . astype( { 'DrugReports' : 'int' , 'YYYY' : 'int' } )
result2. head( )
State COUNTY YYYY SubstanceName DrugReports 0 KY ADAIR 2010 Hydrocodone 6 1 KY ADAIR 2010 Methadone 1 2 KY ADAIR 2011 Buprenorphine 3 3 KY ADAIR 2011 Hydrocodone 9 4 KY ADAIR 2011 Morphine 2
df_tidy = df. reset_index( ) . sort_values( by= result2. columns[ : 4 ] . tolist( ) ) . reset_index( ) . drop( columns= 'index' )
df_tidy. head( )
State COUNTY YYYY SubstanceName DrugReports 0 KY ADAIR 2010 Hydrocodone 6 1 KY ADAIR 2010 Methadone 1 2 KY ADAIR 2011 Buprenorphine 3 3 KY ADAIR 2011 Hydrocodone 9 4 KY ADAIR 2011 Morphine 2
df_tidy. equals( result2)
True
df = pd. read_csv( 'data/Earthquake.csv' )
df = df. sort_values( by= df. columns. tolist( ) [ : 3 ] ) . sort_index( axis= 1 ) . reset_index( ) . drop( columns= 'index' )
df. head( )
方向 日期 时间 深度 烈度 经度 维度 距离 0 south_east 1912.08.09 12:29:00 AM 16.0 6.7 27.2 40.6 4.3 1 south_west 1912.08.10 12:23:00 AM 15.0 6.0 27.1 40.6 2.0 2 south_west 1912.08.10 12:30:00 AM 15.0 5.2 27.1 40.6 2.0 3 south_east 1912.08.11 12:19:04 AM 30.0 4.9 27.2 40.6 4.3 4 south_west 1912.08.11 12:20:00 AM 15.0 4.5 27.1 40.6 2.0
result = pd. pivot_table( df, index= [ '日期' , '时间' , '维度' , '经度' ]
, columns= '方向'
, values= [ '烈度' , '深度' , '距离' ] , fill_value= '-' ) . stack( level= 0 ) . rename_axis( index= { None : '地震参数' } )
result. head( 6 )
方向 east north north_east north_west south south_east south_west west 日期 时间 维度 经度 地震参数 1912.08.09 12:29:00 AM 40.6 27.2 深度 - - - - - 16 - - 烈度 - - - - - 6.7 - - 距离 - - - - - 4.3 - - 1912.08.10 12:23:00 AM 40.6 27.1 深度 - - - - - - 15 - 烈度 - - - - - - 6 - 距离 - - - - - - 2 -
df_result = result. unstack( ) . stack( 0 ) [ ( ~ ( result. unstack( ) . stack( 0 ) == '-' ) ) . any ( 1 ) ] . reset_index( )
df_result. columns. name= None
df_result = df_result. sort_index( axis= 1 ) . astype( { '深度' : 'float64' , '烈度' : 'float64' , '距离' : 'float64' } )
df_result. head( )
方向 日期 时间 深度 烈度 经度 维度 距离 0 south_east 1912.08.09 12:29:00 AM 16.0 6.7 27.2 40.6 4.3 1 south_west 1912.08.10 12:23:00 AM 15.0 6.0 27.1 40.6 2.0 2 south_west 1912.08.10 12:30:00 AM 15.0 5.2 27.1 40.6 2.0 3 south_east 1912.08.11 12:19:04 AM 30.0 4.9 27.2 40.6 4.3 4 south_west 1912.08.11 12:20:00 AM 15.0 4.5 27.1 40.6 2.0
df_result. astype( { '深度' : 'float64' , '烈度' : 'float64' , '距离' : 'float64' } , copy= False ) . dtypes
方向 object
日期 object
时间 object
深度 float64
烈度 float64
经度 float64
维度 float64
距离 float64
dtype: object
df. equals( df_result)
True