Cloning into 'Feature-Engineering-Made-Easy'...
remote: Enumerating objects: 63, done.[K
remote: Total 63 (delta 0), reused 0 (delta 0), pack-reused 63[K
Unpacking objects: 100% (63/63), done.
Checking out files: 100% (62/62), done.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
import pandas.util.testing as tm
import zipfile
Dataset ="GlobalLandTemperaturesByCity.csv"# Will unzip the files so that you can see them..with zipfile.ZipFile("/content/Feature-Engineering-Made-Easy/data/"+Dataset+".zip","r")as z:
z.extractall(".")
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
salary_ranges['Biweekly High Rate']= salary_ranges['Biweekly High Rate'].map(lambda value: value.replace('$',''))
salary_ranges['Biweekly High Rate']= salary_ranges['Biweekly High Rate'].astype(float)
salary_ranges['Grade']= salary_ranges['Grade'].astype(str)
salary_ranges.head()
SetID
Job Code
Eff Date
Sal End Date
Salary SetID
Sal Plan
Grade
Step
Biweekly High Rate
Biweekly Low Rate
Union Code
Extended Step
Pay Type
0
COMMN
0109
07/01/2009 12:00:00 AM
06/30/2010 12:00:00 AM
COMMN
SFM
00000
1
0.0
$0.00
330
0
C
1
COMMN
0110
07/01/2009 12:00:00 AM
06/30/2010 12:00:00 AM
COMMN
SFM
00000
1
15.0
$15.00
323
0
D
2
COMMN
0111
07/01/2009 12:00:00 AM
06/30/2010 12:00:00 AM
COMMN
SFM
00000
1
25.0
$25.00
323
0
D
3
COMMN
0112
07/01/2009 12:00:00 AM
06/30/2010 12:00:00 AM
COMMN
SFM
00000
1
50.0
$50.00
323
0
D
4
COMMN
0114
07/01/2009 12:00:00 AM
06/30/2010 12:00:00 AM
COMMN
SFM
00000
1
100.0
$100.00
323
0
M
salary_ranges.groupby('Grade')['Biweekly High Rate'].mean().sort_values(ascending =False).head(20).plot(kind ='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x7fd020094400>
salary_ranges.head()
SetID
Job Code
Eff Date
Sal End Date
Salary SetID
Sal Plan
Grade
Step
Biweekly High Rate
Biweekly Low Rate
Union Code
Extended Step
Pay Type
0
COMMN
0109
07/01/2009 12:00:00 AM
06/30/2010 12:00:00 AM
COMMN
SFM
00000
1
0.0
$0.00
330
0
C
1
COMMN
0110
07/01/2009 12:00:00 AM
06/30/2010 12:00:00 AM
COMMN
SFM
00000
1
15.0
$15.00
323
0
D
2
COMMN
0111
07/01/2009 12:00:00 AM
06/30/2010 12:00:00 AM
COMMN
SFM
00000
1
25.0
$25.00
323
0
D
3
COMMN
0112
07/01/2009 12:00:00 AM
06/30/2010 12:00:00 AM
COMMN
SFM
00000
1
50.0
$50.00
323
0
D
4
COMMN
0114
07/01/2009 12:00:00 AM
06/30/2010 12:00:00 AM
COMMN
SFM
00000
1
100.0
$100.00
323
0
M
fig = plt.figure(figsize=(15,5))
ax = fig.gca()
salary_ranges.groupby('Grade')[['Biweekly High Rate']].mean().sort_values('Biweekly High Rate',ascending =False).head(20).plot.bar(stacked =False,ax = ax,color ='darkorange')
ax.set_title('Top 20 Grade by Mean Biweekly High Rate')
Text(0.5, 1.0, 'Top 20 Grade by Mean Biweekly High Rate')
fig = plt.figure(figsize=(15,5))
ax = fig.gca()
salary_ranges.groupby('Grade')[['Biweekly High Rate']].mean().sort_values('Biweekly High Rate',ascending =False).tail(20).plot.bar(stacked =False,ax = ax,color ='darkorange')
ax.set_title('Bottom 20 Grade by Mean Biweekly High Rate')
Text(0.5, 1.0, 'Bottom 20 Grade by Mean Biweekly High Rate')
sorted_df =salary_ranges.groupby('Grade')[['Biweekly High Rate']].mean().sort_values('Biweekly High Rate',ascending =False)
sorted_df.head()
Biweekly High Rate
Grade
9186F
12120.77
0390F
11255.00
0140H
10843.00
0140F
10630.00
0395F
10376.00
sorted_df.iloc[0]
Biweekly High Rate 12120.77
Name: 9186F, dtype: float64