特征工程1

第二章 数据等级总结

import os 

os.listdir()
['.config', 'sample_data']
!git clone https://github.com/**********/Feature-Engineering-Made-Easy.git
Cloning into 'Feature-Engineering-Made-Easy'...
remote: Enumerating objects: 63, done.[K
remote: Total 63 (delta 0), reused 0 (delta 0), pack-reused 63[K
Unpacking objects: 100% (63/63), done.
Checking out files: 100% (62/62), done.
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline 
plt.style.use('fivethirtyeight')
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
customer = pd.read_csv('/content/Feature-Engineering-Made-Easy/data/2013_SFO_Customer_survey.csv')
customer.shape
(3535, 95)
customer.head()
RESPNUMCCGIDRUNINTDATEGATESTRATAPEAKMETHODAIRLINEFLIGHTDESTDESTGEODESTMARKARRTIMEDEPTIMEQ2PURP1Q2PURP2Q2PURP3Q2PURP4Q2PURP5Q2PURP6Q3GETTO1Q3GETTO2Q3GETTO3Q3GETTO4Q3GETTO5Q3GETTO6Q3PARKQ4BAGSQ4BUYQ4FOODQ4WIFIQ5FLYPERYRQ6TENURESAQQ7A_ARTQ7B_FOODQ7C_SHOPSQ7D_SIGNSQ7E_WALK...Q9C_CLNRENTQ9D_CLNFOODQ9E_CLNBATHQ9F_CLNWHOLEQ9COM1Q9COM2Q9COM3Q10SAFEQ10COM1Q10COM2Q10COM3Q11A_USEWEBQ11B_USESFOAPPQ11C_USEOTHAPPQ11D_USESOCMEDQ11E_USEWIFIQ12COM1Q12COM2Q12COM3Q13_WHEREDEPARTQ13_RATEGETTOQ14A_FINDQ14B_SECURITYQ15_PROBLEMSQ15COM1Q15COM2Q15COM3Q16_REGIONQ17_CITYQ17_ZIPQ17_COUNTRYHOMEQ18_AGEQ19_SEXQ20_INCOMEQ21_HIFLYERQ22A_USESJCQ22B_USEOAKLANGWEIGHT
011121521211121143749118:34 AM9:25 AM18.0NaNNaNNaNNaN210.0NaNNaNNaNNaNNaN212262.0134333...3344NaNNaNNaN51.0NaNNaN22222NaNNaNNaN53332NaNNaNNaN1SAN FRANCISCO94131.0US121122110.553675
122121521211121143749118:00 AM9:25 AM18.0NaNNaNNaNNaN210.0NaNNaNNaNNaNNaN222264.0144444...6444NaNNaNNaN51.0NaNNaN22223NaNNaNNaN23552NaNNaNNaN1CONCORD94521.0US561032110.553675
233121521211121143749117:00 AM9:25 AM18.0NaNNaNNaNNaN210.0NaNNaNNaNNaNNaN222244.0134424...3333NaNNaNNaN31.0NaNNaN22222NaNNaNNaN53332NaNNaNNaN1SAN FRANCISCO94134.0US142232210.553675
344121521211121143749117:30 AM9:25 AM18.0NaNNaNNaNNaN110.0NaNNaNNaNNaN1.0121234.0233344...5555NaNNaNNaN5NaNNaNNaN22222NaNNaNNaN53552NaNNaNNaN1NaNNaNUS9041222210.553675
455121521211121143749116:30 AM9:25 AM18.0NaNNaNNaNNaN810.0NaNNaNNaNNaNNaN211123.0233235...555587.0NaNNaN55.0NaNNaN222211.05.0NaN35432NaNNaNNaN3HUNTINGTON BEACH92646.0US1031310110.553675

5 rows Ă— 95 columns

定序等级

art_rating = customer['Q7A_ART']
art_rating.describe()
count    3535.000000
mean        4.300707
std         1.341445
min         0.000000
25%         3.000000
50%         4.000000
75%         5.000000
max         6.000000
Name: Q7A_ART, dtype: float64
art_ratings = art_rating[(art_rating>=1) & (art_rating<=5)]
art_ratings = art_ratings.astype(str)
art_ratings.describe()
count     2656
unique       5
top          4
freq      1066
Name: Q7A_ART, dtype: object
art_ratings.value_counts().plot(kind = 'bar');

在这里插入图片描述

art_ratings.value_counts().plot(kind = 'box')

在这里插入图片描述

定距等级

import zipfile

Dataset = "GlobalLandTemperaturesByCity.csv"

# Will unzip the files so that you can see them..
with zipfile.ZipFile("/content/Feature-Engineering-Made-Easy/data/"+Dataset+".zip","r") as z:
    z.extractall(".")

climate = pd.read_csv('/content/GlobalLandTemperaturesByCity.csv')
climate.head()
dtAverageTemperatureAverageTemperatureUncertaintyCityCountryLatitudeLongitude
01743-11-016.0681.737Ă rhusDenmark57.05N10.33E
11743-12-01NaNNaNĂ rhusDenmark57.05N10.33E
21744-01-01NaNNaNĂ rhusDenmark57.05N10.33E
31744-02-01NaNNaNĂ rhusDenmark57.05N10.33E
41744-03-01NaNNaNĂ rhusDenmark57.05N10.33E
climate.dropna(axis = 0,inplace = True)
climate.shape
(8235082, 7)
climate.isnull().sum()
dt                               0
AverageTemperature               0
AverageTemperatureUncertainty    0
City                             0
Country                          0
Latitude                         0
Longitude                        0
dtype: int64
climate.AverageTemperature.plot(kind='hist');

在这里插入图片描述

climate.dt.head()
0    1743-11-01
5    1744-04-01
6    1744-05-01
7    1744-06-01
8    1744-07-01
Name: dt, dtype: object
climate.dt = pd.to_datetime(climate.dt)
climate.dt.head()
0   1743-11-01
5   1744-04-01
6   1744-05-01
7   1744-06-01
8   1744-07-01
Name: dt, dtype: datetime64[ns]
climate['year'] = climate.dt.map(lambda value: value.year)
climate.head()
dtAverageTemperatureAverageTemperatureUncertaintyCityCountryLatitudeLongitudeyear
01743-11-016.0681.737Ă rhusDenmark57.05N10.33E1743
51744-04-015.7883.624Ă rhusDenmark57.05N10.33E1744
61744-05-0110.6441.283Ă rhusDenmark57.05N10.33E1744
71744-06-0114.0511.347Ă rhusDenmark57.05N10.33E1744
81744-07-0116.0821.396Ă rhusDenmark57.05N10.33E1744
# ĺŞçœ‹çžŽĺ›˝
climate_sub_us = climate.loc[climate['Country']== 'United States']
climate_sub_us.head()
dtAverageTemperatureAverageTemperatureUncertaintyCityCountryLatitudeLongitudeyear
475551820-01-012.1013.217AbileneUnited States32.95N100.53W1820
475561820-02-016.9262.853AbileneUnited States32.95N100.53W1820
475571820-03-0110.7672.395AbileneUnited States32.95N100.53W1820
475581820-04-0117.9892.202AbileneUnited States32.95N100.53W1820
475591820-05-0121.8092.036AbileneUnited States32.95N100.53W1820
climate_sub_us['Century'] = climate_sub_us['year'].map(lambda x: int(x/100+1))

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
climate_sub_us['AverageTemperature'].hist(by=climate_sub_us['Century'],
 sharex=True, sharey=True, 
 figsize=(10, 10),
 bins=20);

在这里插入图片描述

climate_sub_us.groupby('Century')['AverageTemperature'].mean().plot(kind = 'line');

在这里插入图片描述

century_changes = climate_sub_us.groupby('Century')['AverageTemperature'].mean()
century_changes
Century
18    12.073243
19    13.662870
20    14.386622
21    15.197692
Name: AverageTemperature, dtype: float64
# ĺœ¨ĺŽščˇç­‰çş§çť˜ĺˆśä¸¤ĺˆ—ć•°ćŽ
x = climate_sub_us['year']
y = climate_sub_us['AverageTemperature']
fig,ax = plt.subplots(figsize = (12,5))
ax.scatter(x,y)
plt.show()

在这里插入图片描述

climate_sub_us.groupby('year').mean()['AverageTemperature'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fd022540c50>

在这里插入图片描述

# ä˝żç”¨ćť‘ĺŠ¨ĺ‡ĺ€źĺšłćť‘ĺ›žĺƒ
climate_sub_us.groupby('year').mean()['AverageTemperature'].rolling(10).mean().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fd022121fd0>

在这里插入图片描述

定比等级

salary_ranges = pd.read_csv('/content/Feature-Engineering-Made-Easy/data/Salary_Ranges_by_Job_Classification.csv')

salary_ranges['Biweekly High Rate'] = salary_ranges['Biweekly High Rate'].map(lambda value: value.replace('$',''))
salary_ranges['Biweekly High Rate'] = salary_ranges['Biweekly High Rate'].astype(float)
salary_ranges['Grade'] = salary_ranges['Grade'].astype(str)
salary_ranges.head()
SetIDJob CodeEff DateSal End DateSalary SetIDSal PlanGradeStepBiweekly High RateBiweekly Low RateUnion CodeExtended StepPay Type
0COMMN010907/01/2009 12:00:00 AM06/30/2010 12:00:00 AMCOMMNSFM0000010.0$0.003300C
1COMMN011007/01/2009 12:00:00 AM06/30/2010 12:00:00 AMCOMMNSFM00000115.0$15.003230D
2COMMN011107/01/2009 12:00:00 AM06/30/2010 12:00:00 AMCOMMNSFM00000125.0$25.003230D
3COMMN011207/01/2009 12:00:00 AM06/30/2010 12:00:00 AMCOMMNSFM00000150.0$50.003230D
4COMMN011407/01/2009 12:00:00 AM06/30/2010 12:00:00 AMCOMMNSFM000001100.0$100.003230M
salary_ranges.groupby('Grade')['Biweekly High Rate'].mean().sort_values(ascending = False).head(20).plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x7fd020094400>

在这里插入图片描述

salary_ranges.head()
SetIDJob CodeEff DateSal End DateSalary SetIDSal PlanGradeStepBiweekly High RateBiweekly Low RateUnion CodeExtended StepPay Type
0COMMN010907/01/2009 12:00:00 AM06/30/2010 12:00:00 AMCOMMNSFM0000010.0$0.003300C
1COMMN011007/01/2009 12:00:00 AM06/30/2010 12:00:00 AMCOMMNSFM00000115.0$15.003230D
2COMMN011107/01/2009 12:00:00 AM06/30/2010 12:00:00 AMCOMMNSFM00000125.0$25.003230D
3COMMN011207/01/2009 12:00:00 AM06/30/2010 12:00:00 AMCOMMNSFM00000150.0$50.003230D
4COMMN011407/01/2009 12:00:00 AM06/30/2010 12:00:00 AMCOMMNSFM000001100.0$100.003230M
fig = plt.figure(figsize=(15,5))
ax = fig.gca()
salary_ranges.groupby('Grade')[['Biweekly High Rate']].mean().sort_values('Biweekly High Rate',ascending = False).head(20).plot.bar(stacked =False,ax = ax,color = 'darkorange')
ax.set_title('Top 20 Grade by Mean Biweekly High Rate')
Text(0.5, 1.0, 'Top 20 Grade by Mean Biweekly High Rate')

在这里插入图片描述


fig = plt.figure(figsize=(15,5))
ax = fig.gca()
salary_ranges.groupby('Grade')[['Biweekly High Rate']].mean().sort_values('Biweekly High Rate',ascending = False).tail(20).plot.bar(stacked =False,ax = ax,color = 'darkorange')
ax.set_title('Bottom 20 Grade by Mean Biweekly High Rate')
Text(0.5, 1.0, 'Bottom 20 Grade by Mean Biweekly High Rate')

在这里插入图片描述

sorted_df =salary_ranges.groupby('Grade')[['Biweekly High Rate']].mean().sort_values('Biweekly High Rate',ascending = False)
sorted_df.head()
Biweekly High Rate
Grade
9186F12120.77
0390F11255.00
0140H10843.00
0140F10630.00
0395F10376.00
sorted_df.iloc[0]
Biweekly High Rate    12120.77
Name: 9186F, dtype: float64
sorted_df.iloc[0][0] / sorted_df.iloc[-1][0]
13.931919540229886

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值