数值特征

离散值处理

import pandas as pd
import numpy as np
vg_df = pd.read_csv('datasets/vgsales.csv', encoding = "ISO-8859-1")
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]
NamePlatformYearGenrePublisher
1Super Mario Bros.NES1985.0PlatformNintendo
2Mario Kart WiiWii2008.0RacingNintendo
3Wii Sports ResortWii2009.0SportsNintendo
4Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo
5TetrisGB1989.0PuzzleNintendo
6New Super Mario Bros.DS2006.0PlatformNintendo
genres = np.unique(vg_df['Genre'])
genres
array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

LabelEncoder

from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder()
genre_labels = gle.fit_transform(vg_df['Genre'])
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
genre_mappings
{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}
vg_df['GenreLabel'] = genre_labels
vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]
NamePlatformYearGenreGenreLabel
1Super Mario Bros.NES1985.0Platform4
2Mario Kart WiiWii2008.0Racing6
3Wii Sports ResortWii2009.0Sports10
4Pokemon Red/Pokemon BlueGB1996.0Role-Playing7
5TetrisGB1989.0Puzzle5
6New Super Mario Bros.DS2006.0Platform4

Map

poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
poke_df.head()
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison318454949656545Gen 1False
12IvysaurGrassPoison405606263808060Gen 1False
23VenusaurGrassPoison52580828310010080Gen 1False
33VenusaurMega VenusaurGrassPoison6258010012312212080Gen 1False
44CharmanderFireNaN309395243605065Gen 1False
poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)

np.unique(poke_df['Generation'])
array(['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6'], dtype=object)
gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3, 
               'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}

poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)
poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10]
NameGenerationGenerationLabel
4OctilleryGen 22
5HelioptileGen 66
6DialgaGen 44
7DeoxysDefense FormeGen 33
8RapidashGen 11
9SwannaGen 55

One-hot Encoding

poke_df[['Name', 'Generation', 'Legendary']].iloc[4:10]
NameGenerationLegendary
4OctilleryGen 2False
5HelioptileGen 6False
6DialgaGen 4True
7DeoxysDefense FormeGen 3True
8RapidashGen 1False
9SwannaGen 5False
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# 完成LabelEncoder
gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(poke_df['Generation'])
poke_df['Gen_Label'] = gen_labels

poke_df_sub = poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary']]

# 完成OneHotEncoder
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_Label']]).toarray()
gen_feature_labels = list(gen_le.classes_)

# 将转换好的特征组合到dataframe中
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)
poke_df_ohe = pd.concat([poke_df_sub, gen_features], axis=1)
poke_df_ohe.head()
['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6']
NameGenerationGen_LabelLegendaryGen 1Gen 2Gen 3Gen 4Gen 5Gen 6
0CharizardMega Charizard YGen 10False1.00.00.00.00.00.0
1AbomasnowGen 43False0.00.00.01.00.00.0
2SentretGen 21False0.01.00.00.00.00.0
3LitleoGen 65False0.00.00.00.00.01.0
4OctilleryGen 21False0.01.00.00.00.00.0
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# transform and map pokemon generations
gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(poke_df['Generation'])
poke_df['Gen_Label'] = gen_labels

# transform and map pokemon legendary status
leg_le = LabelEncoder()
leg_labels = leg_le.fit_transform(poke_df['Legendary'])
poke_df['Lgnd_Label'] = leg_labels

poke_df_sub = poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]
poke_df_sub.iloc[4:10]
NameGenerationGen_LabelLegendaryLgnd_Label
4OctilleryGen 21False0
5HelioptileGen 65False0
6DialgaGen 43True1
7DeoxysDefense FormeGen 32True1
8RapidashGen 10False0
9SwannaGen 54False0
# encode generation labels using one-hot encoding scheme
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_Label']]).toarray()
gen_feature_labels = list(gen_le.classes_)
print (gen_feature_labels)
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)

# encode legendary status labels using one-hot encoding scheme
leg_ohe = OneHotEncoder()
leg_feature_arr = leg_ohe.fit_transform(poke_df[['Lgnd_Label']]).toarray()
leg_feature_labels = ['Legendary_'+str(cls_label) for cls_label in leg_le.classes_]
print (leg_feature_labels)
leg_features = pd.DataFrame(leg_feature_arr, columns=leg_feature_labels)
['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6']
['Legendary_False', 'Legendary_True']
poke_df_ohe = pd.concat([poke_df_sub, gen_features, leg_features], axis=1)
columns = sum([['Name', 'Generation', 'Gen_Label'],gen_feature_labels,
              ['Legendary', 'Lgnd_Label'],leg_feature_labels], [])
poke_df_ohe[columns].iloc[4:10]
NameGenerationGen_LabelGen 1Gen 2Gen 3Gen 4Gen 5Gen 6LegendaryLgnd_LabelLegendary_FalseLegendary_True
4OctilleryGen 210.01.00.00.00.00.0False01.00.0
5HelioptileGen 650.00.00.00.00.01.0False01.00.0
6DialgaGen 430.00.00.01.00.00.0True10.01.0
7DeoxysDefense FormeGen 320.00.01.00.00.00.0True10.01.0
8RapidashGen 101.00.00.00.00.00.0False01.00.0
9SwannaGen 540.00.00.00.01.00.0False01.00.0

Get Dummy

gen_dummy_features = pd.get_dummies(poke_df['Generation'], drop_first=True)
pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]
NameGenerationGen 2Gen 3Gen 4Gen 5Gen 6
4OctilleryGen 210000
5HelioptileGen 600001
6DialgaGen 400100
7DeoxysDefense FormeGen 301000
8RapidashGen 100000
9SwannaGen 500010
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10]
NameGenerationGen 1Gen 2Gen 3Gen 4Gen 5Gen 6
4OctilleryGen 2010000
5HelioptileGen 6000001
6DialgaGen 4000100
7DeoxysDefense FormeGen 3001000
8RapidashGen 1100000
9SwannaGen 5000010

gen_onehot_features = pd.get_dummies(poke_df['Generation'],prefix = 'one-hot')
pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10]
NameGenerationone-hot_Gen 1one-hot_Gen 2one-hot_Gen 3one-hot_Gen 4one-hot_Gen 5one-hot_Gen 6
4OctilleryGen 2010000
5HelioptileGen 6000001
6DialgaGen 4000100
7DeoxysDefense FormeGen 3001000
8RapidashGen 1100000
9SwannaGen 5000010
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import scipy.stats as spstats

%matplotlib inline
mpl.style.reload_library()
mpl.style.use('classic')
mpl.rcParams['figure.facecolor'] = (1, 1, 1, 0)
mpl.rcParams['figure.figsize'] = [6.0, 4.0]
mpl.rcParams['figure.dpi'] = 100
poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
poke_df.head()
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison318454949656545Gen 1False
12IvysaurGrassPoison405606263808060Gen 1False
23VenusaurGrassPoison52580828310010080Gen 1False
33VenusaurMega VenusaurGrassPoison6258010012312212080Gen 1False
44CharmanderFireNaN309395243605065Gen 1False
poke_df[['HP', 'Attack', 'Defense']].head()
HPAttackDefense
0454949
1606263
2808283
380100123
4395243
poke_df[['HP', 'Attack', 'Defense']].describe()
HPAttackDefense
count800.000000800.000000800.000000
mean69.25875079.00125073.842500
std25.53466932.45736631.183501
min1.0000005.0000005.000000
25%50.00000055.00000050.000000
50%65.00000075.00000070.000000
75%80.000000100.00000090.000000
max255.000000190.000000230.000000
popsong_df = pd.read_csv('datasets/song_views.csv', encoding='utf-8')
popsong_df.head(10)
user_idsong_idtitlelisten_count
0b6b799f34a204bd928ea014c243ddad6d0be4f8fSOBONKR12A58A7A7E0You're The One2
1b41ead730ac14f6b6717b9cf8859d5579f3f8d4dSOBONKR12A58A7A7E0You're The One0
24c84359a164b161496d05282707cecbd50adbfc4SOBONKR12A58A7A7E0You're The One0
3779b5908593756abb6ff7586177c966022668b06SOBONKR12A58A7A7E0You're The One0
4dd88ea94f605a63d9fc37a214127e3f00e85e42dSOBONKR12A58A7A7E0You're The One0
568f0359a2f1cedb0d15c98d88017281db79f9bc6SOBONKR12A58A7A7E0You're The One0
6116a4c95d63623a967edf2f3456c90ebbf964e6fSOBONKR12A58A7A7E0You're The One17
745544491ccfcdc0b0803c34f201a6287ed4e30f8SOBONKR12A58A7A7E0You're The One0
8e701a24d9b6c59f5ac37ab28462ca82470e27cfbSOBONKR12A58A7A7E0You're The One68
9edc8b7b1fd592a3b69c3d823a742e1a064abec95SOBONKR12A58A7A7E0You're The One0

二值特征

watched = np.array(popsong_df['listen_count']) 
watched[watched >= 1] = 1
popsong_df['watched'] = watched
popsong_df.head(10)
user_idsong_idtitlelisten_countwatched
0b6b799f34a204bd928ea014c243ddad6d0be4f8fSOBONKR12A58A7A7E0You're The One21
1b41ead730ac14f6b6717b9cf8859d5579f3f8d4dSOBONKR12A58A7A7E0You're The One00
24c84359a164b161496d05282707cecbd50adbfc4SOBONKR12A58A7A7E0You're The One00
3779b5908593756abb6ff7586177c966022668b06SOBONKR12A58A7A7E0You're The One00
4dd88ea94f605a63d9fc37a214127e3f00e85e42dSOBONKR12A58A7A7E0You're The One00
568f0359a2f1cedb0d15c98d88017281db79f9bc6SOBONKR12A58A7A7E0You're The One00
6116a4c95d63623a967edf2f3456c90ebbf964e6fSOBONKR12A58A7A7E0You're The One171
745544491ccfcdc0b0803c34f201a6287ed4e30f8SOBONKR12A58A7A7E0You're The One00
8e701a24d9b6c59f5ac37ab28462ca82470e27cfbSOBONKR12A58A7A7E0You're The One681
9edc8b7b1fd592a3b69c3d823a742e1a064abec95SOBONKR12A58A7A7E0You're The One00
from sklearn.preprocessing import Binarizer

bn = Binarizer(threshold=0.9)
pd_watched = bn.transform([popsong_df['listen_count']])[0]
popsong_df['pd_watched'] = pd_watched
popsong_df.head(10)
user_idsong_idtitlelisten_countwatchedpd_watched
0b6b799f34a204bd928ea014c243ddad6d0be4f8fSOBONKR12A58A7A7E0You're The One211
1b41ead730ac14f6b6717b9cf8859d5579f3f8d4dSOBONKR12A58A7A7E0You're The One000
24c84359a164b161496d05282707cecbd50adbfc4SOBONKR12A58A7A7E0You're The One000
3779b5908593756abb6ff7586177c966022668b06SOBONKR12A58A7A7E0You're The One000
4dd88ea94f605a63d9fc37a214127e3f00e85e42dSOBONKR12A58A7A7E0You're The One000
568f0359a2f1cedb0d15c98d88017281db79f9bc6SOBONKR12A58A7A7E0You're The One000
6116a4c95d63623a967edf2f3456c90ebbf964e6fSOBONKR12A58A7A7E0You're The One1711
745544491ccfcdc0b0803c34f201a6287ed4e30f8SOBONKR12A58A7A7E0You're The One000
8e701a24d9b6c59f5ac37ab28462ca82470e27cfbSOBONKR12A58A7A7E0You're The One6811
9edc8b7b1fd592a3b69c3d823a742e1a064abec95SOBONKR12A58A7A7E0You're The One000

多项式特征

poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
atk_def = poke_df[['Attack', 'Defense']]
atk_def.head()
AttackDefense
04949
16263
28283
3100123
45243
from sklearn.preprocessing import PolynomialFeatures

pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
res = pf.fit_transform(atk_def)
res[:5]
array([[    49.,     49.,   2401.,   2401.,   2401.],
       [    62.,     63.,   3844.,   3906.,   3969.],
       [    82.,     83.,   6724.,   6806.,   6889.],
       [   100.,    123.,  10000.,  12300.,  15129.],
       [    52.,     43.,   2704.,   2236.,   1849.]])
intr_features = pd.DataFrame(res, columns=['Attack', 'Defense', 'Attack^2', 'Attack x Defense', 'Defense^2'])
intr_features.head(5)
AttackDefenseAttack^2Attack x DefenseDefense^2
049.049.02401.02401.02401.0
162.063.03844.03906.03969.0
282.083.06724.06806.06889.0
3100.0123.010000.012300.015129.0
452.043.02704.02236.01849.0

binning特征

fcc_survey_df = pd.read_csv('datasets/fcc_2016_coder_survey_subset.csv', encoding='utf-8')
fcc_survey_df[['ID.x', 'EmploymentField', 'Age', 'Income']].head()
ID.xEmploymentFieldAgeIncome
0cef35615d61b202f1dc794ef2746df14office and administrative support28.032000.0
1323e5a113644d18185c743c241407754food and beverage22.015000.0
2b29a1027e5cd062e654a63764157461dfinance19.048000.0
304a11e4bcb573a1261eb0d9948d32637arts, entertainment, sports, or media26.043000.0
49368291c93d5d5f5c8cdb1a575e18beceducation20.06000.0
fig, ax = plt.subplots()
fcc_survey_df['Age'].hist(color='#A9C5D3')
ax.set_title('Developer Age Histogram', fontsize=12)
ax.set_xlabel('Age', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
Text(0,0.5,'Frequency')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-4CuqwgqY-1618505239282)(output_35_1.png)]

Binning based on rounding

Age Range: Bin
---------------
 0 -  9  : 0
10 - 19  : 1
20 - 29  : 2
30 - 39  : 3
40 - 49  : 4
50 - 59  : 5
60 - 69  : 6
  ... and so on
fcc_survey_df['Age_bin_round'] = np.array(np.floor(np.array(fcc_survey_df['Age']) / 10.))
fcc_survey_df[['ID.x', 'Age', 'Age_bin_round']].iloc[1071:1076]
ID.xAgeAge_bin_round
10716a02aa4618c99fdb3e24de522a09943117.01.0
1072f0e5e47278c5f248fe861c5f7214c07a38.03.0
10736e14f6d0779b7e424fa3fdd9e4bd3bf921.02.0
1074c2654c07dc929cdf3dad4d1aec4ffbb353.05.0
1075f07449fc9339b2e57703ec788623252335.03.0

分位数切分

fcc_survey_df[['ID.x', 'Age', 'Income']].iloc[4:9]
ID.xAgeIncome
49368291c93d5d5f5c8cdb1a575e18bec20.06000.0
5dd0e77eab9270e4b67c19b0d6bbf621b34.040000.0
67599c0aa0419b59fd11ffede98a3665d23.032000.0
76dff182db452487f07a47596f314bddc35.040000.0
89dc233f8ed1c6eb2432672ab4bb3924933.080000.0
fig, ax = plt.subplots()
fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3')
ax.set_title('Developer Income Histogram', fontsize=12)
ax.set_xlabel('Developer Income', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
Text(0,0.5,'Frequency')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1F6g7PmF-1618505239286)(output_40_1.png)]

quantile_list = [0, .25, .5, .75, 1.]
quantiles = fcc_survey_df['Income'].quantile(quantile_list)
quantiles
0.00      6000.0
0.25     20000.0
0.50     37000.0
0.75     60000.0
1.00    200000.0
Name: Income, dtype: float64
fig, ax = plt.subplots()
fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3')

for quantile in quantiles:
    qvl = plt.axvline(quantile, color='r')
ax.legend([qvl], ['Quantiles'], fontsize=10)

ax.set_title('Developer Income Histogram with Quantiles', fontsize=12)
ax.set_xlabel('Developer Income', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
Text(0,0.5,'Frequency')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hiEo15YC-1618505239295)(output_42_1.png)]

quantile_labels = ['0-25Q', '25-50Q', '50-75Q', '75-100Q']
fcc_survey_df['Income_quantile_range'] = pd.qcut(fcc_survey_df['Income'], 
                                                 q=quantile_list)
fcc_survey_df['Income_quantile_label'] = pd.qcut(fcc_survey_df['Income'], 
                                                 q=quantile_list, labels=quantile_labels)
fcc_survey_df[['ID.x', 'Age', 'Income', 
               'Income_quantile_range', 'Income_quantile_label']].iloc[4:9]
ID.xAgeIncomeIncome_quantile_rangeIncome_quantile_label
49368291c93d5d5f5c8cdb1a575e18bec20.06000.0(5999.999, 20000.0]0-25Q
5dd0e77eab9270e4b67c19b0d6bbf621b34.040000.0(37000.0, 60000.0]50-75Q
67599c0aa0419b59fd11ffede98a3665d23.032000.0(20000.0, 37000.0]25-50Q
76dff182db452487f07a47596f314bddc35.040000.0(37000.0, 60000.0]50-75Q
89dc233f8ed1c6eb2432672ab4bb3924933.080000.0(60000.0, 200000.0]75-100Q

对数变换 COX-BOX

fcc_survey_df['Income_log'] = np.log((1+ fcc_survey_df['Income']))
fcc_survey_df[['ID.x', 'Age', 'Income', 'Income_log']].iloc[4:9]
ID.xAgeIncomeIncome_log
49368291c93d5d5f5c8cdb1a575e18bec20.06000.08.699681
5dd0e77eab9270e4b67c19b0d6bbf621b34.040000.010.596660
67599c0aa0419b59fd11ffede98a3665d23.032000.010.373522
76dff182db452487f07a47596f314bddc35.040000.010.596660
89dc233f8ed1c6eb2432672ab4bb3924933.080000.011.289794
income_log_mean = np.round(np.mean(fcc_survey_df['Income_log']), 2)

fig, ax = plt.subplots()
fcc_survey_df['Income_log'].hist(bins=30, color='#A9C5D3')
plt.axvline(income_log_mean, color='r')
ax.set_title('Developer Income Histogram after Log Transform', fontsize=12)
ax.set_xlabel('Developer Income (log scale)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.text(11.5, 450, r'$\mu$='+str(income_log_mean), fontsize=10)
Text(11.5,450,'$\\mu$=10.43')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VPTippr9-1618505239299)(output_46_1.png)]

日期相关特征

import datetime
import numpy as np
import pandas as pd
from dateutil.parser import parse
import pytz
import numpy as np
import pandas as pd
time_stamps = ['2015-03-08 10:30:00.360000+00:00', '2017-07-13 15:45:05.755000-07:00',
               '2012-01-20 22:30:00.254000+05:30', '2016-12-25 00:30:00.000000+10:00']
df = pd.DataFrame(time_stamps, columns=['Time'])
df
Time
02015-03-08 10:30:00.360000+00:00
12017-07-13 15:45:05.755000-07:00
22012-01-20 22:30:00.254000+05:30
32016-12-25 00:30:00.000000+10:00
ts_objs = np.array([pd.Timestamp(item) for item in np.array(df.Time)])
df['TS_obj'] = ts_objs
ts_objs
array([Timestamp('2015-03-08 10:30:00.360000+0000', tz='UTC'),
       Timestamp('2017-07-13 15:45:05.755000-0700', tz='pytz.FixedOffset(-420)'),
       Timestamp('2012-01-20 22:30:00.254000+0530', tz='pytz.FixedOffset(330)'),
       Timestamp('2016-12-25 00:30:00+1000', tz='pytz.FixedOffset(600)')], dtype=object)
df['Year'] = df['TS_obj'].apply(lambda d: d.year)
df['Month'] = df['TS_obj'].apply(lambda d: d.month)
df['Day'] = df['TS_obj'].apply(lambda d: d.day)
df['DayOfWeek'] = df['TS_obj'].apply(lambda d: d.dayofweek)
df['DayName'] = df['TS_obj'].apply(lambda d: d.weekday_name)
df['DayOfYear'] = df['TS_obj'].apply(lambda d: d.dayofyear)
df['WeekOfYear'] = df['TS_obj'].apply(lambda d: d.weekofyear)
df['Quarter'] = df['TS_obj'].apply(lambda d: d.quarter)

df[['Time', 'Year', 'Month', 'Day', 'Quarter', 
    'DayOfWeek', 'DayName', 'DayOfYear', 'WeekOfYear']]
e:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: FutureWarning: `weekday_name` is deprecated and will be removed in a future version. Use `day_name` instead
  """
TimeYearMonthDayQuarterDayOfWeekDayNameDayOfYearWeekOfYear
02015-03-08 10:30:00.360000+00:0020153816Sunday6710
12017-07-13 15:45:05.755000-07:00201771333Thursday19428
22012-01-20 22:30:00.254000+05:30201212014Friday203
32016-12-25 00:30:00.000000+10:002016122546Sunday36051

时间相关特征

df['Hour'] = df['TS_obj'].apply(lambda d: d.hour)
df['Minute'] = df['TS_obj'].apply(lambda d: d.minute)
df['Second'] = df['TS_obj'].apply(lambda d: d.second)
df['MUsecond'] = df['TS_obj'].apply(lambda d: d.microsecond)   #毫秒
df['UTC_offset'] = df['TS_obj'].apply(lambda d: d.utcoffset()) #UTC时间位移

df[['Time', 'Hour', 'Minute', 'Second', 'MUsecond', 'UTC_offset']]
TimeHourMinuteSecondMUsecondUTC_offset
02015-03-08 10:30:00.360000+00:001030036000000:00:00
12017-07-13 15:45:05.755000-07:0015455755000-1 days +17:00:00
22012-01-20 22:30:00.254000+05:302230025400005:30:00
32016-12-25 00:30:00.000000+10:000300010:00:00

按照早晚切分时间

hour_bins = [-1, 5, 11, 16, 21, 23]
bin_names = ['Late Night', 'Morning', 'Afternoon', 'Evening', 'Night']
df['TimeOfDayBin'] = pd.cut(df['Hour'], 
                            bins=hour_bins, labels=bin_names)
df[['Time', 'Hour', 'TimeOfDayBin']]
TimeHourTimeOfDayBin
02015-03-08 10:30:00.360000+00:0010Morning
12017-07-13 15:45:05.755000-07:0015Afternoon
22012-01-20 22:30:00.254000+05:3022Night
32016-12-25 00:30:00.000000+10:000Late Night
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Wency(王斯-CUEB)

我不是要饭的

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值