def cut_col(data, col_name, cut_list):
print('cutting', col_name)
def _trans(array):
count = array['box_counts']
for box in cut_list:
if count <= box:
return 'count_' + str(box)
return array[col_name]
df_counts = pd.DataFrame(data[col_name].value_counts())
df_counts.columns = ['box_counts']
df_counts[col_name] = df_counts.index
df = pd.merge(data, df_counts, on=col_name, how='left')
column = df.apply(_trans, axis=1)
return column
cut_col_dict = {
('pkgname', 'ver', 'reqrealip', 'adidmd5',
'imeimd5', 'openudidmd5', 'macmd5', 'model', 'make'): [3],
('ip',): [3, 5, 10],
}
for cut_cols, cut_list in cut_col_dict.items():
for col in cut_cols:
data[col] = cut_col(data, col, cut_list)
数据挖掘-去长尾操作
最新推荐文章于 2021-09-23 15:32:52 发布