import featuretools as ft
# 定义EntitySet
es = ft.EntitySet(id = 'engines')
es = es.entity_from_dataframe(dataframe = train,
entity_id = 'obs',
index = 'index',
time_index = 'time')
# 设置2个table的关系
es.normalize_entity(base_entity_id = 'obs',
new_entity_id = 'engines',
index = 'engine_no')
# Deep feature synthesis,Deep feature synthesis就是产生新特征的一个过程
feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='engines',
agg_primitives = ['min', 'max', 'mean',
'count', 'sum'],
trans_primitives = ['cum_mean', 'cum_sum'],
max_depth = 1, n_jobs = -1, verbose = 1)
# test_data
test = pd.read_csv('../input/test_obs.csv').reset_index()
test_es = ft.EntitySet(id = 'test_engines')
test_es = test_es.entity_from_dataframe(dataframe = test,
entity_id='obs',
index = 'index',
time_index = 'time')
test_es.normalize_entity(base_entity_id='obs',
new_entity_id = 'engines',
index = 'engine_no')
# 我们已经有features了,所有不用dfs,所以我们可以用calculate_feature_matrix过一遍features的列表
test_feature_matrix = ft.calculate_feature_matrix(feature_names, entityset=test_es,
n_jobs = -1, verbose = 1)
# 特征选择
feature_matrix = feature_selection(feature_matrix, correlation_threshold=0.9)
test_feature_matrix = test_feature_matrix[feature_matrix.columns]
# 用cv评价feature_matrix
preds, fi = evaluate(feature_matrix, train_labels, test_feature_matrix, test_labels)
# 画出前十重要的特征
norm_fi = plot_feature_importances(fi, 10, color = 'red')
python-featuretools-feature-selection
最新推荐文章于 2024-08-12 09:15:00 发布