需求:给出一个公式对帖子进行粗排
16个特征:浏览数view_cnt,点击率ctr,搜索词和文章的相关性text_score,评论数comment_cnt,创建时间create_hours....
第一步:合并训练和测试数据
拉取最近7天的帖子的点击记录,和16个特征。把前1天的数据合并为test测试集,把前2~7天合并为训练集。
第二步:特征处理
特征实例:
0 93 0 5 None None 0.11201316033651 0 0 0 0 55.56929 3153 0.029495 0.0 0.0 0.0
0 83 0 5 None None 0.15016227641681 1 0 0 0 55.281693 1765 0.04702 0.000566572 0.0 0.0
说明:第一列为lable是否点击,剩下16列为特征列。最终我们得到两个文件一个test,一个train。
第三步:训练模型
"""
import pandas as pd
import numpy as np
from sklearn import tree
import pydot_ng as pydot
from sklearn.externals.six import StringIO
from sklearn.metrics import roc_auc_score
import re
feature_names = ["view_cnt", "pgc_yn", "post_type", "ctr", "ctr_min_est", "newsort", "up_cnt", "ding_cnt",
"comment_cnt", "post_video_yn", "text_score", "create_hours", "view_cnt_avg", "up_cnt_avg",
"ding_cnt_avg", "comment_cnt_avg"]
def read_file(in_file):
df = pd.read_csv(in_file, sep='\t', names=['label'] + feature_names, na_values="None")
corr = df.corr()
corr.to_csv("corr",sep='\t')
# 打印各特征的分布
for fea in feature_names:
print(f"\n{fea}************************")
# df[fea] = df[fea].astype(str)
na_num = sum(df[fea].isna())
print(f"空值数量{na_num}")
series = df[fea]
series = series.astype(float)
a =np.arange(0.0, 1.1, 0.1)
print(series.describe(a))
# 填补默认值
# df = df.replace(to_replace="None", value=None)
# for fea in feature_names:
# if fea != "label":
# df[fea] = df[fea].astype(float)
# df['label'] = df['label'].astype(int)
# train_df, test_df = train_test_split(df, test_size=0.3, random_state=10)
return df
def extract_seg(infile, fea):
f = open(infile, 'r')
lines = f.readlines()
f.close()
match_str = []
pattern = re.compile("label=\"{0} <=.*\\\\ngini".format(fea))
for line in lines:
match_str.extend(pattern.findall(line))
segs = set()
for s in match_str:
segs.add(float(s.split('<=')[1].split('\\')[0]))
segs = sorted(segs)
return segs
def get_segs(train_df, test_df, fea, max_depth=4, min_samples_leaf=1000):
clf = tree.DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
a =train_df[fea].isna()
b =~train_df[fea].isna()
train_valid = train_df.loc[~train_df[fea].isna()] #~标识逐位取反 loc是根据位置读取数据。意思是读取某列中所有部null的值
test_valid = test_df.loc[~test_df[fea].isna()]
model = clf.fit(train_valid[fea].values.reshape(-1, 1), train_valid["label"])
dot_data = StringIO()
dot_file = f"./model/{fea}.dot"
tree.export_graphviz(model, out_file=dot_data, feature_names=[fea])
f = open(dot_file, 'w')
f.write(dot_data.getvalue())
f.close()
segs = extract_seg(dot_file, fea)
test_pred = clf.predict_proba(test_valid[fea].values.reshape(-1, 1))[:, 1]
test_auc = roc_auc_score(test_valid["label"], test_pred)
segs_length = len(segs)
print(f"fea: \"{fea}\"\n auc: {test_auc}, segs_length: {segs_length}, segs: {segs}\n")
return test_auc, segs
def write_segs(train_df, test_df, segs_file):
segs_out = open(segs_file, 'w')
for fea in feature_names:
test_auc, segs = get_segs(train_df, test_df, fea, max_depth=4, min_samples_leaf=1000)
if test_auc <= 0.515:
print(f"************{fea}:{test_auc} is droped*************\n")
else:
segs_out.write(f"{fea}\t{segs}\n")
segs_out.close()
if __name__ == "__main__":
train_df = read_file("./train")
test_df = read_file("./test")
segs_file = './segs'
write_segs(train_df, test_df, segs_file)