帖子粗排公式-决策树

最新推荐文章于 2024-04-07 08:59:43 发布

Focus_Liu

最新推荐文章于 2024-04-07 08:59:43 发布

阅读量197

点赞数

本文链接：https://blog.csdn.net/liuhe2296044/article/details/106447745

版权

需求：给出一个公式对帖子进行粗排

16个特征：浏览数view_cnt，点击率ctr，搜索词和文章的相关性text_score，评论数comment_cnt，创建时间create_hours....

第一步：合并训练和测试数据

拉取最近7天的帖子的点击记录，和16个特征。把前1天的数据合并为test测试集，把前2~7天合并为训练集。

第二步：特征处理

特征实例：

0 93 0 5 None None 0.11201316033651 0 0 0 0 55.56929 3153 0.029495 0.0 0.0 0.0

0 83 0 5 None None 0.15016227641681 1 0 0 0 55.281693 1765 0.04702 0.000566572 0.0 0.0

说明：第一列为lable是否点击，剩下16列为特征列。最终我们得到两个文件一个test，一个train。

第三步：训练模型

"""

import pandas as pd
import numpy as np
from sklearn import tree
import pydot_ng as pydot
from sklearn.externals.six import StringIO
from sklearn.metrics import roc_auc_score
import re

feature_names = ["view_cnt", "pgc_yn", "post_type", "ctr", "ctr_min_est", "newsort", "up_cnt", "ding_cnt",
                 "comment_cnt", "post_video_yn", "text_score", "create_hours", "view_cnt_avg", "up_cnt_avg",
                 "ding_cnt_avg", "comment_cnt_avg"]

def read_file(in_file):
    df = pd.read_csv(in_file, sep='\t', names=['label'] + feature_names, na_values="None")
    corr = df.corr()
    corr.to_csv("corr",sep='\t')
    # 打印各特征的分布
    for fea in feature_names:
        print(f"\n{fea}************************")
        # df[fea] = df[fea].astype(str)
        na_num = sum(df[fea].isna())
        print(f"空值数量{na_num}")
        series = df[fea]
        series = series.astype(float)
        a =np.arange(0.0, 1.1, 0.1)
        print(series.describe(a))
    # 填补默认值
    # df = df.replace(to_replace="None", value=None)
    # for fea in feature_names:
    #     if fea != "label":
    #         df[fea] = df[fea].astype(float)
    # df['label'] = df['label'].astype(int)
    # train_df, test_df = train_test_split(df, test_size=0.3, random_state=10)
    return df


def extract_seg(infile, fea):
    f = open(infile, 'r')
    lines = f.readlines()
    f.close()
    match_str = []
    pattern = re.compile("label=\"{0} <=.*\\\\ngini".format(fea))
    for line in lines:
        match_str.extend(pattern.findall(line))
    segs = set()
    for s in match_str:
        segs.add(float(s.split('<=')[1].split('\\')[0]))
    segs = sorted(segs)
    return segs


def get_segs(train_df, test_df, fea, max_depth=4, min_samples_leaf=1000):
    clf = tree.DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
    a =train_df[fea].isna()
    b =~train_df[fea].isna()
    train_valid = train_df.loc[~train_df[fea].isna()] #~标识逐位取反 loc是根据位置读取数据。意思是读取某列中所有部null的值
    test_valid = test_df.loc[~test_df[fea].isna()]
    model = clf.fit(train_valid[fea].values.reshape(-1, 1), train_valid["label"])
    dot_data = StringIO()
    dot_file = f"./model/{fea}.dot"
    tree.export_graphviz(model, out_file=dot_data, feature_names=[fea])
    f = open(dot_file, 'w')
    f.write(dot_data.getvalue())
    f.close()
    segs = extract_seg(dot_file, fea)
    test_pred = clf.predict_proba(test_valid[fea].values.reshape(-1, 1))[:, 1]
    test_auc = roc_auc_score(test_valid["label"], test_pred)
    segs_length = len(segs)
    print(f"fea: \"{fea}\"\n auc: {test_auc}, segs_length: {segs_length}, segs: {segs}\n")
    return test_auc, segs


def write_segs(train_df, test_df, segs_file):
    segs_out = open(segs_file, 'w')
    for fea in feature_names:
        test_auc, segs = get_segs(train_df, test_df, fea, max_depth=4, min_samples_leaf=1000)
        if test_auc <= 0.515:
            print(f"************{fea}:{test_auc} is droped*************\n")
        else:
            segs_out.write(f"{fea}\t{segs}\n")
    segs_out.close()


if __name__ == "__main__":
    train_df = read_file("./train")
    test_df = read_file("./test")
    segs_file = './segs'
    write_segs(train_df, test_df, segs_file)