1. 训练排序模型
训练排序模型并把模型存放到"model.dat"中:
from sklearn import datasets
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
import xgboost as xgb
data_X, data_Y = datasets.load_svmlight_file("data/model_in.txt")
data_groups = []
with open("data/model_in.group") as fin:
for line in fin:
data_groups.append(int(line.rstrip()))
model = xgb.XGBRanker(
booster='gbtree',
objective='rank:pairwise',
random_state=30,
learning_rate=0.1,
colsample_bytree=0.8,
eta=0.07,
max_depth=5,
n_estimators=110,
subsample=0.75
)
model.fit(data_X, data_Y, group=data_groups)
model_file = "model.dat"
model.save_model(model_file)
训练样本的形式:
1 qid:0 0:37.000000 1:0.000000 2:nan
0 qid:0 0:73.000000 1:0.000000 2:nan
1 qid:1 0:57.000000 1:-27.128151 2:0.582392
0 qid:1 0:66.000000 1:-35.469261 2:nan
1 qid:2 0:100.000000 1:0.000000 2:0.842665
0 qid:2 0:10.000000 1:0.000000 2:nan
0 qid:3 0:10.000000 1:-21.327538 2:nan
1 qid:3 0:100.000000 1:0.000000 2:0.000000
1 qid:4 0:90.000000 1:0.000000 2:nan
0 qid:4 0:10.000000 1:-21.327538 2:nan
0 qid:5 0:10.000000 1:0.000000 2:nan
1 qid:5 0:100.000000 1:0.000000 2:1.000000
1 qid:6 0:100.000000 1:0.000000 2:nan
0 qid:6 0:0.000000 1:0.000000 2:nan
1 qid:7 0:100.000000 1:0.000000 2:1.000000
0 qid:7 0:10.000000 1:0.000000 2:1.000000
0 qid:8 0:10.000000 1:0.000000 2:nan
1 qid:8 0:95.000000 1:0.000000 2:1.000000
1 qid:9 0:100.000000 1:0.000000 2:nan
0 qid:9 0:10.000000 1:0.000000 2:1.000000
0 qid:9 0:10.000000 1:0.000000 2:1.000000
0 qid:10 0:54.000000 1:-26.621315 2:nan
1 qid:10 0:54.000000 1:0.000000 2:nan
1 qid:11 0:90.000000 1:0.000000 2:nan
0 qid:11 0:10.000000 1:0.000000 2:1.000000
1 qid:12 0:90.000000 1:0.000000 2:1.000000
0 qid:12 0:54.000000 1:-15.469261 2:0.586860
第一列是label,第二列qid相同的代表在同一个rank列表里,往后的几列都是特征。
group文件的形式:
2
2
2
2
2
2
2
2
2
3
2
2
2
group文件的每一行表示这个rank列表里面有几个case
2. 预测
预测:
import xgboost as xgb
import sys
import numpy as np
from xgboost import DMatrix
import pandas as pd
bst = xgb.Booster()
bst.load_model("model.dat")
datas = []
feature_list = []
for line in open("data/predict/model_in.fix"):
elems = line.rstrip().split('\t')
feats = np.array([float(x.split(':')[1]) for x in elems[1:]], dtype=float)
datas.append(feats)
feature_list = ["F_%d" % i for i in range(len(datas[0]))]
data_dict = {}
for i in range(len(datas)):
for j in range(len(datas[0])):
if j == 0:
if "qid" not in data_dict:
data_dict["qid"] = [datas[i][j]]
else:
data_dict["qid"].append(datas[i][j])
if j != 1:
if feature_list[j - 1] not in data_dict:
data_dict[feature_list[j - 1]] = [datas[i][j]]
else:
data_dict[feature_list[j - 1]].append(datas[i][j])
# print(data_dict)
df = pd.DataFrame(data_dict)
def predict(model, df):
data = df.loc[:, ~df.columns.isin(['qid'])]
return model.predict(DMatrix(np.array(data), missing=np.NaN))
predictions = (df.groupby('qid')
.apply(lambda x: predict(bst, x)))
print(predictions)
代码地址:https://github.com/liaohit/rank/tree/master/xgb_rank