本文链接：https://blog.csdn.net/wzl1997/article/details/89575597

Factorization Machine

1.训练模型

# coding:UTF-8
'''
Date:20180426
@author: zhilongwang
'''
import numpy as np
from random import normalvariate  # 正态分布


def loadDataSet(data):
    '''导入训练数据
    input:  data(string)训练数据
    output: dataMat(list)特征
            labelMat(list)标签
    '''
    dataMat = []
    labelMat = []
    fr = open(data)  # 打开文件
    for line in fr.readlines():
        lines = line.strip().split("\t")
        lineArr = []

        for i in range(len(lines) - 1):
            lineArr.append(float(lines[i]))
        dataMat.append(lineArr)

        labelMat.append(float(lines[-1]) * 2 - 1)  # 转换成{-1,1}
    fr.close()
    return dataMat, labelMat


def sigmoid(inx):
    return 1.0 / (1 + np.exp(-inx))


def initialize_v(n, k):
    '''初始化交叉项
    input:  n(int)特征的个数
            k(int)FM模型的超参数
    output: v(mat):交叉项的系数权重
    '''
    v = np.mat(np.zeros((n, k)))

    for i in range(n):
        for j in range(k):
            # 利用正态分布生成每一个权重
            v[i, j] = normalvariate(0, 0.2)
    return v


def stocGradAscent(dataMatrix, classLabels, k, max_iter, alpha):
    '''利用随机梯度下降法训练FM模型
    input:  dataMatrix(mat)特征
            classLabels(mat)标签
            k(int)v的维数
            max_iter(int)最大迭代次数
            alpha(float)学习率
    output: w0(float),w(mat),v(mat):权重
    '''
    m, n = np.shape(dataMatrix)
    # 1、初始化参数
    w = np.zeros((n, 1))  # 其中n是特征的个数
    w0 = 0  # 偏置项
    v = initialize_v(n, k)  # 初始化V

    # 2、训练
    for it in range(max_iter):
        for x in range(m):  # 随机优化，对每一个样本而言的
            inter_1 = dataMatrix[x] * v
            inter_2 = np.multiply(dataMatrix[x], dataMatrix[x]) * \
                      np.multiply(v, v)  # multiply对应元素相乘
            # 完成交叉项
            interaction = np.sum(np.multiply(inter_1, inter_1) - inter_2) / 2.
            p = w0 + dataMatrix[x] * w + interaction  # 计算预测的输出
            loss = sigmoid(classLabels[x] * p[0, 0]) - 1

            w0 = w0 - alpha * loss * classLabels[x]
            for i in range(n):
                if dataMatrix[x, i] != 0:
                    w[i, 0] = w[i, 0] - alpha * loss * classLabels[x] * dataMatrix[x, i]

                    for j in range(k):
                        v[i, j] = v[i, j] - alpha * loss * classLabels[x] * \
                                  (dataMatrix[x, i] * inter_1[0, j] - \
                                   v[i, j] * dataMatrix[x, i] * dataMatrix[x, i])

        # 计算损失函数的值
        if it % 1000 == 0:
            print("\t------- iter: ", it, " , cost: ", getCost(getPrediction(np.mat(dataMatrix), w0, w, v), classLabels))

    # 3、返回最终的FM模型的参数
    return w0, w, v


def getCost(predict, classLabels):
    '''计算预测准确性
    input:  predict(list)预测值
            classLabels(list)标签
    output: error(float)计算损失函数的值
    '''
    m = len(predict)
    error = 0.0
    for i in range(m):
        error -= np.log(sigmoid(predict[i] * classLabels[i]))
    return error


def getPrediction(dataMatrix, w0, w, v):
    '''得到预测值
    input:  dataMatrix(mat)特征
            w(int)常数项权重
            w0(int)一次项权重
            v(float)交叉项权重
    output: result(list)预测的结果
    '''
    m = np.shape(dataMatrix)[0]
    result = []
    for x in range(m):
        inter_1 = dataMatrix[x] * v
        inter_2 = np.multiply(dataMatrix[x], dataMatrix[x]) * \
                  np.multiply(v, v)  # multiply对应元素相乘
        # 完成交叉项
        interaction = np.sum(np.multiply(inter_1, inter_1) - inter_2) / 2.
        p = w0 + dataMatrix[x] * w + interaction  # 计算预测的输出
        pre = sigmoid(p[0, 0])
        result.append(pre)
    return result


def getAccuracy(predict, classLabels):
    '''计算预测准确性
    input:  predict(list)预测值
            classLabels(list)标签
    output: float(error) / allItem(float)错误率
    '''
    m = len(predict)
    allItem = 0
    error = 0
    for i in range(m):
        allItem += 1
        if float(predict[i]) < 0.5 and classLabels[i] == 1.0:
            error += 1
        elif float(predict[i]) >= 0.5 and classLabels[i] == -1.0:
            error += 1
        else:
            continue
    return float(error) / allItem


def save_model(file_name, w0, w, v):
    '''保存训练好的FM模型
    input:  file_name(string):保存的文件名
            w0(float):偏置项
            w(mat):一次项的权重
            v(mat):交叉项的权重
    '''
    f = open(file_name, "w")
    # 1、保存w0
    f.write(str(w0) + "\n")
    # 2、保存一次项的权重
    w_array = []
    m = np.shape(w)[0]
    for i in range(m):
        w_array.append(str(w[i, 0]))
    f.write("\t".join(w_array) + "\n")
    # 3、保存交叉项的权重
    m1, n1 = np.shape(v)
    for i in range(m1):
        v_tmp = []
        for j in range(n1):
            v_tmp.append(str(v[i, j]))
        f.write("\t".join(v_tmp) + "\n")
    f.close()


if __name__ == "__main__":
    # 1、导入训练数据
    print("---------- 1.load data ---------")
    dataTrain, labelTrain = loadDataSet("data.txt")
    print("---------- 2.learning ---------")
    # 2、利用随机梯度训练FM模型
    w0, w, v = stocGradAscent(np.mat(dataTrain), labelTrain, 3, 10000, 0.01)
    predict_result = getPrediction(np.mat(dataTrain), w0, w, v)  # 得到训练的准确性
    print("----------training accuracy: %f" % (1 - getAccuracy(predict_result, labelTrain)))
    print("---------- 3.save result ---------")
    # 3、保存训练好的FM模型
    save_model("weights", w0, w, v)

2.测试模型

# coding:UTF-8

import numpy as np
from FM_train import getPrediction
import matplotlib.pyplot as plt



def loadDataSet(data):
    '''导入测试数据集
    input:  data(string)测试数据
    output: dataMat(list)特征
    '''
    dataMat = []
    fr = open(data)  # 打开文件
    for line in fr.readlines():
        lines = line.strip().split("\t")
        lineArr = []

        for i in range(len(lines)):
            lineArr.append(float(lines[i]))
        dataMat.append(lineArr)

    fr.close()
    return np.mat(dataMat)


def loadModel(model_file):
    '''导入FM模型
    input:  model_file(string)FM模型
    output: w0, np.mat(w).T, np.mat(v)FM模型的参数
    '''
    f = open(model_file)
    line_index = 0
    w0 = 0.0
    w = []
    v = []
    for line in f.readlines():
        lines = line.strip().split("\t")
        if line_index == 0:  # w0
            w0 = float(lines[0].strip())
        elif line_index == 1:  # w
            for x in lines:
                w.append(float(x.strip()))
        else:
            v_tmp = []
            for x in lines:
                v_tmp.append(float(x.strip()))
            v.append(v_tmp)
        line_index += 1
    f.close()
    return w0, np.mat(w).T, np.mat(v)


def save_result(file_name, result):
    '''保存最终的预测结果
    input:  file_name(string)需要保存的文件名
            result(mat):对测试数据的预测结果
    '''
    f = open(file_name, "w")
    f.write("\n".join(str(x) for x in result))
    f.close()


if __name__ == "__main__":
    # 1、导入测试数据
    dataTest = loadDataSet("test_data.txt")
    # 2、导入FM模型
    w0, w, v = loadModel("weights")
    # 3、预测
    result = getPrediction(dataTest, w0, w, v)
    # 4、保存最终的预测结果
    save_result("predict_result", result)

    dataTest = dataTest.T
    plt.plot(dataTest[0][0, 0:100], dataTest[1][0, 0:100], 'g-s')
    plt.plot(dataTest[0][0, 100:200], dataTest[1][0, 100:200], 'r-s')
    plt.show()