机器学习-逻辑回归算法推理+代码复现

最新推荐文章于 2025-01-09 21:22:26 发布

LiBin1103

最新推荐文章于 2025-01-09 21:22:26 发布

阅读量1.9k

点赞数 5

分类专栏：学习心得

本文链接：https://blog.csdn.net/Linux_bin/article/details/106551232

版权

学习心得专栏收录该内容

14 篇文章

订阅专栏

1、算法推理：

这里算法推理和代码复现原型参考：https://blog.csdn.net/zouxy09/article/details/20319673

2、算法应用背景-课题设计：

企业财务风险预警是企业风险预警系统的一个重要组成部分，它能有效的预知部分财务风险。本课题将风险公司记为ST，非风险公司记为非ST，ST判断标准如下：

1.）连续两年年报显示净利润为负值

2）净资产收益率、总资产净利润率为负值。

其影响特征变量如下：

注: 现金比率= 货币资金÷ 流动负债;

盈利现金比率= 经营活动的现金净流量÷ 净利润;

主营业务鲜明程度= 主营业务利润÷ |净利润|;

请利用2018年的财务指标相关数据，构建基于BP神经网络的财务风险预警模型。

3、代码复现

3.1 train & predict

from numpy import *
import matplotlib.pyplot as plt
import time
import numpy as np
from data import Dataset
import pickle
# calculate the sigmoid function
#算法推理来自于：https://blog.csdn.net/zouxy09/article/details/20319673
#commit code to https://github.com/Linby1103/Pre-warning.git

class LR(object):
    def __init__(self,train_data,label_):

        self.traindata=train_data
        self.label=label_

    def sigmoid(self,z):
        return 1.0 / (1 + exp(-z))

    def train(self,opts,weight_path="weights.pkl"):
        # train a logistic regression model using some optional optimize algorithm
        # traindata is a mat datatype, each row stands for one sample
        #label is mat datatype too, each row is the corresponding label
        #opts is optimize option include step and maximum number of iterations
        # calculate training time
        startTime = time.time()

        numSamples, numFeatures = shape(self.traindata)
        alpha = opts['alpha'];
        maxIter = opts['maxIter']
        weights = ones((numFeatures, 1))


        # optimize through gradient descent algorilthm
        for k in range(maxIter):
            if opts['optimizeType'] == 'gradDescent':  # gradient descent algorilthm
                output = self.sigmoid(self.traindata * weights)
                error = self.label - output  
                weights = weights + alpha * self.traindata.transpose() * error  # (y-err)*x  BP
            elif opts['optimizeType'] == 'stocGradDescent':  # stochastic gradient descent
                for i in range(numSamples):
                    output = self.sigmoid(self.traindata[i, :] * weights)#注意这traindata是转化为mat类型的数据，所以这里用* 和用dot做矩阵的乘法是一样的
                    error = self.label[i, 0] - output
                    weights = weights + alpha * self.traindata[i, :].transpose() * error
            elif opts['optimizeType'] == 'smoothStocGradDescent':  # smooth stochastic gradient descent
                # randomly select samples to optimize for reducing cycle fluctuations
                dataIndex = range(numSamples)
                for i in range(numSamples):
                    alpha = 4.0 / (1.0 + k + i) + 0.01
                    randIndex = int(random.uniform(0, len(dataIndex)))
                    output = self.sigmoid(self.traindata[randIndex, :] * weights)
                    error = self.label[randIndex, 0] - output
                    weights = weights + alpha * self.traindata[randIndex, :].transpose() * error
                    del (dataIndex[randIndex])  # during one interation, delete the optimized sample
            else:
                raise NameError('Not support optimize method type!')

        print('training finished!  %fs!' % (time.time() - startTime))

        try:
            pickf = open(weight_path, 'wb')
            print("Save weight to {}".format(weight_path))
            data = {"weight": weights}
            pickle.dump(data, pickf)
        except Exception as e:
            print("Write weight to {} fail! {}".format(pickf,e))
            pickf.close()
        finally:
            pickf.close()

    # test  trained Logistic Regression model given test set
    def predict(self,weights, test_x, test_y):
        numSamples, numFeatures = shape(test_x)
        matchCount = 0
        result=[]
        for i in range(numSamples):
            prob = 1 if self.sigmoid(test_x[i, :] * weights)[0, 0] > 0.5 else 0
            result.append(prob)
            if prob == bool(test_y[i, 0]):
                matchCount += 1
        accuracy = float(matchCount) / numSamples
        print("accuracy={}".format(accuracy))
        return result
    def load_weight(self,path):
        weight={}
        try:
             weight = pickle.load(open(path,'rb'))["weight"]
             print(weight.shape)
        except Exception as e:
            print("Load weight faild! {}".format(e))
        finally:
            return weight



def loadData():
    train_x = []
    train_y = []
    fileIn = open('./data/testdata.txt')
    for line in fileIn.readlines():
        lineArr = line.strip().split()
        train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
        train_y.append(float(lineArr[2]))
    return mat(train_x), mat(train_y).transpose()

if __name__=="__main__":
    STATUS = ['非ST','ST']
    save_model="weights.pkl"

    train, label = Dataset.GetDatafromDict()
    trainarrayx=mat(train)
    labelarray=mat(label).transpose()
    logistreg = LR(trainarrayx,labelarray)
    # print(trainarrayx.shape,labelarray.shape)
    opts={"alpha":0.001,"maxIter":200,"optimizeType":"gradDescent"}
    logistreg.train(opts,save_model)
    weights=logistreg.load_weight(save_model)
    res=logistreg.predict(weights,trainarrayx,labelarray)

    for i in range(len(res)):
        print("预测结果 :{}---原始数据:{}".format(STATUS[res[i]], STATUS[label[i]]))

3.2 Create dataset-数据来源于excel

import xlrd
import os
import numpy as np


class ExcelReade(object):
    def __init__(self, excel_name, sheet_name):
        """
        # 我把excel放在工程包的当前文件夹中：
        # 1.需要先获取到工程文件的地址
        # 2.再找到excel的文件地址（比写死的绝对路径灵活）

            os.path.relpath(__file__)
            1.根据系统获取绝对路径
            2.会根据电脑系统自动匹配路径：mac路径用/,windows路径用\
            3.直接使用__file__方法是不会自动适配环境的
        """

        self.excel_path = excel_name
        # 打开指定的excel文件
        self.date = xlrd.open_workbook(self.excel_path)
        # 找到指定的sheet页
        self.table = self.date.sheet_by_name(sheet_name)
        self.rows = self.table.nrows  # 获取总行数
        self.cols = self.table.ncols  # 获取总列数

    def data_dict(self):
        if self.rows <= 1:
            print("总行数小于等于1，路径：", end='')
            print(self.excel_path)
            return False
        else:
            # 将列表的第一行设置为字典的key值
            keys = self.table.row_values(0)
            # 定义一个数组
            data = []
            # 从第二行开始读取数据，循环rows（总行数）-1次
            for i in range(1, self.rows):
                # 循环内定义一个字典，每次循环都会清空
                dict = {}
                # 从第一列开始读取数据，循环cols（总列数）次
                for j in range(0, self.cols):
                    # 将value值关联同一列的key值
                    dict[keys[j]] = self.table.row_values(i)[j] if self.table.row_values(i)[j] != "NULL" else "0"
                # 将关联后的字典放到数组里
                data.append(dict)


            return data
#Stkcd	净利润2017	净利润2018	净资产收益率	总资产净利润率	x1	x2	x3	x4	x5	x6	x7	x8	x9	x10	x11	x12	x13	x14

def GetDatafromDict():
    start = ExcelReade('./data/dataset.xlsx', 'Sheet1')
    data = start.data_dict()

    train=[]
    label=[]

    for i in range(len(data)):
        array=[]
        array.append(float(data[i]['净利润2017']))
        array.append(float(data[i]['净利润2018']))
        array.append(float(data[i]['净资产收益率']))
        array.append(float(data[i]['总资产净利润率']))
        array.append(float(data[i]['x1']))
        array.append(float(data[i]['x2']))
        array.append(float(data[i]['x3']))
        array.append(float(data[i]['x4']))
        array.append(float(data[i]['x5']))
        array.append(float(data[i]['x6']))
        array.append(float(data[i]['x7']))
        array.append(float(data[i]['x8']))
        array.append(float(data[i]['x9']))
        array.append(float(data[i]['x10']))
        array.append(float(data[i]['x11']))
        array.append(float(data[i]['x12']))
        array.append(float(data[i]['x13']))
        array.append(float(data[i]['x14']))
        train.append(array)
        if float(data[i]['净利润2017'])*float(data[i]['净利润2018'])<0:
            label.append(0)
        else:
            label.append(1)
    return np.array(train) ,np.array(label)



if __name__ == '__main__':
    start = ExcelReade('C:/Users/91324/Desktop/datasset.xlsx','Sheet1')
    data = start.data_dict()
    GetDatafromDict(data)

数据集，代码上传至：https://github.com/Linby1103/Pre-warning.git