用于回归分析的自定义库

最新推荐文章于 2024-10-09 14:34:06 发布

AlexCookie

最新推荐文章于 2024-10-09 14:34:06 发布

阅读量63

点赞数 2

文章标签：自然语言处理

本文链接：https://blog.csdn.net/qq_47991812/article/details/116718344

版权

#ml_algorithms
from linear_regression import *
from  logistic_regression import *
import numpy as np
import pandas as pd
Algorithms = {
    'LINEAR':[linear_hypothesis, linear_cost],
    'LOGISTIC':[logistic_hypothesis,logistic_cost]}

import pickle
#保存和读取训练号的决策树模型
def storeTree(tree,filename):
    f = open(filename,tree)
    pickle.dump(tree,f)
    f.close()

def loadTree(filename):
    f = open(filename,'rb')
    t = pickle.load(f)
    f.close()
    return t

def Entropy(x):#熵
    return (-x)*(np.log2(x))-(1-x)*(np.log2(1-x))

def calcEntropy(df,label = 'label'):
    # data's sum counter
    m = df.shape[0]
    #for label counter
    label_counts = df[label].value_counts()
    prob = label_counts * 1.0 / m
    return sum( -prob*np.log2(prob) )

def createTestData():
    ds =pd.DataFrame( [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']],columns = ['no surfing','flippers','label'])
    return  ds

def splitDataFrame(df,feature,value):
    columns = df.columns.to_list()
    columns.remove(feature)
    return df.loc[ df[feature ] == value,columns]

def chooseBestFeature(df,label = 'label'):
    features = df.columns.to_list()
    features.remove(label)
    #计算所有样本信息熵
    totalEntropy = calcEntropy(df,label)
    bestInfoGain = 0
    bestFeature = ''
    for f in features:
        uniqueVals = df[f].unique()
        newEntropy = 0
        for value in uniqueVals:
            subs = splitDataFrame(df,f,value)
            prob = len(subs) / float( len(df) ) #calculate 子集权重
            newEntropy += prob * calcEntropy(subs)#calculate 条件熵
        infoGain = totalEntropy - newEntropy
        if infoGain > bestInfoGain:              #update 最大增益及最优特征
            bestInfoGain = infoGain
            bestFeature = f
    return bestFeature

def majorityCnt(classList):
    classCounts = classList.value_counts().values
    max_i = classCounts.argmax() #find max index in classlist
    return classList.iloc[max_i]

def createTree(df,label = 'label'):
    classList = df[label]#统计data中 所有标签

    #recursive停止的第一个条件->当前df中所有数据是纯的（一个特征）
    if len( classList.unique() ) == 1:
        return classList.iloc[0]

    #recursive停止的第二个条件->当所有标签都划分完了，都纯了，
    #且返回标签中 数量最多的一个
    if df.shape[1] == 1:
        return majorityCnt(classList)
    bestFeature = chooseBestFeature(df,label)
    myTree = {bestFeature: {} }

    #statistical feature 的属性值
    uniqueVals = df[bestFeature].unique()

    #对 每一个特征进行划分，递归建树 function
    for value in uniqueVals:
        myTree[bestFeature][value] = \
            createTree( splitDataFrame(df,bestFeature,value) )
    return myTree

def classifier_id3(inputTree, testVec):
    #print("1\n")
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    for key in secondDict.keys():
        if(testVec[firstStr] == key):
            if type(secondDict[key]) == dict:
                classLabel = classifier_id3(secondDict[key], testVec)
            else:
                classLabel = secondDict[key]
    return classLabel


def mean_normalization(x):
    return (x - x.mean(axis=0))/(x.max(axis=0) - x.min(axis=0))

def add_bias(x):
    m = x.shape[0]
    ones = np.ones((m, 1))
    return np.hstack((ones, x))

def gradient_descend(iters, alpha, x, y, algo='LINEAR'):
    hypothesis, cost = Algorithms[algo]
    costs = []
    T = np.zeros((x.shape[1]))
    m = x.shape[0]
    for i in range(iters):
        costs.append(cost(T, x, y))
        T = T - alpha* np.sum((hypothesis(T, x) - y).reshape(-1,1)*x)/m
       # print(T)
    return T, costs,hypothesis

def normal_eq(x, y):
    return np.linalg.inv(x.T@x)@x.T@y

#特征值的标准归一化处理
class standard_scalar:
    def __int__(self):
        self.mean = 0
        self.std = 0

    #用于计算训练集均值和方差，并返回训练集标准归一化结果
    def fit_transform(self,data):
        self.mean = data.mean(axis=0)
        self.std = data.std(axis=0)
        data[::] = (data - self.mean) / self.std

    #对测试集进行标准归一化
    def transform(self,data):
        data[::] = (data -self.mean) / self.std

if __name__ == '__main__':

    df = createTestData()
    tree = createTree(df)
    testVec = pd.Series([1, 1], index=['no surfing', 'flippers'])
    print(classifier_id3(createTree(df), testVec))

    df = createTestData()
    storelabels = df['label']  # 复制label
    trainTree = createTree(df)
    classlabel = classifier_id3(trainTree, testVec)
    print(classlabel)

import numpy as np

def sigmoid(z):
    return 1 / (1+np.exp(-z))

def logistic_hypothesis(t,x):
    z = x@t
    return sigmoid(z)
def train_test_split(features,targets,ratio = 0.8):
    m = features.shape[0]
    sep = int(m*ratio)
    indices = np.random.permutation(m)

    train_indices = indices[:sep]
    test_indices = indices[sep:]
    return features[train_indices,:],targets[train_indices],\
            features[test_indices,:],targets[test_indices]
'''
import tensorflow.keras as keras
keras.regularizers.l1(0.)  # L1范式正则
keras.regularizers.l2(0.)  # L2范式正则
keras.regularizers.l1_l2(l1=0.01, l2=0.01)  # L1范式，L2范式同时正则
'''

def logistic_cost(t,x,y):
    h = logistic_hypothesis(t,x)
    first = (-y)@np.log(h)
    second = (1-y)@np.log(1-h)*(0.001/2)*(t@t.T)
    return (first-second)/len(x)