#ml_algorithms
from linear_regression import *
from logistic_regression import *
import numpy as np
import pandas as pd
Algorithms = {
'LINEAR':[linear_hypothesis, linear_cost],
'LOGISTIC':[logistic_hypothesis,logistic_cost]}
import pickle
#保存和读取训练号的决策树模型
def storeTree(tree,filename):
f = open(filename,tree)
pickle.dump(tree,f)
f.close()
def loadTree(filename):
f = open(filename,'rb')
t = pickle.load(f)
f.close()
return t
def Entropy(x):#熵
return (-x)*(np.log2(x))-(1-x)*(np.log2(1-x))
def calcEntropy(df,label = 'label'):
# data's sum counter
m = df.shape[0]
#for label counter
label_counts = df[label].value_counts()
prob = label_counts * 1.0 / m
return sum( -prob*np.log2(prob) )
def createTestData():
ds =pd.DataFrame( [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']],columns = ['no surfing','flippers','label'])
return ds
def splitDataFrame(df,feature,value):
columns = df.columns.to_list()
columns.remove(feature)
return df.loc[ df[feature ] == value,columns]
def chooseBestFeature(df,label = 'label'):
features = df.columns.to_list()
features.remove(label)
#计算所有样本信息熵
totalEntropy = calcEntropy(df,label)
bestInfoGain = 0
bestFeature = ''
for f in features:
uniqueVals = df[f].unique()
newEntropy = 0
for value in uniqueVals:
subs = splitDataFrame(df,f,value)
prob = len(subs) / float( len(df) ) #calculate 子集权重
newEntropy += prob * calcEntropy(subs)#calculate 条件熵
infoGain = totalEntropy - newEntropy
if infoGain > bestInfoGain: #update 最大增益及最优特征
bestInfoGain = infoGain
bestFeature = f
return bestFeature
def majorityCnt(classList):
classCounts = classList.value_counts().values
max_i = classCounts.argmax() #find max index in classlist
return classList.iloc[max_i]
def createTree(df,label = 'label'):
classList = df[label]#统计data中 所有标签
#recursive停止的第一个条件->当前df中所有数据是纯的(一个特征)
if len( classList.unique() ) == 1:
return classList.iloc[0]
#recursive停止的第二个条件->当所有标签都划分完了,都纯了,
#且返回标签中 数量最多的一个
if df.shape[1] == 1:
return majorityCnt(classList)
bestFeature = chooseBestFeature(df,label)
myTree = {bestFeature: {} }
#statistical feature 的属性值
uniqueVals = df[bestFeature].unique()
#对 每一个特征进行划分,递归建树 function
for value in uniqueVals:
myTree[bestFeature][value] = \
createTree( splitDataFrame(df,bestFeature,value) )
return myTree
def classifier_id3(inputTree, testVec):
#print("1\n")
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
for key in secondDict.keys():
if(testVec[firstStr] == key):
if type(secondDict[key]) == dict:
classLabel = classifier_id3(secondDict[key], testVec)
else:
classLabel = secondDict[key]
return classLabel
def mean_normalization(x):
return (x - x.mean(axis=0))/(x.max(axis=0) - x.min(axis=0))
def add_bias(x):
m = x.shape[0]
ones = np.ones((m, 1))
return np.hstack((ones, x))
def gradient_descend(iters, alpha, x, y, algo='LINEAR'):
hypothesis, cost = Algorithms[algo]
costs = []
T = np.zeros((x.shape[1]))
m = x.shape[0]
for i in range(iters):
costs.append(cost(T, x, y))
T = T - alpha* np.sum((hypothesis(T, x) - y).reshape(-1,1)*x)/m
# print(T)
return T, costs,hypothesis
def normal_eq(x, y):
return np.linalg.inv(x.T@x)@x.T@y
#特征值的标准归一化处理
class standard_scalar:
def __int__(self):
self.mean = 0
self.std = 0
#用于计算训练集均值和方差,并返回训练集标准归一化结果
def fit_transform(self,data):
self.mean = data.mean(axis=0)
self.std = data.std(axis=0)
data[::] = (data - self.mean) / self.std
#对测试集进行标准归一化
def transform(self,data):
data[::] = (data -self.mean) / self.std
if __name__ == '__main__':
df = createTestData()
tree = createTree(df)
testVec = pd.Series([1, 1], index=['no surfing', 'flippers'])
print(classifier_id3(createTree(df), testVec))
df = createTestData()
storelabels = df['label'] # 复制label
trainTree = createTree(df)
classlabel = classifier_id3(trainTree, testVec)
print(classlabel)
import numpy as np
def sigmoid(z):
return 1 / (1+np.exp(-z))
def logistic_hypothesis(t,x):
z = x@t
return sigmoid(z)
def train_test_split(features,targets,ratio = 0.8):
m = features.shape[0]
sep = int(m*ratio)
indices = np.random.permutation(m)
train_indices = indices[:sep]
test_indices = indices[sep:]
return features[train_indices,:],targets[train_indices],\
features[test_indices,:],targets[test_indices]
'''
import tensorflow.keras as keras
keras.regularizers.l1(0.) # L1范式正则
keras.regularizers.l2(0.) # L2范式正则
keras.regularizers.l1_l2(l1=0.01, l2=0.01) # L1范式,L2范式同时正则
'''
def logistic_cost(t,x,y):
h = logistic_hypothesis(t,x)
first = (-y)@np.log(h)
second = (1-y)@np.log(1-h)*(0.001/2)*(t@t.T)
return (first-second)/len(x)