一份作业代码
随机森林回归
楼主彩笔
from sklearn import datasets
import sklearn.ensemble
import numpy as np
from typing import Union
import math
from sklearn.model_selection import train_test_split
datasets = datasets.load_diabetes()
X = datasets["data"]
y = datasets["target"]
class MetaLearner(object):
def __init__(self,
min_samples: int = 5,
min_gain: float = 0,
max_depth: int = 20,
max_leaves: Union[int, None] = None,
):
self.max_depth = max_depth
self.min_samples = min_samples
def cal_val(self, x_y):
y_list = list(x_y[:,-1])
if len(y_list) == 0:
return 0
ave = 0
for label in y_list:
ave += label
ave /= len(y_list)
sum = 0
for label in y_list:
sum += (label - ave)**2
return(sum / len(y_list))
def find_best_thre(self,x_y,attr):
beat_val = math.inf
val = self.cal_val(x_y)
attr_list = x_y[:,attr]
val_sum = 0
thre_list =[attr_list[i] for i in range(len(attr_list))]
for thre in thre_list:
less_thre = x_y[:, attr] < thre
larg_thre = ~less_thre
val_sum = (sum(less_thre) / len(thre_list)) * self.cal_val(x_y[less_thre]) + (sum(larg_thre) / len(thre_list)) * self.cal_val(x_y[larg_thre])
if val_sum < beat_val:
beat_val = val_sum
best_thre = thre
return best_thre,val - val_sum
def find_ave(self, x_y):
attr_list = x_y[:,-1]
attr_list = list(attr_list)
sum = 0
for attr in attr_list:
sum += attr
return sum / len(attr_list)
def fit(self, X: np.ndarray, y: np.ndarray) -> None:
x_y = np.hstack((X, y.reshape(len(y), 1)))
attr_list = list(range(len(X[0])))
self.node = self.tree(x_y, attr_list, 0)
return(self.node)
def tree(self, x_y, attr_list, depth):
print(depth)
if depth >= self.max_depth:
node = self.find_ave(x_y)
print("shut down: depth")
return node
if len(set(x_y[:,-1])) == 1:
node = list(set(x_y[:,-1]))[0]
return node
elif (len(x_y) <= self.min_samples):
node = self.find_ave(x_y)
print("shut down: too little")
return node
else:
best_attr_gain = -math.inf
best_thre = None
best_attr = None
for attr in attr_list:
thre, attr_gain = self.find_best_thre(x_y,attr)
if attr_gain > best_attr_gain:
best_attr_gain = attr_gain
best_thre = thre
best_attr = attr
left = x_y[:,best_attr] < best_thre
right = ~left
left = x_y[left]
right = x_y[right]
if len(left) == 0:
left = self.find_ave(x_y)
else:
left = self.tree(left, attr_list, depth + 1)
if len(right) == 0:
right = self.find_ave(x_y)
else:
right = self.tree(right, attr_list, depth + 1)
if left == right:
node = left
else:
node = {}
node[(best_attr, best_thre)] = {"<": left, ">=": right}
return node
def predictone(self, node, x):
pre_val = None
if type(node) == np.float64:
pre_val = node
elif type(node) == dict:
key = list(node)[0]
if x[key[0]] <= key[1]:
son = node[key]["<"]
else:
son = node[key][">="]
pre_val = self.predictone(son, x)
return pre_val
def predict(self, X: np.ndarray) -> np.ndarray:
anslist = []
for i in X:
pre_val = self.predictone(self.node, i)
anslist.append(pre_val)
return np.array(anslist)
class trees(MetaLearner):
def __init__(self, X_train_list, y_train_list, X_test_list, y_test_list):
self.X_train_list = X_train_list
self.y_train_list = y_train_list
self.X_test_list = X_test_list
self.y_test_list = y_test_list
def vote(self,min_samples, max_depth):
num = len(self.X_train_list)
trs = []
for i in range(num):
tree = MetaLearner(min_samples, max_depth)
tree.fit(self.X_train_list[i], self.y_train_list[i])
trs.append(tree)
ans = []
pre = []
flag = np.random.randint(0, len(X_train_list))
X_test = self.X_test_list[flag]
y_test = self.y_test_list[flag]
for i in range(num):
ans.append(trs[i].predict(X_test))
for y in range(len(ans[0])):
sum = 0
for x in range(len(ans)):
sum += ans[x][y]
pre.append(sum / len(ans))
count = 0
for i in range(len(y_test)):
count += (pre[i] - y_test[i])**2
ac = count / len(y_test)
return pre, ac
seed_list = list(range(50))
np.random.shuffle(seed_list)
X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []
for i in range(10):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=seed_list[i])
X_train_list.append(X_train)
X_test_list.append(X_test)
y_train_list.append(y_train)
y_test_list.append(y_test)
m = trees(X_train_list, y_train_list, X_test_list, y_test_list)
pre,ac = m.vote(min_samples=5, max_depth=20)
print(pre)
print(ac)