调库(随机森林、决策树分类莺尾花)
from sklearn.datasets import load_iris
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt, warnings
warnings.filterwarnings('ignore')
data, target = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33, random_state=42)
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred1 = dt.predict(X_test)
print('cart决策树:\n', precision_score(y_test, y_pred1, average='micro'))
print(dt.classes_)
print(dt.feature_importances_)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred2 = rf.predict(X_test)
print(y_pred2)
print('随机森林:\n', precision_score(y_test, y_pred2, average='micro'))
print(rf.feature_importances_)
'''
参数average : string, [None, ‘micro’, ‘macro’(default), ‘samples’, ‘weighted’]
将一个二分类matrics拓展到多分类或多标签问题时,我们可以将数据看成多个二分类问题的集合,每个类都是一个二分类。接着,我们可以通过跨多个分类计算每个二分类metrics得分的均值,这在一些情况下很有用。你可以使用average参数来指定。
macro:计算二分类metrics的均值,为每个类给出相同权重的分值。当小类很重要时会出问题,因为该macro-averging方法是对性能的平均。另一方面,该方法假设所有分类都是一样重要的,因此macro-averaging方法会对小类的性能影响很大。
weighted:对于不均衡数量的类来说,计算二分类metrics的平均,通过在每个类的score上进行加权实现。
micro:给出了每个样本类以及它对整个metrics的贡献的pair(sample-weight),而非对整个类的metrics求和,它会每个类的metrics上的权重及因子进行求和,来计算整个份额。Micro-averaging方法在多标签(multilabel)问题中设置,包含多分类,此时,大类将被忽略。
samples:应用在multilabel问题上。它不会计算每个类,相反,它会在评估数据中,通过计算真实类和预测类的差异的metrics,来求平均(sample_weight-weighted)
average:average=None将返回一个数组,它包含了每个类的得分.
'''
底层
分类
import numpy as np
import pandas as pd
import random
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
Y = pd.DataFrame(data=iris.target)
data = pd.concat([X, Y], axis=1);
print(data)
M = []
R = []
n_trees = 100
for i in range(n_trees):
sample = data.sample(frac=0.7)
k = np.random.randint(1, sample.shape[1])
r = np.random.choice(range(sample.shape[1]), k, replace=False).tolist()
X = sample.iloc[:, r]
Y = sample.iloc[:, -1]
model = DecisionTreeClassifier()
model.fit(X, Y)
M.append(model)
R.append(r)
print('第' + str(i) + '颗预测score=', model.score(X, Y))
result = pd.concat([pd.DataFrame([M[i].predict(data.iloc[:, R[i]])]) for i in range(n_trees)], ignore_index=True)
predict = result.mode(axis=0).values[0].astype(int)
print('预测值结果=', predict)
score = sum(np.where(predict == iris.target, 1, 0)) / len(data)
print(score)
回归
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
boston = load_boston()
X = pd.DataFrame(data=boston.data, columns=boston.feature_names)
Y = pd.DataFrame(data=boston.target)
data = pd.concat([X, Y], axis=1)
M = []
R = []
n_trees = 100
for i in range(n_trees):
sample = data.sample(frac=0.7)
k = np.random.randint(1, sample.shape[1])
r = np.random.choice(range(sample.shape[1]), k, replace=False).tolist()
X = sample.iloc[:, r]
Y = sample.iloc[:, -1];
model = DecisionTreeRegressor()
model.fit(X, Y)
M.append(model)
R.append(r)
print('第' + str(i) + '颗预测score=', model.score(X, Y))
result = pd.concat([pd.DataFrame([M[i].predict(data.iloc[:, R[i]])]) for i in range(n_trees)], ignore_index=True)
predict = result.mean(axis=0).values
print('预测值结果=', predict)
print('R2=', r2_score(boston.target, predict))