sklearn实现lda模型_使用python+sklearn实现模型复杂性的影响

本文详细介绍了如何使用Python的sklearn库实现LDA(线性判别分析)模型,并探讨了模型复杂性对性能的影响。通过实例代码,展示了LDA模型的训练过程以及如何评估模型的复杂性和准确性。
摘要由CSDN通过智能技术生成
本示例演示了模型复杂性是如何影响预测精度和计算性能的。 数据集是用于回归(或者分类)的波士顿住房(Boston Housing)数据集(或者20 Newsgroups)。 对于每一类模型,我们通过选择相关的模型参数来改变模型的复杂度,并测量对计算性能(延迟)和预测能力(MSE或Hamming损失)的影响。

输出:
Benchmarking SGDClassifier(alpha=0.001, l1_ratio=0.25, loss='modified_huber',
              penalty='elasticnet')
Complexity: 4466 | Hamming Loss (Misclassification Ratio): 0.2491 | Pred. Time: 0.020835s
Benchmarking SGDClassifier(alpha=0.001, l1_ratio=0.5, loss='modified_huber',
              penalty='elasticnet')
Complexity: 1663 | Hamming Loss (Misclassification Ratio): 0.2915 | Pred. Time: 0.015789s
Benchmarking SGDClassifier(alpha=0.001, l1_ratio=0.75, loss='modified_huber',
              penalty='elasticnet')
Complexity: 880 | Hamming Loss (Misclassification Ratio): 0.3180 | Pred. Time: 0.013469s
Benchmarking SGDClassifier(alpha=0.001, l1_ratio=0.9, loss='modified_huber',
              penalty='elasticnet')
Complexity: 639 | Hamming Loss (Misclassification Ratio): 0.3337 | Pred. Time: 0.011812s
Benchmarking NuSVR(C=1000.0, gamma=3.0517578125e-05, nu=0.1)
Complexity: 69 | MSE: 31.8139 | Pred. Time: 0.000301s
Benchmarking NuSVR(C=1000.0, gamma=3.0517578125e-05, nu=0.25)
Complexity: 136 | MSE: 25.6140 | Pred. Time: 0.000811s
Benchmarking NuSVR(C=1000.0, gamma=3.0517578125e-05)
Complexity: 244 | MSE: 22.3375 | Pred. Time: 0.000895s
Benchmarking NuSVR(C=1000.0, gamma=3.0517578125e-05, nu=0.75)
Complexity: 351 | MSE: 21.3688 | Pred. Time: 0.001237s
Benchmarking NuSVR(C=1000.0, gamma=3.0517578125e-05, nu=0.9)
Complexity: 404 | MSE: 21.1033 | Pred. Time: 0.001460s
Benchmarking GradientBoostingRegressor(n_estimators=10)
Complexity: 10 | MSE: 29.0148 | Pred. Time: 0.000120s
Benchmarking GradientBoostingRegressor(n_estimators=50)
Complexity: 50 | MSE: 8.6545 | Pred. Time: 0.000302s
Benchmarking GradientBoostingRegressor()
Complexity: 100 | MSE: 7.7179 | Pred. Time: 0.000264s
Benchmarking GradientBoostingRegressor(n_estimators=200)
Complexity: 200 | MSE: 6.7507 | Pred. Time: 0.000425s
Benchmarking GradientBoostingRegressor(n_estimators=500)
Complexity: 500 | MSE: 7.1471 | Pred. Time: 0.000922s
print(__doc__)# 作者: Eustache Diemert # 许可证: BSD 3 clause
import time
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.parasite_axes import host_subplot
from mpl_toolkits.axisartist.axislines import Axes
from scipy.sparse.csr import csr_matrix
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.svm import NuSVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import hamming_loss# ############################################################################## 例程(Routines)# 初始化随机生成器
np.random.seed(0)
def generate_data(case, sparse=False):"""生成回归/分类数据。"""if case == 'regression':
        X, y = datasets.load_boston(return_X_y=True)elif case == 'classification':
        X, y = datasets.fetch_20newsgroups_vectorized(subset='all',
                                                      return_X_y=True)
    X, y = shuffle(X, y)
    offset = int(X.shape[0] * 0.8)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]if sparse:
        X_train = csr_matrix(X_train)
        X_test = csr_matrix(X_test)else:
        X_train = np.array(X_train)
        X_test = np.array(X_test)
    y_test = np.array(y_test)
    y_train = np.array(y_train)
    data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,'y_test': y_test}return data
def benchmark_influence(conf):"""
    changing_param:对MSE和延迟的基准影响
    """
    prediction_times = []
    prediction_powers = []
    complexities = []for param_value in conf['changing_param_values']:
        conf['tuned_params'][conf['changing_param']] = param_value
        estimator = conf['estimator'](**conf['tuned_params'])print("Benchmarking %s" % estimator)
        estimator.fit(conf['data']['X_train'], conf['data']['y_train'])
        conf['postfit_hook'](estimator)
        complexity = conf['complexity_computer'](estimator)
        complexities.append(complexity)
        start_time = time.time()for _ in range(conf['n_samples']):
            y_pred = estimator.predict(conf['data']['X_test'])
        elapsed_time = (time.time() - start_time) / float(conf['n_samples'])
        prediction_times.append(elapsed_time)
        pred_score = conf['prediction_performance_computer'](
            conf['data']['y_test'], y_pred)
        prediction_powers.append(pred_score)print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
            complexity, conf['prediction_performance_label'], pred_score,
            elapsed_time))return prediction_powers, prediction_times, complexities
def plot_influence(conf, mse_values, prediction_times, complexities):"""
    绘制模型复杂性对准确度和延迟的影响。
    """
    plt.figure(figsize=(12, 6))
    host = host_subplot(111, axes_class=Axes)
    plt.subplots_adjust(right=0.75)
    par1 = host.twinx()
    host.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
    y1_label = conf['prediction_performance_label']
    y2_label = "Time (s)"
    host.set_ylabel(y1_label)
    par1.set_ylabel(y2_label)
    p1, = host.plot(complexities, mse_values, 'b-', label="prediction error")
    p2, = par1.plot(complexities, prediction_times, 'r-',
                    label="latency")
    host.legend(loc='upper right')
    host.axis["left"].label.set_color(p1.get_color())
    par1.axis["right"].label.set_color(p2.get_color())
    plt.title('Influence of Model Complexity - %s' % conf['estimator'].__name__)
    plt.show()
def _count_nonzero_coefficients(estimator):
    a = estimator.coef_.toarray()return np.count_nonzero(a)# ############################################################################## 主要代码
regression_data = generate_data('regression')
classification_data = generate_data('classification', sparse=True)
configurations = [
    {'estimator': SGDClassifier,'tuned_params': {'penalty': 'elasticnet', 'alpha': 0.001, 'loss':'modified_huber', 'fit_intercept': True, 'tol': 1e-3},'changing_param': 'l1_ratio','changing_param_values': [0.25, 0.5, 0.75, 0.9],'complexity_label': 'non_zero coefficients','complexity_computer': _count_nonzero_coefficients,'prediction_performance_computer': hamming_loss,'prediction_performance_label': 'Hamming Loss (Misclassification Ratio)','postfit_hook': lambda x: x.sparsify(),'data': classification_data,'n_samples': 30},
    {'estimator': NuSVR,'tuned_params': {'C': 1e3, 'gamma': 2 ** -15},'changing_param': 'nu','changing_param_values': [0.1, 0.25, 0.5, 0.75, 0.9],'complexity_label': 'n_support_vectors','complexity_computer': lambda x: len(x.support_vectors_),'data': regression_data,'postfit_hook': lambda x: x,'prediction_performance_computer': mean_squared_error,'prediction_performance_label': 'MSE','n_samples': 30},
    {'estimator': GradientBoostingRegressor,'tuned_params': {'loss': 'ls'},'changing_param': 'n_estimators','changing_param_values': [10, 50, 100, 200, 500],'complexity_label': 'n_trees','complexity_computer': lambda x: x.n_estimators,'data': regression_data,'postfit_hook': lambda x: x,'prediction_performance_computer': mean_squared_error,'prediction_performance_label': 'MSE','n_samples': 30},
]for conf in configurations:
    prediction_performances, prediction_times, complexities = \
        benchmark_influence(conf)
    plot_influence(conf, prediction_performances, prediction_times,
                   complexities)
脚本的总运行时间: ( 0 分 20.261 秒) 估计的内存使用量: 60 MB 下载python源代码:plot_model_complexity_influence.py 下载Jupyter notebook源代码:plot_model_complexity_influence.ipynb 由Sphinx-Gallery生成的画廊 ☆☆☆为方便大家查阅,小编已将scikit-learn学习路线专栏文章统一整理到公众号底部菜单栏,同步更新中,关注公众号,点击左下方“系列文章”,如图: 欢迎大家和我一起沿着scikit-learn文档这条路线,一起巩固机器学习算法基础。(添加微信:mthler,备注:sklearn学习,一起进【sklearn机器学习进步群】开启打怪升级的学习之旅。)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
LDA(线性判别分析)是一种经典的有监督降维方法,它可以将高维数据映射到一个低维空间中,以便于分类和可视化。下面是一个简单的 Python 实现: 1.首先,我们需要计算类内散度矩阵 Sw 和类间散度矩阵 Sb。 ```python import numpy as np def compute_scatter_matrices(X, y): # 计算均值向量 class_labels = np.unique(y) n_classes = len(class_labels) n_features = X.shape[1] mean_vectors = [] for cl in class_labels: mean_vectors.append(np.mean(X[y==cl], axis=0)) # 计算类内散度矩阵 Sw = np.zeros((n_features, n_features)) for cl,mv in zip(class_labels, mean_vectors): class_sc_mat = np.zeros((n_features, n_features)) # scatter matrix for every class for row in X[y == cl]: row, mv = row.reshape(n_features,1), mv.reshape(n_features,1) # make column vectors class_sc_mat += (row-mv).dot((row-mv).T) Sw += class_sc_mat # sum class scatter matrices # 计算类间散度矩阵 overall_mean = np.mean(X, axis=0) Sb = np.zeros((n_features, n_features)) for i,mean_vec in enumerate(mean_vectors): n = X[y==class_labels[i]].shape[0] mean_vec = mean_vec.reshape(n_features,1) # make column vector overall_mean = overall_mean.reshape(n_features,1) # make column vector Sb += n * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T) return Sw, Sb ``` 2.然后,我们需要计算 Sw 的逆矩阵和 Sw 和 Sb 的乘积。 ```python def lda(X, y, n_components): Sw, Sb = compute_scatter_matrices(X, y) eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(Sw).dot(Sb)) eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True) W = np.hstack([eig_pairs[i][1].reshape(len(X[0]),1) for i in range(n_components)]) return X.dot(W) ``` 这个函数将返回一个降维后的特征矩阵,其中 n_components 是我们想要的输出维度数。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值