example datasets in sklearn

0. 可用数据集

  • iris(三个类别)digits(10个类别)

    from sklearn.datasets import load_iris
        # from sklearn.datasets import load_digits
    iris_data = load_iris()
    >> type(iris_data)
    sklearn.datasets.base.Bunch
    >> dir(iris_data)
    ['DESCR', 'data', 'feature_names', 'target', 'target_names']
    >> type(iris_data.data)
    numpy.ndarray
    >> type(iris_data.target)
    numpy.ndarray
  • california_housing(在线下载)(回归问题)

    from sklearn.datasets import fetch_california_housing
    >> housing_data = fetch_california_housing()
    >> type(housing_data)
    sklearn.datasets.base.Bunch
    >> dir(housing_data)
    ['DESCR', 'data', 'feature_names', 'target']
    >> type(housing_data.data)
    numpy.ndarray
    >> type(housing_data.target)
    numpy.ndarray

1. nonlinear example datasets

  • 1.1 half_moon

    产生非线性数据集,比如用以测试核机制的性能;
    核方法最终的使命是:unfold the half-moons(展开)

    from sklearn.datasets import make_moons
    X, y = make_moons(n_samples=200, shuffle=True, random_state=123)
    plt.scatter(X[y==0, 0], X[y==0, 1], color='r', marker='^', alpha=.4)
    plt.scatter(X[y==1, 0], X[y==1, 1], color='r', marker='o', alpha=.4)
    plt.show()



  • 1.2 concentric circles

    from sklearn.datasets import make_circles
    X, y = make_circles(n_samples=1000, noise=.1, factor=.2, random_state=123)
    plt.scatter(X[y==0, 0], X[y==0, 1], color='r', marker='^', alpha=.4)
    plt.scatter(X[y==1, 0], X[y==1, 1], color='b', marker='o', alpha=.4)
    plt.show()



2. datasets in sklearn

from sklearn import datasets
  • iris

    >>> iris = datasets.load_iris()
    >>> dir(iris)
    >>> iris.features_names
    ['sepal length (cm)',
     'sepal width (cm)',
     'petal length (cm)',
     'petal width (cm)']
    
    >>> iris.target_names
    array(['setosa', 'versicolor', 'virginica'],
          dtype='<U10')
    
    >>> iris.data.shape
    (150, 4)                    # 训练样本 
    >>> iris.target.shape
    (150,)                      # 一维的训练样本
    
  • digits

    >> digits = datasets.load_digits()
    >> dir(digits)
    >> digits.data.target_names
    ...
  • make_blobs

    from sklearn.datasets import make_blobs
    
    X, y = make_blobs(n_samples=300, centers=4,
                      random_state=0, cluster_std=1.0)
    plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow');

3. UCI 数据

  • Breast Cancer Wisconsin dataset

    which contains 569 samples of malignant(恶性的) and benign(良性的) tumor cells.

    The first two columns in the dataset store the unique ID numbers of the samples and the corresponding diagnoisi (M=malignant, B=benign), respectively.

    The columns 3-32 contains 30 real-value features that have been computed from digitized images of the cell nuclei, which can be used to build a model to predict whether a tumor is benign or malignant.

    import pandas as pd
    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                     'breast-cancer-wisconsin/wdbc.data', header=None)
    X, y = df.values[:, 2:], df.values[:, 1]
翻译这段程序并自行赋值调用:import matplotlib.pyplot as plt import numpy as np import sklearn import sklearn.datasets import sklearn.linear_model def plot_decision_boundary(model, X, y): # Set min and max values and give it some padding x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1 y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1 h = 0.01 # Generate a grid of points with distance h between them xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Predict the function value for the whole grid Z = model(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # Plot the contour and training examples plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral) plt.ylabel('x2') plt.xlabel('x1') plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral) def sigmoid(x): s = 1/(1+np.exp(-x)) return s def load_planar_dataset(): np.random.seed(1) m = 400 # number of examples N = int(m/2) # number of points per class print(np.random.randn(N)) D = 2 # dimensionality X = np.zeros((m,D)) # data matrix where each row is a single example Y = np.zeros((m,1), dtype='uint8') # labels vector (0 for red, 1 for blue) a = 4 # maximum ray of the flower for j in range(2): ix = range(Nj,N(j+1)) t = np.linspace(j3.12,(j+1)3.12,N) + np.random.randn(N)0.2 # theta r = anp.sin(4t) + np.random.randn(N)0.2 # radius X[ix] = np.c_[rnp.sin(t), rnp.cos(t)] Y[ix] = j X = X.T Y = Y.T return X, Y def load_extra_datasets(): N = 200 noisy_circles = sklearn.datasets.make_circles(n_samples=N, factor=.5, noise=.3) noisy_moons = sklearn.datasets.make_moons(n_samples=N, noise=.2) blobs = sklearn.datasets.make_blobs(n_samples=N, random_state=5, n_features=2, centers=6) gaussian_quantiles = sklearn.datasets.make_gaussian_quantiles(mean=None, cov=0.5, n_samples=N, n_features=2, n_classes=2, shuffle=True, random_state=None) no_structure = np.random.rand(N, 2), np.random.rand(N, 2) return noisy_circles, noisy_moons, blobs, gaussian_quantiles, no_structure
05-24
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

五道口纳什

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值