sklearn中生成数据集samples_generator.py源代码

  1. """
  2. Generate samples of synthetic data sets.
  3. """
  4.  
  5. # Authors: B. Thirion, G. Varoquaux, A. Gramfort, V. Michel, O. Grisel,
  6. #          G. Louppe, J. Nothman
  7. # License: BSD 3 clause
  8.  
  9. import numbers
  10. import  array
  11. import numpy  as np
  12. from scipy  import linalg
  13. import scipy.sparse  as sp
  14.  
  15. from ..preprocessing  import MultiLabelBinarizer
  16. from ..utils  import check_array , check_random_state
  17. from ..utils  import shuffle  as util_shuffle
  18. from ..utils.fixes  import astype
  19. from ..utils. random  import sample_without_replacement
  20. from ..externals  import six
  21. map  = six.moves. map
  22. zip  = six.moves. zip
  23.  
  24.  
  25. def _generate_hypercube(samples , dimensions , rng):
  26.      """Returns distinct binary samples of length dimensions
  27.    """
  28.      if dimensions  >  30:
  29.          return np.hstack([_generate_hypercube(samples , dimensions -  30 , rng) ,
  30.                           _generate_hypercube(samples ,  30 , rng)])
  31.     out  = astype(sample_without_replacement( 2 ** dimensions , samples ,
  32.                                             random_state =rng) ,
  33.                  dtype = '>u4' ,  copy = False)
  34.     out  = np.unpackbits(out.view( '>u1')).reshape((- 1 ,  32))[: , -dimensions:]
  35.      return out
  36.  
  37.  
  38. def make_classification(n_samples = 100 , n_features = 20 , n_informative = 2 ,
  39.                         n_redundant = 2 , n_repeated = 0 , n_classes = 2 ,
  40.                         n_clusters_per_class = 2 , weights = None , flip_y = 0.01 ,
  41.                         class_sep = 1.0 , hypercube = True , shift = 0.0 , scale = 1.0 ,
  42.                         shuffle = True , random_state = None):
  43.      """Generate a random n-class classification problem.
  44.  
  45.    This initially creates clusters of points normally distributed (std=1)
  46.    about vertices of a `2 * class_sep`-sided hypercube, and assigns an equal
  47.    number of clusters to each class. It introduces interdependence between
  48.    these features and adds various types of further noise to the data.
  49.  
  50.    Prior to shuffling, `X` stacks a number of these primary "informative"
  51.    features, "redundant" linear combinations of these, "repeated" duplicates
  52.    of sampled features, and arbitrary noise for and remaining features.
  53.  
  54.    Read more in the :ref:`User Guide <sample_generators>`.
  55.  
  56.    Parameters
  57.    ----------
  58.    n_samples : int, optional (default=100)
  59.        The number of samples.
  60.  
  61.    n_features : int, optional (default=20)
  62.        The total number of features. These comprise `n_informative`
  63.        informative features, `n_redundant` redundant features, `n_repeated`
  64.        duplicated features and `n_features-n_informative-n_redundant-
  65.        n_repeated` useless features drawn at random.
  66.  
  67.    n_informative : int, optional (default=2)
  68.        The number of informative features. Each class is composed of a number
  69.        of gaussian clusters each located around the vertices of a hypercube
  70.        in a subspace of dimension `n_informative`. For each cluster,
  71.        informative features are drawn independently from  N(0, 1) and then
  72.        randomly linearly combined within each cluster in order to add
  73.        covariance. The clusters are then placed on the vertices of the
  74.        hypercube.
  75.  
  76.    n_redundant : int, optional (default=2)
  77.        The number of redundant features. These features are generated as
  78.        random linear combinations of the informative features.
  79.  
  80.    n_repeated : int, optional (default=0)
  81.        The number of duplicated features, drawn randomly from the informative
  82.        and the redundant features.
  83.  
  84.    n_classes : int, optional (default=2)
  85.        The number of classes (or labels) of the classification problem.
  86.  
  87.    n_clusters_per_class : int, optional (default=2)
  88.        The number of clusters per class.
  89.  
  90.    weights : list of floats or None (default=None)
  91.        The proportions of samples assigned to each class. If None, then
  92.        classes are balanced. Note that if `len(weights) == n_classes - 1`,
  93.        then the last class weight is automatically inferred.
  94.        More than `n_samples` samples may be returned if the sum of `weights`
  95.        exceeds 1.
  96.  
  97.    flip_y : float, optional (default=0.01)
  98.        The fraction of samples whose class are randomly exchanged.
  99.  
  100.    class_sep : float, optional (default=1.0)
  101.        The factor multiplying the hypercube dimension.
  102.  
  103.    hypercube : boolean, optional (default=True)
  104.        If True, the clusters are put on the vertices of a hypercube. If
  105.        False, the clusters are put on the vertices of a random polytope.
  106.  
  107.    shift : float, array of shape [n_features] or None, optional (default=0.0)
  108.        Shift features by the specified value. If None, then features
  109.        are shifted by a random value drawn in [-class_sep, class_sep].
  110.  
  111.    scale : float, array of shape [n_features] or None, optional (default=1.0)
  112.        Multiply features by the specified value. If None, then features
  113.        are scaled by a random value drawn in [1, 100]. Note that scaling
  114.        happens after shifting.
  115.  
  116.    shuffle : boolean, optional (default=True)
  117.        Shuffle the samples and the features.
  118.  
  119.    random_state : int, RandomState instance or None, optional (default=None)
  120.        If int, random_state is the seed used by the random number generator;
  121.        If RandomState instance, random_state is the random number generator;
  122.        If None, the random number generator is the RandomState instance used
  123.        by `np.random`.
  124.  
  125.    Returns
  126.    -------
  127.    X : array of shape [n_samples, n_features]
  128.        The generated samples.
  129.  
  130.    y : array of shape [n_samples]
  131.        The integer labels for class membership of each sample.
  132.  
  133.    Notes
  134.    -----
  135.    The algorithm is adapted from Guyon [1] and was designed to generate
  136.    the "Madelon" dataset.
  137.  
  138.    References
  139.    ----------
  140.    .. [1] I. Guyon, "Design of experiments for the NIPS 2003 variable
  141.           selection benchmark", 2003.
  142.  
  143.    See also
  144.    --------
  145.    make_blobs: simplified variant
  146.    make_multilabel_classification: unrelated generator for multilabel tasks
  147.    """
  148.     generator  = check_random_state(random_state)
  149.  
  150.      # Count features, clusters and samples
  151.      if n_informative + n_redundant + n_repeated  > n_features:
  152.          raise  ValueError( "Number of informative, redundant and repeated "
  153.                           "features must sum to less than the number of total"
  154.                           " features")
  155.      if  2 ** n_informative  < n_classes * n_clusters_per_class:
  156.          raise  ValueError( "n_classes * n_clusters_per_class must"
  157.                           " be smaller or equal 2 ** n_informative")
  158.      if weights  and  len(weights)  not  in [n_classes , n_classes -  1]:
  159.          raise  ValueError( "Weights specified but incompatible with number "
  160.                           "of classes.")
  161.  
  162.     n_useless  = n_features - n_informative - n_redundant - n_repeated
  163.     n_clusters  = n_classes * n_clusters_per_class
  164.  
  165.      if weights  and  len(weights)  == (n_classes -  1):
  166.         weights.append( 1.0 -  sum(weights))
  167.  
  168.      if weights  is  None:
  169.         weights  = [ 1.0 / n_classes] * n_classes
  170.         weights[- 1=  1.0 -  sum(weights[:- 1])
  171.  
  172.      # Distribute samples among clusters by weight
  173.     n_samples_per_cluster  = []
  174.      for k  in  range(n_clusters):
  175.         n_samples_per_cluster.append( int(n_samples * weights[k % n_classes]
  176.                                      / n_clusters_per_class))
  177.      for i  in  range(n_samples -  sum(n_samples_per_cluster)):
  178.         n_samples_per_cluster[i % n_clusters] + =  1
  179.  
  180.      # Initialize X and y
  181.     X  = np.zeros((n_samples , n_features))
  182.     y  = np.zeros(n_samples , dtype =np. int)
  183.  
  184.      # Build the polytope whose vertices become cluster centroids
  185.     centroids  = _generate_hypercube(n_clusters , n_informative ,
  186.                                     generator).astype( float)
  187.     centroids * =  2 * class_sep
  188.     centroids - = class_sep
  189.      if  not hypercube:
  190.         centroids * = generator.rand(n_clusters ,  1)
  191.         centroids * = generator.rand( 1 , n_informative)
  192.  
  193.      # Initially draw informative features from the standard normal
  194.     X[: , :n_informative]  = generator.randn(n_samples , n_informative)
  195.  
  196.      # Create each cluster; a variant of make_blobs
  197.     stop  =  0
  198.      for k , centroid  in  enumerate(centroids):
  199.         start , stop  = stop , stop + n_samples_per_cluster[k]
  200.         y[start:stop]  = k % n_classes   # assign labels
  201.         X_k  = X[start:stop , :n_informative]   # slice a view of the cluster
  202.  
  203.         A  =  2 * generator.rand(n_informative , n_informative) -  1
  204.         X_k[...]  = np.dot(X_k , A)   # introduce random covariance
  205.  
  206.         X_k + = centroid   # shift the cluster to a vertex
  207.  
  208.      # Create redundant features
  209.      if n_redundant  >  0:
  210.         B  =  2 * generator.rand(n_informative , n_redundant) -  1
  211.         X[: , n_informative:n_informative + n_redundant]  = \
  212.             np.dot(X[: , :n_informative] , B)
  213.  
  214.      # Repeat some features
  215.      if n_repeated  >  0:
  216.         n  = n_informative + n_redundant
  217.         indices  = ((n -  1) * generator.rand(n_repeated) +  0.5).astype(np.intp)
  218.         X[: , n:n + n_repeated]  = X[: , indices]
  219.  
  220.      # Fill useless features
  221.      if n_useless  >  0:
  222.         X[: , -n_useless:]  = generator.randn(n_samples , n_useless)
  223.  
  224.      # Randomly replace labels
  225.      if flip_y  >=  0.0:
  226.         flip_mask  = generator.rand(n_samples)  < flip_y
  227.         y[flip_mask]  = generator.randint(n_classes , size =flip_mask. sum())
  228.  
  229.      # Randomly shift and scale
  230.      if shift  is  None:
  231.         shift  = ( 2 * generator.rand(n_features) -  1) * class_sep
  232.     X + = shift
  233.  
  234.      if scale  is  None:
  235.         scale  =  1 +  100 * generator.rand(n_features)
  236.     X * = scale
  237.  
  238.      if shuffle:
  239.          # Randomly permute samples
  240.         X , y  = util_shuffle(X , y , random_state =generator)
  241.  
  242.          # Randomly permute features
  243.         indices  = np.arange(n_features)
  244.         generator.shuffle(indices)
  245.         X[: , :]  = X[: , indices]
  246.  
  247.      return X , y
  248.  
  249.  
  250. def make_multilabel_classification(n_samples = 100 , n_features = 20 , n_classes = 5 ,
  251.                                    n_labels = 2 , length = 50 , allow_unlabeled = True ,
  252.                                    sparse = False , return_indicator = 'dense' ,
  253.                                    return_distributions = False ,
  254.                                    random_state = None):
  255.      """Generate a random multilabel classification problem.
  256.  
  257.    For each sample, the generative process is:
  258.        - pick the number of labels: n ~ Poisson(n_labels)
  259.        - n times, choose a class c: c ~ Multinomial(theta)
  260.        - pick the document length: k ~ Poisson(length)
  261.        - k times, choose a word: w ~ Multinomial(theta_c)
  262.  
  263.    In the above process, rejection sampling is used to make sure that
  264.    n is never zero or more than `n_classes`, and that the document length
  265.    is never zero. Likewise, we reject classes which have already been chosen.
  266.  
  267.    Read more in the :ref:`User Guide <sample_generators>`.
  268.  
  269.    Parameters
  270.    ----------
  271.    n_samples : int, optional (default=100)
  272.        The number of samples.
  273.  
  274.    n_features : int, optional (default=20)
  275.        The total number of features.
  276.  
  277.    n_classes : int, optional (default=5)
  278.        The number of classes of the classification problem.
  279.  
  280.    n_labels : int, optional (default=2)
  281.        The average number of labels per instance. More precisely, the number
  282.        of labels per sample is drawn from a Poisson distribution with
  283.        ``n_labels`` as its expected value, but samples are bounded (using
  284.        rejection sampling) by ``n_classes``, and must be nonzero if
  285.        ``allow_unlabeled`` is False.
  286.  
  287.    length : int, optional (default=50)
  288.        The sum of the features (number of words if documents) is drawn from
  289.        a Poisson distribution with this expected value.
  290.  
  291.    allow_unlabeled : bool, optional (default=True)
  292.        If ``True``, some instances might not belong to any class.
  293.  
  294.    sparse : bool, optional (default=False)
  295.        If ``True``, return a sparse feature matrix
  296.  
  297.        .. versionadded:: 0.17
  298.           parameter to allow *sparse* output.
  299.  
  300.    return_indicator : 'dense' (default) | 'sparse' | False
  301.        If ``dense`` return ``Y`` in the dense binary indicator format. If
  302.        ``'sparse'`` return ``Y`` in the sparse binary indicator format.
  303.        ``False`` returns a list of lists of labels.
  304.  
  305.    return_distributions : bool, optional (default=False)
  306.        If ``True``, return the prior class probability and conditional
  307.        probabilities of features given classes, from which the data was
  308.        drawn.
  309.  
  310.    random_state : int, RandomState instance or None, optional (default=None)
  311.        If int, random_state is the seed used by the random number generator;
  312.        If RandomState instance, random_state is the random number generator;
  313.        If None, the random number generator is the RandomState instance used
  314.        by `np.random`.
  315.  
  316.    Returns
  317.    -------
  318.    X : array of shape [n_samples, n_features]
  319.        The generated samples.
  320.  
  321.    Y : array or sparse CSR matrix of shape [n_samples, n_classes]
  322.        The label sets.
  323.  
  324.    p_c : array, shape [n_classes]
  325.        The probability of each class being drawn. Only returned if
  326.        ``return_distributions=True``.
  327.  
  328.    p_w_c : array, shape [n_features, n_classes]
  329.        The probability of each feature being drawn given each class.
  330.        Only returned if ``return_distributions=True``.
  331.  
  332.    """
  333.     generator  = check_random_state(random_state)
  334.     p_c  = generator.rand(n_classes)
  335.     p_c / = p_c. sum()
  336.     cumulative_p_c  = np.cumsum(p_c)
  337.     p_w_c  = generator.rand(n_features , n_classes)
  338.     p_w_c / = np. sum(p_w_c , axis = 0)
  339.  
  340.      def sample_example():
  341.         _ , n_classes  = p_w_c.shape
  342.  
  343.          # pick a nonzero number of labels per document by rejection sampling
  344.         y_size  = n_classes +  1
  345.          while ( not allow_unlabeled  and y_size  ==  0or y_size  > n_classes:
  346.             y_size  = generator.poisson(n_labels)
  347.  
  348.          # pick n classes
  349.         y  =  set()
  350.          while  len(y)  != y_size:
  351.              # pick a class with probability P(c)
  352.             c  = np.searchsorted(cumulative_p_c ,
  353.                                 generator.rand(y_size -  len(y)))
  354.             y.update(c)
  355.         y  =  list(y)
  356.  
  357.          # pick a non-zero document length by rejection sampling
  358.         n_words  =  0
  359.          while n_words  ==  0:
  360.             n_words  = generator.poisson(length)
  361.  
  362.          # generate a document of length n_words
  363.          if  len(y)  ==  0:
  364.              # if sample does not belong to any class, generate noise word
  365.             words  = generator.randint(n_features , size =n_words)
  366.              return words , y
  367.  
  368.          # sample words with replacement from selected classes
  369.         cumulative_p_w_sample  = p_w_c.take(y , axis = 1). sum(axis = 1).cumsum()
  370.         cumulative_p_w_sample / = cumulative_p_w_sample[- 1]
  371.         words  = np.searchsorted(cumulative_p_w_sample , generator.rand(n_words))
  372.          return words , y
  373.  
  374.     X_indices  =  array. array( 'i')
  375.     X_indptr  =  array. array( 'i' , [ 0])
  376.     Y  = []
  377.      for i  in  range(n_samples):
  378.         words , y  = sample_example()
  379.         X_indices.extend(words)
  380.         X_indptr.append( len(X_indices))
  381.         Y.append(y)
  382.     X_data  = np.ones( len(X_indices) , dtype =np.float64)
  383.     X  = sp.csr_matrix((X_data , X_indices , X_indptr) ,
  384.                       shape =(n_samples , n_features))
  385.     X.sum_duplicates()
  386.      if  not sparse:
  387.         X  = X.toarray()
  388.  
  389.      # return_indicator can be True due to backward compatibility
  390.      if return_indicator  in ( True ,  'sparse' ,  'dense'):
  391.         lb  = MultiLabelBinarizer(sparse_output =(return_indicator  ==  'sparse'))
  392.         Y  = lb.fit([ range(n_classes)]).transform(Y)
  393.      elif return_indicator  is  not  False:
  394.          raise  ValueError( "return_indicator must be either 'sparse', 'dense' "
  395.                           'or False.')
  396.      if return_distributions:
  397.          return X , Y , p_c , p_w_c
  398.      return X , Y
  399.  
  400.  
  401. def make_hastie_10_2(n_samples = 12000 , random_state = None):
  402.      """Generates data for binary classification used in
  403.    Hastie et al. 2009, Example 10.2.
  404.  
  405.    The ten features are standard independent Gaussian and
  406.    the target ``y`` is defined by::
  407.  
  408.      y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1
  409.  
  410.    Read more in the :ref:`User Guide <sample_generators>`.
  411.  
  412.    Parameters
  413.    ----------
  414.    n_samples : int, optional (default=12000)
  415.        The number of samples.
  416.  
  417.    random_state : int, RandomState instance or None, optional (default=None)
  418.        If int, random_state is the seed used by the random number generator;
  419.        If RandomState instance, random_state is the random number generator;
  420.        If None, the random number generator is the RandomState instance used
  421.        by `np.random`.
  422.  
  423.    Returns
  424.    -------
  425.    X : array of shape [n_samples, 10]
  426.        The input samples.
  427.  
  428.    y : array of shape [n_samples]
  429.        The output values.
  430.  
  431.    References
  432.    ----------
  433.    .. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
  434.           Learning Ed. 2", Springer, 2009.
  435.  
  436.    See also
  437.    --------
  438.    make_gaussian_quantiles: a generalization of this dataset approach
  439.    """
  440.     rs  = check_random_state(random_state)
  441.  
  442.     shape  = (n_samples ,  10)
  443.     X  = rs.normal(size =shape).reshape(shape)
  444.     y  = ((X **  2.0). sum(axis = 1>  9.34).astype(np.float64)
  445.     y[y  ==  0.0= - 1.0
  446.  
  447.      return X , y
  448.  
  449.  
  450. def make_regression(n_samples = 100 , n_features = 100 , n_informative = 10 ,
  451.                     n_targets = 1 , bias = 0.0 , effective_rank = None ,
  452.                     tail_strength = 0.5 , noise = 0.0 , shuffle = True , coef = False ,
  453.                     random_state = None):
  454.      """Generate a random regression problem.
  455.  
  456.    The input set can either be well conditioned (by default) or have a low
  457.    rank-fat tail singular profile. See :func:`make_low_rank_matrix` for
  458.    more details.
  459.  
  460.    The output is generated by applying a (potentially biased) random linear
  461.    regression model with `n_informative` nonzero regressors to the previously
  462.    generated input and some gaussian centered noise with some adjustable
  463.    scale.
  464.  
  465.    Read more in the :ref:`User Guide <sample_generators>`.
  466.  
  467.    Parameters
  468.    ----------
  469.    n_samples : int, optional (default=100)
  470.        The number of samples.
  471.  
  472.    n_features : int, optional (default=100)
  473.        The number of features.
  474.  
  475.    n_informative : int, optional (default=10)
  476.        The number of informative features, i.e., the number of features used
  477.        to build the linear model used to generate the output.
  478.  
  479.    n_targets : int, optional (default=1)
  480.        The number of regression targets, i.e., the dimension of the y output
  481.        vector associated with a sample. By default, the output is a scalar.
  482.  
  483.    bias : float, optional (default=0.0)
  484.        The bias term in the underlying linear model.
  485.  
  486.    effective_rank : int or None, optional (default=None)
  487.        if not None:
  488.            The approximate number of singular vectors required to explain most
  489.            of the input data by linear combinations. Using this kind of
  490.            singular spectrum in the input allows the generator to reproduce
  491.            the correlations often observed in practice.
  492.        if None:
  493.            The input set is well conditioned, centered and gaussian with
  494.            unit variance.
  495.  
  496.    tail_strength : float between 0.0 and 1.0, optional (default=0.5)
  497.        The relative importance of the fat noisy tail of the singular values
  498.        profile if `effective_rank` is not None.
  499.  
  500.    noise : float, optional (default=0.0)
  501.        The standard deviation of the gaussian noise applied to the output.
  502.  
  503.    shuffle : boolean, optional (default=True)
  504.        Shuffle the samples and the features.
  505.  
  506.    coef : boolean, optional (default=False)
  507.        If True, the coefficients of the underlying linear model are returned.
  508.  
  509.    random_state : int, RandomState instance or None, optional (default=None)
  510.        If int, random_state is the seed used by the random number generator;
  511.        If RandomState instance, random_state is the random number generator;
  512.        If None, the random number generator is the RandomState instance used
  513.        by `np.random`.
  514.  
  515.    Returns
  516.    -------
  517.    X : array of shape [n_samples, n_features]
  518.        The input samples.
  519.  
  520.    y : array of shape [n_samples] or [n_samples, n_targets]
  521.        The output values.
  522.  
  523.    coef : array of shape [n_features] or [n_features, n_targets], optional
  524.        The coefficient of the underlying linear model. It is returned only if
  525.        coef is True.
  526.    """
  527.     n_informative  =  min(n_features , n_informative)
  528.     generator  = check_random_state(random_state)
  529.  
  530.      if effective_rank  is  None:
  531.          # Randomly generate a well conditioned input set
  532.         X  = generator.randn(n_samples , n_features)
  533.  
  534.      else:
  535.          # Randomly generate a low rank, fat tail input set
  536.         X  = make_low_rank_matrix(n_samples =n_samples ,
  537.                                  n_features =n_features ,
  538.                                  effective_rank =effective_rank ,
  539.                                  tail_strength =tail_strength ,
  540.                                  random_state =generator)
  541.  
  542.      # Generate a ground truth model with only n_informative features being non
  543.      # zeros (the other features are not correlated to y and should be ignored
  544.      # by a sparsifying regularizers such as L1 or elastic net)
  545.     ground_truth  = np.zeros((n_features , n_targets))
  546.     ground_truth[:n_informative , :]  =  100 * generator.rand(n_informative ,
  547.                                                            n_targets)
  548.  
  549.     y  = np.dot(X , ground_truth) + bias
  550.  
  551.      # Add noise
  552.      if noise  >  0.0:
  553.         y + = generator.normal(scale =noise , size =y.shape)
  554.  
  555.      # Randomly permute samples and features
  556.      if shuffle:
  557.         X , y  = util_shuffle(X , y , random_state =generator)
  558.  
  559.         indices  = np.arange(n_features)
  560.         generator.shuffle(indices)
  561.         X[: , :]  = X[: , indices]
  562.         ground_truth  = ground_truth[indices]
  563.  
  564.     y  = np.squeeze(y)
  565.  
  566.      if coef:
  567.          return X , y , np.squeeze(ground_truth)
  568.  
  569.      else:
  570.          return X , y
  571.  
  572.  
  573. def make_circles(n_samples = 100 , shuffle = True , noise = None , random_state = None ,
  574.                  factor = .8):
  575.      """Make a large circle containing a smaller circle in 2d.
  576.  
  577.    A simple toy dataset to visualize clustering and classification
  578.    algorithms.
  579.  
  580.    Read more in the :ref:`User Guide <sample_generators>`.
  581.  
  582.    Parameters
  583.    ----------
  584.    n_samples : int, optional (default=100)
  585.        The total number of points generated.
  586.  
  587.    shuffle: bool, optional (default=True)
  588.        Whether to shuffle the samples.
  589.  
  590.    noise : double or None (default=None)
  591.        Standard deviation of Gaussian noise added to the data.
  592.  
  593.    factor : double < 1 (default=.8)
  594.        Scale factor between inner and outer circle.
  595.  
  596.    Returns
  597.    -------
  598.    X : array of shape [n_samples, 2]
  599.        The generated samples.
  600.  
  601.    y : array of shape [n_samples]
  602.        The integer labels (0 or 1) for class membership of each sample.
  603.    """
  604.  
  605.      if factor  >  1  or factor  <  0:
  606.          raise  ValueError( "'factor' has to be between 0 and 1.")
  607.  
  608.     generator  = check_random_state(random_state)
  609.      # so as not to have the first point = last point, we add one and then
  610.      # remove it.
  611.     linspace  = np.linspace( 0 ,  2 * np.pi , n_samples //  2 +  1)[:- 1]
  612.     outer_circ_x  = np.cos(linspace)
  613.     outer_circ_y  = np.sin(linspace)
  614.     inner_circ_x  = outer_circ_x * factor
  615.     inner_circ_y  = outer_circ_y * factor
  616.  
  617.     X  = np.vstack((np.append(outer_circ_x , inner_circ_x) ,
  618.                    np.append(outer_circ_y , inner_circ_y))).T
  619.     y  = np.hstack([np.zeros(n_samples //  2 , dtype =np.intp) ,
  620.                    np.ones(n_samples //  2 , dtype =np.intp)])
  621.      if shuffle:
  622.         X , y  = util_shuffle(X , y , random_state =generator)
  623.  
  624.      if noise  is  not  None:
  625.         X + = generator.normal(scale =noise , size =X.shape)
  626.  
  627.      return X , y
  628.  
  629.  
  630. def make_moons(n_samples = 100 , shuffle = True , noise = None , random_state = None):
  631.      """Make two interleaving half circles
  632.  
  633.    A simple toy dataset to visualize clustering and classification
  634.    algorithms. Read more in the :ref:`User Guide <sample_generators>`.
  635.  
  636.    Parameters
  637.    ----------
  638.    n_samples : int, optional (default=100)
  639.        The total number of points generated.
  640.  
  641.    shuffle : bool, optional (default=True)
  642.        Whether to shuffle the samples.
  643.  
  644.    noise : double or None (default=None)
  645.        Standard deviation of Gaussian noise added to the data.
  646.  
  647.    Returns
  648.    -------
  649.    X : array of shape [n_samples, 2]
  650.        The generated samples.
  651.  
  652.    y : array of shape [n_samples]
  653.        The integer labels (0 or 1) for class membership of each sample.
  654.    """
  655.  
  656.     n_samples_out  = n_samples //  2
  657.     n_samples_in  = n_samples - n_samples_out
  658.  
  659.     generator  = check_random_state(random_state)
  660.  
  661.     outer_circ_x  = np.cos(np.linspace( 0 , np.pi , n_samples_out))
  662.     outer_circ_y  = np.sin(np.linspace( 0 , np.pi , n_samples_out))
  663.     inner_circ_x  =  1 - np.cos(np.linspace( 0 , np.pi , n_samples_in))
  664.     inner_circ_y  =  1 - np.sin(np.linspace( 0 , np.pi , n_samples_in)) -  .5
  665.  
  666.     X  = np.vstack((np.append(outer_circ_x , inner_circ_x) ,
  667.                    np.append(outer_circ_y , inner_circ_y))).T
  668.     y  = np.hstack([np.zeros(n_samples_in , dtype =np.intp) ,
  669.                    np.ones(n_samples_out , dtype =np.intp)])
  670.  
  671.      if shuffle:
  672.         X , y  = util_shuffle(X , y , random_state =generator)
  673.  
  674.      if noise  is  not  None:
  675.         X + = generator.normal(scale =noise , size =X.shape)
  676.  
  677.      return X , y
  678.  
  679.  
  680. def make_blobs(n_samples = 100 , n_features = 2 , centers = 3 , cluster_std = 1.0 ,
  681.                center_box =(- 10.0 ,  10.0) , shuffle = True , random_state = None):
  682.      """Generate isotropic Gaussian blobs for clustering.
  683.  
  684.    Read more in the :ref:`User Guide <sample_generators>`.
  685.  
  686.    Parameters
  687.    ----------
  688.    n_samples : int, optional (default=100)
  689.        The total number of points equally divided among clusters.
  690.  
  691.    n_features : int, optional (default=2)
  692.        The number of features for each sample.
  693.  
  694.    centers : int or array of shape [n_centers, n_features], optional
  695.        (default=3)
  696.        The number of centers to generate, or the fixed center locations.
  697.  
  698.    cluster_std : float or sequence of floats, optional (default=1.0)
  699.        The standard deviation of the clusters.
  700.  
  701.    center_box : pair of floats (min, max), optional (default=(-10.0, 10.0))
  702.        The bounding box for each cluster center when centers are
  703.        generated at random.
  704.  
  705.    shuffle : boolean, optional (default=True)
  706.        Shuffle the samples.
  707.  
  708.    random_state : int, RandomState instance or None, optional (default=None)
  709.        If int, random_state is the seed used by the random number generator;
  710.        If RandomState instance, random_state is the random number generator;
  711.        If None, the random number generator is the RandomState instance used
  712.        by `np.random`.
  713.  
  714.    Returns
  715.    -------
  716.    X : array of shape [n_samples, n_features]
  717.        The generated samples.
  718.  
  719.    y : array of shape [n_samples]
  720.        The integer labels for cluster membership of each sample.
  721.  
  722.    Examples
  723.    --------
  724.    >>> from sklearn.datasets.samples_generator import make_blobs
  725.    >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
  726.    ...                   random_state=0)
  727.    >>> print(X.shape)
  728.    (10, 2)
  729.    >>> y
  730.    array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
  731.  
  732.    See also
  733.    --------
  734.    make_classification: a more intricate variant
  735.    """
  736.     generator  = check_random_state(random_state)
  737.  
  738.      if  isinstance(centers , numbers.Integral):
  739.         centers  = generator.uniform(center_box[ 0] , center_box[ 1] ,
  740.                                     size =(centers , n_features))
  741.      else:
  742.         centers  = check_array(centers)
  743.         n_features  = centers.shape[ 1]
  744.  
  745.      if  isinstance(cluster_std , numbers.Real):
  746.         cluster_std  = np.ones( len(centers)) * cluster_std
  747.  
  748.     X  = []
  749.     y  = []
  750.  
  751.     n_centers  = centers.shape[ 0]
  752.     n_samples_per_center  = [ int(n_samples // n_centers)] * n_centers
  753.  
  754.      for i  in  range(n_samples % n_centers):
  755.         n_samples_per_center[i] + =  1
  756.  
  757.      for i , (n , std)  in  enumerate( zip(n_samples_per_center , cluster_std)):
  758.         X.append(centers[i] + generator.normal(scale =std ,
  759.                                                size =(n , n_features)))
  760.         y + = [i] * n
  761.  
  762.     X  = np.concatenate(X)
  763.     y  = np. array(y)
  764.  
  765.      if shuffle:
  766.         indices  = np.arange(n_samples)
  767.         generator.shuffle(indices)
  768.         X  = X[indices]
  769.         y  = y[indices]
  770.  
  771.      return X , y
  772.  
  773.  
  774. def make_friedman1(n_samples = 100 , n_features = 10 , noise = 0.0 , random_state = None):
  775.      """Generate the "Friedman \#1" regression problem
  776.  
  777.    This dataset is described in Friedman [1] and Breiman [2].
  778.  
  779.    Inputs `X` are independent features uniformly distributed on the interval
  780.    [0, 1]. The output `y` is created according to the formula::
  781.  
  782.        y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \
  783. + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1).
  784.  
  785.    Out of the `n_features` features, only 5 are actually used to compute
  786.    `y`. The remaining features are independent of `y`.
  787.  
  788.    The number of features has to be >= 5.
  789.  
  790.    Read more in the :ref:`User Guide <sample_generators>`.
  791.  
  792.    Parameters
  793.    ----------
  794.    n_samples : int, optional (default=100)
  795.        The number of samples.
  796.  
  797.    n_features : int, optional (default=10)
  798.        The number of features. Should be at least 5.
  799.  
  800.    noise : float, optional (default=0.0)
  801.        The standard deviation of the gaussian noise applied to the output.
  802.  
  803.    random_state : int, RandomState instance or None, optional (default=None)
  804.        If int, random_state is the seed used by the random number generator;
  805.        If RandomState instance, random_state is the random number generator;
  806.        If None, the random number generator is the RandomState instance used
  807.        by `np.random`.
  808.  
  809.    Returns
  810.    -------
  811.    X : array of shape [n_samples, n_features]
  812.        The input samples.
  813.  
  814.    y : array of shape [n_samples]
  815.        The output values.
  816.  
  817.    References
  818.    ----------
  819.    .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
  820.           of Statistics 19 (1), pages 1-67, 1991.
  821.  
  822.    .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
  823.           pages 123-140, 1996.
  824.    """
  825.      if n_features  <  5:
  826.          raise  ValueError( "n_features must be at least five.")
  827.  
  828.     generator  = check_random_state(random_state)
  829.  
  830.     X  = generator.rand(n_samples , n_features)
  831.     y  =  10 * np.sin(np.pi * X[: ,  0] * X[: ,  1]) +  20 * (X[: ,  2] -  0.5) **  2 \
  832.         +  10 * X[: ,  3] +  5 * X[: ,  4] + noise * generator.randn(n_samples)
  833.  
  834.      return X , y
  835.  
  836.  
  837. def make_friedman2(n_samples = 100 , noise = 0.0 , random_state = None):
  838.      """Generate the "Friedman \#2" regression problem
  839.  
  840.    This dataset is described in Friedman [1] and Breiman [2].
  841.  
  842.    Inputs `X` are 4 independent features uniformly distributed on the
  843.    intervals::
  844.  
  845.        0 <= X[:, 0] <= 100,
  846.        40 * pi <= X[:, 1] <= 560 * pi,
  847.        0 <= X[:, 2] <= 1,
  848.        1 <= X[:, 3] <= 11.
  849.  
  850.    The output `y` is created according to the formula::
  851.  
  852.        y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] \
  853. - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1).
  854.  
  855.    Read more in the :ref:`User Guide <sample_generators>`.
  856.  
  857.    Parameters
  858.    ----------
  859.    n_samples : int, optional (default=100)
  860.        The number of samples.
  861.  
  862.    noise : float, optional (default=0.0)
  863.        The standard deviation of the gaussian noise applied to the output.
  864.  
  865.    random_state : int, RandomState instance or None, optional (default=None)
  866.        If int, random_state is the seed used by the random number generator;
  867.        If RandomState instance, random_state is the random number generator;
  868.        If None, the random number generator is the RandomState instance used
  869.        by `np.random`.
  870.  
  871.    Returns
  872.    -------
  873.    X : array of shape [n_samples, 4]
  874.        The input samples.
  875.  
  876.    y : array of shape [n_samples]
  877.        The output values.
  878.  
  879.    References
  880.    ----------
  881.    .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
  882.           of Statistics 19 (1), pages 1-67, 1991.
  883.  
  884.    .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
  885.           pages 123-140, 1996.
  886.    """
  887.     generator  = check_random_state(random_state)
  888.  
  889.     X  = generator.rand(n_samples ,  4)
  890.     X[: ,  0] * =  100
  891.     X[: ,  1] * =  520 * np.pi
  892.     X[: ,  1] + =  40 * np.pi
  893.     X[: ,  3] * =  10
  894.     X[: ,  3] + =  1
  895.  
  896.     y  = (X[: ,  0] **  2
  897.          + (X[: ,  1] * X[: ,  2] -  1 / (X[: ,  1] * X[: ,  3])) **  2) **  0.5 \
  898.         + noise * generator.randn(n_samples)
  899.  
  900.      return X , y
  901.  
  902.  
  903. def make_friedman3(n_samples = 100 , noise = 0.0 , random_state = None):
  904.      """Generate the "Friedman \#3" regression problem
  905.  
  906.    This dataset is described in Friedman [1] and Breiman [2].
  907.  
  908.    Inputs `X` are 4 independent features uniformly distributed on the
  909.    intervals::
  910.  
  911.        0 <= X[:, 0] <= 100,
  912.        40 * pi <= X[:, 1] <= 560 * pi,
  913.        0 <= X[:, 2] <= 1,
  914.        1 <= X[:, 3] <= 11.
  915.  
  916.    The output `y` is created according to the formula::
  917.  
  918.        y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) \
  919. / X[:, 0]) + noise * N(0, 1).
  920.  
  921.    Read more in the :ref:`User Guide <sample_generators>`.
  922.  
  923.    Parameters
  924.    ----------
  925.    n_samples : int, optional (default=100)
  926.        The number of samples.
  927.  
  928.    noise : float, optional (default=0.0)
  929.        The standard deviation of the gaussian noise applied to the output.
  930.  
  931.    random_state : int, RandomState instance or None, optional (default=None)
  932.        If int, random_state is the seed used by the random number generator;
  933.        If RandomState instance, random_state is the random number generator;
  934.        If None, the random number generator is the RandomState instance used
  935.        by `np.random`.
  936.  
  937.    Returns
  938.    -------
  939.    X : array of shape [n_samples, 4]
  940.        The input samples.
  941.  
  942.    y : array of shape [n_samples]
  943.        The output values.
  944.  
  945.    References
  946.    ----------
  947.    .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
  948.           of Statistics 19 (1), pages 1-67, 1991.
  949.  
  950.    .. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
  951.           pages 123-140, 1996.
  952.    """
  953.     generator  = check_random_state(random_state)
  954.  
  955.     X  = generator.rand(n_samples ,  4)
  956.     X[: ,  0] * =  100
  957.     X[: ,  1] * =  520 * np.pi
  958.     X[: ,  1] + =  40 * np.pi
  959.     X[: ,  3] * =  10
  960.     X[: ,  3] + =  1
  961.  
  962.     y  = np.arctan((X[: ,  1] * X[: ,  2] -  1 / (X[: ,  1] * X[: ,  3])) / X[: ,  0]) \
  963.         + noise * generator.randn(n_samples)
  964.  
  965.      return X , y
  966.  
  967.  
  968. def make_low_rank_matrix(n_samples = 100 , n_features = 100 , effective_rank = 10 ,
  969.                          tail_strength = 0.5 , random_state = None):
  970.      """Generate a mostly low rank matrix with bell-shaped singular values
  971.  
  972.    Most of the variance can be explained by a bell-shaped curve of width
  973.    effective_rank: the low rank part of the singular values profile is::
  974.  
  975.        (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)
  976.  
  977.    The remaining singular values' tail is fat, decreasing as::
  978.  
  979.        tail_strength * exp(-0.1 * i / effective_rank).
  980.  
  981.    The low rank part of the profile can be considered the structured
  982.    signal part of the data while the tail can be considered the noisy
  983.    part of the data that cannot be summarized by a low number of linear
  984.    components (singular vectors).
  985.  
  986.    This kind of singular profiles is often seen in practice, for instance:
  987.     - gray level pictures of faces
  988.     - TF-IDF vectors of text documents crawled from the web
  989.  
  990.    Read more in the :ref:`User Guide <sample_generators>`.
  991.  
  992.    Parameters
  993.    ----------
  994.    n_samples : int, optional (default=100)
  995.        The number of samples.
  996.  
  997.    n_features : int, optional (default=100)
  998.        The number of features.
  999.  
  1000.    effective_rank : int, optional (default=10)
  1001.        The approximate number of singular vectors required to explain most of
  1002.        the data by linear combinations.
  1003.  
  1004.    tail_strength : float between 0.0 and 1.0, optional (default=0.5)
  1005.        The relative importance of the fat noisy tail of the singular values
  1006.        profile.
  1007.  
  1008.    random_state : int, RandomState instance or None, optional (default=None)
  1009.        If int, random_state is the seed used by the random number generator;
  1010.        If RandomState instance, random_state is the random number generator;
  1011.        If None, the random number generator is the RandomState instance used
  1012.        by `np.random`.
  1013.  
  1014.    Returns
  1015.    -------
  1016.    X : array of shape [n_samples, n_features]
  1017.        The matrix.
  1018.    """
  1019.     generator  = check_random_state(random_state)
  1020.     n  =  min(n_samples , n_features)
  1021.  
  1022.      # Random (ortho normal) vectors
  1023.     u , _  = linalg.qr(generator.randn(n_samples , n) , mode = 'economic')
  1024.     v , _  = linalg.qr(generator.randn(n_features , n) , mode = 'economic')
  1025.  
  1026.      # Index of the singular values
  1027.     singular_ind  = np.arange(n , dtype =np.float64)
  1028.  
  1029.      # Build the singular profile by assembling signal and noise components
  1030.     low_rank  = (( 1 - tail_strength) *
  1031.                 np.exp(- 1.0 * (singular_ind / effective_rank) **  2))
  1032.     tail  = tail_strength * np.exp(- 0.1 * singular_ind / effective_rank)
  1033.     s  = np.identity(n) * (low_rank + tail)
  1034.  
  1035.      return np.dot(np.dot(u , s) , v.T)
  1036.  
  1037.  
  1038. def make_sparse_coded_signal(n_samples , n_components , n_features ,
  1039.                              n_nonzero_coefs , random_state = None):
  1040.      """Generate a signal as a sparse combination of dictionary elements.
  1041.  
  1042.    Returns a matrix Y = DX, such as D is (n_features, n_components),
  1043.    X is (n_components, n_samples) and each column of X has exactly
  1044.    n_nonzero_coefs non-zero elements.
  1045.  
  1046.    Read more in the :ref:`User Guide <sample_generators>`.
  1047.  
  1048.    Parameters
  1049.    ----------
  1050.    n_samples : int
  1051.        number of samples to generate
  1052.  
  1053.    n_components:  int,
  1054.        number of components in the dictionary
  1055.  
  1056.    n_features : int
  1057.        number of features of the dataset to generate
  1058.  
  1059.    n_nonzero_coefs : int
  1060.        number of active (non-zero) coefficients in each sample
  1061.  
  1062.    random_state : int or RandomState instance, optional (default=None)
  1063.        seed used by the pseudo random number generator
  1064.  
  1065.    Returns
  1066.    -------
  1067.    data : array of shape [n_features, n_samples]
  1068.        The encoded signal (Y).
  1069.  
  1070.    dictionary : array of shape [n_features, n_components]
  1071.        The dictionary with normalized components (D).
  1072.  
  1073.    code : array of shape [n_components, n_samples]
  1074.        The sparse code such that each column of this matrix has exactly
  1075.        n_nonzero_coefs non-zero items (X).
  1076.  
  1077.    """
  1078.     generator  = check_random_state(random_state)
  1079.  
  1080.      # generate dictionary
  1081.     D  = generator.randn(n_features , n_components)
  1082.     D / = np.sqrt(np. sum((D **  2) , axis = 0))
  1083.  
  1084.      # generate code
  1085.     X  = np.zeros((n_components , n_samples))
  1086.      for i  in  range(n_samples):
  1087.         idx  = np.arange(n_components)
  1088.         generator.shuffle(idx)
  1089.         idx  = idx[:n_nonzero_coefs]
  1090.         X[idx , i]  = generator.randn(n_nonzero_coefs)
  1091.  
  1092.      # encode signal
  1093.     Y  = np.dot(D , X)
  1094.  
  1095.      return  map(np.squeeze , (Y , D , X))
  1096.  
  1097.  
  1098. def make_sparse_uncorrelated(n_samples = 100 , n_features = 10 , random_state = None):
  1099.      """Generate a random regression problem with sparse uncorrelated design
  1100.  
  1101.    This dataset is described in Celeux et al [1]. as::
  1102.  
  1103.        X ~ N(0, 1)
  1104.        y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]
  1105.  
  1106.    Only the first 4 features are informative. The remaining features are
  1107.    useless.
  1108.  
  1109.    Read more in the :ref:`User Guide <sample_generators>`.
  1110.  
  1111.    Parameters
  1112.    ----------
  1113.    n_samples : int, optional (default=100)
  1114.        The number of samples.
  1115.  
  1116.    n_features : int, optional (default=10)
  1117.        The number of features.
  1118.  
  1119.    random_state : int, RandomState instance or None, optional (default=None)
  1120.        If int, random_state is the seed used by the random number generator;
  1121.        If RandomState instance, random_state is the random number generator;
  1122.        If None, the random number generator is the RandomState instance used
  1123.        by `np.random`.
  1124.  
  1125.    Returns
  1126.    -------
  1127.    X : array of shape [n_samples, n_features]
  1128.        The input samples.
  1129.  
  1130.    y : array of shape [n_samples]
  1131.        The output values.
  1132.  
  1133.    References
  1134.    ----------
  1135.    .. [1] G. Celeux, M. El Anbari, J.-M. Marin, C. P. Robert,
  1136.           "Regularization in regression: comparing Bayesian and frequentist
  1137.           methods in a poorly informative situation", 2009.
  1138.    """
  1139.     generator  = check_random_state(random_state)
  1140.  
  1141.     X  = generator.normal(loc = 0 , scale = 1 , size =(n_samples , n_features))
  1142.     y  = generator.normal(loc =(X[: ,  0] +
  1143.                                2 * X[: ,  1] -
  1144.                                2 * X[: ,  2] -
  1145.                                1.5 * X[: ,  3]) , scale =np.ones(n_samples))
  1146.  
  1147.      return X , y
  1148.  
  1149.  
  1150. def make_spd_matrix(n_dim , random_state = None):
  1151.      """Generate a random symmetric, positive-definite matrix.
  1152.  
  1153.    Read more in the :ref:`User Guide <sample_generators>`.
  1154.  
  1155.    Parameters
  1156.    ----------
  1157.    n_dim : int
  1158.        The matrix dimension.
  1159.  
  1160.    random_state : int, RandomState instance or None, optional (default=None)
  1161.        If int, random_state is the seed used by the random number generator;
  1162.        If RandomState instance, random_state is the random number generator;
  1163.        If None, the random number generator is the RandomState instance used
  1164.        by `np.random`.
  1165.  
  1166.    Returns
  1167.    -------
  1168.    X : array of shape [n_dim, n_dim]
  1169.        The random symmetric, positive-definite matrix.
  1170.  
  1171.    See also
  1172.    --------
  1173.    make_sparse_spd_matrix
  1174.    """
  1175.     generator  = check_random_state(random_state)
  1176.  
  1177.     A  = generator.rand(n_dim , n_dim)
  1178.     U , s , V  = linalg.svd(np.dot(A.T , A))
  1179.     X  = np.dot(np.dot(U ,  1.0 + np.diag(generator.rand(n_dim))) , V)
  1180.  
  1181.      return X
  1182.  
  1183.  
  1184. def make_sparse_spd_matrix(dim = 1 , alpha = 0.95 , norm_diag = False ,
  1185.                            smallest_coef = .1 , largest_coef = .9 ,
  1186.                            random_state = None):
  1187.      """Generate a sparse symmetric definite positive matrix.
  1188.  
  1189.    Read more in the :ref:`User Guide <sample_generators>`.
  1190.  
  1191.    Parameters
  1192.    ----------
  1193.    dim : integer, optional (default=1)
  1194.        The size of the random matrix to generate.
  1195.  
  1196.    alpha : float between 0 and 1, optional (default=0.95)
  1197.        The probability that a coefficient is zero (see notes). Larger values
  1198.        enforce more sparsity.
  1199.  
  1200.    random_state : int, RandomState instance or None, optional (default=None)
  1201.        If int, random_state is the seed used by the random number generator;
  1202.        If RandomState instance, random_state is the random number generator;
  1203.        If None, the random number generator is the RandomState instance used
  1204.        by `np.random`.
  1205.  
  1206.    largest_coef : float between 0 and 1, optional (default=0.9)
  1207.        The value of the largest coefficient.
  1208.  
  1209.    smallest_coef : float between 0 and 1, optional (default=0.1)
  1210.        The value of the smallest coefficient.
  1211.  
  1212.    norm_diag : boolean, optional (default=False)
  1213.        Whether to normalize the output matrix to make the leading diagonal
  1214.        elements all 1
  1215.  
  1216.    Returns
  1217.    -------
  1218.    prec : sparse matrix of shape (dim, dim)
  1219.        The generated matrix.
  1220.  
  1221.    Notes
  1222.    -----
  1223.    The sparsity is actually imposed on the cholesky factor of the matrix.
  1224.    Thus alpha does not translate directly into the filling fraction of
  1225.    the matrix itself.
  1226.  
  1227.    See also
  1228.    --------
  1229.    make_spd_matrix
  1230.    """
  1231.     random_state  = check_random_state(random_state)
  1232.  
  1233.     chol  = -np.eye(dim)
  1234.     aux  = random_state.rand(dim , dim)
  1235.     aux[aux  < alpha]  =  0
  1236.     aux[aux  > alpha]  = (smallest_coef
  1237.                         + (largest_coef - smallest_coef)
  1238.                         * random_state.rand(np. sum(aux  > alpha)))
  1239.     aux  = np.tril(aux , k =- 1)
  1240.  
  1241.      # Permute the lines: we don't want to have asymmetries in the final
  1242.      # SPD matrix
  1243.     permutation  = random_state.permutation(dim)
  1244.     aux  = aux[permutation].T[permutation]
  1245.     chol + = aux
  1246.     prec  = np.dot(chol.T , chol)
  1247.  
  1248.      if norm_diag:
  1249.          # Form the diagonal vector into a row matrix
  1250.         d  = np.diag(prec).reshape( 1 , prec.shape[ 0])
  1251.         d  =  1. / np.sqrt(d)
  1252.  
  1253.         prec * = d
  1254.         prec * = d.T
  1255.  
  1256.      return prec
  1257.  
  1258.  
  1259. def make_swiss_roll(n_samples = 100 , noise = 0.0 , random_state = None):
  1260.      """Generate a swiss roll dataset.
  1261.  
  1262.    Read more in the :ref:`User Guide <sample_generators>`.
  1263.  
  1264.    Parameters
  1265.    ----------
  1266.    n_samples : int, optional (default=100)
  1267.        The number of sample points on the S curve.
  1268.  
  1269.    noise : float, optional (default=0.0)
  1270.        The standard deviation of the gaussian noise.
  1271.  
  1272.    random_state : int, RandomState instance or None, optional (default=None)
  1273.        If int, random_state is the seed used by the random number generator;
  1274.        If RandomState instance, random_state is the random number generator;
  1275.        If None, the random number generator is the RandomState instance used
  1276.        by `np.random`.
  1277.  
  1278.    Returns
  1279.    -------
  1280.    X : array of shape [n_samples, 3]
  1281.        The points.
  1282.  
  1283.    t : array of shape [n_samples]
  1284.        The univariate position of the sample according to the main dimension
  1285.        of the points in the manifold.
  1286.  
  1287.    Notes
  1288.    -----
  1289.    The algorithm is from Marsland [1].
  1290.  
  1291.    References
  1292.    ----------
  1293.    .. [1] S. Marsland, "Machine Learning: An Algorithmic Perspective",
  1294.           Chapter 10, 2009.
  1295.           http://seat.massey.ac.nz/personal/s.r.marsland/Code/10/lle.py
  1296.    """
  1297.     generator  = check_random_state(random_state)
  1298.  
  1299.     t  =  1.5 * np.pi * ( 1 +  2 * generator.rand( 1 , n_samples))
  1300.     x  = t * np.cos(t)
  1301.     y  =  21 * generator.rand( 1 , n_samples)
  1302.     z  = t * np.sin(t)
  1303.  
  1304.     X  = np.concatenate((x , y , z))
  1305.     X + = noise * generator.randn( 3 , n_samples)
  1306.     X  = X.T
  1307.     t  = np.squeeze(t)
  1308.  
  1309.      return X , t
  1310.  
  1311.  
  1312. def make_s_curve(n_samples = 100 , noise = 0.0 , random_state = None):
  1313.      """Generate an S curve dataset.
  1314.  
  1315.    Read more in the :ref:`User Guide <sample_generators>`.
  1316.  
  1317.    Parameters
  1318.    ----------
  1319.    n_samples : int, optional (default=100)
  1320.        The number of sample points on the S curve.
  1321.  
  1322.    noise : float, optional (default=0.0)
  1323.        The standard deviation of the gaussian noise.
  1324.  
  1325.    random_state : int, RandomState instance or None, optional (default=None)
  1326.        If int, random_state is the seed used by the random number generator;
  1327.        If RandomState instance, random_state is the random number generator;
  1328.        If None, the random number generator is the RandomState instance used
  1329.        by `np.random`.
  1330.  
  1331.    Returns
  1332.    -------
  1333.    X : array of shape [n_samples, 3]
  1334.        The points.
  1335.  
  1336.    t : array of shape [n_samples]
  1337.        The univariate position of the sample according to the main dimension
  1338.        of the points in the manifold.
  1339.    """
  1340.     generator  = check_random_state(random_state)
  1341.  
  1342.     t  =  3 * np.pi * (generator.rand( 1 , n_samples) -  0.5)
  1343.     x  = np.sin(t)
  1344.     y  =  2.0 * generator.rand( 1 , n_samples)
  1345.     z  = np.sign(t) * (np.cos(t) -  1)
  1346.  
  1347.     X  = np.concatenate((x , y , z))
  1348.     X + = noise * generator.randn( 3 , n_samples)
  1349.     X  = X.T
  1350.     t  = np.squeeze(t)
  1351.  
  1352.      return X , t
  1353.  
  1354.  
  1355. def make_gaussian_quantiles(mean = None , cov = 1. , n_samples = 100 ,
  1356.                             n_features = 2 , n_classes = 3 ,
  1357.                             shuffle = True , random_state = None):
  1358.      """Generate isotropic Gaussian and label samples by quantile
  1359.  
  1360.    This classification dataset is constructed by taking a multi-dimensional
  1361.    standard normal distribution and defining classes separated by nested
  1362.    concentric multi-dimensional spheres such that roughly equal numbers of
  1363.    samples are in each class (quantiles of the :math:`\chi^2` distribution).
  1364.  
  1365.    Read more in the :ref:`User Guide <sample_generators>`.
  1366.  
  1367.    Parameters
  1368.    ----------
  1369.    mean : array of shape [n_features], optional (default=None)
  1370.        The mean of the multi-dimensional normal distribution.
  1371.        If None then use the origin (0, 0, ...).
  1372.  
  1373.    cov : float, optional (default=1.)
  1374.        The covariance matrix will be this value times the unit matrix. This
  1375.        dataset only produces symmetric normal distributions.
  1376.  
  1377.    n_samples : int, optional (default=100)
  1378.        The total number of points equally divided among classes.
  1379.  
  1380.    n_features : int, optional (default=2)
  1381.        The number of features for each sample.
  1382.  
  1383.    n_classes : int, optional (default=3)
  1384.        The number of classes
  1385.  
  1386.    shuffle : boolean, optional (default=True)
  1387.        Shuffle the samples.
  1388.  
  1389.    random_state : int, RandomState instance or None, optional (default=None)
  1390.        If int, random_state is the seed used by the random number generator;
  1391.        If RandomState instance, random_state is the random number generator;
  1392.        If None, the random number generator is the RandomState instance used
  1393.        by `np.random`.
  1394.  
  1395.    Returns
  1396.    -------
  1397.    X : array of shape [n_samples, n_features]
  1398.        The generated samples.
  1399.  
  1400.    y : array of shape [n_samples]
  1401.        The integer labels for quantile membership of each sample.
  1402.  
  1403.    Notes
  1404.    -----
  1405.    The dataset is from Zhu et al [1].
  1406.  
  1407.    References
  1408.    ----------
  1409.    .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
  1410.  
  1411.    """
  1412.      if n_samples  < n_classes:
  1413.          raise  ValueError( "n_samples must be at least n_classes")
  1414.  
  1415.     generator  = check_random_state(random_state)
  1416.  
  1417.      if mean  is  None:
  1418.         mean  = np.zeros(n_features)
  1419.      else:
  1420.         mean  = np. array(mean)
  1421.  
  1422.      # Build multivariate normal distribution
  1423.     X  = generator.multivariate_normal(mean , cov * np.identity(n_features) ,
  1424.                                       (n_samples ,))
  1425.  
  1426.      # Sort by distance from origin
  1427.     idx  = np.argsort(np. sum((X - mean[np.newaxis , :]) **  2 , axis = 1))
  1428.     X  = X[idx , :]
  1429.  
  1430.      # Label by quantile
  1431.     step  = n_samples // n_classes
  1432.  
  1433.     y  = np.hstack([np.repeat(np.arange(n_classes) , step) ,
  1434.                    np.repeat(n_classes -  1 , n_samples - step * n_classes)])
  1435.  
  1436.      if shuffle:
  1437.         X , y  = util_shuffle(X , y , random_state =generator)
  1438.  
  1439.      return X , y
  1440.  
  1441.  
  1442. def _shuffle(data , random_state = None):
  1443.     generator  = check_random_state(random_state)
  1444.     n_rows , n_cols  = data.shape
  1445.     row_idx  = generator.permutation(n_rows)
  1446.     col_idx  = generator.permutation(n_cols)
  1447.     result  = data[row_idx][: , col_idx]
  1448.      return result , row_idx , col_idx
  1449.  
  1450.  
  1451. def make_biclusters(shape , n_clusters , noise = 0.0 , minval = 10 ,
  1452.                     maxval = 100 , shuffle = True , random_state = None):
  1453.      """Generate an array with constant block diagonal structure for
  1454.    biclustering.
  1455.  
  1456.    Read more in the :ref:`User Guide <sample_generators>`.
  1457.  
  1458.    Parameters
  1459.    ----------
  1460.    shape : iterable (n_rows, n_cols)
  1461.        The shape of the result.
  1462.  
  1463.    n_clusters : integer
  1464.        The number of biclusters.
  1465.  
  1466.    noise : float, optional (default=0.0)
  1467.        The standard deviation of the gaussian noise.
  1468.  
  1469.    minval : int, optional (default=10)
  1470.        Minimum value of a bicluster.
  1471.  
  1472.    maxval : int, optional (default=100)
  1473.        Maximum value of a bicluster.
  1474.  
  1475.    shuffle : boolean, optional (default=True)
  1476.        Shuffle the samples.
  1477.  
  1478.    random_state : int, RandomState instance or None, optional (default=None)
  1479.        If int, random_state is the seed used by the random number generator;
  1480.        If RandomState instance, random_state is the random number generator;
  1481.        If None, the random number generator is the RandomState instance used
  1482.        by `np.random`.
  1483.  
  1484.    Returns
  1485.    -------
  1486.    X : array of shape `shape`
  1487.        The generated array.
  1488.  
  1489.    rows : array of shape (n_clusters, X.shape[0],)
  1490.        The indicators for cluster membership of each row.
  1491.  
  1492.    cols : array of shape (n_clusters, X.shape[1],)
  1493.        The indicators for cluster membership of each column.
  1494.  
  1495.    References
  1496.    ----------
  1497.  
  1498.    .. [1] Dhillon, I. S. (2001, August). Co-clustering documents and
  1499.        words using bipartite spectral graph partitioning. In Proceedings
  1500.        of the seventh ACM SIGKDD international conference on Knowledge
  1501.        discovery and data mining (pp. 269-274). ACM.
  1502.  
  1503.    See also
  1504.    --------
  1505.    make_checkerboard
  1506.    """
  1507.     generator  = check_random_state(random_state)
  1508.     n_rows , n_cols  = shape
  1509.     consts  = generator.uniform(minval , maxval , n_clusters)
  1510.  
  1511.      # row and column clusters of approximately equal sizes
  1512.     row_sizes  = generator.multinomial(n_rows ,
  1513.                                       np.repeat( 1.0 / n_clusters ,
  1514.                                                 n_clusters))
  1515.     col_sizes  = generator.multinomial(n_cols ,
  1516.                                       np.repeat( 1.0 / n_clusters ,
  1517.                                                 n_clusters))
  1518.  
  1519.     row_labels  = np.hstack( list(np.repeat(val , rep)  for val , rep  in
  1520.                                  zip( range(n_clusters) , row_sizes)))
  1521.     col_labels  = np.hstack( list(np.repeat(val , rep)  for val , rep  in
  1522.                                  zip( range(n_clusters) , col_sizes)))
  1523.  
  1524.     result  = np.zeros(shape , dtype =np.float64)
  1525.      for i  in  range(n_clusters):
  1526.         selector  = np.outer(row_labels  == i , col_labels  == i)
  1527.         result[selector] + = consts[i]
  1528.  
  1529.      if noise  >  0:
  1530.         result + = generator.normal(scale =noise , size =result.shape)
  1531.  
  1532.      if shuffle:
  1533.         result , row_idx , col_idx  = _shuffle(result , random_state)
  1534.         row_labels  = row_labels[row_idx]
  1535.         col_labels  = col_labels[col_idx]
  1536.  
  1537.     rows  = np.vstack(row_labels  == c  for c  in  range(n_clusters))
  1538.     cols  = np.vstack(col_labels  == c  for c  in  range(n_clusters))
  1539.  
  1540.      return result , rows , cols
  1541.  
  1542.  
  1543. def make_checkerboard(shape , n_clusters , noise = 0.0 , minval = 10 ,
  1544.                       maxval = 100 , shuffle = True , random_state = None):
  1545.  
  1546.      """Generate an array with block checkerboard structure for
  1547.    biclustering.
  1548.  
  1549.    Read more in the :ref:`User Guide <sample_generators>`.
  1550.  
  1551.    Parameters
  1552.    ----------
  1553.    shape : iterable (n_rows, n_cols)
  1554.        The shape of the result.
  1555.  
  1556.    n_clusters : integer or iterable (n_row_clusters, n_column_clusters)
  1557.        The number of row and column clusters.
  1558.  
  1559.    noise : float, optional (default=0.0)
  1560.        The standard deviation of the gaussian noise.
  1561.  
  1562.    minval : int, optional (default=10)
  1563.        Minimum value of a bicluster.
  1564.  
  1565.    maxval : int, optional (default=100)
  1566.        Maximum value of a bicluster.
  1567.  
  1568.    shuffle : boolean, optional (default=True)
  1569.        Shuffle the samples.
  1570.  
  1571.    random_state : int, RandomState instance or None, optional (default=None)
  1572.        If int, random_state is the seed used by the random number generator;
  1573.        If RandomState instance, random_state is the random number generator;
  1574.        If None, the random number generator is the RandomState instance used
  1575.        by `np.random`.
  1576.  
  1577.    Returns
  1578.    -------
  1579.    X : array of shape `shape`
  1580.        The generated array.
  1581.  
  1582.    rows : array of shape (n_clusters, X.shape[0],)
  1583.        The indicators for cluster membership of each row.
  1584.  
  1585.    cols : array of shape (n_clusters, X.shape[1],)
  1586.        The indicators for cluster membership of each column.
  1587.  
  1588.  
  1589.    References
  1590.    ----------
  1591.  
  1592.    .. [1] Kluger, Y., Basri, R., Chang, J. T., & Gerstein, M. (2003).
  1593.        Spectral biclustering of microarray data: coclustering genes
  1594.        and conditions. Genome research, 13(4), 703-716.
  1595.  
  1596.    See also
  1597.    --------
  1598.    make_biclusters
  1599.    """
  1600.     generator  = check_random_state(random_state)
  1601.  
  1602.      if  hasattr(n_clusters ,  "__len__"):
  1603.         n_row_clusters , n_col_clusters  = n_clusters
  1604.      else:
  1605.         n_row_clusters  = n_col_clusters  = n_clusters
  1606.  
  1607.      # row and column clusters of approximately equal sizes
  1608.     n_rows , n_cols  = shape
  1609.     row_sizes  = generator.multinomial(n_rows ,
  1610.                                       np.repeat( 1.0 / n_row_clusters ,
  1611.                                                 n_row_clusters))
  1612.     col_sizes  = generator.multinomial(n_cols ,
  1613.                                       np.repeat( 1.0 / n_col_clusters ,
  1614.                                                 n_col_clusters))
  1615.  
  1616.     row_labels  = np.hstack( list(np.repeat(val , rep)  for val , rep  in
  1617.                                  zip( range(n_row_clusters) , row_sizes)))
  1618.     col_labels  = np.hstack( list(np.repeat(val , rep)  for val , rep  in
  1619.                                  zip( range(n_col_clusters) , col_sizes)))
  1620.  
  1621.     result  = np.zeros(shape , dtype =np.float64)
  1622.      for i  in  range(n_row_clusters):
  1623.          for j  in  range(n_col_clusters):
  1624.             selector  = np.outer(row_labels  == i , col_labels  == j)
  1625.             result[selector] + = generator.uniform(minval , maxval)
  1626.  
  1627.      if noise  >  0:
  1628.         result + = generator.normal(scale =noise , size =result.shape)
  1629.  
  1630.      if shuffle:
  1631.         result , row_idx , col_idx  = _shuffle(result , random_state)
  1632.         row_labels  = row_labels[row_idx]
  1633.         col_labels  = col_labels[col_idx]
  1634.  
  1635.     rows  = np.vstack(row_labels  == label
  1636.                       for label  in  range(n_row_clusters)
  1637.                       for _  in  range(n_col_clusters))
  1638.     cols  = np.vstack(col_labels  == label
  1639.                       for _  in  range(n_row_clusters)
  1640.                       for label  in  range(n_col_clusters))
  1641.  
  1642.      return result , rows , cols
  1643.  
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值