参考地址:
1、http://spark.apache.org/docs/latest/ml-guide.html
2、https://github.com/apache/spark/tree/v2.2.0
Data Types - RDD-based API
Local vector
import numpy as np import scipy.sparse as sps from pyspark.mllib.linalg import Vectors # Use a NumPy array as a dense vector. dv1 = np.array([1.0, 0.0, 3.0]) # [ 1. 0. 3.] # Use a Python list as a dense vector. dv2 = [1.0, 0.0, 3.0] # Create a SparseVector. sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0]) # <class 'pyspark.mllib.linalg.SparseVector'> # (3,[0,2],[1.0,3.0]) # Use a single-column SciPy csc_matrix as a sparse vector. sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1)) # <class 'scipy.sparse.csc.csc_matrix'> ''' (0, 0) 1.0 (2, 0) 3.0 '''
Labeled point
from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint # Create a labeled point with a positive label and a dense feature vector. pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) s=SparseVector(3, [0, 2], [1.0, 3.0]) # Create a labeled point with a negative label and a sparse feature vector. neg = LabeledPoint(0.0, s) print(pos) print(type(pos)) print(s) print(type(s)) print(neg) print(type(neg)) ''' (1.0,[1.0,0.0,3.0]) <class 'pyspark.mllib.regression.LabeledPoint'> (3,[0,2],[1.0,3.0]) <class 'pyspark.mllib.linalg.SparseVector'> (0.0,(3,[0,2],[1.0,3.0])) <class 'pyspark.mllib.regression.LabeledPoint'> '''
from pyspark.mllib.util import MLUtils from pyspark import SparkContext,SparkConf sc = SparkContext(conf=SparkConf().setAppName("The first example")) examples = MLUtils.loadLibSVMFile(sc, "data.txt") print(examples) print(type(examples)) print(examples.collect()) ''' PythonRDD[4] at RDD at PythonRDD.scala:48 <class 'pyspark.rdd.PipelinedRDD'> [LabeledPoint(20.0, (0,[],[]))] '''
Local matrix
from pyspark.mllib.linalg import Matrix, Matrices # Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) # Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) print(dm2,'\n',type(dm2)) print(sm,'\n',type(sm)) """ DenseMatrix([[ 1., 4.], [ 2., 5.], [ 3., 6.]]) <class 'pyspark.mllib.linalg.DenseMatrix'> 3 X 2 CSCMatrix (0,0) 9.0 (2,1) 6.0 (1,1) 8.0 <class 'pyspark.mllib.linalg.SparseMatrix'> """
RowMatrix
from pyspark.mllib.linalg.distributed import RowMatrix from pyspark import SparkContext,SparkConf sc = SparkContext(conf=SparkConf().setAppName("The first example")) # Create an RDD of vectors. rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) # Create a RowMatrix from an RDD of vectors. mat = RowMatrix(rows) # Get its size. m = mat.numRows() # 4 n = mat.numCols() # 3 # Get the rows as an RDD of vectors again. rowsRDD = mat.rows print(mat,'\n',type(mat)) print(rowsRDD,'\n',type(rowsRDD)) print(rowsRDD.collect()) """ <pyspark.mllib.linalg.distributed.RowMatrix object at 0x7f56eb00a940> <class 'pyspark.mllib.linalg.distributed.RowMatrix'> MapPartitionsRDD[3] at mapPartitions at PythonMLLibAPI.scala:1335 <class 'pyspark.rdd.RDD'> [DenseVector([1.0, 2.0, 3.0]), DenseVector([4.0, 5.0, 6.0]), DenseVector([7.0, 8.0, 9.0]), DenseVector([10.0, 11.0, 12.0])] """
IndexedRowMatrix
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix from pyspark import SparkContext,SparkConf sc = SparkContext(conf=SparkConf().setAppName("The first example")) # Create an RDD of indexed rows. # - This can be done explicitly with the IndexedRow class: indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6]), IndexedRow(2, [7, 8, 9]), IndexedRow(3, [10, 11, 12])]) # - or by using (long, vector) tuples: indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]), (2, [7, 8, 9]), (3, [10, 11, 12])]) # Create an IndexedRowMatrix from an RDD of IndexedRows. mat = IndexedRowMatrix(indexedRows) # Get its size. m = mat.numRows() # 4 n = mat.numCols() # 3 # Get the rows as an RDD of IndexedRows. rowsRDD = mat.rows # Convert to a RowMatrix by dropping the row indices. rowMat = mat.toRowMatrix()
CoordinateMatrix
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry from pyspark import SparkContext,SparkConf sc = SparkContext(conf=SparkConf().setAppName("The first example")) # Create an RDD of coordinate entries. # - This can be done explicitly with the MatrixEntry class: entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)]) # - or using (long, long, float) tuples: entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)]) # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(entries) # Get its size. m = mat.numRows() # 3 n = mat.numCols() # 2 # Get the entries as an RDD of MatrixEntries. entriesRDD = mat.entries # Convert to a RowMatrix. rowMat = mat.toRowMatrix() # Convert to an IndexedRowMatrix. indexedRowMat = mat.toIndexedRowMatrix() # Convert to a BlockMatrix. blockMat = mat.toBlockMatrix()
BlockMatrix
from pyspark.mllib.linalg import Matrices from pyspark.mllib.linalg.distributed import BlockMatrix # Create an RDD of sub-matrix blocks. blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) # Create a BlockMatrix from an RDD of sub-matrix blocks. mat = BlockMatrix(blocks, 3, 2) # Get its size. m = mat.numRows() # 6 n = mat.numCols() # 2 # Get the blocks as an RDD of sub-matrix blocks. blocksRDD = mat.blocks # Convert to a LocalMatrix. localMat = mat.toLocalMatrix() # Convert to an IndexedRowMatrix. indexedRowMat = mat.toIndexedRowMatrix() # Convert to a CoordinateMatrix. coordinateMat = mat.toCoordinateMatrix()