pyspark-MLlib（Data Types）

最新推荐文章于 2021-03-01 00:14:48 发布

风吴痕

最新推荐文章于 2021-03-01 00:14:48 发布

阅读量699

点赞数

分类专栏： spark 文章标签： spark

spark 专栏收录该内容

27 篇文章 1 订阅

订阅专栏

参考地址：

1、http://spark.apache.org/docs/latest/ml-guide.html

2、https://github.com/apache/spark/tree/v2.2.0

Data Types - RDD-based API

Local vector

import numpy as np
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors

# Use a NumPy array as a dense vector.
dv1 = np.array([1.0, 0.0, 3.0])
# [ 1.  0.  3.]

# Use a Python list as a dense vector.
dv2 = [1.0, 0.0, 3.0]
# Create a SparseVector.
sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0]) # <class 'pyspark.mllib.linalg.SparseVector'>
# (3,[0,2],[1.0,3.0])

# Use a single-column SciPy csc_matrix as a sparse vector.
sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1))
# <class 'scipy.sparse.csc.csc_matrix'>
'''
(0, 0) 1.0
(2, 0) 3.0
'''

Labeled point

from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])

s=SparseVector(3, [0, 2], [1.0, 3.0])

# Create a labeled point with a negative label and a sparse feature vector.
neg = LabeledPoint(0.0, s)

print(pos)
print(type(pos))

print(s)
print(type(s))

print(neg)
print(type(neg))
'''
(1.0,[1.0,0.0,3.0])
<class 'pyspark.mllib.regression.LabeledPoint'>
(3,[0,2],[1.0,3.0])
<class 'pyspark.mllib.linalg.SparseVector'>
(0.0,(3,[0,2],[1.0,3.0]))
<class 'pyspark.mllib.regression.LabeledPoint'>
'''

from pyspark.mllib.util import MLUtils
from pyspark import SparkContext,SparkConf

sc = SparkContext(conf=SparkConf().setAppName("The first example"))
examples = MLUtils.loadLibSVMFile(sc, "data.txt")

print(examples)
print(type(examples))
print(examples.collect())
'''
PythonRDD[4] at RDD at PythonRDD.scala:48
<class 'pyspark.rdd.PipelinedRDD'>
[LabeledPoint(20.0, (0,[],[]))]
'''

Local matrix

from pyspark.mllib.linalg import Matrix, Matrices

# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])

# Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])

print(dm2,'\n',type(dm2))

print(sm,'\n',type(sm))
"""
DenseMatrix([[ 1.,  4.],
             [ 2.,  5.],
             [ 3.,  6.]]) 
 <class 'pyspark.mllib.linalg.DenseMatrix'>

3 X 2 CSCMatrix
(0,0) 9.0
(2,1) 6.0
(1,1) 8.0 
 <class 'pyspark.mllib.linalg.SparseMatrix'>
"""

RowMatrix

from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark import SparkContext,SparkConf

sc = SparkContext(conf=SparkConf().setAppName("The first example"))
# Create an RDD of vectors.
rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])

# Create a RowMatrix from an RDD of vectors.
mat = RowMatrix(rows)

# Get its size.
m = mat.numRows()  # 4
n = mat.numCols()  # 3

# Get the rows as an RDD of vectors again.
rowsRDD = mat.rows
print(mat,'\n',type(mat))
print(rowsRDD,'\n',type(rowsRDD))
print(rowsRDD.collect())
"""
<pyspark.mllib.linalg.distributed.RowMatrix object at 0x7f56eb00a940> 
 <class 'pyspark.mllib.linalg.distributed.RowMatrix'>
MapPartitionsRDD[3] at mapPartitions at PythonMLLibAPI.scala:1335 
 <class 'pyspark.rdd.RDD'>
[DenseVector([1.0, 2.0, 3.0]), DenseVector([4.0, 5.0, 6.0]), 
DenseVector([7.0, 8.0, 9.0]), DenseVector([10.0, 11.0, 12.0])]
"""

IndexedRowMatrix

from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark import SparkContext,SparkConf

sc = SparkContext(conf=SparkConf().setAppName("The first example"))

# Create an RDD of indexed rows.
#   - This can be done explicitly with the IndexedRow class:
indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
                              IndexedRow(1, [4, 5, 6]),
                              IndexedRow(2, [7, 8, 9]),
                              IndexedRow(3, [10, 11, 12])])
#   - or by using (long, vector) tuples:
indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]),
                              (2, [7, 8, 9]), (3, [10, 11, 12])])

# Create an IndexedRowMatrix from an RDD of IndexedRows.
mat = IndexedRowMatrix(indexedRows)

# Get its size.
m = mat.numRows()  # 4
n = mat.numCols()  # 3

# Get the rows as an RDD of IndexedRows.
rowsRDD = mat.rows

# Convert to a RowMatrix by dropping the row indices.
rowMat = mat.toRowMatrix()

CoordinateMatrix

from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from pyspark import SparkContext,SparkConf

sc = SparkContext(conf=SparkConf().setAppName("The first example"))
# Create an RDD of coordinate entries.
#   - This can be done explicitly with the MatrixEntry class:
entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)])
#   - or using (long, long, float) tuples:
entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)])

# Create an CoordinateMatrix from an RDD of MatrixEntries.
mat = CoordinateMatrix(entries)

# Get its size.
m = mat.numRows()  # 3
n = mat.numCols()  # 2

# Get the entries as an RDD of MatrixEntries.
entriesRDD = mat.entries

# Convert to a RowMatrix.
rowMat = mat.toRowMatrix()

# Convert to an IndexedRowMatrix.
indexedRowMat = mat.toIndexedRowMatrix()

# Convert to a BlockMatrix.
blockMat = mat.toBlockMatrix()

BlockMatrix

from pyspark.mllib.linalg import Matrices
from pyspark.mllib.linalg.distributed import BlockMatrix

# Create an RDD of sub-matrix blocks.
blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
                         ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])

# Create a BlockMatrix from an RDD of sub-matrix blocks.
mat = BlockMatrix(blocks, 3, 2)

# Get its size.
m = mat.numRows()  # 6
n = mat.numCols()  # 2

# Get the blocks as an RDD of sub-matrix blocks.
blocksRDD = mat.blocks

# Convert to a LocalMatrix.
localMat = mat.toLocalMatrix()

# Convert to an IndexedRowMatrix.
indexedRowMat = mat.toIndexedRowMatrix()

# Convert to a CoordinateMatrix.
coordinateMat = mat.toCoordinateMatrix()