这几天在使用Python练习Spark官网上SparkMLlib编程指南中的IndexedRowMatrix示例代码时遇到了以下错误:
官网上的源码如下:
# Create an RDD of indexed rows.
# - This can be done explicitly with the IndexedRow class:
indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
IndexedRow(1, [4, 5, 6]),
IndexedRow(2, [7, 8, 9]),
IndexedRow(3, [10, 11, 12])])
# - or by using (long, vector) tuples:
# indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]),
# (2, [7, 8, 9]), (3, [10, 11, 12])])
# Create an IndexedRowMatrix from an RDD of IndexedRows.
mat = IndexedRowMatrix(indexedRows)
# Get its size.
m = mat.numRows() # 4
n = mat.numCols() # 3
# Get the rows as an RDD of IndexedRows.
rowsRDD = mat.rows
# Convert to a RowMatrix by dropping the row indices.
rowMat = mat.toRowMatrix()
错误如下:
AttributeError: 'PipelinedRDD' object has no attribute 'toDF'
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
conf = SparkConf().setAppName('Data Types Index Row Matrix').setMaster('local[2]')
sc = SparkContext(conf=conf)
## 分布式矩阵
# 行索引矩阵
# Create an RDD of indexed rows.
# - This can be done explicitly with the IndexedRow class:
indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
IndexedRow(1, [4, 5, 6]),
IndexedRow(2, [7, 8, 9]),
IndexedRow(3, [10, 11, 12])])
# - or by using (long, vector) tuples:
# indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]),
# (2, [7, 8, 9]), (3, [10, 11, 12])])
# solve the question:AttributeError: 'PipelinedRDD' object has no attribute 'toDF'
sqlContext = SQLContext(sc)
# Create an IndexedRowMatrix from an RDD of IndexedRows.
mat = IndexedRowMatrix(indexedRows)
# Get its size.
m = mat.numRows() # 4
n = mat.numCols() # 3
# Get the rows as an RDD of IndexedRows.
rowsRDD = mat.rows
# Convert to a RowMatrix by dropping the row indices.
rowMat = mat.toRowMatrix()
print('行索引矩阵:')
print(m, n)
print(rowMat)
其实也就是在原来的代码中加入以下代码片段:
from pyspark.sql import SQLContext
# solve the question:AttributeError: 'PipelinedRDD' object has no attribute 'toDF'
sqlContext = SQLContext(sc)
最终问题就奇迹般的解决了。
参考资料
http://stackoverflow.com/questions/32788387/pipelinedrdd-object-has-no-attribute-todf-in-pyspark