我试着统计双元组出现的频率,并列出最常见的5个。我可以打印输出,但当我尝试使用saveAsTextFile以.out文件格式保存结果时,会出现“属性错误”。在from __future__ import print_function
import sys
from operator import add
from pyspark import SparkContext
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: bigram ", file=sys.stderr)
exit(-1)
sc = SparkContext()
lines = sc.textFile(sys.argv[1], 1)
sentences = lines.glom() \
.map(lambda x: " ".join(x).replace(","," ").replace(".", " ").replace("{"," ").replace("}"," ").lower()) \
.flatMap(lambda x: x.split("."))
#print(sentences)
#Your code goes here
bigram_pair = sentences.map(lambda x: x.split()) \
.flatMap(lambda x: [((x[i],x[i+1]),1) for i in range (0, len(x)-1)])
frequency = bigram_pair.reduceByKey(lambda x,y:x+y) \
.map(lambda x:(x[1],x[0])) \
.sortByKey(False) \
.take(100)
print (frequency)
frequency.saveAsTextFile("result.out")
sc.stop()