代码:
#coding:utf8
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster("local[*]").setAppName("WordCountHelloWord")
#通过SparkConf对象构建SparkContext对象
sc = SparkContext(conf=conf)
#读取文件
file_rdd = sc.textFile("hdfs://hadoop105:8020/study/spark/data/words.txt")
#将单词进行切割,得到一个存储全部单词的集合对象
words_rdd = file_rdd.flatMap(lambda line: line.split(" "))
#将单词转换为元组对象,key是单词,value是数字
words_with_one_rdd = words_rdd.map(lambda x: (x, 1))
#将元组的value 按照key来分组,对所有的value执行聚合操作
result_rdd = words_with_one_rdd.reduceByKey(lambda a, b: a + b)
#通过collect方法手机RDD的数据打印输出结果
print(result_rdd.collect())
报错为:
Traceback (most recent call last):
File "/tmp/pycharm_project_909/00_example/HelloWord.py", line 21, in <module>
result_rdd = words_with_one_rdd.reduceByKey(lambda a, b: a + b)
File "/opt/module/anaconda3/envs/pyspark/lib/python3.8/site-packages/pyspark/rdd.py", line 3552, in reduceByKey
return self.combineByKey(lambda x: x, func, func, numPartitions, partitionFunc)
File "/opt/module/anaconda3/envs/pyspark/lib/python3.8/site-packages/pyspark/rdd.py", line 3987, in combineByKey
shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)
File "/opt/module/anaconda3/envs/pyspark/lib/python3.8/site-packages/pyspark/rdd.py", line 3897, in partitionBy
pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
File "/opt/module/anaconda3/envs/pyspark/lib/python3.8/site-packages/pyspark/rdd.py", line 5470, in _jrdd
wrapped_func = _wrap_function(
File "/opt/module/anaconda3/envs/pyspark/lib/python3.8/site-packages/pyspark/rdd.py", line 5270, in _wrap_function
return sc._jvm.SimplePythonFunction(
TypeError: 'JavaPackage' object is not callable
是因为pyspark库的版本过高导致
解决办法:
pip install pyspark==3.2.0