Spark Core篇（一）

最新推荐文章于 2022-12-23 20:25:35 发布

CMCST

最新推荐文章于 2022-12-23 20:25:35 发布

阅读量214

点赞数

分类专栏： spark 文章标签： spark python 大数据

本文链接：https://blog.csdn.net/m0_48275578/article/details/124706176

版权

spark 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

combineByKey 实现 reduceByKey

conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf = conf)

def getSentences(nums):
    return LoremIpsum().get_sentences(nums)

def wordCountApp(data):
    data = sc.parallelize(data)
    words = data.filter( lambda line : len(line.strip()) != 0 ).flatMap(lambda line : line.strip().split(" ")).map(lambda word : word.replace("." , "").replace(",",""))
    # return words.map(lambda key : (key , 1)).reduceByKey(lambda a , b : a + b).sortByKey()
    return words.map(lambda word : (word , 1))

if __name__ == '__main__':
    nums = 1000
    data = list()
    data.append(getSentences(nums))
    result = wordCountApp(data)
    print(result.collect())

    # reduceByKey 实现
    resRDD = result.combineByKey(
        lambda word : 1 ,
        lambda acc , word : acc + word,
        lambda acc1 , acc2 : acc1 + acc2
    ).map(lambda x : (x[0] , x[1]))
    print(resRDD.collect())

combineByKey 实现 groupByKey

conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf = conf)

def getSentences(nums):
    return LoremIpsum().get_sentences(nums)

def wordCountApp(data):
    data = sc.parallelize(data)
    words = data.filter( lambda line : len(line.strip()) != 0 ).flatMap(lambda line : line.strip().split(" ")).map(lambda word : word.replace("." , "").replace(",",""))
    # return words.map(lambda key : (key , 1)).reduceByKey(lambda a , b : a + b).sortByKey()
    return words.map(lambda word : (word , 1))

if __name__ == '__main__':
    nums = 1000
    data = list()
    data.append(getSentences(nums))
    result = wordCountApp(data)
    print(result.collect())


    # groupByKey实现
    resRDD = result.combineByKey(
        lambda cnt : [cnt],
        lambda acc , cnt : acc + [cnt], #lambda acc , cnt : acc.append(cnt)
        lambda acc1 , acc2 : acc1 + acc2
    )
    print(resRDD.collect())