Spark Core篇(一)

combineByKey 实现 reduceByKey

conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf = conf)

def getSentences(nums):
    return LoremIpsum().get_sentences(nums)

def wordCountApp(data):
    data = sc.parallelize(data)
    words = data.filter( lambda line : len(line.strip()) != 0 ).flatMap(lambda line : line.strip().split(" ")).map(lambda word : word.replace("." , "").replace(",",""))
    # return words.map(lambda key : (key , 1)).reduceByKey(lambda a , b : a + b).sortByKey()
    return words.map(lambda word : (word , 1))

if __name__ == '__main__':
    nums = 1000
    data = list()
    data.append(getSentences(nums))
    result = wordCountApp(data)
    print(result.collect())

    # reduceByKey 实现
    resRDD = result.combineByKey(
        lambda word : 1 ,
        lambda acc , word : acc + word,
        lambda acc1 , acc2 : acc1 + acc2
    ).map(lambda x : (x[0] , x[1]))
    print(resRDD.collect())

combineByKey 实现 groupByKey

conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf = conf)

def getSentences(nums):
    return LoremIpsum().get_sentences(nums)

def wordCountApp(data):
    data = sc.parallelize(data)
    words = data.filter( lambda line : len(line.strip()) != 0 ).flatMap(lambda line : line.strip().split(" ")).map(lambda word : word.replace("." , "").replace(",",""))
    # return words.map(lambda key : (key , 1)).reduceByKey(lambda a , b : a + b).sortByKey()
    return words.map(lambda word : (word , 1))

if __name__ == '__main__':
    nums = 1000
    data = list()
    data.append(getSentences(nums))
    result = wordCountApp(data)
    print(result.collect())


    # groupByKey实现
    resRDD = result.combineByKey(
        lambda cnt : [cnt],
        lambda acc , cnt : acc + [cnt], #lambda acc , cnt : acc.append(cnt)
        lambda acc1 , acc2 : acc1 + acc2
    )
    print(resRDD.collect())
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值