【spark】jieba + wordcount

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from os import path
import jieba
from pyspark import SparkContext
from pyspark.sql import SQLContext
#from operator import add

sc = SparkContext("local[1]" , "wordCount")
sc.setLogLevel("ERROR")
sqc = SQLContext(sc)

thisDir = path.dirname(__file__)

def wordCut(strings):
    strings = strings.strip()
    returnList = []
    for r in jieba.cut(strings):
        returnList.append(r)
    return returnList

fileName = 'words.txt'
file_in = sc.textFile(path.join(thisDir,fileName))

linesNum = file_in.count()
print '[INFO]number of lines in file %s : %d' % (fileName , linesNum)

charsNum = file_in.map(lambda x : len(x)).reduce(lambda x,y : x+y)
print '[INFO]number of charts in file %s : %d' % (fileName , charsNum)

words = file_in.flatMap(lambda line : wordCut(line))
termBigger3 = words.filter(lambda word : len(word) > 3)
print '[INFO]number of words bigger than 3 in file %s : %d' % (fileName , termBigger3.count())

wordCount = words.map(lambda w : (w,1)).reduceByKey(lambda x,y:x+y)
sqc.createDataFrame(wordCount,['word','count']).sort('count',ascending = False).show(20)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值