pyspark+nltk处理文本数据

环境条件:hadoop2.6.0,spark1.6.0,python2.7,下载代码和数据

代码如下:

from pyspark import SparkContext
sc=SparkContext('local','pyspark') 
data=sc.textFile("hdfs:/user/hadoop/test.txt")
import nltk
from nltk.corpus import stopwords
from functools import reduce
def filter_content(content):
    content_old=content
    content=content.split("%#%")[-1]
    sentences=nltk.sent_tokenize(content) #句子化,sent_tokenize的输入是一段文字,返回的是标准化的句子列表
    words=[word.lower() for sentence in sentences for word in nltk.word_tokenize(sentence)]	#单词化
    words=[word for word in words if word not in stopwords.words('english')] #去除停用词
    words=[word for word in words if word not in ['/','^','-','+','<','>','{','}','*','//',',','.',':',';','?','(',')','[',']','&','!','*','@','|','#','$','%','"',"'","''",'""','`','``']] #去除标点和空字符
    words=[var[0] for var in nltk.pos_tag(words) if var[1][0] in ['N','V']] #词性标注并选择出名词和动词
    words1=[nltk.PorterStemmer().stem(word) for word in words] #Porter提取词干
    # words2=[nltk.LancasterStemmer().stem(word) for word in words] #Lancaster提取词干
    # words3=[nltk.WordNetLemmatizer().lemmatize(word) for word in words] #WordNet提取词元
    # words=set(words1+words2+words3) #将三者合并去重
    words=words1
    if words:
        return reduce(lambda a,b: str(a)+"%#%"+str(b),content_old.split("%#%")[:-1])+"%#%"+reduce(lambda a,b:"%s %s"%(a,b),words)+'\n' #将单词以空格隔开组成文本并返回
    elif content_old.split("%#%")[1]:
        return reduce(lambda a,b: str(a)+"%#%"+str(b),content_old.split("%#%")[:-1])+"%#%"+'\n'
    else:
        return ''
#filter_content("%#%I am a good boy.")
data=data.map(lambda line: filter_content(line))
data.saveAsTextFile("hdfs:/user/hadoop/test_result")
data_list=data.collect()
with open("/home/snow/zzwork/test_result.txt","w") as fw:
    for var in data_list:
        fw.write(str(var))




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值