声明:版权所有,转载请联系作者并注明出处 http://blog.csdn.net/u013719780?viewmode=contents
博主简介:风雪夜归子(Allen),机器学习算法攻城狮,喜爱钻研Meachine Learning的黑科技,对Deep Learning和Artificial Intelligence充满兴趣,经常关注Kaggle数据挖掘竞赛平台,对数据、Machine Learning和Artificial Intelligence有兴趣的童鞋可以一起探讨哦,个人CSDN博客:http://blog.csdn.net/u013719780?viewmode=contents
本文我们将建立一个简单的单词统计应用
创建rdd
wordsList = ['cat', 'elephant', 'rat', 'rat', 'cat']
wordsRDD = sc.parallelize(wordsList, 4)
# Print out the type of wordsRDD
print type(wordsRDD)
将单词变成复数形式并且进行测试
# One way of completing the function
def makePlural(word):
return word + 's'
print makePlural('cat')
测试结果是否正确,如果不正确就返回'incorrect result: makePlural does not add an s'
# Make sure to rerun any cell you change before trying the test again
from test_helper import Test
Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s')
应用makePlural()函数到rdd上
# TODO: Replace <FILL IN> with appropriate code
pluralRDD = wordsRDD.map(makePlural)
print pluralRDD.collect()
# TEST Apply makePlural to the base RDD(1c)
Test.assertEquals(pluralRDD.collect(), ['cats', 'elephants', 'rats', 'rats', 'cats'],
'incorrect values for pluralRDD')
使用lambda函数将单词变成复数形式的功能
# TODO: Replace <FILL IN> with appropriate code
pluralLambdaRDD = wordsRDD.map(lambda x: x + 's')
print pluralLambdaRDD.collect()
# TEST Pass a lambda function to map (1d)
Test.assertEquals(pluralLambdaRDD.collect(), ['cats', 'elephants', 'rats', 'rats', 'cats'],
'incorrect values for pluralLambdaRDD (1d)')
计算每个单词的长度
# TODO: Replace <FILL IN> with appropriate code
pluralLengths = (pluralRDD
.map(lambda x: len(x))
.collect())
print pluralLengths
# TEST Length of each word (1e)
Test.assertEquals(pluralLengths, [4, 9, 4, 4, 4],
'incorrect values for pluralLengths')
接下来创建rdd对
# TODO: Replace <FILL IN> with appropriate code
wordPairs = wordsRDD.map(lambda x: (x, 1))
print wordPairs.collect()
# TEST Pair RDDs (1f)
Test.assertEquals(wordPairs.collect(),
[('cat', 1), ('elephant', 1), ('rat', 1), ('rat', 1), ('cat', 1)],
'incorrect value for wordPairs')
下面我们将对每个单词统计其出现的次数,实现这个目标有很多方法。
# TODO: Replace <FILL IN> with appropriate code
# Note that groupByKey requires no parameters
wordsGrouped = wordPairs.groupByKey()
for key, value in wordsGrouped.collect():
print '{0}: {1}'.format(key, list(value))
# TEST groupByKey() approach (2a)
Test.assertEquals(sorted(wordsGrouped.mapValues(lambda x: list(x)).collect()),
[('cat', [1, 1]), ('elephant', [1]), ('rat', [1, 1])],
'incorrect value for wordsGrouped')
# TODO: Replace <FILL IN> with appropriate code
wordCountsGrouped = wordsGrouped.map(lambda (k, v): (k, sum(v)))
print wordCountsGrouped.collect()
# TEST Use groupByKey() to obtain the counts (2b)
Test.assertEquals(sorted(wordCountsGrouped.collect()),
[('cat', 2), ('elephant', 1), ('rat', 2)],
'incorrect value for wordCountsGrouped')
用reduceByKey()实现统计每个单词出现的次数的任务
# TODO: Replace <FILL IN> with appropriate code
# Note that reduceByKey takes in a function that accepts two values and returns a single value
wordCounts = wordPairs.reduceByKey(lambda x, y: x + y)
print wordCounts.collect()
# TEST Counting using reduceByKey (2c)
Test.assertEquals(sorted(wordCounts.collect()), [('cat', 2), ('elephant', 1), ('rat', 2)],
'incorrect value for wordCounts')
# 将两个步骤连起来
wordCountsCollected = (wordsRDD
.map(lambda x: (x, 1))
.reduceByKey(lambda x, y: x + y)
.collect())
print wordCountsCollected
# TEST All together
Test.assertEquals(sorted(wordCountsCollected), [('cat', 2), ('elephant', 1), ('rat', 2)],
'incorrect value for wordCountsCollected')
统计不同单词的个数
uniqueWords = wordCounts.count()
print uniqueWords
# TEST Unique words
Test.assertEquals(uniqueWords, 3, 'incorrect count of uniqueWords')
计算每个单词平均出现的次数
from operator import add
totalCount = (wordCounts
.map(lambda (k, v): v)
.reduce(lambda x, y: x + y))
average = totalCount / float(wordCounts.count())
print totalCount
print round(average, 2)
# TEST Mean using reduce
Test.assertEquals(round(average, 2), 1.67, 'incorrect value of average')
接下来看一个完整的在单词统计在文本上的应用
首先,定义一个单词统计的函数
# TODO: Replace <FILL IN> with appropriate code
def wordCount(wordListRDD):
"""Creates a pair RDD with word counts from an RDD of words.
Args:
wordListRDD (RDD of str): An RDD consisting of words.
Returns:
RDD of (str, int): An RDD consisting of (word, count) tuples.
"""
wordCountsCollected = (wordListRDD
.map(lambda x: (x, 1))
.reduceByKey(lambda x, y: x + y))
return wordCountsCollected
print wordCount(wordsRDD).collect()
# TEST wordCount function (4a)
Test.assertEquals(sorted(wordCount(wordsRDD).collect()),
[('cat', 2), ('elephant', 1), ('rat', 2)],
'incorrect definition for wordCount function')
接下来将文本中的标点符号去掉并将字母都转换为小写
# TODO: Replace <FILL IN> with appropriate code
import re
import string
def removePunctuation(text):
"""Removes punctuation, changes to lower case, and strips leading and trailing spaces.
Note:
Only spaces, letters, and numbers should be retained. Other characters should should be
eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after
punctuation is removed.
Args:
text (str): A string.
Returns:
str: The cleaned up string.
"""
regex = re.compile('[%s]' % re.escape(string.punctuation))
return regex.sub('', text).lower().strip()
print removePunctuation('Hi, you!')
print removePunctuation(' No under_score!')
# TEST Capitalization and punctuation
Test.assertEquals(removePunctuation(" The Elephant's 4 cats. "),
'the elephants 4 cats',
'incorrect definition for removePunctuation function')
import os.path
fileName = os.path.join('/Users/youwei.tan/Downloads', 'shakespeare.txt')
shakespeareRDD = (sc
.textFile(fileName, 8)
.map(removePunctuation))
print '\n'.join(shakespeareRDD
.zipWithIndex() # to (line, lineNum)
.map(lambda (l, num): '{0}: {1}'.format(num, l)) # to 'lineNum: line'
.take(15))
在使用wordcount()函数之前,需要完成两个任务: 1、以空格对字符串进行分割 2、过滤掉空行
shakespeareWordsRDD = shakespeareRDD.flatMap(lambda x: x.split())
shakespeareWordCount = shakespeareWordsRDD.count()
print shakespeareWordsRDD.top(5)
print shakespeareWordCount
# TEST Words from lines
# This test allows for leading spaces to be removed either before or after
# punctuation is removed.
Test.assertTrue(shakespeareWordCount == 903705 or shakespeareWordCount == 928908,
'incorrect value for shakespeareWordCount')
Test.assertEquals(shakespeareWordsRDD.top(5),
[u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds'],
'incorrect value for shakespeareWordsRDD')
shakeWordsRDD = shakespeareWordsRDD # already removed
shakeWordCount = shakeWordsRDD.count()
print shakeWordCount
# TEST Remove empty elements
Test.assertEquals(shakeWordCount, 903705, 'incorrect value for shakeWordCount')
实现空格分割和过滤掉恐狼的另一种方法
shakespeareRDD.map(lambda x: x.split()).filter(lambda x: len(x)>0).flatMap(lambda x:x).take(10)
接下来统计单词
# TODO: Replace <FILL IN> with appropriate code
top15WordsAndCounts = wordCount(shakeWordsRDD).takeOrdered(15, key=lambda (k, v): -v)
print '\n'.join(map(lambda (w, c): '{0}: {1}'.format(w, c), top15WordsAndCounts))