multiply
from pyspark import SparkConf, SparkContext
sc = SparkContext( )
nums = sc. parallelize( [ 1 , 2 , 3 , 4 , 5 ] )
mult = nums. fold( 1 , ( lambda x, y : x * y) )
print ( mult)
120
accumulate = nums. fold( 0 , ( lambda x, y : x + y) )
print ( accumulate)
15
sort_by key
! cat . / data. txt
crazy crazy fox jumped
crazy for jumped
fox is fast
fox is smart
dog is smart
lines = sc. textFile( 'data.txt' , 1 )
lines. collect( )
['crazy crazy fox jumped',
'crazy for jumped',
'fox is fast ',
'fox is smart',
'dog is smart']
frequencies = lines. flatMap( lambda x : x. split( ' ' ) ) . map ( lambda x : ( x, 1 ) ) . reduceByKey( lambda x, y : x + y)
frequencies. collect( )
[('crazy', 3),
('fox', 3),
('jumped', 2),
('for', 1),
('is', 3),
('fast', 1),
('', 1),
('smart', 2),
('dog', 1)]
frequencies. count( )
9
lines. flatMap( lambda x : x. split( ' ' ) ) . collect( )
['crazy',
'crazy',
'fox',
'jumped',
'crazy',
'for',
'jumped',
'fox',
'is',
'fast',
'',
'fox',
'is',
'smart',
'dog',
'is',
'smart']
lines. flatMap( lambda x : x. split( ' ' ) ) . map ( lambda x : ( x, 1 ) ) . collect( )
[('crazy', 1),
('crazy', 1),
('fox', 1),
('jumped', 1),
('crazy', 1),
('for', 1),
('jumped', 1),
('fox', 1),
('is', 1),
('fast', 1),
('', 1),
('fox', 1),
('is', 1),
('smart', 1),
('dog', 1),
('is', 1),
('smart', 1)]
lines. collect( )
['crazy crazy fox jumped',
'crazy for jumped',
'fox is fast ',
'fox is smart',
'dog is smart']
lines. flatMap( lambda x : x. split( ' ' ) ) . map ( lambda x : ( x, 1 ) ) . reduceByKey( lambda x, y : x + y) . collect( )
[('crazy', 3),
('fox', 3),
('jumped', 2),
('for', 1),
('is', 3),
('fast', 1),
('', 1),
('smart', 2),
('dog', 1)]
sum
nums = sc. parallelize( [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 ] )
Sum = nums. fold( 0 , ( lambda x, y : x+ y) )
print ( Sum)
36
union
r1 = sc. parallelize( [ ( 'k1' , 1 ) , ( 'k2' , 2 ) , ( 'k3' , 3 ) ] )
r2 = sc. parallelize( [ ( 'k1' , 3 ) , ( 'k2' , 4 ) , ( 'k4' , 8 ) ] )
r3 = r1. union( r2)
print ( 'r3 :' , r3. collect( ) )
r4 = r3. reduceByKey( lambda x, y : x + y)
print ( 'r4 :' , r4. collect( ) )
r3 : [('k1', 1), ('k2', 2), ('k3', 3), ('k1', 3), ('k2', 4), ('k4', 8)]
r4 : [('k1', 4), ('k3', 3), ('k4', 8), ('k2', 6)]
Word frequency
!cat './data.txt'
crazy crazy fox jumped over the fence
crazy fox jumped
the fence is high of fox
crazy fox is smart
fox jumped very high
lines2 = sc. textFile( './data.txt' )
print ( lines2. collect( ) )
['crazy crazy fox jumped over the fence', 'crazy fox jumped', 'the fence is high of fox', 'crazy fox is smart', 'fox jumped very high']
lines2 = lines. map ( lambda x : x. split( ' ' ) )
print ( 'lines2 is :' )
print ( lines2. collect( ) )
lines2 is :
[['crazy', 'crazy', 'fox', 'jumped', 'over', 'the', 'fence'], ['crazy', 'fox', 'jumped'], ['the', 'fence', 'is', 'high', 'of', 'fox']]
bigrams_list = lines2. fold( [ ] , lambda x , y : x+ y)
bigrams_list = sc. parallelize( bigarms_list)
print ( 'bigrams list :' )
print ( bigrams_list. collect( ) )
bigarms list :
['crazy', 'crazy', 'fox', 'jumped', 'over', 'the', 'fence', 'crazy', 'fox', 'jumped', 'the', 'fence', 'is', 'high', 'of', 'fox']
word_counts = bigrams_list. map ( lambda x : ( x, 1 ) ) . reduceByKey( lambda x, y: x+ y)
n_words = word_counts. count( )
word_frequency = word_counts. map ( lambda x : ( x[ 0 ] , float ( x[ 1 ] / n_words) ) )
print ( 'word frequency' )
print ( word_frequency. collect( ) )
word frequency
[('crazy', 0.3333333333333333), ('of', 0.1111111111111111), ('jumped', 0.2222222222222222), ('high', 0.1111111111111111), ('fence', 0.2222222222222222), ('fox', 0.3333333333333333), ('over', 0.1111111111111111), ('is', 0.1111111111111111), ('the', 0.2222222222222222)]