通过并行集合创建RDD
parallelize和reduce
data = [ 1 , 2 , 3 , 4 , 5 ]
distData1 = sc. parallelize( data)
distData1. reduce ( lambda a, b: a + b)
15
glom和collect
distData2 = sc. parallelize( data, 3 )
distData2. glom( ) . collect( )
[[1], [2, 3], [4, 5]]
通过外部数据集创建RDD
textFile和map
distData3 = sc. textFile( 'E:/spark-2.3.0-bin-hadoop2.7/data/streaming/AFINN-111.txt' )
distData3. map ( lambda s: len ( s) ) . reduce ( lambda a, b: a + b)
25616
distData4 = sc. textFile( 'E:/spark-2.3.0-bin-hadoop2.7/data/graphx/users.txt' , 4 )
distData4. glom( ) . collect( )
[['1,BarackObama,Barack Obama', '2,ladygaga,Goddess of Love'],
['3,jeresig,John Resig', '4,justinbieber,Justin Bieber'],
['6,matei_zaharia,Matei Zaharia'],
['7,odersky,Martin Odersky', '8,anonsys']]
wholeTextFiles
distData5 = sc. wholeTextFiles( 'E:/spark-2.3.0-bin-hadoop2.7/data/graphx/' )
distData5. collect( )
[('file:/E:/spark-2.3.0-bin-hadoop2.7/data/graphx/followers.txt',
'2 1\n4 1\n1 2\n6 3\n7 3\n7 6\n6 7\n3 7\n'),
('file:/E:/spark-2.3.0-bin-hadoop2.7/data/graphx/users.txt',
'1,BarackObama,Barack Obama\n2,ladygaga,Goddess of Love\n3,jeresig,John Resig\n4,justinbieber,Justin Bieber\n6,matei_zaharia,Matei Zaharia\n7,odersky,Martin Odersky\n8,anonsys\n')]
saveAsPickleFile
distData4. saveAsPickleFile( 'pickle' )
pickleFile
sc. pickleFile( 'pickle' ) . collect( )
['1,BarackObama,Barack Obama',
'2,ladygaga,Goddess of Love',
'3,jeresig,John Resig',
'4,justinbieber,Justin Bieber',
'6,matei_zaharia,Matei Zaharia',
'7,odersky,Martin Odersky',
'8,anonsys']
saveAsSequenceFile
sequencedata = [ ( 'jack' , 100 ) , ( 'Tom' , 99 ) , ( 'Jerry' , 90 ) ]
distData6 = sc. parallelize( sequencedata)
distData6. saveAsSequenceFile( 'sequence' )
sequenceFile
sc. sequenceFile( 'sequence' ) . collect( )
[('jack', 100), ('Tom', 99), ('Jerry', 90)]
RDD操作
转换算子-Value型
map和flatMap
rdd1 = sc. parallelize( [ 2 , 3 , 4 ] )
rdd2 = rdd1. map ( lambda x: range ( 1 , x) )
rdd3 = rdd1. flatMap( lambda x: range ( 1 , x) )
print ( rdd2. collect( ) )
print ( rdd3. collect( ) )
[range(1, 2), range(1, 3), range(1, 4)]
[1, 1, 2, 1, 2, 3]
mapPartitions
rdd = sc. parallelize( [ 9 , 3 , 5 , 7 ] , 2 )
def f ( iterator) :
yield max ( iterator)
rdd. mapPartitions( f) . collect( )
[9, 7]
mapPartitionsWithIndex
rdd = sc. parallelize( [ 10 , 3 , 8 , 6 ] , 2 )
def f ( index, iterator) :
if ( index == 0 ) :
pass
else :
yield list ( iterator)
rdd. mapPartitionsWithIndex( f) . collect( )
[[8, 6]]
filter
sc. parallelize( [ 1 , 2 , 3 , 4 , 5 ] ) \
. filter ( lambda x: x% 2 == 0 ) \
. collect( )
[2, 4]
distinct
sc. parallelize( [ 1 , 2 , 2 , 3 , 3 , 4 , 5 ] ) \
. distinct( ) \
. collect( )
[1, 2, 3, 4, 5]
union
rdd1 = sc. parallelize( [ 1 , 2 , 3 , 4 ] )
rdd2 = sc. parallelize( [ 5 , 6 , 7 , 1 ] )
rdd1. union( rdd2) . collect( )
[1, 2, 3, 4, 5, 6, 7, 1]
intersection
rdd1 = sc. parallelize( [ 1 , 2 , 3 , 4 ] )
rdd2 = sc. parallelize( [ 3 , 4 , 5 , 6 ] )
rdd1. intersection( rdd2) . collect( )
[3, 4]
subtract
rdd1 = sc. parallelize( [ 1 , 2 , 3 , 4 ] )
rdd2 = sc. parallelize( [ 3 , 4 , 5 , 6 ] )
rdd1. subtract( rdd2) . collect( )
[1, 2]
sortBy
sc. parallelize( [ ( 'a' , 4 ) , ( 'b' , 2 ) , ( 'c' , 3 ) ] ) \
. sortBy( lambda e: e[ 1 ] ) \
. collect( )
[('b', 2), ('c', 3), ('a', 4)]
转换算子-Key-Value型
mapValues
sc. parallelize( [ ( 'a' , 1 ) , ( 'b' , 2 ) , ( 'c' , 3 ) ] ) \
. mapValues( lambda v: v** 2 ) \
. collect( )
[('a', 1), ('b', 4), ('c', 9)]
flatMapValues
sc. parallelize( [ ( 'a' , [ 1 , 5 ] ) , ( 'b' , [ 2 , 4 ] ) , ( 'c' , [ 3 , 3 ] ) ] ) \
. flatMapValues( lambda v: v) \
. collect( )
[('a', 1), ('a', 5), ('b', 2), ('b', 4), ('c', 3), ('c', 3)]
reduceByKey
sc. parallelize( [ ( 'a' , 1 ) , ( 'b' , 2 ) , ( 'a' , 3 ) , ( 'b' , 4 ) ] ) \
. reduceByKey( lambda v1, v2: v1 + v2) \
. collect( )
[('a', 4), ('b', 6)]
groupByKey
sc. parallelize( [ ( 'a' , 1 ) , ( 'b' , 2 ) , ( 'a' , 3 ) , ( 'b' , 4 ) ] ) \
. groupByKey( ) \
. mapValues( list ) \
. collect( )
[('a', [1, 3]), ('b', [2, 4])]
sortByKey
sc. parallelize( [ ( '1' , 1 ) , ( 'b' , 2 ) , ( 'a' , 3 ) , ( '5' , 4 ) ] ) \
. sortByKey( ) \
. collect( )
[('1', 1), ('5', 4), ('a', 3), ('b', 2)]
keys
sc. parallelize( [ ( 1 , 'a' ) , ( 2 , 'b' ) ] ) \
. keys( ) \
. collect( )
[1, 2]
values
sc. parallelize( [ ( 1 , 'a' ) , ( 2 , 'b' ) ] ) \
. values( ) \
. collect( )
['a', 'b']
join
rdd1 = sc. parallelize( [ ( 'a' , 1 ) , ( 'b' , 2 ) ] )
rdd2 = sc. parallelize( [ ( 'a' , 3 ) , ( 'a' , 4 ) ] )
rdd1. join( rdd2) . collect( )
[('a', (1, 3)), ('a', (1, 4))]
leftOuterJoin
rdd1 = sc. parallelize( [ ( 'a' , 1 ) , ( 'b' , 2 ) ] )
rdd2 = sc. parallelize( [ ( 'a' , 3 ) , ( 'a' , 4 ) ] )
rdd1. leftOuterJoin( rdd2) . collect( )
[('b', (2, None)), ('a', (1, 3)), ('a', (1, 4))]
rightOuterJoin
rdd1 = sc. parallelize( [ ( 'a' , 1 ) , ( 'b' , 2 ) ] )
rdd2 = sc. parallelize( [ ( 'a' , 3 ) , ( 'a' , 4 ) ] )
rdd1. rightOuterJoin( rdd2) . collect( )
[('a', (1, 3)), ('a', (1, 4))]
动作算子
count
sc. parallelize( [ 1 , 2 , 3 , 4 , 5 ] ) . count( )
5
take
sc. parallelize( [ 1 , 2 , 3 , 4 , 5 ] ) . take( 3 )
[1, 2, 3]
takeOrdered
sc. parallelize( [ 1 , 5 , 2 , 7 , 3 ] ) . takeOrdered( 3 )
[1, 2, 3]
first
sc. parallelize( [ 2 , 3 , 4 ] ) . first( )
2
top
sc. parallelize( [ 1 , 2 , 3 , 11 , 12 ] ) . top( 3 )
[12, 11, 3]
sc. parallelize( [ 1 , 2 , 3 , 11 , 22 ] ) . top( 4 , key= str )
[3, 22, 2, 11]
foreach
sc. parallelize( [ 1 , 2 , 3 , 4 , 5 ] ) \
. foreach( lambda x: exec ( 'x=x+3; print(x)' ) )
foreachPartition
sc. parallelize( [ 1 , 2 , 3 , 4 , 5 ] , 3 ) \
. foreachPartition( lambda part: exec ( 's = sum(part); print(s)' ) )
collectAsMap
sc. parallelize( [ ( 'a' , 1 ) , ( 'b' , 2 ) , ( 'c' , 3 ) , ( 'a' , 4 ) ] ) \
. collectAsMap( )
{'a': 4, 'b': 2, 'c': 3}
countByKey
sc. parallelize( [ ( 'a' , 1 ) , ( 'b' , 2 ) , ( 'b' , 3 ) , ( 'a' , 4 ) ] ) \
. countByKey( )
defaultdict(int, {'a': 2, 'b': 2})
持久化
dataRDD = sc. textFile( 'E:/spark-2.3.0-bin-hadoop2.7/data/graphx/users.txt' )
num = dataRDD. map ( lambda s: len ( s) ) . reduce ( lambda a, b: a + b)
dataRDD. persist( )
l = dataRDD. glom( ) . collect( )
print ( num)
print ( l)
print ( "StorageLevel: " , dataRDD. getStorageLevel( ) )
162
[['1,BarackObama,Barack Obama', '2,ladygaga,Goddess of Love', '3,jeresig,John Resig', '4,justinbieber,Justin Bieber'], ['6,matei_zaharia,Matei Zaharia', '7,odersky,Martin Odersky', '8,anonsys']]
StorageLevel: Memory Serialized 1x Replicated
共享变量
广播变量
b = sc. broadcast( [ 1 , 2 , 3 , 4 , 5 ] )
print ( b. value)
l = sc. parallelize( [ 0 , 0 ] ) . map ( lambda x: b. value) . collect( )
print ( l)
b. unpersist( )
[1, 2, 3, 4, 5]
[[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]
累加器
counter = sc. accumulator( 0 )
rdd = sc. parallelize( range ( 10 ) )
def increment ( x) :
global counter
counter += x
rdd. foreach( increment)
print ( "Counter value: " , counter. value)
Counter value: 45