map:
def map():
sc = SparkContext("spark://node0:7077", "map")
list=[1,2,3,4,5]
listRdd=sc.parallelize(list)
listmap =listRdd.map(lambda s:s*2)
print listmap.collect()
sc.stop()
filter
def filter():
sc = SparkContext("spark://node0:7077", "filter")
list = [1, 2, 3, 4, 5,6,7,8,9,10]
listRdd = sc.parallelize(list)
result = listRdd.filter(lambda x: x % 2 == 0)
print result.collect()
sc.stop()
flatMap:
def flatMap():
sc = SparkContext("spark://node0:7077", "flatMap")
line = ["hello you","hello me","hello world"]
lineRdd =sc.parallelize(line)
result =lineRdd.flatMap(lambda s :s.split(" "))
print result.collect()
sc.stop()
groupbykey:
def groupbyKey():
sc = SparkContext("spark://node0:7077", "groupbyKey")
listtest = [("class1",80),("class2",75),("class1",90),("class2",65)]
listRdd = sc.parallelize(listtest)
result = listRdd.groupByKey();
print result.map(lambda x : (x[0], list(x[1]))).collect();
sc.stop
reducebykey:
def reduceByKey():
sc = SparkContext("spark://node0:7077", "reduceByKey")
listtest = [("class1", 80), ("class2", 75), ("class1", 90), ("class2", 65)]
listRdd = sc.parallelize(listtest)
result = listRdd.reduceByKey(lambda x,y:x+y);
print result.collect()
sc.stop
sortbykey:
def sortByKey():
sc = SparkContext("spark://node0:7077", "sortByKey")
listtest = [(65,"ieo"),(50,"tom"),(100,"marry"),(80,"jack")]
listRdd = sc.parallelize(listtest)
result = listRdd.sortByKey()
print result.collect()
sc.stop
join:
def join():
sc = SparkContext("spark://node0:7077", "join")
studentlist = [(1, "leo"),(2, "jack"),(3, "tom")]
scorelist=[(1, 100),(2, 90),(3, 60)]
students = sc.parallelize(studentlist)
scores = sc.parallelize(scorelist)
result = students.join(scores)
print result.collect()
sc.stop
cogroup:
def cogroup():
sc = SparkContext("spark://node0:7077", "cogroup")
studentlist = [(1, "leo"),(2, "jack"),(3, "tom")]
scorelist=[(1, 100),(2, 90),(3, 60),(1, 70),(2, 80),(3, 50)]
students = sc.parallelize(studentlist)
scores = sc.parallelize(scorelist)
result = students.cogroup(scores)
results = result.map(lambda x: (x[0],list(x[1][0]),list(x[1][1])))
print results.collect()
sc.stop
全文件:
import os
import sys
os.environ['SPARK_HOME'] = '/opt/spark'
sys.path.append("/opt/spark/python")
from pyspark import SparkContext
from pyspark import SparkConf
def map():
sc = SparkContext("spark://node0:7077", "map")
list=[1,2,3,4,5]
listRdd=sc.parallelize(list)
listmap =listRdd.map(lambda s:s*2)
print listmap.collect()
sc.stop()
def filter():
sc = SparkContext("spark://node0:7077", "filter")
list = [1, 2, 3, 4, 5,6,7,8,9,10]
listRdd = sc.parallelize(list)
result = listRdd.filter(lambda x: x % 2 == 0)
print result.collect()
sc.stop()
def flatMap():
sc = SparkContext("spark://node0:7077", "flatMap")
line = ["hello you","hello me","hello world"]
lineRdd =sc.parallelize(line)
result =lineRdd.flatMap(lambda s :s.split(" "))
print result.collect()
sc.stop()
def groupbyKey():
sc = SparkContext("spark://node0:7077", "groupbyKey")
listtest = [("class1",80),("class2",75),("class1",90),("class2",65)]
listRdd = sc.parallelize(listtest)
result = listRdd.groupByKey();
print result.map(lambda x : (x[0], list(x[1]))).collect();
sc.stop
def reduceByKey():
sc = SparkContext("spark://node0:7077", "reduceByKey")
listtest = [("class1", 80), ("class2", 75), ("class1", 90), ("class2", 65)]
listRdd = sc.parallelize(listtest)
result = listRdd.reduceByKey(lambda x,y:x+y);
print result.collect()
sc.stop
def sortByKey():
sc = SparkContext("spark://node0:7077", "sortByKey")
listtest = [(65,"ieo"),(50,"tom"),(100,"marry"),(80,"jack")]
listRdd = sc.parallelize(listtest)
result = listRdd.sortByKey()
print result.collect()
sc.stop
def join():
sc = SparkContext("spark://node0:7077", "join")
studentlist = [(1, "leo"),(2, "jack"),(3, "tom")]
scorelist=[(1, 100),(2, 90),(3, 60)]
students = sc.parallelize(studentlist)
scores = sc.parallelize(scorelist)
result = students.join(scores)
print result.collect()
sc.stop
def cogroup():
sc = SparkContext("spark://node0:7077", "cogroup")
studentlist = [(1, "leo"),(2, "jack"),(3, "tom")]
scorelist=[(1, 100),(2, 90),(3, 60),(1, 70),(2, 80),(3, 50)]
students = sc.parallelize(studentlist)
scores = sc.parallelize(scorelist)
result = students.cogroup(scores)
results = result.map(lambda x: (x[0],list(x[1][0]),list(x[1][1])))
print results.collect()
sc.stop
if __name__ == '__main__':
# map()
# filter()
# flatMap()
#groupbyKey()
#reduceByKey()
#sortByKey()
#join()
cogroup()