Spark WordCount实验
cd Downloads
vim data.txt
Apache Spark is a unified analytics engine for large-scale data processing.Spark powers a stack of libraries including SQL and DataFrames, MLlib for machine learning, GraphX, and Spark Streaming. You can combine these libraries seamlessly in the same application
#导入SparkContext对象
from pyspark import SparkContext
#配置SparkContext
sc = SparkContext( 'local', 'WordCount')
#创建一个新的RDD,加载本地文件
lines = sc.textFile("file:///home/ubuntu/Downloads/data.txt")
#flatMap(func):与map()相似,但每个输入元素都可以映射到0或多个输出结果
#map(func):将每个元素传递到函数func中,并将结果返回为一个新的数据集
#reduceByKey(func):应用于(K,V)键值对的数据集时,返回一个新的(K, V)形式的数据集,其中的每个值是将每个key传递到函数func中进行聚合
result = lines.flatMap(lambda x:x.split(" ")).map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y)
print (result.collect())
from __future__ import print_function
import sys
from operator import add
from pyspark.sql import SparkSession
if __name__ == "__main__":
#传参不是2个,则退出
if len(sys.argv) != 2:
print("Usage: wordcount <file>", file=sys.stderr)
exit(-1)
spark = SparkSession.builder.appName("PythonWordCount").getOrCreate()
#将传参文件名字赋予phthon
lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
#以空格为分割符统计单词存到Map
counts = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(add)
#将数据集赋予变量output
output = counts.collect()
#循环统计字符的个数
for (word, count) in output:
print("%s: %i" % (word, count))
spark.stop()
spark-submit wordcount01.py
spark-submit wordcount02.py data.txt
wget http://10.90.3.2/HUP/spark/book.txt
pyspark
#定义数组
data=[1,2,6,4,7,3]
#数组data赋值给RDD变量
rdd=sc.parallelize(data)
rdd.collect()
#将文件的内容赋值给RDD变量
distFile=sc.textFile("/home/ubuntu/Desktop/book.txt")
type(distFile)
3.RDD转换
count()操作,返回RDD元素个数
sc.parallelize([2,3,4]).count()
countByKey操作,每个键对应的元素个数,返回数据类型为字典类型
rdd=sc.parallelize([("a",1),("b",1),("a",1)])
sorted(rdd.countByKey().items())
countByKey操作,返回RDD中每一个唯一值的个数,返回数据类型为字典类型
sorted(sc.parallelize([1,2,1, 2, 2], 2).countByValue().items())
disict,去除RDD中重复的元素
sorted(sc.parallelize([1,1,2,3]).distinct().collect())
filter过滤符合自定义条件的RDD元素
rdd=sc.parallelize([1,2,3,4,5])
rdd.filter(lambda x:x%2== 0).collect()
first,返回RDD中的第一个元素,当RDD为空时抛出RDD为空的异常
sc.parallelize([2,3,4]).first()
sc.parallelize([]).first()
flatMap(迭代器)
rdd=sc.parallelize([2,3, 4])
sorted(rdd.flatMap(lambda x:range(1,x)).collect())
sorted(rdd.flatMap(lambda x:[(x,x), (x,x)]).collect())
def f(x): print(x)
sc.parallelize([1,2, 3,4,5]).foreach(f)
rdd=sc.parallelize([1,1, 2, 3,5,8])
result= rdd.groupBy(lambda x:x% 2).collect()
sorted([(x,sorted(y)) for (x,y) in result])
sc.parallelize([]).isEmpty()
rdd=sc.parallelize(["b","a","c"])
sorted(rdd.map(lambda x: (x,1)).collect())
from operator import add
rdd=sc.parallelize([("a", 1), ("b", 1), ("a",1)])
sorted(rdd.reduceByKey(add).collect())