from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("My Spark Application").setMaster("local")
sc = SparkContext(conf=conf)
text = sc.textFile('/root/common_command/url_data.csv')
url_info = text.map(lambda line:line.split(","))
id_info = url_info.map(lambda fields:((fields[0], fields[1]),(fields[3])))
url_cnt = id_info.countByKey().items()
url_num = id_info.distinct().countByKey().items()
x = sc.parallelize(url_cnt)
y = sc.parallelize(url_num)
result = sorted(x.fullOuterJoin(y).collect())
print(result)
print("executed successfully!")
转载于:https://my.oschina.net/kyo4321/blog/1036721