#encoding:utf-8
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from com.bjsxt.spark.wc import show, lines
# from babel.util import distinct
def getCurrSiteTop2Location(one):
site=one(0)
locations=one(1)
locationDict={}
for location in locations:
if location in locationDict:
locationDict[location]+=1
else:
locationDict[location]=1
resultList=[]
sortedList=sorted(locationDict.items(),key=lambda kv:kv[1],reverse=True)
if len(sortedList)<2:
sortedList=sortedList
else:
for i in range(2):
resultList.append(sortedList[i])
return site,resultList
def getTop2Location(lines):
site_locations=lines.map(lambda line:(line.split("\t")[4],line.split("\t")[3])).groupByKey()
result=site_locations.map(lambda one:getCurrSite
机器学习部分:分区取topN(类的调用方法)
最新推荐文章于 2022-04-08 11:26:35 发布