spark教程python案例_【机器学习之二】python开发spark案例

最新推荐文章于 2023-10-08 06:00:00 发布

weixin_39672680

最新推荐文章于 2023-10-08 06:00:00 发布

阅读量128

点赞数

文章标签： spark教程python案例

#-*- coding:utf-8 -*-

'''Created on 2019年5月16日

@author: Administrator'''

#import sys

from pyspark.conf importSparkConffrom pyspark.context importSparkContextfrom builtins importsorted#print(sys.getdefaultencoding())#reload(sys)#sys.setdefaultencoding('utf-8')#print(sys.getdefaultencoding())

#打印结果

defshowresult(em):print(em)#数据样例#7.213.213.208 吉林 2018-03-29 1522294977303 1920936170939152672 www.dangdang.com Login

#页面访问量

defpv(lines):

sitepair= lines.map(lambda line:(line.split("\t")[5],1))

result1= sitepair.reduceByKey(lambda v1,v2:v1+v2)#排序降序

result2 = result1.sortBy(lambda one:one[1],ascending=False)

result2.foreach(lambdaem :showresult(em))#('www.baidu.com', 18791)#('www.dangdang.com', 18751)#('www.suning.com', 18699)#('www.mi.com', 18678)#('www.taobao.com', 18613)#('www.jd.com', 18519)#('www.gome.com.cn', 18493)

#用户访问量

defuv(lines):#同一个IP访问某个网站量要排重

sitepair = lines.map(lambda line:line.split("\t")[0]+"_"+line.split("\t")[5]).distinct()

result= sitepair.map(lambda one:(one.split("_")[1],1)).reduceByKey(lambda v1,v2:v1+v2).sortBy(lambda one:one[1],ascending=False)

result.foreach(lambdaone:showresult(one))#('www.baidu.com', 15830)#('www.suning.com', 15764)#('www.mi.com', 15740)#('www.jd.com', 15682)#('www.dangdang.com', 15641)#('www.taobao.com', 15593)#('www.gome.com.cn', 15590)

defuvExceptBJ(lines):

usiteviews= lines.filter(lambda line:line.split("\t")[1] != "北京").map(lambda line:line.split("\t")[0]+"_"+line.split("\t")[5]).distinct()

result1= usiteviews.map(lambda one:(one.split("_")[1],1)).reduceByKey(lambda v1,v2:v1+v2)

result2= result1.sortBy(lambda one:one[1],ascending=False)

result2.foreach(lambdaem : showresult(em))#('www.baidu.com', 15399)#('www.mi.com', 15341)#('www.suning.com', 15294)#('www.jd.com', 15255)#('www.dangdang.com', 15181)#('www.gome.com.cn', 15154)#('www.taobao.com', 15131)

defgetTop2Location(lines):#按照网站分组

site_locations = lines.map(lambda line:(line.split("\t")[5],line.split("\t")[1])).groupByKey()

result= site_locations.map(lambdaone:getCurrSiteTop2Location(one)).collect()for em inresult:print(em)#('www.suning.com', [('山西', 1102), ('广西', 606)])#('www.jd.com', [('山西', 1069), ('湖北', 614)])#('www.taobao.com', [('山西', 1065), ('安徽', 601)])#('www.gome.com.cn', [('山西', 1029), ('内蒙', 590)])#('www.dangdang.com', [('山西', 1083), ('香港', 591)])#('www.mi.com', [('山西', 1085), ('广东', 617)])#('www.baidu.com', [('山西', 1028), ('台湾', 641)])

defgetCurrSiteTop2Location(one):

site=one[0]

locations= one[1]

locationdict={}#汇总每个网站中location的数量

for location inlocations:if location inlocationdict:

locationdict[location]+= 1

else:

locationdict[location]= 1resultlist=[]#使用内置函数排序

sortedList = sorted(locationdict.items(),key = lambda kv:kv[1],reverse =True)#取前两个地区

if len(sortedList) < 2:

resultlist=sortedListelse:for i in range(2):

resultlist.append(sortedList[i])returnsite,resultlistdefgetTopOperation(lines):

site_operations= lines.map(lambda line:(line.split("\t")[5],line.split("\t")[6])).groupByKey()

result= site_operations.map(lambdaone:getCurrSiteTopOperation(one)).collect()for em inresult:print(em)#('www.suning.com', [('View', 3168)])#('www.jd.com', [('Login', 3132)])#('www.taobao.com', [('Regist', 3196)])#('www.gome.com.cn', [('Click', 3170)])#('www.dangdang.com', [('Buy', 3179)])#('www.mi.com', [('Buy', 3231)])#('www.baidu.com', [('Comment', 3207)])

defgetCurrSiteTopOperation(one):

site=one[0]

operations= one[1]

operationDict={}for operation inoperations:if operation inoperationDict:

operationDict[operation]+= 1

else:

operationDict[operation]= 1resultList=[]

sortedList= sorted(operationDict.items(), key=lambda kv:kv[1], reverse=True)if len(sortedList) < 1:

resultList=[]else:

resultList.append(sortedList[0])returnsite,resultListdefgetTop3User(lines):#另外一种思路按照用户分组统计每个用户访问不同网站数量

site_uid_count = lines.map(lambda line:(line.split("\t")[3],line.split("\t")[5])).groupByKey().flatMap(lambdaone:getSiteInfo(one))#按照网站分组之后再取前三

result = site_uid_count.groupByKey().map(lambdaone:getCurSiteTop3User(one)).collect()for em inresult:print(em)#('www.suning.com', [('1522294989941', 5), ('1522294980028', 5), ('1522294986337', 5)])#('www.jd.com', [('1522295002636', 5), ('1522294988631', 5), ('1522294990824', 4)])#('www.taobao.com', [('1522294992394', 5), ('1522294982477', 5), ('1522294999369', 5)])#('www.gome.com.cn', [('1522294994219', 5), ('1522294988497', 5), ('1522294991142', 5)])#('www.dangdang.com', [('1522294994360', 5), ('1522294988712', 5), ('1522294992239', 4)])#('www.mi.com', [('1522294987189', 5), ('1522294989540', 5), ('1522294980962', 5)])#('www.baidu.com', [('1522294991559', 6), ('1522294989188', 5), ('1522294996021', 5)])

#统计每个用户访问网站数量然后返回每个网站对应用户访问量

defgetSiteInfo(one):

uid=one[0]

sites= one[1]

siteDict={}for site insites:if site insiteDict:

siteDict[site]+= 1

else:

siteDict[site]= 1resultList=[]for site,count insiteDict.items():

resultList.append((site,(uid,count)))returnresultListdefgetCurSiteTop3User(one):

site=one[0]

uid_counts= one[1]

top3List= ["","",""]for uid_count inuid_counts:for i inrange(0,len(top3List)):if top3List[i] == "":

top3List[i]=uid_countbreak

else:if uid_count[1] > top3List[i][1]:for j in range(2,i,-1):

top3List[j]= top3List[j-1]

top3List[i]=uid_countbreak

returnsite,top3Listif __name__ == '__main__':

conf= SparkConf().setMaster("local").setAppName("pvuv")

sc= SparkContext(conf=conf)

sc.setLogLevel("WARN")

lines= sc.textFile('../../data/pvuvdata')#1).统计PV,UV

pv(lines)

uv(lines)#2).统计除了北京地区外的UV

uvExceptBJ(lines)#3).统计每个网站最活跃的top2地区

getTop2Location(lines)#4).统计每个网站最热门的操作

getTopOperation(lines)#5).统计每个网站下最活跃的top3用户

getTop3User(lines)#停止

sc.stop()

weixin_39672680

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark教程python案例_【机器学习之二】python开发spark案例

#-*- coding:utf-8 -*-'''Created on 2019年5月16日@author: Administrator'''#import sysfrom pyspark.conf importSparkConffrom pyspark.context importSparkContextfrom builtins importsorted#print(sys.getdefault...
复制链接

扫一扫