#任务:已知班级信息表和成绩表,找出班级平均分在75分以上的班级#班级信息表包括class,name,成绩表包括name,score
classes =[("class1","LiLei"),("class1","HanMeiMei"),("class2","DaChui"),("class2","RuHua")]
scores =[("LiLei",76),("HanMeiMei",80),("DaChui",70),("RuHua",60)]
rdd_classes = sc.parallelize(classes).map(lambda x:(x[1],x[0]))
rdd_scores = sc.parallelize(scores)
rdd_join = rdd_scores.join(rdd_classes).map(lambda t:(t[1][1],t[1][0]))defaverage(iterator):
data =list(iterator)
s =0.0for x in data:
s = s + x
return s/len(data)
rdd_result = rdd_join.groupByKey().map(lambda t:(t[0],average(t[1]))).filter(lambda t:t[1]>75)print(rdd_result.collect())
[('class1', 78.0)]
分组求众数
#任务:有一批学生信息表格,包括class和age。求每个班级学生年龄的众数。
students =[("class1",15),("class1",15),("class2",16),("class2",16),("class1",17),("class2",19)]
defmode(arr):
dict_cnt ={}for x in arr:
dict_cnt[x]= dict_cnt.get(x,0)+1
max_cnt =max(dict_cnt.values())
most_values =[k for k,v in dict_cnt.items()if v==max_cnt]
s =0.0for x in most_values:
s = s + x
return s/len(most_values)
rdd_students = sc.parallelize(students)
rdd_classes = rdd_students.aggregateByKey([],lambda arr,x:arr+[x],lambda arr1,arr2:arr1+arr2)
rdd_mode = rdd_classes.map(lambda t:(t[0],mode(t[1])))print(rdd_mode.collect())