一.计算最受欢迎的老师
1.项目需求:现有某网络上的访问日志,现需要计算某一学科下被访问次数最多的老师。
2.网络的url如右:http://bigdata.xiaoniu.com/laozhao
bigdata表示学科,laozhao表示教师。
3.代码如下:
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/*
1.分析最受欢迎的老师
*/
object PopularTeacher{
def main(args:Array[String]): Unit = {
val words = Array("http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://java.xiaoniu.com/laozhang",
"http://java.xiaoniu.com/laozhang",
"http://python.xiaoniu.com/laoqian",
"http://java.xiaoniu.com/laoli",
"http://python.xiaoniu.com/laoli",
"http://python.xiaoniu.com/laoli")
val conf = new SparkConf().setAppName("Popular").setMaster("local")
val sc = new SparkContext(conf)
//读取数据
//val result1 :RDD [String]= sc.textFile(args(0))
val result1 = sc.parallelize(words)
val subjectAndTeacher:RDD[(String,String)] = result1.map(lines =>{
val url = new URL(lines)
println("url = "+url)
val host = new URL(lines).getHost
println("host = "+host)
val subject = host.substring(0,host.indexOf("."))//切分字符串
val teacher = url.getPath.substring(1)//获得老师的名字
(subject,teacher)//这是一个直接返回的
})//整理数据
//总的排序
val result2 = subjectAndTeacher.map(x => (x,1)) //形成 ((键值对),1) 这种map
val result22 = result2.reduceByKey(_+_)//根据键将相同的合并
//print("result22's content are:") //并行的程序,你永远都不知道是不是按照程序的顺序输出
result22.foreach(println)
val result3: Array[((String, String), Int)] = result22.collect()
//println(result3.toBuffer)
//每个学科里面做排序 局部排序 按照学科的名字排序
//val result4 = result22.groupBy(_._1._1)
val result4: RDD[(String, Iterable[((String, String), Int)])] = result22.groupBy(x => x._1._1)
//二次排序
//将keys和values转换成List类型,然后按照values排序,然后倒叙输出,然后取前三
val result5: RDD[(String, List[((String, String), Int)])] = result4.mapValues(_.toList.sortBy(_._2).reverse.take(3))
val result = result5.collect()
result5.foreach(println)
}
}
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
/**
*1.自定义分区器
*2.继承自Partitioner
*3.subjects是一个字符串数组
*
* @param subjects
*/
class SelfPartition (subjects :Array[String]) extends Partitioner{
/*当课程和分区之间没有定义规则时,需要自定义规则
val rules = new mutable.HashMap[String ,Int]()
var i = 0
for (sub <- subjects){
rules += (sub -> i)
i+=1
}
*/
//直接固定map
val rules = Map("bigdata"-> 1,"java"->2,"python"->3)//不用new 直接写Map
//定义分区数 是个方法,而不是定义变量
override def numPartitions: Int = {
subjects.length
}
//获取具体分区
override def getPartition(key: Any): Int ={
val k = key.toString
rules.getOrElse(k,0)
}
}
/**
* 1.访问记录存储是一个URL,暂时用一个records = Array[String]来存储
* 2.将records转换成text(一个rdd)
* 3.对text进行操作,如:mapPartitions,map
* 4.将操作后的结果收集并写出到控制台
*/
object FavoriteTeacher{
def main (args:Array[String]): Unit ={
val conf = new SparkConf().setAppName("FavoriteTeacher").setMaster("local")
val sc = new SparkContext(conf)
//存储文本
val records: Array[String] = Array("http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://java.xiaoniu.com/laozhang",
"http://java.xiaoniu.com/laozhang",
"http://python.xiaoniu.com/laoqian",
"http://java.xiaoniu.com/laoli",
"http://python.xiaoniu.com/laoli",
"http://python.xiaoniu.com/laoli")
val text: RDD[String] = sc.parallelize(records)//转换成rdd
print("First disposition:")
text.collect().foreach(println)
//打印结果如下:http://bigdata.xiaoniu.com/laozhao
// http://bigdata.xiaoniu.com/laozhao
// http://bigdata.xiaoniu.com/laozhao
// http://bigdata.xiaoniu.com/laozhao
// http://bigdata.xiaoniu.com/laozhao
// http://java.xiaoniu.com/laozhang
// http://java.xiaoniu.com/laozhang
// http://python.xiaoniu.com/laoqian
// http://java.xiaoniu.com/laoli
// http://python.xiaoniu.com/laoli
// http://python.xiaoniu.com/laoli
/*
1.处理lines,并返回一个(String,String)元组
*/
def fun1(lines :String ): (String, String) = {
val url = new URL(lines)//将lines转换成URL
val hostName = url.getHost//获取host
val path = url.getPath//获取path
val courseName = hostName.substring(0,hostName.indexOf("."))//获取课程名
val teacherName = path.substring(1)//获取教师的姓名
(courseName,teacherName)
}
val res1: RDD[(String, String)] = text.map(fun1)
print("Second disposition:")
res1.foreach(print)
//打印结果如下:(bigdata,laozhao)(bigdata,laozhao)(bigdata,laozhao)
// (bigdata,laozhao)(bigdata,laozhao)(java,laozhang)(java,laozhang)(python,laoqian)
// (java,laoli)(python,laoli)(python,laoli)
val res2: RDD[((String, String), Int)] = res1.map(x => (x,1))//形成一个map 组合
val res3: RDD[((String, String), Int)] = res2.reduceByKey(_+_)//根据Key将每个map合并
print("Third disposition:")
res3.foreach(print)
val res4: RDD[(String, Iterable[((String, String), Int)])] = res3.groupBy(_._1._1)//根据学科来分组
res4.foreach(println)
val finRes = res4.mapValues(x => x.toList.sortBy(_._2).reverse.take(2))//对value操作!很重要
finRes.foreach(print)
// val selfPartition = new SelfPartition(records)//new 一个分区对象
// val res4 = res2.reduceByKey(selfPartition,_+_)
}
}
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
/**
*1.自定义分区器
*2.继承自Partitioner
*3.subjects是一个字符串数组
*
* @param subjects
*/
class SelfPartition (subjects :Array[String]) extends Partitioner{
//当课程和分区之间没有定义规则时,需要自定义规则
val rules = new mutable.HashMap[String ,Int]()
var i = 0
for (sub <- subjects){
rules += (sub -> i) //将rules逐渐添加完
i+=1
}
//直接固定map
//val rules = Map("bigdata"-> 1,"java"->2,"python"->3)//不用new 直接写Map
//定义分区数 是个方法,而不是定义变量
override def numPartitions: Int = {
subjects.length+ 1
}
//获取具体分区
override def getPartition(key: Any): Int ={
val k = key.toString
rules.getOrElse(k,0)
}
}
/**
* 1.访问记录存储是一个URL,暂时用一个records = Array[String]来存储
* 2.将records转换成text(一个rdd)
* 3.对text进行操作,如:mapPartitions,map
* 4.将操作后的结果收集并写出到控制台
* 5.让每个学科分到各自的分区
*/
object FavoriteTeacher{
def main (args:Array[String]): Unit ={
val conf = new SparkConf().setAppName("FavoriteTeacher").setMaster("local")
val sc = new SparkContext(conf)
//存储文本
val records: Array[String] = Array("http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://bigdata.xiaoniu.com/laozhao",
"http://java.xiaoniu.com/laozhang",
"http://java.xiaoniu.com/laozhang",
"http://python.xiaoniu.com/laoqian",
"http://java.xiaoniu.com/laoli",
"http://python.xiaoniu.com/laoli",
"http://python.xiaoniu.com/laoli")
val text: RDD[String] = sc.parallelize(records)//转换成rdd
print("First disposition:")
text.collect().foreach(println)
//打印结果如下:http://bigdata.xiaoniu.com/laozhao
// http://bigdata.xiaoniu.com/laozhao
// http://bigdata.xiaoniu.com/laozhao
// http://bigdata.xiaoniu.com/laozhao
// http://bigdata.xiaoniu.com/laozhao
// http://java.xiaoniu.com/laozhang
// http://java.xiaoniu.com/laozhang
// http://python.xiaoniu.com/laoqian
// http://java.xiaoniu.com/laoli
// http://python.xiaoniu.com/laoli
// http://python.xiaoniu.com/laoli
/*
1.处理lines,并返回一个(String,String)元组
*/
def fun1(lines :String ): (String, String) = {
val url = new URL(lines)//将lines转换成URL
val hostName = url.getHost//获取host
val path = url.getPath//获取path
val courseName = hostName.substring(0,hostName.indexOf("."))//获取课程名
val teacherName = path.substring(1)//获取教师的姓名
(courseName,teacherName)
}
val res1: RDD[(String, String)] = text.map(fun1)
print("Second disposition:")
res1.foreach(print)
//打印结果如下:(bigdata,laozhao)(bigdata,laozhao)(bigdata,laozhao)
// (bigdata,laozhao)(bigdata,laozhao)(java,laozhang)(java,laozhang)(python,laoqian)
// (java,laoli)(python,laoli)(python,laoli)
val res2: RDD[((String, String), Int)] = res1.map(x => (x,1))//形成一个map 组合
val subjects: Array[String] = res2.map(_._1._1).distinct().collect()
print("subjects = "+subjects)
val res3: RDD[((String, String), Int)] = res2.reduceByKey(_+_)//根据Key将每个map合并
print("Third disposition:")
res3.foreach(print)
val selfPartition = new SelfPartition(subjects)
//按照自定义的规则分区shuffle
val res4: RDD[(String, (String, Int))] = res3.map(t => (t._1._1, (t._1._2,t._2))).partitionBy(selfPartition)
/*
* 1.分区中本来就是Iterator,所以在toList之后,需要再转换成iterator
*/
val result: RDD[(String, (String, Int))] = res4.mapPartitions(_.toList.sortBy(_._2._2).reverse.take(2).iterator)
result.foreach(print)
}
}