数据源:person.log
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laoduan
http://bigdata.edu360.cn/laoduan
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://php.edu360.cn/laoli
http://php.edu360.cn/laoliu
http://php.edu360.cn/laoli
http://php.edu360.cn/laoli
FilterDemo:联系过滤
package com.zpark.stu.Transformation_Action
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object FilterDemo {
private val subjects = Array("bigdata","php","javaee")
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("FilterDemo").setMaster("local")
val sc = new SparkContext(conf)
val textFile = sc.textFile("E:\\hadoop\\spark\\WordCount\\InPut\\person.log")
val subJectOne: RDD[((String, String), Int)] = textFile.map(x => {
val i = x.lastIndexOf("/")
val person = x.substring(i + 1)
val urlString = x.substring(0, i)
val url = new URL(urlString).getHost
val subject = url.split("\\.")(0)
((subject, person), 1)
})
val reducer: RDD[((String, String), Int)] = subJectOne.reduceByKey(_+_)
for(sb <- subjects){
val filted: RDD[((String, String), Int)] = reducer.filter(_._1._1 == sb)
val tuples: Array[((String, String), Int)] = filted.sortBy(_._2, false).take(3)
println(tuples.toBuffer)
}
sc.stop()
}
}
GroupDemo:练习分区
package com.zpark.stu.Transformation_Action
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object GroupDemo {
def main(args: Array[String]): Unit = {
/**
* 数据源:http://bigdata.edu360.cn/laozhang
* 需求:将http://bigdata.edu360.cn/laozhang中的组和用户名取出(bigdata,laozhang)并进行分组聚合
* 1.拿出SparkContext
* 2.读取数据源
* 3.将数据通过map处理数据
* 4.用reducer将数据聚合
* 5.将数据分组聚合
* 6.输出数据
*/
//1
val conf: SparkConf = new SparkConf()
conf.setAppName("GroupDemo")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
//2:读取数据
val lines: RDD[String] = sc.textFile("E:\\hadoop\\spark\\WordCount\\InPut\\person.log")
//3.处理数据
val jobPersonOne: RDD[((String, String), Int)] = lines.map(line => {
val i: Int = line.lastIndexOf("/")
//取出用户名:laozhang
val person: String = line.substring(i + 1)
//截取 从开始到i的字符串:http://bigdata.edu360.cn/
val userString: String = line.substring(0, i)
//取出bigdata.edu360.cn
val url = new URL(userString)
val host: String = url.getHost
//将bigdata.edu360.cn通过.切割
val strings: Array[String] = host.split("\\.")
//取出切割后的第一个数据
val job: String = strings(0)
//将数据组和为((bigdata,laozhang),1) 传给下一个阶段
((job, person), 1)
})
//4.reduce聚合数据
//处理后的数据:((javaee,xiaoxu),6)((php,laoliu),1)((bigdata,laozhang),2)((bigdata,laozhao),15)((javaee,laoyang),9)((php,laoli),3)((bigdata,laoduan),6)
val reduceValue: RDD[((String, String), Int)] = jobPersonOne.reduceByKey(_+_)
//5.将数据分组聚合
//聚合结果:(javaee,CompactBuffer(((javaee,xiaoxu),6), ((javaee,laoyang),9)))(php,CompactBuffer(((php,laoliu),1), ((php,laoli),3)))(bigdata,CompactBuffer(((bigdata,laozhang),2), ((bigdata,laozhao),15), ((bigdata,laoduan),6)))
val gropBy: RDD[(String, Iterable[((String, String), Int)])] = reduceValue.groupBy(_._1._1)
val mapValue: RDD[(String, List[((String, String), Int)])] = gropBy.mapValues(x => {
//转换为List
val list: List[((String, String), Int)] = x.toList
//_._2 _.swap 的区别
val tuples: List[((String, String), Int)] = list.sortBy(_._2).reverse
tuples
})
mapValue.foreach(y => println("分组聚合后的数据:" + y))
sc.stop()
}
}
MypartitionerDemo:自定义分区
package com.zpark.stu.Transformation_Action
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
object MypartitionerDemo {
/**
* 需求:将数据按照职业进行分区
* @param args
*/
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("MypartitionerDemo").setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("E:\\hadoop\\spark\\WordCount\\InPut\\person.log")
val mapOne: RDD[((String, String), Int)] = lines.map(x => {
val i = x.lastIndexOf("/")
val person = x.substring(i + 1)
val urlString = x.substring(0, i)
val host = new URL(urlString).getHost
val strings = host.split("\\.")
val job = strings(0)
((job, person), 1)
})
// mapOne.foreach(y => println(y))
//聚合
val reducer: RDD[((String, String), Int)] = mapOne.reduceByKey((x, y) => x + y)
// reducer.foreach(y => println(y))
//排序
val sortBy: RDD[((String, String), Int)] = reducer.sortBy(_._2, false)
// sortBy.foreach(y => println(y))
//改变格式为(String,(String, Int))
val mapTwo: RDD[(String, (String, Int))] = sortBy.map(line => {
val string1: String = line._1._1
val string2: String = line._1._2
val int1: Int = line._2
(string1, (string2, int1))
})
// mapTwo.foreach(f => println(f))
val myPartition: RDD[(String, (String, Int))] = mapTwo.partitionBy(new MyPartitioner(3))
myPartition.foreach(y => println(y))
sc.stop()
}
}
class MyPartitioner(num: Int) extends Partitioner{
override def numPartitions: Int = num
override def getPartition(key: Any): Int = {
//分别比配 bigdata php javaee
// if(key.toString.equals("bigdata")){
// 0
// } else if (key.toString.equals("php")){
// 1
// }else{
// 2
// }
// println("key.toString: " +key.toString)
// key.toString.length % num
//模式匹配
matchTest(key.toString)
}
def matchTest(x: Any): Int = x match {
case "bigdata" => 0
case "php" => 1
case _ => 2
}
}
Mypartitioner2Demo
package com.zpark.stu.Transformation_Action
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
object Mypartitioner2Demo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Mypartitioner2Demo").setMaster("local")
val sc = new SparkContext(conf)
val textrdd: RDD[String] = sc.textFile("E:\\hadoop\\spark\\WordCount\\InPut\\person.log")
val maprdd: RDD[((String, String), Int)] = textrdd.map(x => {
val i = x.lastIndexOf("/")
val person = x.substring(i + 1)
val http = x.substring(0, i)
val job = new URL(http).getHost.split("\\.")(0)
((job, person), 1)
})
val maprddtwo: Array[String] = maprdd.map(_._1._1).distinct().collect()
val partitioner = new MyPartitioner1(maprddtwo)
val reducerdd: RDD[((String, String), Int)] = maprdd.reduceByKey(partitioner, _+_)
val iteratorrdd: RDD[((String, String), Int)] = reducerdd.mapPartitions(it => {
it.toList.sortBy(_._2).reverse.iterator
})
iteratorrdd.foreach(y => println(y))
sc.stop()
}
}
class MyPartitioner1(jobs : Array[String]) extends Partitioner{
val rules = new mutable.HashMap[String, Int]()
var i = 0
for (job <- jobs){
rules.put(job, i)
i += 1
}
override def numPartitions: Int = jobs.length
override def getPartition(key: Any): Int = {
val job = key.asInstanceOf[(String, String)]._1
rules(job)
}
}