数据源
http://bigdata.edu360.cn/laozhang
http://bigdata.edu360.cn/laozhang
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laoduan
http://bigdata.edu360.cn/laoduan
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laoduan
http://bigdata.edu360.cn/laoduan
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laoduan
http://bigdata.edu360.cn/laoduan
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://php.edu360.cn/laoli
http://php.edu360.cn/laoliu
http://php.edu360.cn/laoli
http://php.edu360.cn/laoli
测试一:查询姓名
package com.grace.scalawc
/**
* 查询姓名
*/
object wcTest {
def main(args: Array[String]): Unit = {
val sr:String = "http://bigdata.edu360.cn/laozhang"
val x:Int = sr.lastIndexOf("/")
val m:String = sr.substring(x+1)
print(m)
//结果:laozhang
}
}
测试二:按照名字分组
package com.grace.scalawc
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 按照名字分组
*/
object wcTest2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("wcTeast2").setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile(args(0))
val wordAndOne: RDD[(String, Int)] = lines.map(line => {
val i = line.lastIndexOf("/")
val person = line.substring(i+1)
(person,1)
})
val reduced: RDD[(String, Int)] = wordAndOne.reduceByKey((x,y) => x+y)
val sorted: RDD[(String, Int)] = reduced.sortBy(_._2,false)
val tuples = sorted.collect()
print("........................."+tuples.toBuffer)
sc.stop()
//结果:.........................ArrayBuffer((laozhao,15), (laoyang,9), (laoduan,6), (xiaoxu,6), (laoli,3), (laozhang,2), (laoliu,1))
}
}
测试三:查询同一职业的相同名字的前三名
package com.grace.scalawc
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 查询同一职业的相同名字的前三名
* 想要的结果:((javaee,xiaoxu),6)
*/
object wcTest3 {
def main(args: Array[String]): Unit = {
/* 部分数据源
http://bigdata.edu360.cn/laozhang
http://bigdata.edu360.cn/laozhao
http://javaee.edu360.cn/xiaoxu
*/
// spark配置
val conf: SparkConf = new SparkConf()
conf.setAppName("wcTest3")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
// 指定读取文件路径
// val lines: RDD[String] = sc.textFile(args(0)) //方法一
val lines: RDD[String] = sc.textFile("D:\\Study\\实训\\Java基础\\hadoop\\spark\\person.log")//方法二
//map():第一次处理数据;(job,person)=> key;6 => value
val personJobAndOne: RDD[((String, String), Int)] = lines.map(line => {
val i: Int = line.lastIndexOf("/")
val person: String = line.substring(i + 1)
val urlString: String = line.substring(0, i)
val url: URL = new URL(urlString)
val hostname: String = url.getHost
val str: Array[String] = hostname.split("\\.")
val job: String = str(0)
((job, person), 1)
//ArrayBuffer(((bigdata,laozhang),1), ((bigdata,laozhang),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laoduan),1), ((bigdata,laoduan),1), ((javaee,xiaoxu),1), ((javaee,xiaoxu),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laoduan),1), ((bigdata,laoduan),1), ((javaee,xiaoxu),1), ((javaee,xiaoxu),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laoduan),1), ((bigdata,laoduan),1), ((javaee,xiaoxu),1), ((javaee,xiaoxu),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((php,laoli),1), ((php,laoliu),1), ((php,laoli),1), ((php,laoli),1))
})
//聚合,把相同key(bigdata,laozhang)聚合在一起
val keySame: RDD[((String, String), Int)] = personJobAndOne.reduceByKey(_+_)//方法一
// personJobAndOne.reduceByKey((x,y) => x+y)//方法二
//ArrayBuffer(((javaee,xiaoxu),6), ((php,laoliu),1), ((bigdata,laozhang),2), ((bigdata,laozhao),15), ((javaee,laoyang),9), ((php,laoli),3), ((bigdata,laoduan),6))
//按照javaee排序_(String, String), Int) _1代表(String, String) _1第一个String
val jobGroup: RDD[(String, Iterable[((String, String), Int)])] = keySame.groupBy(_._1._1)
//ArrayBuffer((javaee,CompactBuffer(((javaee,xiaoxu),6), ((javaee,laoyang),9))), (php,CompactBuffer(((php,laoliu),1), ((php,laoli),3))), (bigdata,CompactBuffer(((bigdata,laozhang),2), ((bigdata,laozhao),15), ((bigdata,laoduan),6))))
//key不变,对每个Iterable[((String, String), Int)]进行处理
val valueList: RDD[(String, List[((String, String), Int)])] = jobGroup.mapValues(x => {
val valueList: List[((String, String), Int)] = x.toList
//按照value排序_代表(String, String), Int) _2表示Int
val valueGroup: List[((String, String), Int)] = valueList.sortBy(_._2)
//倒序
val valueReverse: List[((String, String), Int)] = valueGroup.reverse
//ArrayBuffer((javaee,List(((javaee,laoyang),9), ((javaee,xiaoxu),6))), (php,List(((php,laoli),3), ((php,laoliu),1))), (bigdata,List(((bigdata,laozhao),15), ((bigdata,laoduan),6), ((bigdata,laozhang),2))))
//取前2名
val three: List[((String, String), Int)] = valueReverse.take(2)
//重点不能忘记three,作用是向下传递
three
//ArrayBuffer((javaee,List(((javaee,laoyang),9), ((javaee,xiaoxu),6))), (php,List(((php,laoli),3), ((php,laoliu),1))), (bigdata,List(((bigdata,laozhao),15), ((bigdata,laoduan),6))))
})
val tuples = valueList.collect()
System.out.println(tuples.toBuffer)
sc.stop()
}
}
测试四:过滤器:每个job的前三名
package com.grace.scalawc
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 过滤器:每个job的前三名
*/
object PersonFilter {
private val jobs: Array[String] = Array("javaee","php","bigdata")
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
conf.setAppName("PersonFilter")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val lines: RDD[String] = sc.textFile(args(0))
val personAndJob: RDD[((String, String), Int)] = lines.map(line => {
val i: Int = line.lastIndexOf("/")
val person: String = line.substring(i + 1)
val urlStr: String = line.substring(0, i)
val urlString: URL = new URL(urlStr)
val urlhost: String = urlString.getHost
val jobString: Array[String] = urlhost.split("\\.")
val job: String = jobString(0)
((job, person), 1)
})
val reduced: RDD[((String, String), Int)] = personAndJob.reduceByKey(_+_)
//ArrayBuffer(((javaee,xiaoxu),6), ((php,laoliu),1), ((bigdata,laozhang),2), ((bigdata,laozhao),15), ((javaee,laoyang),9), ((php,laoli),3), ((bigdata,laoduan),6))
for (job <- jobs){
val filter: RDD[((String, String), Int)] = reduced.filter(_._1._1 == job)
val reverse: Array[((String, String), Int)] = filter.sortBy(_._2,true).take(3).reverse
reverse
println(reverse.toBuffer)
//ArrayBuffer(((javaee,laoyang),9), ((javaee,xiaoxu),6))
//ArrayBuffer(((php,laoli),3), ((php,laoliu),1))
//ArrayBuffer(((bigdata,laozhao),15), ((bigdata,laoduan),6), ((bigdata,laozhang),2))
}
// val tuples = reduced.collect()
// System.out.println(tuples.toBuffer)
sc.stop()
}
}
测试五:自定义分区(一)
package com.grace.scalawc
import java.io.File
import java.net.URL
import org.apache.commons.io.FileUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
/**
* 分区
*/
object PersonPartitioner {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
conf.setAppName("PersonPartitioner")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("D:\\Study\\实训\\Java基础\\hadoop\\spark\\person.log")
val personAndJob: RDD[((String, String), Int)] = lines.map(line => {
val i: Int = line.lastIndexOf("/")
val person: String = line.substring(i + 1)
val urlStr: String = line.substring(0, i)
val urlString: URL = new URL(urlStr)
val urlhost: String = urlString.getHost
val jobString: Array[String] = urlhost.split("\\.")
val job: String = jobString(0)
((job, person), 1)
})
// jobs:(javaee,php,bigdata)
val job: Array[String] = personAndJob.map(_._1._1).distinct().collect()
val ownPartitioner: MyPartitioner2 = new MyPartitioner2(job)
val reduced: RDD[((String, String), Int)] = personAndJob.reduceByKey(ownPartitioner,_+_)
//.........reduced: ArrayBuffer(((javaee,xiaoxu),6), ((php,laoliu),1), ((bigdata,laozhang),2), ((bigdata,laozhao),15), ((javaee,laoyang),9), ((php,laoli),3), ((bigdata,laoduan),6))
val sorted: RDD[((String, String), Int)] = reduced.mapPartitions(it => {
it.toList.sortBy(_._2).reverse.iterator
})
//.........sorted: ArrayBuffer(((bigdata,laozhao),15), ((javaee,laoyang),9), ((bigdata,laoduan),6), ((javaee,xiaoxu),6), ((php,laoli),3), ((bigdata,laozhang),2), ((php,laoliu),1))
val tuplesArr: Array[((String, String), Int)] = sorted.collect()
//.........tuplesArr: ArrayBuffer(((bigdata,laozhao),15), ((javaee,laoyang),9), ((bigdata,laoduan),6), ((javaee,xiaoxu),6), ((php,laoli),3), ((bigdata,laozhang),2), ((php,laoliu),1))
// 写入数据到文件
val file = new File("D:\\personAndJob.log")
if(file.exists()) {
FileUtils.deleteDirectory(file)
}
sorted.saveAsTextFile("D:\\personAndJob.log")
// val tuples = sorted.collect()
// System.out.println(".........tuples: "+tuplesArr.toBuffer)
sc.stop()
}
}
class MyPartitioner2(jobs:Array[String]) extends Partitioner{
// print("...jobs: "+jobs.toBuffer)
//...jobs: ArrayBuffer(javaee, php, bigdata)
private val rules: mutable.HashMap[String, Int] = new mutable.HashMap[String,Int]()
var i = 0
for (job <- jobs){
rules.put(job,i)
i += 1
// print("...rules: "+rules)
//...rules: Map(javaee -> 0)...rules: Map(javaee -> 0, php -> 1)...rules: Map(javaee -> 0, php -> 1, bigdata -> 2)
}
override def numPartitions: Int = jobs.length
override def getPartition(key: Any): Int = {
// key是reduced: RDD[((String, String), Int)]中的key
// print("...key: "+key.toString)
//...key: (bigdata,laozhang)...key: (bigdata,laozhang)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laoduan)...key: (bigdata,laoduan)...key: (javaee,xiaoxu)...key: (javaee,xiaoxu)...key: (javaee,laoyang)...key: (javaee,laoyang)...key: (javaee,laoyang)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laoduan)...key: (bigdata,laoduan)...key: (javaee,xiaoxu)...key: (javaee,xiaoxu)...key: (javaee,laoyang)...key: (javaee,laoyang)...key: (javaee,laoyang)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laozhao)...key: (bigdata,laoduan)...key: (bigdata,laoduan)...key: (javaee,xiaoxu)...key: (javaee,xiaoxu)...key: (javaee,laoyang)...key: (javaee,laoyang)...key: (javaee,laoyang)...key: (php,laoli)...key: (php,laoliu)...key: (php,laoli)...key: (php,laoli)
val job: String = key.asInstanceOf[(String,String)]._1
// print("...rulesJob: "+job)
//...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: javaee...rulesJob: javaee...rulesJob: javaee...rulesJob: javaee...rulesJob: javaee...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: javaee...rulesJob: javaee...rulesJob: javaee...rulesJob: javaee...rulesJob: javaee...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: bigdata...rulesJob: javaee...rulesJob: javaee...rulesJob: javaee...rulesJob: javaee...rulesJob: javaee...rulesJob: php...rulesJob: php...rulesJob: php...rulesJob: php19/11/19 16:13:03 INFO Executor: Finished task 0.0 in stage 2.0 (TID 2). 1027 bytes result sent to driver rules(job)
rules(job)
}
}
在D:\personAndJob.log盘中出现
分区成功!
测试五:自定义分区(二)
package com.grace.scalawc
import java.io.File
import org.apache.commons.io.FileUtils
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext}
object FavPerson3Partitioner {
def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setAppName("FavPerson3Partitioner").setMaster("local"))
val data = Array(("aaa", 2), ("aaa", 3), ("aaa", 1), ("aaa", 0), ("aaa", 4),
("aa", 2), ("aa", 3), ("aa", 1), ("aa", 0), ("aa", 4),
("a", 2), ("a", 3), ("a", 1), ("a", 0), ("a", 4))
val dataRdd: RDD[(String, Int)] = sc.parallelize(data)
val res: RDD[(String, Int)] = dataRdd.partitionBy(new MyPartitioner(3))
val file = new File("D:\\log.log")
if(file.exists()) {
FileUtils.deleteDirectory(file)
}
res.saveAsTextFile("D:\\log.log")
sc.stop()
}
}
class MyPartitioner(num: Int) extends Partitioner {
override def numPartitions: Int = num
override def getPartition(key: Any): Int = {
key.toString.length % num
}
}
在D:\\log.log文件夹中出现
分区成功!