spark中wordCount案例

数据源

http://bigdata.edu360.cn/laozhang
http://bigdata.edu360.cn/laozhang
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laoduan
http://bigdata.edu360.cn/laoduan
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laoduan
http://bigdata.edu360.cn/laoduan
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laoduan
http://bigdata.edu360.cn/laoduan
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://javaee.edu360.cn/laoyang
http://php.edu360.cn/laoli
http://php.edu360.cn/laoliu
http://php.edu360.cn/laoli
http://php.edu360.cn/laoli

测试一:查询姓名

package com.grace.scalawc

/**
 * 查询姓名
 */
object wcTest {
  def main(args: Array[String]): Unit = {
    val sr:String = "http://bigdata.edu360.cn/laozhang"

    val x:Int = sr.lastIndexOf("/")
    val m:String = sr.substring(x+1)
    print(m)
    //结果:laozhang
  }
}

测试二:按照名字分组

package com.grace.scalawc

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
 * 按照名字分组
 */
object wcTest2 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("wcTeast2").setMaster("local")
    val sc = new SparkContext(conf)
    val lines = sc.textFile(args(0))
    val wordAndOne: RDD[(String, Int)] = lines.map(line => {

      val i = line.lastIndexOf("/")
      val person = line.substring(i+1)
      (person,1)
    })
    val reduced: RDD[(String, Int)] = wordAndOne.reduceByKey((x,y) => x+y)
    val sorted: RDD[(String, Int)] = reduced.sortBy(_._2,false)
    val tuples = sorted.collect()

    print("........................."+tuples.toBuffer)
    sc.stop()
    //结果:.........................ArrayBuffer((laozhao,15), (laoyang,9), (laoduan,6), (xiaoxu,6), (laoli,3), (laozhang,2), (laoliu,1))
  }
}

测试三:查询同一职业的相同名字的前三名

package com.grace.scalawc

import java.net.URL

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
 * 查询同一职业的相同名字的前三名
 * 想要的结果:((javaee,xiaoxu),6)
 */
object wcTest3 {
  def main(args: Array[String]): Unit = {
    /*  部分数据源
      http://bigdata.edu360.cn/laozhang
      http://bigdata.edu360.cn/laozhao
      http://javaee.edu360.cn/xiaoxu
        */
  //    spark配置
    val conf: SparkConf = new SparkConf()
    conf.setAppName("wcTest3")
    conf.setMaster("local")
    val sc: SparkContext = new SparkContext(conf)
//  指定读取文件路径
//    val lines: RDD[String] = sc.textFile(args(0)) //方法一
    val lines: RDD[String] = sc.textFile("D:\\Study\\实训\\Java基础\\hadoop\\spark\\person.log")//方法二
    //map():第一次处理数据;(job,person)=> key;6 => value
    val personJobAndOne: RDD[((String, String), Int)] = lines.map(line => {
      val i: Int = line.lastIndexOf("/")
      val person: String = line.substring(i + 1)
      val urlString: String = line.substring(0, i)
      val url: URL = new URL(urlString)
      val hostname: String = url.getHost
      val str: Array[String] = hostname.split("\\.")
      val job: String = str(0)
      ((job, person), 1)
//ArrayBuffer(((bigdata,laozhang),1), ((bigdata,laozhang),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laoduan),1), ((bigdata,laoduan),1), ((javaee,xiaoxu),1), ((javaee,xiaoxu),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laoduan),1), ((bigdata,laoduan),1), ((javaee,xiaoxu),1), ((javaee,xiaoxu),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laozhao),1), ((bigdata,laoduan),1), ((bigdata,laoduan),1), ((javaee,xiaoxu),1), ((javaee,xiaoxu),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((javaee,laoyang),1), ((php,laoli),1), ((php,laoliu),1), ((php,laoli),1), ((php,laoli),1))
    })
    //聚合,把相同key(bigdata,laozhang)聚合在一起
    val keySame: RDD[((String, String), Int)] = personJobAndOne.reduceByKey(_+_)//方法一
//    personJobAndOne.reduceByKey((x,y) =>  x+y)//方法二
    //ArrayBuffer(((javaee,xiaoxu),6), ((php,laoliu),1), ((bigdata,laozhang),2), ((bigdata,laozhao),15), ((javaee,laoyang),9), ((php,laoli),3), ((bigdata,laoduan),6))
    //按照javaee排序_(String, String), Int)     _1代表(String, String)     _1第一个String
    val jobGroup: RDD[(String, Iterable[((String, String), Int)])] = keySame.groupBy(_._1._1)
    //ArrayBuffer((javaee,CompactBuffer(((javaee,xiaoxu),6), ((javaee,laoyang),9))), (php,CompactBuffer(((php,laoliu),1), ((php,laoli),3))), (bigdata,CompactBuffer(((bigdata,laozhang),2), ((bigdata,laozhao),15), ((bigdata,laoduan),6))))
        //key不变,对每个Iterable[((String, String), Int)]进行处理
    val valueList: RDD[(String, List[((String, String), Int)])] = jobGroup.mapValues(x => {
          val valueList: List[((String, String), Int)] = x.toList
          //按照value排序_代表(String, String), Int)   _2表示Int
          val valueGroup: List[((String, String), Int)] = valueList.sortBy(_._2)
          //倒序
          val valueReverse: List[((String, String), Int)] = valueGroup.reverse
          //ArrayBuffer((javaee,List(((javaee,laoyang),9), ((javaee,xiaoxu),6))), (php,List(((php,laoli),3), ((php,laoliu),1))), (bigdata,List(((bigdata,laozhao),15), ((bigdata,laoduan),6), ((bigdata,laozhang),2))))
          //取前2名
          val three: List[((String, String), Int)] = valueReverse.take(2)
          //重点不能忘记three,作用是向下传递
          three
    //ArrayBuffer((javaee,List(((javaee,laoyang),9), ((javaee,xiaoxu),6))), (php,List(((php,laoli),3), ((php,laoliu),1))), (bigdata,List(((bigdata,laozhao),15), ((bigdata,laoduan),6))))          
    })
    val tuples = valueList.collect()
    System.out.println(tuples.toBuffer)
    sc.stop()
  }
}

测试四:过滤器:每个job的前三名

package com.grace.scalawc

import java.net.URL

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


/**
 * 过滤器:每个job的前三名
 */
object PersonFilter {
  private val jobs: Array[String] = Array("javaee","php","bigdata")
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("PersonFilter")
    conf.setMaster("local")
    val sc: SparkContext = new SparkContext(conf)
    val lines: RDD[String] = sc.textFile(args(0))
    val personAndJob: RDD[((String, String), Int)] = lines.map(line => {
      val i: Int = line.lastIndexOf("/")
      val person: String = line.substring(i + 1)
      val urlStr: String = line.substring(0, i)
      val urlString: URL = new URL(urlStr)
      val urlhost: String = urlString.getHost
      val jobString: Array[String] = urlhost.split("\\.")
      val job: String = jobString(0)
      ((job, person), 1)
    })
    val reduced: RDD[((String, String), Int)] = personAndJob.reduceByKey(_+_)
    //ArrayBuffer(((javaee,xiaoxu),6), ((php,laoliu),1), ((bigdata,laozhang),2), ((bigdata,laozhao),15), ((javaee,laoyang),9), ((php,laoli),3), ((bigdata,laoduan),6))
    for (job <- jobs){
      val filter: RDD[((String, String), Int)] = reduced.filter(_._1._1  == job)
      val reverse: Array[((String, String), Int)] = filter.sortBy(_._2,true).take(3).reverse
      reverse
      println(reverse.toBuffer)
      //ArrayBuffer(((javaee,laoyang),9), ((javaee,xiaoxu),6))
      //ArrayBuffer(((php,laoli),3), ((php,laoliu),1))
      //ArrayBuffer(((bigdata,laozhao),15), ((bigdata,laoduan),6), ((bigdata,laozhang),2))
    }

//    val tuples = reduced.collect()
//    System.out.println(tuples.toBuffer)
    sc.stop()
  }
}

测试五:自定义分区(一)

package com.grace.scalawc

import java.io.File
import java.net.URL

import org.apache.commons.io.FileUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}

import scala.collection.mutable

/**
 * 分区
 */
object PersonPartitioner {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("PersonPartitioner")
    conf.setMaster("local")
    val sc: SparkContext = new SparkContext(conf)
    val lines: RDD[String] = sc.textFile("D:\\Study\\实训\\Java基础\\hadoop\\spark\\person.log")
    val personAndJob: RDD[((String, String), Int)] = lines.map(line => {
      val i: Int = line.lastIndexOf("/")
      val person: String = line.substring(i + 1)
      val urlStr: String = line.substring(0, i)
      val urlString: URL = new URL(urlStr)
      val urlhost: String = urlString.getHost
      val jobString: Array[String] = urlhost.split("\\.")
      val job: String = jobString(0)
      ((job, person), 1)
    })


//  jobs:(javaee,php,bigdata)
    val job: Array[String] = personAndJob.map(_._1._1).distinct().collect()
    val ownPartitioner: MyPartitioner2 = new MyPartitioner2(job)

    val reduced: RDD[((String, String), Int)] = personAndJob.reduceByKey(ownPartitioner,_+_)
    //.........reduced:    ArrayBuffer(((javaee,xiaoxu),6), ((php,laoliu),1), ((bigdata,laozhang),2), ((bigdata,laozhao),15), ((javaee,laoyang),9), ((php,laoli),3), ((bigdata,laoduan),6))

    val sorted: RDD[((String, String), Int)] = reduced.mapPartitions(it => {
      it.toList.sortBy(_._2).reverse.iterator
    })
    //.........sorted:    ArrayBuffer(((bigdata,laozhao),15), ((javaee,laoyang),9), ((bigdata,laoduan),6), ((javaee,xiaoxu),6), ((php,laoli),3), ((bigdata,laozhang),2), ((php,laoliu),1))
    val tuplesArr: Array[((String, String), Int)] = sorted.collect()
//.........tuplesArr:    ArrayBuffer(((bigdata,laozhao),15), ((javaee,laoyang),9), ((bigdata,laoduan),6), ((javaee,xiaoxu),6), ((php,laoli),3), ((bigdata,laozhang),2), ((php,laoliu),1))
//    写入数据到文件
    val file = new File("D:\\personAndJob.log")
    if(file.exists()) {
      FileUtils.deleteDirectory(file)
    }
    sorted.saveAsTextFile("D:\\personAndJob.log")



//        val tuples = sorted.collect()
//        System.out.println(".........tuples:    "+tuplesArr.toBuffer)
    sc.stop()
  }
}

class MyPartitioner2(jobs:Array[String]) extends Partitioner{
//  print("...jobs:    "+jobs.toBuffer)
  //...jobs:    ArrayBuffer(javaee, php, bigdata)
  private val rules: mutable.HashMap[String, Int] = new mutable.HashMap[String,Int]()

  var i = 0
  for (job <- jobs){
    rules.put(job,i)
    i += 1
//    print("...rules:    "+rules)
    //...rules:    Map(javaee -> 0)...rules:    Map(javaee -> 0, php -> 1)...rules:    Map(javaee -> 0, php -> 1, bigdata -> 2)
  }

  override def numPartitions: Int = jobs.length

  override def getPartition(key: Any): Int = {
//    key是reduced: RDD[((String, String), Int)]中的key
//    print("...key:  "+key.toString)
//...key:  (bigdata,laozhang)...key:  (bigdata,laozhang)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laoduan)...key:  (bigdata,laoduan)...key:  (javaee,xiaoxu)...key:  (javaee,xiaoxu)...key:  (javaee,laoyang)...key:  (javaee,laoyang)...key:  (javaee,laoyang)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laoduan)...key:  (bigdata,laoduan)...key:  (javaee,xiaoxu)...key:  (javaee,xiaoxu)...key:  (javaee,laoyang)...key:  (javaee,laoyang)...key:  (javaee,laoyang)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laozhao)...key:  (bigdata,laoduan)...key:  (bigdata,laoduan)...key:  (javaee,xiaoxu)...key:  (javaee,xiaoxu)...key:  (javaee,laoyang)...key:  (javaee,laoyang)...key:  (javaee,laoyang)...key:  (php,laoli)...key:  (php,laoliu)...key:  (php,laoli)...key:  (php,laoli)
    val job: String = key.asInstanceOf[(String,String)]._1
    //    print("...rulesJob:    "+job)
    //...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    bigdata...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    javaee...rulesJob:    php...rulesJob:    php...rulesJob:    php...rulesJob:    php19/11/19 16:13:03 INFO Executor: Finished task 0.0 in stage 2.0 (TID 2). 1027 bytes result sent to driver    rules(job)
      rules(job)
  }
}

在D:\personAndJob.log盘中出现

分区成功!

测试五:自定义分区(二)

package com.grace.scalawc
import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext}

object FavPerson3Partitioner {
  def main(args: Array[String]): Unit = {
    val sc = new SparkContext(new SparkConf().setAppName("FavPerson3Partitioner").setMaster("local"))

    val data = Array(("aaa", 2), ("aaa", 3), ("aaa", 1), ("aaa", 0), ("aaa", 4),
      ("aa", 2), ("aa", 3), ("aa", 1), ("aa", 0), ("aa", 4),
      ("a", 2), ("a", 3), ("a", 1), ("a", 0), ("a", 4))

    val dataRdd: RDD[(String, Int)] = sc.parallelize(data)

    val res: RDD[(String, Int)] = dataRdd.partitionBy(new MyPartitioner(3))
    val file = new File("D:\\log.log")
    if(file.exists()) {
      FileUtils.deleteDirectory(file)
    }
    res.saveAsTextFile("D:\\log.log")
    sc.stop()
  }
}

class MyPartitioner(num: Int) extends Partitioner {
  override def numPartitions: Int = num

  override def getPartition(key: Any): Int = {
    key.toString.length % num
  }
}

在D:\\log.log文件夹中出现

分区成功!

 

 

 

 

 

 

 

 

 

 

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值