Spark学习笔记(7)——分区实现

1 测试1

package webcount
import java.net.URL
import org.apache.spark.{SparkConf, SparkContext}

object UrlCountPartition {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("AdvURLCount").setMaster("local[2]")
    val sc = new SparkContext(conf)

    //rdd1 将数据切分,然后元组中(url,1)
    val rdd1 = sc.textFile("d://urlcount.log").map(line => {
      val f = line.split("\t")
      (f(1), 1)
    })

    val rdd2 = rdd1.reduceByKey(_ + _)

    val rdd3 = rdd2.map(t => {
      val url = t._1
      val host = new URL(url).getHost
      (host, (url, t._2))
    })

    println(rdd3.collect().toBuffer)

  }
}

ArrayBuffer((php.itcast.cn,(http://php.itcast.cn/php/course.shtml,459)), (java.itcast.cn,(http://java.itcast.cn/java/course/base.shtml,543)), 
(java.itcast.cn,(http://java.itcast.cn/java/video.shtml,496)), (java.itcast.cn,(http://java.itcast.cn/java/course/android.shtml,501)),
 (net.itcast.cn,(http://net.itcast.cn/net/video.shtml,521)), (java.itcast.cn,(http://java.itcast.cn/java/course/hadoop.shtml,506)),
  (net.itcast.cn,(http://net.itcast.cn/net/course.shtml,521)), (java.itcast.cn,(http://java.itcast.cn/java/course/cloud.shtml,1028)),
   (php.itcast.cn,(http://php.itcast.cn/php/video.shtml,490)), 
   (java.itcast.cn,(http://java.itcast.cn/java/teacher.shtml,482)),
    (php.itcast.cn,(http://php.itcast.cn/php/teacher.shtml,464)), 
    (net.itcast.cn,(http://net.itcast.cn/net/teacher.shtml,512)),
     (java.itcast.cn,(http://java.itcast.cn/java/course/javaee.shtml,1000)),
      (java.itcast.cn,(http://java.itcast.cn/java/course/javaeeadvanced.shtml,477)))

2 测试2

package webcount

import java.net.URL

import org.apache.spark.{Partitioner, SparkConf, SparkContext}

object UrlCountPartition {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("AdvURLCount").setMaster("local[2]")
    val sc = new SparkContext(conf)

    //rdd1 将数据切分,然后元组中(url,1)
    val rdd1 = sc.textFile("d://urlcount.log").map(line => {
      val f = line.split("\t")
      (f(1), 1)
    })

    val rdd2 = rdd1.reduceByKey(_ + _)

    val rdd3 = rdd2.map(t => {
      val url = t._1
      val host = new URL(url).getHost
      (host, (url, t._2))
    })

    val rdd4 = rdd3.map(_._1).distinct()

    //rdd3.repartition(3).saveAsTextFile("d://out1")

    println(rdd4.collect().toBuffer)

  }
}

//class HostPartitioer extends Partitioner{
//  override def numPartitions: Int =
//
//  override def getPartition(key: Any): Int =
//}


ArrayBuffer(net.itcast.cn, java.itcast.cn, php.itcast.cn)

3 测试3

package webcount

import java.net.URL

import org.apache.spark.{Partitioner, SparkConf, SparkContext}

import scala.collection.mutable

object UrlCountPartition {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("AdvURLCount").setMaster("local[2]")
    val sc = new SparkContext(conf)

    //rdd1 将数据切分,然后元组中(url,1)
    val rdd1 = sc.textFile("d://urlcount.log").map(line => {
      val f = line.split("\t")
      (f(1), 1)
    })

    val rdd2 = rdd1.reduceByKey(_ + _)

    val rdd3 = rdd2.map(t => {
      val url = t._1
      val host = new URL(url).getHost
      (host, (url, t._2))
    })

    val ins = rdd3.map(_._1).distinct().collect()

    val hostPartitioner = new HostPartitioer(ins)

    rdd3.partitionBy(hostPartitioner).saveAsTextFile("d://out2")

    //println(rdd4.collect().toBuffer)

  }
}

class HostPartitioer(ins: Array[String]) extends Partitioner {

  val partMap = new mutable.HashMap[String, Int]()

  var count = 0
  for (i <- ins) {
    partMap += (i -> count)
    count += 1
  }

  override def numPartitions: Int = ins.length

  override def getPartition(key: Any): Int = {
    partMap.getOrElse(key.toString,0)
  }
}


在这里插入图片描述

4 测试4

package webcount
import java.net.URL
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable

object UrlCountPartition {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("AdvURLCount").setMaster("local[2]")
    val sc = new SparkContext(conf)

    //rdd1 将数据切分,然后元组中(url,1)
    val rdd1 = sc.textFile("d://urlcount.log").map(line => {
      val f = line.split("\t")
      (f(1), 1)
    })

    val rdd2 = rdd1.reduceByKey(_ + _)
    val rdd3 = rdd2.map(t => {
      val url = t._1
      val host = new URL(url).getHost
      (host, (url, t._2))
    })

    val ins = rdd3.map(_._1).distinct().collect()

    val hostPartitioner = new HostPartitioer(ins)

    val rdd4 = rdd3.partitionBy(hostPartitioner).mapPartitions(it => {
      it.toList.sortBy(_._2._2).reverse.take(3).iterator
    })

    rdd4.saveAsTextFile("d://out3")

  }
}

class HostPartitioer(ins: Array[String]) extends Partitioner {

  val partMap = new mutable.HashMap[String, Int]()

  var count = 0
  for (i <- ins) {
    partMap += (i -> count)
    count += 1
  }

  override def numPartitions: Int = ins.length

  override def getPartition(key: Any): Int = {
    partMap.getOrElse(key.toString, 0)
  }
}


在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值