spark：学习杂记+仿写案例--35

最新推荐文章于 2021-05-19 13:08:46 发布

一流小风一

最新推荐文章于 2021-05-19 13:08:46 发布

阅读量455

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/fenger1943/article/details/45462961

版权

spark 专栏收录该内容

60 篇文章 0 订阅

订阅专栏

1.LogQuery：以MapReduce方式处理Apachelog

package llf

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.SparkContext._
/**
 * Created by sendoh on 2015/5/2.
 */
object LogQuery {
  val exampleApacheLogs = List(
    """10.10.10.10 - "FRED" [18/Jan/2013:17:56:07 +1100] "GET http://images.com/2013/Generic.jpg
      | HTTP/1.1" 304 315 "http://referall.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;
      | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR
      | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR
      | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.350 "-" - "" 265 923 934 ""
      | 62.24.11.25 images.com 1358492167 - Whatup""".stripMargin.lines.mkString,
    """10.10.10.10 - "FRED" [18/Jan/2013:18:02:37 +1100] "GET http://images.com/2013/Generic.jpg
      | HTTP/1.1" 304 306 "http:/referall.com" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;
      | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR
      | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR
      | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.352 "-" - "" 256 977 988 ""
      | 0 73.23.2.15 images.com 1358492557 - Whatup""".stripMargin.lines.mkString
  )
  def main(args: Array[String]): Unit ={
    val conf = new SparkConf().setAppName("LogQuery")
    val sc = new SparkContext(conf)
    def dataSet =
    if (args.length == 1) sc.textFile(args(0)) else sc.parallelize(exampleApacheLogs)

    val apacheLogRegex =
      """^([\d.]+) (\S+) (\S+) \[([\w\d:/]+\s[+\-]\d{4})\] "(.+?)" (\d{3}) ([\d\-]+) "([^"]+)" "([^"]+)".*""".r //定义各种类型？
    class Stats(val count: Int, val numBytes: Int) extends Serializable { //Serializable 接口以启用其序列化功能。
     //未实现此接口的类将无法使其任何状态序列化或反序列化。可序列化类的所有子类型本身都是可序列化的。序列化接口没有方法或字段，仅用于标识可序列化的语义。
      def merge(other: Stats): Stats = new Stats(count + other.count, numBytes + other.count)
      override def toString: String = "bytes=%s\\tn=%s".format(numBytes, count) //format:它通过格式操作使任意类型的数据转换成一个字符串。
    }
    //
    def extractKey(line: String): (String, String, String) = {
      apacheLogRegex.findFirstIn(line) match { //可在字符串内检索指定的值,或找到一个或多个正则表达式的匹配。
        case Some(apacheLogRegex(ip, _, user, dataTime, query, status, bytes, referer, ua)) =>
          if (user != "\"-\"") (ip, user, query) else (null, null, null)
        case _ => (null, null, null)
      }
    }
    //
    def extractStats(line: String): Stats = {
      apacheLogRegex.findFirstIn(line) match {
        case Some(apacheLogRegex(ip, _, user, dataTime, query, status, bytes, referer, ua)) =>
          new Stats(1, bytes.toInt)
        case _ => new Stats(1, 0)
      }
    }
    //
    dataSet.map(line => (extractKey(line), extractStats(line))).reduceByKey((a, b) => a.merge(b)).collect()
      .foreach{case (user, query) => println("%s\t%s".format(user, query))}
    sc.stop()
  }

}

2.MultiBroadcastTest:测试多个Broadcast共享变量

package llf

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, SparkConf}

/**
 * Created by sendoh on 2015/5/2.
 */
object MultiBroadcastTest {
  def main(args: Array[String]): Unit ={
    val conf = new SparkConf().setAppName("MulitBroadcastText")
    val sc = new SparkContext(conf)
    val slices = if (args.length > 0) args(0).toInt else 2  //分片
    val num = if (args.length > 1) args(1).toInt else 1000000
    val arr1 = new Array[Int](num) //定义数组arr1
    for (i <- 0 until arr1.length){ //arr1(1) = 1, arr1(2) = 2, ... arr1(max) = num - 1
      arr1(i) = i
    }
    val arr2 = new Array[Int](num)
    for (i <- 0 until arr2.length){ //arr2(1) = 1, arr2(2) = 2, ... arr2(max) = num - 1
      arr2(i) = i
    }
    val barr1 = sc.broadcast(arr1) //广播
    val barr2 = sc.broadcast(arr2)
    val observedSizes: RDD[(Int, Int)] = sc.parallelize(1 to 10, slices).map{ _ => (barr1.value.size, barr2.value.size)}
    observedSizes.collect().foreach(i => println(i))
    sc.stop()
  }

}

3.一些随便写的方法

package llf

import java.util

/**
 * Created by sendoh on 2015/4/28.
 */
object Text {
  def main(args: Array[String]): Unit = {

  }
  //
  def playWithInt(): Unit = { //java基本类型对应的scala类
    val capacity : Int = 10
    val list = new util.ArrayList[String]
    list.ensureCapacity(capacity)
  }
  //元组元素
  def getPersonInfo(primaryKey : Int) = {
    //假设用primaryKey获取一个人的信息
    ("Jim", "Bob", "Lilei")
  }
  val (Firstman, Secondman, Lastman) = getPersonInfo(1)
  println(Firstman) // Jim
  //创建多行字符串，scala会将三个双引号里的内容保持原样，称为原始字符串
  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }
  //
}
在类里面定义字段，方法，构造函数
class person(val firstname: String, val lastname: String){
  private var position: String = _
  println("Creating" + toString)
  def this(firstname: String, lastname: String, positionHeld: String){
    this (firstname, lastname)
    position = positionHeld
  }
  override def toString() : String = {
    firstname + " " + lastname + " holds " + position + " position "
  }
}
val john = new person("john", "Bob", "Jak")
println(join)
val bill = new person("Bill", "Lon")
println(bill)
//Creating join bob holds null position
//join bob holds jak position
//Creating bill lon holds null position
//bill lon holds null position
类继承
class Vehicle(val id: Int, val year: Int){
  override def toString() : String = "ID: " + id + "YEAR: " + year
}
class Car(override val id: Int, override val year: Int, var fuelLevel: Int) extends Vehicle(id, year){
  override def toString() : String = super.toString() + "Fuel Level:" + fuelLevel
}
val car = new Car(1, 2015, 100)
println(car)
容器和类型推演
val list1: List[Int] = new ArrayList[Int]
val list2 = new ArrayList[Int]
list2 add 1
list2 add 2
var total = 0
for (val index <- 0 until list2.size()){
  total += list.get(index)
}
println(total)
//3

一流小风一

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark：学习杂记+仿写案例--35

1.LogQuery：以MapReduce方式处理Apachelog//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////package llfimport org.apache.spark
复制链接

扫一扫