spark及spark sreaming相关测试demo_spark类似teresort测试-CSDN博客

本文链接：https://blog.csdn.net/s840993622/article/details/100881974

针对spark进行了一些相关测试：spark进行wordcount测试，spark streaming进行可行测试，Kafka消息生产进行测试

6.1 spark进行word count测试

采用比较常用的spark测试案例对spark能否进行运算进行了测试：

import org.apache.spark.{SparkConf, SparkContext}
object ScalaPi {
 def main(args: Array[String]): Unit = {
//创建SparkConf()并且设置App的名称
val conf = new SparkConf()
  .setAppName("ScalaPi.scala")
  .setMaster(master = "yarn-client");

//创建SparkContext,该对象是提交spark app的入口
val sc = new SparkContext(conf);
 val res=sc.textFile(args(0))
  .flatMap(_.split(" "))
  .map((_ ,1))
  .reduceByKey(_ + _,4)
  .sortBy(_._2,false);
res.collect().foreach(println(_));
res.saveAsTextFile(args(1))//args(1)在hue任务提交平台设置相关的本地路径即可
//停止sc，结束该任务
sc.stop(); }}

6.2 Kafka数据生成测试

采用Kafka消息生成器，对Kafka的消息生成器和消息消费进行测试：

import java.io.PrintWriter
import java.text.DecimalFormat
import java.util.Properties

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}

import scala.util.Random
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Milliseconds, StreamingContext}

object make_data {

def main(args: Array[String]): Unit = {

val props = new Properties()
props.put("bootstrap.servers","47.103.10.241:9093")
props.put("acks","all")
props.put("retries","0")
props.put("batch.size","2048")
props.put("linger.ms","1")
props.put("buffer.memory","4096")
props.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
props.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer")

val producer = new KafkaProducer[String, String](props)
println("see this is ok")
//val write_logs=new PrintWriter(args(0))
val users = Array("jack","leo","andy","lucy","jim","smith","iverson","andrew")
val pages = Array("iphone4.html","huawei.html","mi.html","mac.html","note.html","book.html","fanlegefan.com")
val df = new DecimalFormat("#.00")
val random = new Random()
val num = 10
println("see this is ok,too")
for(i<- 0 to num ){
  val message = users(random.nextInt(users.length))+","+pages(random.nextInt(pages.length))+
    ","+df.format(random.nextDouble()*1000)+","+System.currentTimeMillis()
    producer.send(new ProducerRecord[String, String]("test", Integer.toString(i),message))
  println(message)
  //write_logs.println(message+"\n")

}
println("woo!see this is ok,too")
producer.close()

}
}

在Kafka的消息生成器的测试中，整个代码没有问题，数据生成也不存在问题，问题在于将数据上传端口时，无法建立相关的节点

6.3 spark streaming 测试demo

import java.io.PrintWriter
import java.text.SimpleDateFormat
import java.util.Date
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object streaming_count {
def main(args: Array[String]): Unit = {
val df = new SimpleDateFormat("yyyyMMdd")
val group = "test"
val topics = "test"

val sparkConf = new SparkConf().setAppName("streaming_count").setMaster("yarn-client")

val sc = new SparkContext(sparkConf)
val ssc = new StreamingContext(sc, Seconds(10))
val topicSets = topics.split(",").toSet
val kafkaParams = Map[String, String](
  "metadata.broker.list"-> "47.103.10.241:9093",
  "group.id"-> group
)
val kafkastream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc,
  kafkaParams, topicSets)

val write_log=new PrintWriter(args(0))
val envents=kafkastream.map(tuple=> {
  val arr = tuple._2
  val line=arr.split(",")
  val user=line(0)
  val page=line(1)
  val money=line(2)+2
  val day=df.format(new Date(line(3).toLong))
  println(arr)
  write_log.println(user+","+page+","+money+","+day+"\n")
})

ssc.start()
ssc.awaitTermination()}}