生产环境中,开发会写入一些测试数据,或者黑名单过滤,或者只抓取黑名单
transform
日志双写 ==> loga:正常的
a:test的
黑名单过滤
log
black
======================================================================================================================================================
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer
object FilterApp {
def main(args: Array[String]) {
val sparkConf = new SparkConf()
.setAppName("FilterApp")
.setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val blackTuple = new ListBuffer[(String,Boolean)]
blackTuple.append(("sm",true))
val blacksRDD = sc.parallelize(blackTuple)
// 准备测试数据: log
val input = new ListBuffer[(String,String)]
input.append(("su","20180808,su,M,20"))
input.append(("kk","20180808,kk,M,20"))
input.append(("sm","20180808,sm,M,20"))
val inputRDD = sc.parallelize(input)
// TODO... 过滤掉黑名单用户
val joinRDD = inputRDD.leftOuterJoin(blacksRDD) //.foreach(println)
joinRDD.filter(x => {
x._2._2.getOrElse(false) != true
}).map(_._2._1).foreach(println)
sc.stop()
}
}
import org.apache.spark.SparkConf import org.apache.spark.streaming.flume.FlumeUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object FlumePullApp { def main(args: Array[String]) { val Array(hostname, port) = args val sparkConf = new SparkConf() .setAppName("FlumePullApp") .setMaster("local[2]") val ssc = new StreamingContext(sparkConf, Seconds(10)) val lines = FlumeUtils.createPollingStream(ssc, hostname,port.toInt) lines.map(x => new String(x.event.getBody.array()).trim) .flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print() ssc.start() ssc.awaitTermination() } }
import org.apache.spark.SparkConf import org.apache.spark.streaming.flume.FlumeUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object FlumePushApp { def main(args: Array[String]) { val Array(hostname, port) = args val sparkConf = new SparkConf() .setAppName("FlumePushApp") .setMaster("local[2]") val ssc = new StreamingContext(sparkConf, Seconds(10)) val lines = FlumeUtils.createStream(ssc,hostname,port.toInt) lines.map(x => new String(x.event.getBody.array()).trim) .flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print() ssc.start() ssc.awaitTermination() } }
import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingFilterApp { def main(args: Array[String]) { val sparkConf = new SparkConf() .setAppName("SocketWordCountApp") .setMaster("local[2]") val ssc = new StreamingContext(sparkConf, Seconds(10)) val blacks = List("sm","su") val blacksRDD = ssc.sparkContext.parallelize(blacks).map(x=>(x,true)) val lines = ssc.socketTextStream("hadoop000",9997) // (su, log of su) val clickLogDstream = lines.map(x => (x.split(",")(1), x)).transform(rdd => { rdd.leftOuterJoin(blacksRDD) .filter(x => { // _2 : (string,option) _2:option x._2._2.getOrElse(false) != true }).map(_._2._1) }) clickLogDstream.print() ssc.start() ssc.awaitTermination() } }
netcat => memory ==> avro sink ==> streaming
a1.sinks = k1
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = localhost
a1.sinks.k1.port = 41414
a1.sinks.spark.type = org.apache.spark.streaming.flume.sink.SparkSink
a1.sinks.spark.hostname = localhost
a1.sinks.spark.port = 41414
./spark-submit --master local[2] \
--class com.ruozedata.streaming.FlumePullApp \
--name FlumePushApp \
/home/hadoop/lib/train-scala-1.0-jar-with-dependencies.jar \
localhost 41414