有兴趣想学习国内整套Spark+Spark Streaming+Machine learning最顶级课程的,可加我qq 471186150。共享视频,性价比超高!
package com.dt.streaming; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Properties; import java.util.Random; import kafka.javaapi.producer.Producer; import kafka.producer.KeyedMessage; import kafka.producer.ProducerConfig; /** * 98讲,逻辑生成器,一直不断的逻辑级别的模拟用户行为,生成数据 * 论坛数据自动生成代码,该生成的数据会作为Producer的方式发送给Kafka,然后SparkStreaming程序会从 * Kafka中在线Pull到论坛或者网站的用户在线行为信息,进而进行多维度的在线分析 * 数据格式如下: * date:日期,格式为yyyy-MM-dd * timestamp:时间戳 * userID:用户ID * pageID:页面ID * chanelID:板块的ID * action:点击和注册 */ public class SparkStreamingDataManuallyProducerForKafka extends Thread { //具体的论坛频道 static String[] channelNames = new String[]{ "Spark","Scala","Kafka","Flink","Hadoop","Storm", "Hive","Impala","HBase","ML" }; //用户的两种行为模式 static String[] actionNames = new String[]{"View", "Register"}; private String topic; //发送给Kafka的数据的类别 private Producer<Integer, String> producerForKafka; private static String dateToday; private static Random random; //构造函数,传递数据,构造器的核心是生成数据。因为要不断的给kafka发送数据,所以继承了Thread public SparkStreamingDataManuallyProducerForKafka(String topic){ dateToday = new SimpleDateFormat("yyyy-MM-dd").format(new Date()); this.topic = topic; random = new Random(); Properties conf = new Properties();//设置属性,链接kafka conf.put("metadata.broker.list","master1:9092,Worker1:9092,Worker2:9092"); conf.put("serializer.class", "kafka.serializer.StringEncoder"); producerForKafka = new Producer<Integer, String>(new ProducerConfig(conf)) ;//运行时基于这个producer进行写数据 } @Override public void run() {//重写线程的run方法 int counter = 0; while(true){//死循环一直发送消息给kafka counter++; String userLog = userlogs();//拿到数据 System.out.println("product:"+userLog); //死循环发送数据给kafka producerForKafka.send(new KeyedMessage<Integer, String>(topic, userLog)); if(0 == counter%500){ counter = 0; try { Thread.sleep(1000);//每发送500条数据,休息一下,一直发,机器受不了 } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } public static void main( String[] args ) { //创建实例,传入topic,运行即可 new SparkStreamingDataManuallyProducerForKafka("UserLogs").start(); } private static String userlogs() { StringBuffer userLogBuffer = new StringBuffer(""); int[] unregisteredUsers = new int[]{1, 2, 3, 4, 5, 6, 7, 8}; long timestamp = new Date().getTime(); Long userID = 0L; long pageID = 0L; //随机生成的用户ID if(unregisteredUsers[random.nextInt(8)] == 1) { userID = null; } else { userID = (long) random.nextInt((int) 2000); } //随机生成的页面ID pageID = random.nextInt((int) 2000); //随机生成Channel String channel = channelNames[random.nextInt(10)]; //随机生成action行为 String action = actionNames[random.nextInt(2)]; userLogBuffer.append(dateToday) .append("\t") .append(timestamp) .append("\t") .append(userID) .append("\t") .append(pageID) .append("\t") .append(channel) .append("\t") .append(action); // .append("\n"); return userLogBuffer.toString(); }}
//上面的类生产数据,下面的类消费数据
package com.dt.streaming; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.api.java.JavaPairDStream; import org.apache.spark.streaming.api.java.JavaPairInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka.KafkaUtils; import kafka.serializer.StringDecoder; import scala.Tuple2; public class OnlineBBSUserLogs { public static void main(String[] args) { /* *第98讲,消费者消费SparkStreamingDataManuallyProducerForKafka类中逻辑级别产生的数据,这里pv的方式 */ /*SparkConf conf = new SparkConf().setMaster("local[2]"). setAppName("WordCountOnline");*/ SparkConf conf = new SparkConf().setMaster("spark://Master:7077"). setAppName("OnlineBBSUserLogs"); JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5)); /* *链接kafka */ Map<String, String> kafkaParameters = new HashMap<String, String>(); kafkaParameters.put("metadata.broker.list","Master:9092,Worker1:9092,Worker2:9092"); Set topics = new HashSet<String>(); topics.add("UserLogs"); JavaPairInputDStream<String, String> lines = KafkaUtils.createDirectStream(jsc, String.class, String.class, StringDecoder.class,StringDecoder.class, kafkaParameters, topics); //找pv,view的方式。所以过滤数据 JavaPairDStream<String, String> logsDStream = lines.filter(new Function<Tuple2<String,String>, Boolean>() { @Override public Boolean call(Tuple2<String, String> v1) throws Exception { String[] logs = v1._2.split("\t");//key是kafka给的,不需要key,拿第二个元素即可 String action = logs[5];//action是第6位appand进来的数据,这里的action要View不是register if("View".equals(action)){ return true; } else { return false; } } }); /* * 第四步:对初始的DStream进行Transformation级别的处理,例如map、filter等高阶函数等的编程,来进行具体的数据计算 * 第4.2步:在单词拆分的基础上对每个单词实例计数为1,也就是word => (word, 1) */ JavaPairDStream<Long, Long> pairs = logsDStream.mapToPair(new PairFunction<Tuple2<String,String>, Long, Long>() { @Override public Tuple2<Long, Long> call(Tuple2<String, String> t) throws Exception { String[] logs = t._2.split("\t");//key是kafka给的,不需要key,拿第二个元素即可 Long pageId = Long.valueOf(logs[3]);//第3个,生成整数 return new Tuple2<Long,Long>(pageId, 1L); } }); /* * 第四步:对初始的DStream进行Transformation级别的处理,例如map、filter等高阶函数等的编程,来进行具体的数据计算 * 第4.3步:在每个单词实例计数为1基础之上统计每个单词在文件中出现的总次数 */ JavaPairDStream<Long, Long> wordsCount = pairs.reduceByKey(new Function2<Long, Long, Long>() { //对相同的Key,进行Value的累计(包括Local和Reducer级别同时Reduce) @Override public Long call(Long v1, Long v2) throws Exception { return v1 + v2; } }); //上面操作结果就得到了页面id和点击次数 /* *在企业生产环境下,一般会把计算的数据放入Redius或者DB中,采用J2EE等技术进行趋势的绘制 */ wordsCount.print(); /* * Spark Streaming执行引擎也就是Driver开始运行,Driver启动的时候是位于一条新的线程中的,当然其内部有消息循环体,用于 * 接受应用程序本身或者Executor中的消息; */ jsc.start(); jsc.awaitTermination(); jsc.close(); } }