由于公司需要对用户的访问行为实时计算,推荐出用户喜欢的影片,所以采用当下最流行的工具sparkstreaming对log日志的数据进行及时分析送给算法部门进行推荐数据,同时本人对sparkstreaming好奇,看了一些关于sparkstreaming方面的书籍,通过网上的streaming对kafka写入和读取数据代码在idea进行测试,代码比较简单,但第一次接触scala语言以及第一次接触idea的开发环境,在开始遇到了不少的麻烦,只要努力,一定能战胜困难。
- 生产者producter
package com.baofeng.dataparse import kafka.producer.KeyedMessage import kafka.producer.ProducerConfig import kafka.producer.Producer import java.util.Properties import scala.util.Random import scala.util.parsing.json.JSONObject object Producer { def main(args:Array[String]): Unit = { println("my name is producer") val topic = "user_msg" val brokers = "192.168.201.117:9092" val prop = new Properties() prop.put("metadata.broker.list",brokers) prop.put("serializer.class", "kafka.serializer.StringEncoder") val kafkaConfig = new ProducerConfig(prop) val producer = new Producer[String,String](kafkaConfig) while(true) { var json = JSONObject.apply(Map( "userid"-> "wang", "time"-> System.currentTimeMillis.toString, "access"-> Random.nextInt(10) )) producer.send(new KeyedMessage[String, String](topic, json.toString())) Thread.sleep(200) } } }
- 消费者comsumer
package com.baofeng.dataparse import org.apache.spark.SparkConf import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.Seconds import kafka.serializer.StringDecoder //import scala.util.parsing.json.JSON import org.apache.spark.streaming.kafka.KafkaUtils import spray.json._ object Comsumer { def main(args: Array[String]): Unit = { println("Comsumer") val conf = new SparkConf().setMaster("local[2]").setAppName("ReadAndSave") val ssc = new StreamingContext(conf, Seconds(5)) val topics = Set("user_msg") val brokers = "192.168.201.117:9092" val kafkaParams = Map[String, String]( "metadata.broker.list" -> brokers, "serializer.class" -> "kafka.serializer.StringEncoder") val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics) kafkaStream.foreachRDD(rdd => { rdd.foreachPartition(r=>{ r.foreach(record=> { val data = JsonParser(record._2).asJsObject() println(data.getFields("userid")+" "+data.getFields("access")) }) }) }) ssc.start() ssc.awaitTermination() } }
在解析json方面,scala中的JSONObject很难使用,用spray类库。
-
其中的pom.xml文件
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.baofeng.test</groupId> <artifactId>Project003</artifactId> <version>1.0-SNAPSHOT</version> <dependencies> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>2.10.7</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-actors</artifactId> <version>2.10.7</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-xml</artifactId> <version>2.11.0-M4</version> </dependency> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka_2.10</artifactId> <version>0.8.1.1</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>2.2.2</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.10</artifactId> <version>2.2.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.10</artifactId> <version>1.0.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.10</artifactId> <version>1.0.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_2.10</artifactId> <version>1.0.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka_2.10</artifactId> <version>1.3.1</version> </dependency> <dependency> <groupId>io.spray</groupId> <artifactId>spray-json_2.10</artifactId> <version>1.3.2</version> </dependency> </dependencies> </project>