一 、数据准备
通过kafka-rest 写入kafka-avro数据
public class Test {
public static void main(String args[]){
String url = "http://node9:8082/topics/ztwo";
int x=1;
while (true){
Random random =new Random();
int i = random.nextInt();
String json;
if (i%2==0) {
json="{\"value_schema\": \"{\\\"type\\\": \\\"record\\\", \\\"name\\\": \\\"news_doc\\\", \\\"fields\\\": [{\\\"name\\\": \\\"name\\\", \\\"type\\\": \\\"string\\\"},{\\\"name\\\": \\\"time\\\", \\\"type\\\": \\\"long\\\"}]}\", \"records\": [{\"value\": {\"name\": \"one\",\"time\":1553069910680}}]}";
}else {
json="{\"value_schema\": \"{\\\"type\\\": \\\"record\\\", \\\"name\\\": \\\"news_doc\\\", \\\"fields\\\": [{\\\"name\\\": \\\"name\\\", \\\"type\\\": \\\"string\\\"},{\\\"name\\\": \\\"time\\\", \\\"type\\\": \\\"long\\\"}]}\", \"records\": [{\"value\": {\"name\": \"two\",\"time\":1553069910680}}]}";
}
x++;
System.out.println(x);
HttpRequest.sendPost(url,json);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
public static String sendPost(String url ,String data) {
HttpClient httpClient = new DefaultHttpClient();
//JSONObject json = new JSONObject(data);
HttpPost post = new HttpPost(url);
post.setHeader("Content-type", "application/vnd.kafka.avro.v1+json");
StringEntity entity = new StringEntity(data, Charset.forName("UTF-8"));
entity.setContentEncoding("UTF-8");
// 发送Json格式的数据请求
entity.setContentType("application/json");
post.setEntity(entity);
HttpResponse response = null;
try {
response = httpClient.execute(post);
String enStr = EntityUtils.toString(response.getEntity(), "utf-8");
return enStr;
} catch (IOException e) {
e.printStackTrace();
return e.getMessage();
}
}
2,spark读取解析数据
package cn
import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
import io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.avro._
/**
* Hello world!
*
*/
object App {
private var schemaRegistryClient: SchemaRegistryClient = _
private var kafkaAvroDeserializer: AvroDeserializer = _
def getTopicSchema(topic: String) = {
schemaRegistryClient.getLatestSchemaMetadata(topic + "-value").getSchema
}
def avroSchemaToSparkSchema(avroSchema: String) = {
SchemaConverters.toSqlType(new Schema.Parser().parse(avroSchema))
}
def main(args: Array[String]): Unit = {
val conf= new SparkConf()
.setAppName("kafka-structured").set("spark.testing.memory","2147480000")
.setMaster("local[*]");
val spark = SparkSession.builder()
.config(conf)
.getOrCreate()
val bootstrapServers ="node9:9092"
val topic = "ztwo"
val schemaRegistryUrl = "http://node9:8081"
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.WARN)
Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.WARN)
consumeAvro(spark, bootstrapServers, topic, schemaRegistryUrl)
spark.stop()
}
private def consumeAvro(spark: SparkSession, bootstrapServers: String, topic: String, schemaRegistryUrl: String): Unit = {
import spark.implicits._
schemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128)
kafkaAvroDeserializer = new AvroDeserializer(schemaRegistryClient)
spark.udf.register("deserialize", (bytes: Array[Byte]) =>
kafkaAvroDeserializer.deserialize(bytes)
)
val rawDf = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", bootstrapServers)
.option("subscribe", topic)
.option("startingOffsets", "earliest")
.option("group.id","1")
.load()
import org.apache.spark.sql.functions._
val jsonDf = rawDf.select(callUDF("deserialize", 'value).as("value"))
val dfValueSchema = {
val rawSchema = getTopicSchema(topic)
avroSchemaToSparkSchema(rawSchema)
}
val parsedDf = jsonDf.select(from_json('value, dfValueSchema.dataType).alias("value")
).select($"value.*")
parsedDf.createTempView(topic)
val output=spark.sql("select count(*) from "+topic+" group by name")
output.writeStream
.format("console")
.outputMode("complete")
//.outputMode("append ")
.start()
.awaitTermination()
}
class AvroDeserializer extends AbstractKafkaAvroDeserializer {
def this(client: SchemaRegistryClient) {
this()
this.schemaRegistry = client
}
override def deserialize(bytes: Array[Byte]): String = {
val value = super.deserialize(bytes)
value match {
case str: String =>
str
case _ =>
val genericRecord = value.asInstanceOf[GenericRecord]
if (genericRecord==null) {
// 返回空字符串
null
}else{
genericRecord.toString
}
}
}
}
}
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.golaxy</groupId>
<artifactId>kafka-spark-connector</artifactId>
<version>1.0-SNAPSHOT</version>
<inceptionYear>2008</inceptionYear>
<properties>
<scala.version>2.12.7</scala.version>
</properties>
<repositories>
<repository>
<id>scala-tools.org</id>
<name>Scala-Tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</repository>
<repository>
<id>confluent</id>
<url>http://packages.confluent.io/maven/</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>Scala-Tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</pluginRepository>
</pluginRepositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>2.4.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>2.4.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>2.4.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.12</artifactId>
<version>2.1.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.12</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.38</version>
</dependency>
<dependency>
<groupId>com.thoughtworks.paranamer</groupId>
<artifactId>paranamer</artifactId>
<version>2.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-avro_2.12</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-avro_2.11</artifactId>
<version>4.0.0</version>
</dependency>
<dependency>
<groupId>io.confluent</groupId>
<artifactId>kafka-avro-serializer</artifactId>
<version>3.2.0</version>
<exclusions>
<exclusion>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.specs</groupId>
<artifactId>specs</artifactId>
<version>1.2.5</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.8</arg>
</args>
</configuration>
</plugin>
</plugins>
</build>
</project>