原始代码
package com.lcc.spark.structed.streaming;
import com.lcc.spark.structed.streaming.conf.MyConfig;
import com.lcc.spark.structed.streaming.entity.MyLog;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.util.LongAccumulator;
/**
* @Author: chuanchuan.lcc
* @CreateDate: 2019/2/13 PM9:50
* @Version: 1.0
* @Description: java类作用描述:
*/
public class StructKafkaDemo {
public static void main(String[] args) throws Exception {
Long startTime = System.currentTimeMillis();
// 这里是初始化mertics
MyConfig conf = new MyConfig();
AppContext context = new AppContext(conf);
context.initialize();
// 屏蔽不必要的日志 ,在终端上显示需要的日志
org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.OFF);
org.apache.log4j.Logger.getLogger("org.eclipse.jetty.server").setLevel(org.apache.log4j.Level.OFF);
org.apache.log4j.Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(org.apache.log4j.Level.OFF);
String bootstrapServers = "localhost:9092,localhost:9093,localhost:9094";
String subscribeType = "subscribe";
String topics = "topip-lcc";
SparkSession spark = SparkSession
.builder()
.master("local[6]")
.appName("JavaStructuredKafkaWordCount")
.config("spark.metrics.conf.driver.source.jvm.class", "org.apache.spark.metrics.source.JvmSource")
.config("spark.sql.streaming.metricsEnabled", "true")
.getOrCreate();
// Create DataSet representing the stream of input lines from kafka
Dataset<String> lines = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", bootstrapServers)
.option(subscribeType, topics)
.load()
.selectExpr("CAST(value AS STRING)")
.as(Encoders.STRING());
lines.printSchema();
JavaRDD<String> lineRdd = lines.toJavaRDD();
JavaRDD<MyLog> rowRDD = lineRdd.map(new Function<String, MyLog>() {
@Override
public MyLog call(String value) throws Exception {
if (value != null && !value.trim().equals("")) {
String[] line = value.split("\\|");
java.lang.String page = line[0];
java.lang.String counts = line[1];
MyLog mylog = new MyLog();
mylog.setCount(counts);
mylog.setPage(page);
return mylog;
}
return null;
}
});
//
Dataset<Row> df = spark.createDataFrame(rowRDD, MyLog.class);
StreamingQuery query = df.writeStream()
.outputMode("append")
.format("console")
.start();
// 毫秒 * 秒 * 分钟
long timeInterval = 1000 * 60 * 2;
query.awaitTermination();
}
}
报错如下
Exception in thread "main" org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
kafka
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.org$apache$spark$sql$catalyst$analysis$UnsupportedOperationChecker$$throwError(UnsupportedOperationChecker.scala:297)
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$$anonfun$checkForBatch$1.apply(UnsupportedOperationChecker.scala:36)
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$$anonfun$checkForBatch$1.apply(UnsupportedOperationChecker.scala:34)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.checkForBatch(UnsupportedOperationChecker.scala:34)
at org.apache.spark.sql.execution.QueryExecution.assertSupported(QueryExecution.scala:63)
at org.apache.spark.sql.execution.QueryExecution.withCachedData$lzycompute(QueryExecution.scala:74)
at org.apache.spark.sql.execution.QueryExecution.withCachedData(QueryExecution.scala:72)
at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:78)
at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:78)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:84)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:89)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:89)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
at org.apache.spark.sql.Dataset.rdd$lzycompute(Dataset.scala:2581)
at org.apache.spark.sql.Dataset.rdd(Dataset.scala:2578)
at org.apache.spark.sql.Dataset.toJavaRDD(Dataset.scala:2591)
at com.lcc.spark.structed.streaming.StructKafkaDemo.main(StructKafkaDemo.java:63)
可能的原因是,因为这个是一个流式操作,不可以将流操作转换成离线操作,使用新方法
package com.lcc.spark.structed.streaming;
import com.lcc.spark.structed.streaming.entity.KafkaMessage;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
/**
* @author : shirukai
* @date : 2019-02-14 09:18
*/
public class KafkaDemo {
public static void main(String[] args) throws StreamingQueryException {
// 屏蔽不必要的日志 ,在终端上显示需要的日志
org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.OFF);
org.apache.log4j.Logger.getLogger("org.eclipse.jetty.server").setLevel(org.apache.log4j.Level.OFF);
org.apache.log4j.Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(org.apache.log4j.Level.OFF);
SparkSession spark = SparkSession
.builder()
.appName("JavaStructuredKafka")
.master("local[2]")
.getOrCreate();
String bootstrapServers = "localhost:9092,localhost:9093,localhost:9094";
String subscribeType = "subscribe";
String topics = "topip-lcc";
// Create DataFrame representing the stream of input lines from kafka
Dataset<Row> lines = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", bootstrapServers)
.option(subscribeType, topics)
.load();
Dataset<String> words = lines.select("value").as(Encoders.STRING());
Dataset<KafkaMessage> a = words.map((MapFunction<String, KafkaMessage>) KafkaDemo::handleMessage, ExpressionEncoder.javaBean(KafkaMessage.class));
// Start running the query that prints the running counts to the console
StreamingQuery query = a.writeStream()
.outputMode("update")
.format("console")
.start();
query.awaitTermination();
}
public static KafkaMessage handleMessage(String value) {
String[] values = value.split("\\|");
return new KafkaMessage(values[0], values[1]);
}
}
实体类
package com.lcc.spark.structed.streaming.entity;
import lombok.Data;
/**
* @Author: chuanchuan.lcc
* @CreateDate: 2019/2/14 AM10:06
* @Version: 1.0
* @Description: java类作用描述:
*/
@Data
public class KafkaMessage {
private String id;
private String value;
public KafkaMessage(String id, String value) {
this.id = id;
this.value = value;
}
}
可以正常运行
19/02/14 10:13:43 INFO AppInfoParser: Kafka commitId : a7a17cdec9eaa6c5
+---+-----+
| id|value|
+---+-----+
| 1 | 2|
+---+-----+