RT。代码备忘。
1.Constant.java
package com.sparktest.util;
public class Constant {
public static String master = "yarn-client";
public static String topic = "pj";
public static String appName = "sparktest";
public static long duration = 10000;
public static String zookeeper = "10.67.2.20:2181,10.67.2.21:2181";
public static String brokerlist = "10.67.2.20:9092,10.67.2.21:9092";
public static String groupId = "com.sparktest";
public static int partitions = 10;
}
2.App.java
package com.sparktest.app;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.DefaultDecoder;
import kafka.serializer.StringDecoder;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaCluster;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;
import scala.Predef;
import scala.Tuple2;
import scala.collection.JavaConversions;
import com.sparktest.util.Constant;
public class App implements Serializable{
private KafkaCluster kafkaCluster = null;
private Map<String, String> kafkaParams = new HashMap<String, String>();
private Set<String> topics = new HashSet<String>();
private Duration duration = new Duration(Constant.duration);
private java.util.Map<kafka.common.TopicAndPartition, Long> fromOffsets = new java.util.HashMap<kafka.common.TopicAndPartition, Long>();
private static final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<OffsetRange[]>();
public App() {
kafkaParams.put("metadata.broker.list", Constant.brokerlist);
kafkaParams.put("group.id", Constant.groupId);
scala.collection.mutable.Map<String, String> mutableKafkaParam = JavaConversions
.mapAsScalaMap(kafkaParams);
scala.collection.immutable.Map<String, String> immutableKafkaParam = mutableKafkaParam
.toMap(new Predef.$less$colon$less<Tuple2<String, String>, Tuple2<String, String>>() {
public Tuple2<String, String> apply(
Tuple2<String, String> v1) {
return v1;
}
});
this.kafkaCluster = new KafkaCluster(immutableKafkaParam);
this.topics.add(Constant.topic);
}
public void startApp() {
JavaSparkContext ctx = new JavaSparkContext(Constant.master,
Constant.appName);
JavaStreamingContext jsctx = new JavaStreamingContext(ctx, duration);
scala.collection.mutable.Set<String> mutableTopics = JavaConversions
.asScalaSet(this.topics);
scala.collection.immutable.Set<String> immutableTopics = mutableTopics
.toSet();
scala.collection.immutable.Set<TopicAndPartition> scalaTopicAndPartitionSet = kafkaCluster
.getPartitions(immutableTopics).right().get();
// 首次消费,默认设置为0
if (kafkaCluster.getConsumerOffsets(kafkaParams.get("group.id"),
scalaTopicAndPartitionSet).isLeft()) {
Set<TopicAndPartition> javaTopicAndPartitionSet = JavaConversions
.setAsJavaSet(scalaTopicAndPartitionSet);
for (TopicAndPartition topicAndPartition : javaTopicAndPartitionSet) {
this.fromOffsets.put(topicAndPartition, 0L);
}
} else {
scala.collection.immutable.Map<TopicAndPartition, Object> consumerOffsetsTemp = kafkaCluster
.getConsumerOffsets(kafkaParams.get("group.id"),
scalaTopicAndPartitionSet).right().get();
Map<TopicAndPartition, Object> consumerOffsets = JavaConversions
.mapAsJavaMap(consumerOffsetsTemp);
Set<TopicAndPartition> javaTopicAndPartitionSet = JavaConversions
.setAsJavaSet(scalaTopicAndPartitionSet);
for (TopicAndPartition topicAndPartition : javaTopicAndPartitionSet) {
Long offset = (Long) consumerOffsets.get(topicAndPartition);
this.fromOffsets.put(topicAndPartition, offset);
}
}
JavaInputDStream<byte[]> stream = KafkaUtils.createDirectStream(jsctx,
String.class, byte[].class, StringDecoder.class,
DefaultDecoder.class, byte[].class, kafkaParams,
this.fromOffsets,
new Function<MessageAndMetadata<String, byte[]>, byte[]>() {
public byte[] call(MessageAndMetadata<String, byte[]> v1)
throws Exception {
return v1.message();
}
});
stream.foreachRDD(new Function<JavaRDD<byte[]>, Void>() {
public Void call(JavaRDD<byte[]> arg0) throws Exception {
OffsetRange[] offsets = ((HasOffsetRanges) arg0.rdd()).offsetRanges();
for(OffsetRange o: offsets){
// 封装topic.partition 与 offset对应关系 java Map
TopicAndPartition topicAndPartition = new TopicAndPartition(o.topic(), o.partition());
Map<TopicAndPartition, Object> topicAndPartitionObjectMap = new HashMap<TopicAndPartition, Object>();
topicAndPartitionObjectMap.put(topicAndPartition, o.untilOffset());
// 转换java map to scala immutable.map
scala.collection.mutable.Map<TopicAndPartition, Object> map =
JavaConversions.mapAsScalaMap(topicAndPartitionObjectMap);
scala.collection.immutable.Map<TopicAndPartition, Object> scalatopicAndPartitionObjectMap =
map.toMap(new Predef.$less$colon$less<Tuple2<TopicAndPartition, Object>, Tuple2<TopicAndPartition, Object>>() {
public Tuple2<TopicAndPartition, Object> apply(Tuple2<TopicAndPartition, Object> v1) {
return v1;
}
});
// 更新offset到kafkaCluster
kafkaCluster.setConsumerOffsets(Constant.groupId, scalatopicAndPartitionObjectMap);
}
System.out.println("==========================" + arg0.count()
+ "==================================");
return null;
}
});
jsctx.start();
jsctx.awaitTermination();
}
public static void main(String[] args) {
App app = new App();
app.startApp();
}
}
3.pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>sparktest</groupId>
<artifactId>sparktest</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>sparktest</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>com.allen.capturewebdata.Main</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.7</version>
<scope>system</scope>
<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.10</artifactId>
<version>1.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.10</artifactId>
<version>1.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.10</artifactId>
<version>1.3.0</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>