来自原文:http://blog.csdn.net/xfg0218/article/details/53014206
RT。代码备忘。
1.Constant.Java
- package com.sparktest.util;
- public class Constant {
- public static String master = "yarn-client";
- public static String topic = "pj";
- public static String appName = "sparktest";
- public static long duration = 10000;
- public static String zookeeper = "10.67.2.20:2181,10.67.2.21:2181";
- public static String brokerlist = "10.67.2.20:9092,10.67.2.21:9092";
- public static String groupId = "com.sparktest";
- public static int partitions = 10;
- }
2.App.java
- package com.sparktest.app;
- import java.io.Serializable;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.Map;
- import java.util.Set;
- import java.util.concurrent.atomic.AtomicReference;
- import kafka.common.TopicAndPartition;
- import kafka.message.MessageAndMetadata;
- import kafka.serializer.DefaultDecoder;
- import kafka.serializer.StringDecoder;
- import org.apache.spark.api.java.JavaRDD;
- import org.apache.spark.api.java.JavaSparkContext;
- import org.apache.spark.api.java.function.Function;
- import org.apache.spark.streaming.Duration;
- import org.apache.spark.streaming.api.java.JavaInputDStream;
- import org.apache.spark.streaming.api.java.JavaStreamingContext;
- import org.apache.spark.streaming.kafka.HasOffsetRanges;
- import org.apache.spark.streaming.kafka.KafkaCluster;
- import org.apache.spark.streaming.kafka.KafkaUtils;
- import org.apache.spark.streaming.kafka.OffsetRange;
- import scala.Predef;
- import scala.Tuple2;
- import scala.collection.JavaConversions;
- import com.sparktest.util.Constant;
- public class App implements Serializable{
- private KafkaCluster kafkaCluster = null;
- private Map<String, String> kafkaParams = new HashMap<String, String>();
- private Set<String> topics = new HashSet<String>();
- private Duration duration = new Duration(Constant.duration);
- private java.util.Map<kafka.common.TopicAndPartition, Long> fromOffsets = new java.util.HashMap<kafka.common.TopicAndPartition, Long>();
- private static final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<OffsetRange[]>();
- public App() {
- kafkaParams.put("metadata.broker.list", Constant.brokerlist);
- kafkaParams.put("group.id", Constant.groupId);
- scala.collection.mutable.Map<String, String> mutableKafkaParam = JavaConversions
- .mapAsScalaMap(kafkaParams);
- scala.collection.immutable.Map<String, String> immutableKafkaParam = mutableKafkaParam
- .toMap(new Predef.$less$colon$less<Tuple2<String, String>, Tuple2<String, String>>() {
- public Tuple2<String, String> apply(
- Tuple2<String, String> v1) {
- return v1;
- }
- });
- this.kafkaCluster = new KafkaCluster(immutableKafkaParam);
- this.topics.add(Constant.topic);
- }
- public void startApp() {
- JavaSparkContext ctx = new JavaSparkContext(Constant.master,
- Constant.appName);
- JavaStreamingContext jsctx = new JavaStreamingContext(ctx, duration);
- scala.collection.mutable.Set<String> mutableTopics = JavaConversions
- .asScalaSet(this.topics);
- scala.collection.immutable.Set<String> immutableTopics = mutableTopics
- .toSet();
- scala.collection.immutable.Set<TopicAndPartition> scalaTopicAndPartitionSet = kafkaCluster
- .getPartitions(immutableTopics).right().get();
- // 首次消费,默认设置为0
- if (kafkaCluster.getConsumerOffsets(kafkaParams.get("group.id"),
- scalaTopicAndPartitionSet).isLeft()) {
- Set<TopicAndPartition> javaTopicAndPartitionSet = JavaConversions
- .setAsJavaSet(scalaTopicAndPartitionSet);
- for (TopicAndPartition topicAndPartition : javaTopicAndPartitionSet) {
- this.fromOffsets.put(topicAndPartition, 0L);
- }
- } else {
- scala.collection.immutable.Map<TopicAndPartition, Object> consumerOffsetsTemp = kafkaCluster
- .getConsumerOffsets(kafkaParams.get("group.id"),
- scalaTopicAndPartitionSet).right().get();
- Map<TopicAndPartition, Object> consumerOffsets = JavaConversions
- .mapAsJavaMap(consumerOffsetsTemp);
- Set<TopicAndPartition> javaTopicAndPartitionSet = JavaConversions
- .setAsJavaSet(scalaTopicAndPartitionSet);
- for (TopicAndPartition topicAndPartition : javaTopicAndPartitionSet) {
- Long offset = (Long) consumerOffsets.get(topicAndPartition);
- this.fromOffsets.put(topicAndPartition, offset);
- }
- }
- JavaInputDStream<byte[]> stream = KafkaUtils.createDirectStream(jsctx,
- String.class, byte[].class, StringDecoder.class,
- DefaultDecoder.class, byte[].class, kafkaParams,
- this.fromOffsets,
- new Function<MessageAndMetadata<String, byte[]>, byte[]>() {
- public byte[] call(MessageAndMetadata<String, byte[]> v1)
- throws Exception {
- return v1.message();
- }
- });
- stream.foreachRDD(new Function<JavaRDD<byte[]>, Void>() {
- public Void call(JavaRDD<byte[]> arg0) throws Exception {
- OffsetRange[] offsets = ((HasOffsetRanges) arg0.rdd()).offsetRanges();
- for(OffsetRange o: offsets){
- // 封装topic.partition 与 offset对应关系 java Map
- TopicAndPartition topicAndPartition = new TopicAndPartition(o.topic(), o.partition());
- Map<TopicAndPartition, Object> topicAndPartitionObjectMap = new HashMap<TopicAndPartition, Object>();
- topicAndPartitionObjectMap.put(topicAndPartition, o.untilOffset());
- // 转换java map to scala immutable.map
- scala.collection.mutable.Map<TopicAndPartition, Object> map =
- JavaConversions.mapAsScalaMap(topicAndPartitionObjectMap);
- scala.collection.immutable.Map<TopicAndPartition, Object> scalatopicAndPartitionObjectMap =
- map.toMap(new Predef.$less$colon$less<Tuple2<TopicAndPartition, Object>, Tuple2<TopicAndPartition, Object>>() {
- public Tuple2<TopicAndPartition, Object> apply(Tuple2<TopicAndPartition, Object> v1) {
- return v1;
- }
- });
- // 更新offset到kafkaCluster
- kafkaCluster.setConsumerOffsets(Constant.groupId, scalatopicAndPartitionObjectMap);
- }
- System.out.println("==========================" + arg0.count()
- + "==================================");
- return null;
- }
- });
- jsctx.start();
- jsctx.awaitTermination();
- }
- public static void main(String[] args) {
- App app = new App();
- app.startApp();
- }
- }
3.pom.xml
- <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <groupId>sparktest</groupId>
- <artifactId>sparktest</artifactId>
- <version>0.0.1-SNAPSHOT</version>
- <packaging>jar</packaging>
- <name>sparktest</name>
- <url>http://maven.apache.org</url>
- <properties>
- <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- </properties>
- <build>
- <plugins>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- <configuration>
- <archive>
- <manifest>
- <mainClass>com.allen.capturewebdata.Main</mainClass>
- </manifest>
- </archive>
- <descriptorRefs>
- <descriptorRef>jar-with-dependencies</descriptorRef>
- </descriptorRefs>
- </configuration>
- </plugin>
- </plugins>
- </build>
- <dependencies>
- <dependency>
- <groupId>jdk.tools</groupId>
- <artifactId>jdk.tools</artifactId>
- <version>1.7</version>
- <scope>system</scope>
- <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
- </dependency>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-streaming_2.10</artifactId>
- <version>1.3.0</version>
- </dependency>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-streaming-kafka_2.10</artifactId>
- <version>1.3.0</version>
- </dependency>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-yarn_2.10</artifactId>
- <version>1.3.0</version>
- </dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>3.8.1</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
- </project>