maven
<dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka_2.10</artifactId> <version>0.9.0-kafka-2.0.2</version> </dependency>
private SparkKafka kafka = null ; private static final String TOPIC_SOURCE = "TP_LABEL"; public SparkStoredKuduApp(String[] args){ kafka_conf = KafkaPool.getInstance().getConfig(); kafka_conf.setProperty("zookeeper_connect", "personas1:2181,personas2:2181,personas4:2181"); kafka_conf.setProperty("groupid_tdx", "tpsc01"); //tpsc01 kafka_conf.setProperty("bootstrap.servers", "personas1:9092,personas2:9092,personas4:9092"); kafka = new SparkKafka(kafkaParams()); kafka.setTopics(new HashSet<>(Arrays.asList(TOPIC_SOURCE))); } private Map<String, String> kafkaParams() { Map<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, kafka_conf.getProperty("groupid_tdx")); kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, kafka_conf.getProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG)); kafkaParams.put("zookeeper.connect", kafka_conf.getProperty("zookeeper_connect")); return kafkaParams; }
// 获取kafka开始读取偏移量 Map<TopicAndPartition, Long> fromOffsets = kafka.getOffset();
public class SparkKafka implements Serializable { private static final long serialVersionUID = -7633373735487600970L; private Map<String, String> kafkaParams = null; private Set<String> topics = null; private KafkaCluster kafkaCluster = null; public SparkKafka(Map<String, String> kafkaParams) { this.kafkaParams = kafkaParams; init(); } private void init() { scala.collection.mutable.Map<String, String> mutableKafkaParam = JavaConversions.mapAsScalaMap(kafkaParams); scala.collection.immutable.Map<String, String> immutableKafkaParam = mutableKafkaParam .toMap(new Predef.$less$colon$less<Tuple2<String, String>, Tuple2<String, String>>() { @Override public Tuple2<String, String> apply(Tuple2<String, String> v1) { return v1; } }); kafkaCluster = new KafkaCluster(immutableKafkaParam); } /** * 获取kafka offset * * @return */ public Map<TopicAndPartition, Long> getOffset() { Map<TopicAndPartition, Long> fromOffsets = new HashMap<TopicAndPartition, Long>(); scala.collection.mutable.Set<String> mutableTopics = JavaConversions.asScalaSet(this.topics); scala.collection.immutable.Set<String> immutableTopics = mutableTopics.toSet(); scala.collection.immutable.Set<TopicAndPartition> scalaTopicAndPartitionSet = kafkaCluster .getPartitions(immutableTopics).right().get(); // 首次消费 if (kafkaCluster.getConsumerOffsets(kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG), scalaTopicAndPartitionSet) .isLeft()) { scala.collection.immutable.Map<TopicAndPartition, LeaderOffset> earliestOffsetsTemp = kafkaCluster .getEarliestLeaderOffsets(scalaTopicAndPartitionSet).right().get(); Set<TopicAndPartition> javaTopicAndPartitionSet = JavaConversions.setAsJavaSet(scalaTopicAndPartitionSet); Map<TopicAndPartition, LeaderOffset> earliestOffsets = JavaConversions.mapAsJavaMap(earliestOffsetsTemp); for (TopicAndPartition topicAndPartition : javaTopicAndPartitionSet) { LeaderOffset latestOffset = earliestOffsets.get(topicAndPartition); fromOffsets.put(topicAndPartition, latestOffset.offset()); } } else { scala.collection.immutable.Map<TopicAndPartition, LeaderOffset> earliestOffsetsTemp = kafkaCluster .getEarliestLeaderOffsets(scalaTopicAndPartitionSet).right().get(); scala.collection.immutable.Map<TopicAndPartition, Object> consumerOffsetsTemp = kafkaCluster .getConsumerOffsets(kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG), scalaTopicAndPartitionSet) .right().get(); Map<TopicAndPartition, LeaderOffset> earliestOffsets = JavaConversions.mapAsJavaMap(earliestOffsetsTemp); Map<TopicAndPartition, Object> consumerOffsets = JavaConversions.mapAsJavaMap(consumerOffsetsTemp); Set<TopicAndPartition> javaTopicAndPartitionSet = JavaConversions.setAsJavaSet(scalaTopicAndPartitionSet); for (TopicAndPartition topicAndPartition : javaTopicAndPartitionSet) { LeaderOffset earliestOffset = earliestOffsets.get(topicAndPartition); Long offset = (Long) consumerOffsets.get(topicAndPartition); // 如果消费的offset小于leader的earlistOffset,有可能是kafka定时清理已删除该offset文件 // 这时将过期的offset更新为leader的earlistOffset开始消费,避免offsetOutOfRang异常 if (offset < earliestOffset.offset()) { offset = earliestOffset.offset(); } fromOffsets.put(topicAndPartition, offset); } } return fromOffsets; } /** * 设置kafka offset * * @param range */ public void setOffset(HasOffsetRanges range) { OffsetRange[] offsets = range.offsetRanges(); for (OffsetRange o : offsets) { // 封装topic.partition 与 offset对应关系 java Map TopicAndPartition topicAndPartition = new TopicAndPartition(o.topic(), o.partition()); Map<TopicAndPartition, Object> topicAndPartitionObjectMap = new HashMap<TopicAndPartition, Object>(); topicAndPartitionObjectMap.put(topicAndPartition, o.untilOffset()); // 转换java map to scala immutable.map scala.collection.mutable.Map<TopicAndPartition, Object> map = JavaConversions .mapAsScalaMap(topicAndPartitionObjectMap); scala.collection.immutable.Map<TopicAndPartition, Object> scalatopicAndPartitionObjectMap = map.toMap( new Predef.$less$colon$less<Tuple2<TopicAndPartition, Object>, Tuple2<TopicAndPartition, Object>>() { private static final long serialVersionUID = 1L; public Tuple2<TopicAndPartition, Object> apply(Tuple2<TopicAndPartition, Object> v1) { return v1; } }); // 更新offset到kafkaCluster kafkaCluster.setConsumerOffsets(kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG), scalatopicAndPartitionObjectMap); } } @SuppressWarnings("unchecked") public static Class<MessageAndMetadata<String, byte[]>> getMsgClass() { return (Class<MessageAndMetadata<String, byte[]>>) (Class<?>) MessageAndMetadata.class; } public Map<String, String> getKafkaParams() { return kafkaParams; } public void setKafkaParams(Map<String, String> kafkaParams) { this.kafkaParams = kafkaParams; } public Set<String> getTopics() { return topics; } public void setTopics(Set<String> topics) { this.topics = topics; } public KafkaCluster getKafkaCluster() { return kafkaCluster; } public void setKafkaCluster(KafkaCluster kafkaCluster) { this.kafkaCluster = kafkaCluster; } }