public class KafkaConsumer implements Runnable {
private static final Logger LOGGER = LoggerFactory.getLogger(KafkaConsumer.class);
/**
* Kafka数据消费对象
*/
private ConsumerConnector consumer ;
/**
* Kafka Topic
*/
private String topic ;
/**
* 线程数量,一般就是Topic的分区数量
*/
private int numThreads ;
/**
* 线程池
*/
private ExecutorService executorPool ;
/**
* 构造函数
* @param topic Kafka消息Topic主题
* @param numThreads 处理数据的线程数/可以理解为Topic的分区数
* @param zookeeper Kafka的Zookeeper连接字符串
* @param groupId 该消费者所属group ID的值
*/
public KafkaConsumer(String topic,int numThreads,String zookeeper,String groupId){
// 1. 创建Kafka连接器
this.consumer = Consumer.createJavaConsumerConnector(createConsumerConfig(zookeeper,groupId));
// 2. 数据赋值
this.topic = topic ;
this.numThreads = numThreads ;
}
public void run() {
// 1. 指定Topic
Map<String, Integer> topicCountMap = new HashMap<String, Integer>();
topicCountMap.put(this.topic, this.numThreads);
// 2. 指定数据的解码器
StringDecoder keyDecoder = new StringDecoder(new VerifiableProperties());
StringDecoder valueDecoder = new StringDecoder(new VerifiableProperties());
// 3. 获取连接数据的迭代器对象集合
/**
* Key: Topic主题
* Value: 对应Topic的数据流读取器,大小是topicCountMap中指定的topic大小
*/
Map<String, List<KafkaStream<String, String>>> consumerMap = this.consumer.createMessageStreams(topicCountMap, keyDecoder, valueDecoder);
// 4. 从返回结果中获取对应topic的数据流处理器
List<KafkaStream<String, String>> streams = consumerMap.get(this.topic);
// 5. 创建线程池
this.executorPool = Executors.newFixedThreadPool(this.numThreads);
// 6. 构建数据输出对象
int threadNumber = 0;
for (final KafkaStream<String, String> stream : streams) {
this.executorPool.submit(new ConsumerKafkaStreamProcesser(stream, threadNumber));
threadNumber++;
}
}
public void shutDown(){
// 1. 关闭和Kafka的连接,这样会导致stream.hashNext返回false
if (this.consumer != null) {
this.consumer.shutdown();
}
// 2. 关闭线程池,会等待线程的执行完成
if (this.executorPool != null) {
// 2.1 关闭线程池
this.executorPool.shutdown();
// 2.2. 等待关闭完成, 等待五秒
try {
if (!this.executorPool.awaitTermination(5, TimeUnit.SECONDS)) {
System.out.println("Timed out waiting for consumer threads to shut down, exiting uncleanly!!");
}
} catch (InterruptedException e) {
System.out.println("Interrupted during shutdown, exiting uncleanly!!");
}
}
}
/**
* 根据传入的zk的连接信息和groupID的值创建对应的ConsumerConfig对象
* zk的连接信息,类似于:<br/> hadoop-senior01.ibeifeng.com:2181,hadoop-senior02.ibeifeng.com:2181/kafka
* 该kafka consumer所属的group id的值, group id值一样的kafka consumer会进行负载均衡
* @return Kafka连接信息
*/
private ConsumerConfig createConsumerConfig(String zookeeper ,String groupId){
// 1.构建属性对象
Properties prop = new Properties();
// 2.添加相关属性
prop.put("group.id", groupId); // 指定分组id
prop.put("zookeeper.connect", zookeeper); // 指定zk的连接url
prop.put("zookeeper.session.timeout.ms", "400"); //
prop.put("zookeeper.sync.time.ms", "200");
prop.put("auto.commit.interval.ms", "1000");
/* prop.put("group.id",ConsumerPropertiesFactory.groupId); //指定分组ID
prop.put("zookeeper.connect",ConsumerPropertiesFactory.zookeeperConnect);//指定zk的连接url
prop.put("metadata.broker.list",ConsumerPropertiesFactory.metadataBrokerList);
prop.put("auto.offset.reset", ConsumerPropertiesFactory.autoOffsetReset);
prop.put("zookeeper.session.timeout.ms", "400"); //
prop.put("zookeeper.sync.time.ms", "200");
prop.put("auto.commit.interval.ms", "1000");*/
// 3.构建ConsumerConfig
return new ConsumerConfig(prop);
}
}
上面就是线程的方式实现从kafka拉取数据。实现的关键点 kafka的分区数一般是对应的线程数。
kafka生产者底层实现了控制在一定数据大小或者时间超时就会切换partition生产数据,每个partition只能同时被一个线程消费。
public class ConsumerKafkaStreamProcesser implements Runnable {
private static final Logger LOGGER = LoggerFactory.getLogger(ConsumerKafkaStreamProcesser.class);
// Kafka数据流
private KafkaStream<String,String> stream ;
//线程ID编号
private int threadNumber ;
/**
* 构造函数
*/
public ConsumerKafkaStreamProcesser(KafkaStream<String,String> stream,int threadNumber){
this.stream = stream ;
this.threadNumber = threadNumber ;
}
public void run() {
// 1. 获取数据迭代器
ConsumerIterator<String, String> iter = this.stream.iterator();
// 2. 迭代输出数据
while (iter.hasNext()) {
// 2.1 获取数据值
MessageAndMetadata value = iter.next();
// 2.2 输出
LOGGER.info(this.threadNumber + ":" + ":" + value.offset() + value.key() + ":" + value.message());
System.out.println(this.threadNumber + ":" + ":" + value.offset() + value.key() + ":" + value.message() + "partition ..." + value.partition());
}
// 3. 表示当前线程执行完成
LOGGER.info("Shutdown Thread:" + this.threadNumber);
System.out.println("Shutdown Thread:" + this.threadNumber);
}
}
这里是topic的输出处理,也是写在了线程中打印的log中有详细的解释
public class App
{
public static void main( String[] args )
{
String zookeeper = “192.168.18.128:2181” ;
String groupId = “group1”;
String topic1 = “testTopic4”;
String topic2 = “testTopic3”;
int threads = 10 ;
KafkaConsumer instance = new KafkaConsumer(topic1,threads,zookeeper,groupId);
KafkaConsumer instance1 = new KafkaConsumer(topic2,threads,zookeeper,groupId);
new Thread(instance).start();
new Thread(instance1).start();
// int sleepMillis = 30000 ;
/* try {
Thread.sleep(sleepMillis);
} catch (InterruptedException e) {
e.printStackTrace();
}*/
}
}
最后这里是测试类,简单提一下。实际生产中,kafka的连接配置都是写在resources中的。
拿到数据后对接数据库做保存处理的 连接数据库配置也是写在resource中。静态加载速度较快。
如果没有必要可以不使用多线程的方式,切换线程也是要费效率的