先来看看一段简单的KafkaProducer应用的代码:
import org.apache.kafka.clients.producer.*
;
import org.junit.Test ;
import java.text.SimpleDateFormat ;
import java.util.Date ;
import java.util.Properties ;
public class SimpleProducer {
@Test
public void testProduce(){
Properties props = new Properties() ;
props.put( "bootstrap.servers" , "node87:9092") ;
props.put( "acks" , "all") ;
props.put( "retries" , 0) ;
props.put( "batch.size" , 16384) ;
props.put( "linger.ms" , 1) ;
props.put( "buffer.memory" , 33554432) ;
props.put( "key.serializer" , "org.apache.kafka.common.serialization.StringSerializer") ;
props.put( "value.serializer" , "org.apache.kafka.common.serialization.StringSerializer") ;
props.put(ProducerConfig. METADATA_FETCH_TIMEOUT_CONFIG , "3000") ;
KafkaProducer<String , String> producer = new KafkaProducer<String , String>(props) ;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss") ;
for( int i = 0 ; i < 5 ; i++) {
ProducerRecord<String ,String> record = new ProducerRecord<String , String>( "test1234" , i + "" , simpleDateFormat.format( new Date()) + "---" + i) ;
producer.send(record ,
new Callback() {
public void onCompletion(RecordMetadata metadata , Exception e) {
if(e != null)
e.printStackTrace() ;
System. out.println( "The offset of the record we just sent is: " + metadata.offset()) ;
}
}) ;
}
producer.close() ;
}
}
import org.junit.Test ;
import java.text.SimpleDateFormat ;
import java.util.Date ;
import java.util.Properties ;
public class SimpleProducer {
@Test
public void testProduce(){
Properties props = new Properties() ;
props.put( "bootstrap.servers" , "node87:9092") ;
props.put( "acks" , "all") ;
props.put( "retries" , 0) ;
props.put( "batch.size" , 16384) ;
props.put( "linger.ms" , 1) ;
props.put( "buffer.memory" , 33554432) ;
props.put( "key.serializer" , "org.apache.kafka.common.serialization.StringSerializer") ;
props.put( "value.serializer" , "org.apache.kafka.common.serialization.StringSerializer") ;
props.put(ProducerConfig. METADATA_FETCH_TIMEOUT_CONFIG , "3000") ;
KafkaProducer<String , String> producer = new KafkaProducer<String , String>(props) ;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss") ;
for( int i = 0 ; i < 5 ; i++) {
ProducerRecord<String ,String> record = new ProducerRecord<String , String>( "test1234" , i + "" , simpleDateFormat.format( new Date()) + "---" + i) ;
producer.send(record ,
new Callback() {
public void onCompletion(RecordMetadata metadata , Exception e) {
if(e != null)
e.printStackTrace() ;
System. out.println( "The offset of the record we just sent is: " + metadata.offset()) ;
}
}) ;
}
producer.close() ;
}
}
代码很简单。我们先来看看KafkaProducer的继承关系,
Closeable借口表征了KafkaProducer的使用是会占用系统资源的,在停止使用的时候不要忘记调用close()方法回收资源。Producer借口则定义了一个生产者的主要方法,包括
其中send()的两个方法是异步发送消息;partitionsFor()则用来获取指定topic的partition信息,它为每一个partition创建一个PartionInfo对象一起放在List中返回。metrics是监控相关的方法,本文不做涉及。
KafkaProducer包含了以下全局属性,可以通过它们看出KafkaProducer的一些基础特性
public class KafkaProducer<
K
,
V>
implements Producer<
K
,
V> {
private final Partitioner partitioner ; //分区指定策略
private final int maxRequestSize ; //max.request.size属性指定,定义了一个请求包(Record)的最大字节数
private final long metadataFetchTimeoutMs ; //metadata.fetch.timeout.ms 首次往一个partition发送消息的时候需要获取其metadata,这里指定这个获取过程的最长时间
private final long totalMemorySize ; //buffer.memory 缓存包大小
private final Metadata metadata ; //保存topic的一些关键信息
private final RecordAccumulator accumulator ; //Record累加器队列,每次发送Record时都需要调用此对象的append方法来将Record压入一个MemoryRecords实例,当发送太频繁以至超过了totalMemorySize时append方法会被阻塞(可通过block.on.buffer.full来选择是否阻塞)
private final Sender sender ; //Sender类实现了Runnable接口,是负责在子线程中处理发送请求的类
private final Metrics metrics ; //监控
private final Thread ioThread ; //io线程
private final CompressionType compressionType ; //压缩类型,支持GZIP,SNAPPY,LZ4压缩方式,或者不压缩
private final Sensor errors ; //跟监控有关,记录错误
private final Time time ; //计时器
private final Serializer< K> keySerializer ; //key序列化器
private final Serializer< V> valueSerializer ; //value序列化器
private final ProducerConfig producerConfig ; //配置
private static final AtomicInteger producerAutoId = new AtomicInteger( 1) ; //priducerId
//...
}
private final Partitioner partitioner ; //分区指定策略
private final int maxRequestSize ; //max.request.size属性指定,定义了一个请求包(Record)的最大字节数
private final long metadataFetchTimeoutMs ; //metadata.fetch.timeout.ms 首次往一个partition发送消息的时候需要获取其metadata,这里指定这个获取过程的最长时间
private final long totalMemorySize ; //buffer.memory 缓存包大小
private final Metadata metadata ; //保存topic的一些关键信息
private final RecordAccumulator accumulator ; //Record累加器队列,每次发送Record时都需要调用此对象的append方法来将Record压入一个MemoryRecords实例,当发送太频繁以至超过了totalMemorySize时append方法会被阻塞(可通过block.on.buffer.full来选择是否阻塞)
private final Sender sender ; //Sender类实现了Runnable接口,是负责在子线程中处理发送请求的类
private final Metrics metrics ; //监控
private final Thread ioThread ; //io线程
private final CompressionType compressionType ; //压缩类型,支持GZIP,SNAPPY,LZ4压缩方式,或者不压缩
private final Sensor errors ; //跟监控有关,记录错误
private final Time time ; //计时器
private final Serializer< K> keySerializer ; //key序列化器
private final Serializer< V> valueSerializer ; //value序列化器
private final ProducerConfig producerConfig ; //配置
private static final AtomicInteger producerAutoId = new AtomicInteger( 1) ; //priducerId
//...
}
一共五个构造器支持通过Map和 Properties的方式配置Producer的属性,并且可以指定使用自己实现的序列化方法(这一块我暂时还没有想到有什么特别好的应用场景,通常的做法都是直接发送字符串或者ProtoBuf协议封装的字符串,还没有体会到自定义序列化函数的好处在哪里,有经验的朋友可以留言交流下)
在实例代码中调用的
KafkaProducer<String , String> producer = new KafkaProducer<String, String>(props);
构造器底层做的事情基本上就是实例化上面的参数全局参数,我们不赘述细节,这里只提两点。
一个是
List<InetSocketAddress> addresses = ClientUtils.
parseAndValidateAddresses(config.getList(ProducerConfig.
BOOTSTRAP_SERVERS_CONFIG))
;
this. metadata.update(Cluster. bootstrap(addresses) , time.milliseconds()) ;
NetworkClient client = new NetworkClient( new Selector( this. metrics , time , "producer" , metricTags) ,
this. metadata ,
clientId ,
config.getInt(ProducerConfig. MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION) ,
config.getLong(ProducerConfig. RECONNECT_BACKOFF_MS_CONFIG) ,
config.getInt(ProducerConfig. SEND_BUFFER_CONFIG) ,
config.getInt(ProducerConfig. RECEIVE_BUFFER_CONFIG)) ;
this. sender = new Sender(client ,
this. metadata ,
this. accumulator ,
config.getInt(ProducerConfig. MAX_REQUEST_SIZE_CONFIG) ,
( short) parseAcks(config.getString(ProducerConfig. ACKS_CONFIG)) ,
config.getInt(ProducerConfig. RETRIES_CONFIG) ,
config.getInt(ProducerConfig. TIMEOUT_CONFIG) ,
this. metrics ,
new SystemTime() ,
clientId) ;
this. metadata.update(Cluster. bootstrap(addresses) , time.milliseconds()) ;
NetworkClient client = new NetworkClient( new Selector( this. metrics , time , "producer" , metricTags) ,
this. metadata ,
clientId ,
config.getInt(ProducerConfig. MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION) ,
config.getLong(ProducerConfig. RECONNECT_BACKOFF_MS_CONFIG) ,
config.getInt(ProducerConfig. SEND_BUFFER_CONFIG) ,
config.getInt(ProducerConfig. RECEIVE_BUFFER_CONFIG)) ;
this. sender = new Sender(client ,
this. metadata ,
this. accumulator ,
config.getInt(ProducerConfig. MAX_REQUEST_SIZE_CONFIG) ,
( short) parseAcks(config.getString(ProducerConfig. ACKS_CONFIG)) ,
config.getInt(ProducerConfig. RETRIES_CONFIG) ,
config.getInt(ProducerConfig. TIMEOUT_CONFIG) ,
this. metrics ,
new SystemTime() ,
clientId) ;
代码通过解析bootstrap.servers.config获取IP/端口后,实例化了一个NetworkClient对象,这个对象的底层是通过维持一个Socket来进行TCP通信。并把这个NetworkClient对象交给sender管理具体的发送任务,想要了解Kafka底层通信机制的朋友可以继续深挖,这里不做展开。
另一个是
this.
accumulator =
new RecordAccumulator(config.getInt(ProducerConfig.
BATCH_SIZE_CONFIG)
,
this. totalMemorySize ,
config.getLong(ProducerConfig. LINGER_MS_CONFIG) ,
retryBackoffMs ,
config.getBoolean(ProducerConfig. BLOCK_ON_BUFFER_FULL_CONFIG) ,
metrics ,
time ,
metricTags) ;
this. totalMemorySize ,
config.getLong(ProducerConfig. LINGER_MS_CONFIG) ,
retryBackoffMs ,
config.getBoolean(ProducerConfig. BLOCK_ON_BUFFER_FULL_CONFIG) ,
metrics ,
time ,
metricTags) ;
这里实例化了一个RecordAccumulator对象,它处理的是Kafka的发送队列。下面具体看看发送任务是怎么执行的。
@Override
public Future<RecordMetadata> send(ProducerRecord<K ,V> record , Callback callback) {
try {
// 发送前首先确定已经获取了metadata,
waitOnMetadata(record.topic() , this.metadataFetchTimeoutMs) ;
/*尝试序列化key,value*/
byte[] serializedKey ;
try {
serializedKey = keySerializer.serialize(record.topic() , record.key()) ;
} catch (ClassCastException cce) {
throw new SerializationException( "Can't convert key of class " + record.key().getClass().getName() +
" to class " + producerConfig.getClass(ProducerConfig. KEY_SERIALIZER_CLASS_CONFIG).getName() +
" specified in key.serializer") ;
}
byte[] serializedValue ;
try {
serializedValue = valueSerializer.serialize(record.topic() , record.value()) ;
} catch (ClassCastException cce) {
throw new SerializationException( "Can't convert value of class " + record.value().getClass().getName() +
" to class " + producerConfig.getClass(ProducerConfig. VALUE_SERIALIZER_CLASS_CONFIG).getName() +
" specified in value.serializer") ;
}
ProducerRecord< byte[] , byte[]> serializedRecord = new ProducerRecord< byte[] , byte[]>(record.topic() , record.partition() , serializedKey , serializedValue) ;
//获取record获取将要发往的partion, 如果在record中没有指定,则根据key随意生成一个,如果key也没有则使用partitioner内部的一个计数器依次发给每一个partirion
int partition = partitioner.partition(serializedRecord , metadata.fetch()) ;
int serializedSize = Records.LOG_OVERHEAD + Record.recordSize(serializedKey , serializedValue) ;
ensureValidRecordSize(serializedSize) ;
TopicPartition tp = new TopicPartition(record.topic() , partition) ;
log.trace( "Sending record {} with callback {} to topic {} partition {}" , record , callback , record.topic() , partition) ;
//将数据发往accumulator缓存起来等待发送
RecordAccumulator.RecordAppendResult result = accumulator.append(tp , serializedKey , serializedValue , compressionType , callback) ;
if (result.batchIsFull || result.newBatchCreated) {
log.trace( "Waking up the sender since topic {} partition {} is either full or getting a new batch" , record.topic() , partition) ;
//唤醒sender,调用Client进行实际的发送工作
this.sender.wakeup() ;
}
return result.future ;
// Handling exceptions and record the errors;
// For API exceptions return them in the future,
// for other exceptions throw directly
} catch (ApiException e) {
log.debug( "Exception occurred during message send:" , e) ;
if (callback != null)
callback.onCompletion( null, e) ;
this.errors.record() ;
return new FutureFailure(e) ;
} catch (InterruptedException e) {
this.errors.record() ;
throw new KafkaException(e) ;
} catch (KafkaException e) {
this.errors.record() ;
throw e ;
}
}
public Future<RecordMetadata> send(ProducerRecord<K ,V> record , Callback callback) {
try {
// 发送前首先确定已经获取了metadata,
waitOnMetadata(record.topic() , this.metadataFetchTimeoutMs) ;
/*尝试序列化key,value*/
byte[] serializedKey ;
try {
serializedKey = keySerializer.serialize(record.topic() , record.key()) ;
} catch (ClassCastException cce) {
throw new SerializationException( "Can't convert key of class " + record.key().getClass().getName() +
" to class " + producerConfig.getClass(ProducerConfig. KEY_SERIALIZER_CLASS_CONFIG).getName() +
" specified in key.serializer") ;
}
byte[] serializedValue ;
try {
serializedValue = valueSerializer.serialize(record.topic() , record.value()) ;
} catch (ClassCastException cce) {
throw new SerializationException( "Can't convert value of class " + record.value().getClass().getName() +
" to class " + producerConfig.getClass(ProducerConfig. VALUE_SERIALIZER_CLASS_CONFIG).getName() +
" specified in value.serializer") ;
}
ProducerRecord< byte[] , byte[]> serializedRecord = new ProducerRecord< byte[] , byte[]>(record.topic() , record.partition() , serializedKey , serializedValue) ;
//获取record获取将要发往的partion, 如果在record中没有指定,则根据key随意生成一个,如果key也没有则使用partitioner内部的一个计数器依次发给每一个partirion
int partition = partitioner.partition(serializedRecord , metadata.fetch()) ;
int serializedSize = Records.LOG_OVERHEAD + Record.recordSize(serializedKey , serializedValue) ;
ensureValidRecordSize(serializedSize) ;
TopicPartition tp = new TopicPartition(record.topic() , partition) ;
log.trace( "Sending record {} with callback {} to topic {} partition {}" , record , callback , record.topic() , partition) ;
//将数据发往accumulator缓存起来等待发送
RecordAccumulator.RecordAppendResult result = accumulator.append(tp , serializedKey , serializedValue , compressionType , callback) ;
if (result.batchIsFull || result.newBatchCreated) {
log.trace( "Waking up the sender since topic {} partition {} is either full or getting a new batch" , record.topic() , partition) ;
//唤醒sender,调用Client进行实际的发送工作
this.sender.wakeup() ;
}
return result.future ;
// Handling exceptions and record the errors;
// For API exceptions return them in the future,
// for other exceptions throw directly
} catch (ApiException e) {
log.debug( "Exception occurred during message send:" , e) ;
if (callback != null)
callback.onCompletion( null, e) ;
this.errors.record() ;
return new FutureFailure(e) ;
} catch (InterruptedException e) {
this.errors.record() ;
throw new KafkaException(e) ;
} catch (KafkaException e) {
this.errors.record() ;
throw e ;
}
}
以上。