转载自http://my.oschina.net/cloudcoder/blog/299215?fromerr=1Luc9l6g
介绍
http://kafka.apache.orgkafka是一种高吞吐量的分布式发布订阅消息系统
kafka是linkedin用于日志处理的分布式消息队列,linkedin的日志数据容量大,但对可靠性要求不高,其日志数据主要包括用户行为(登录、浏览、点击、分享、喜欢)以及系统运行日志(CPU、内存、磁盘、网络、系统及进程状态)
当前很多的消息队列服务提供可靠交付保证,并默认是即时消费(不适合离线)。
高可靠交付对linkedin的日志不是必须的,故可通过降低可靠性来提高性能,同时通过构建分布式的集群,允许消息在系统中累积,使得kafka同时支持离线和在线日志处理
测试环境
kafka_2.10-0.8.1.1 3个节点做的集群
zookeeper-3.4.5 一个实例节点
代码示例
消息生产者代码示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
import
java.util.Collections;
import
java.util.Date;
import
java.util.Properties;
import
java.util.Random;
import
kafka.javaapi.producer.Producer;
import
kafka.producer.KeyedMessage;
import
kafka.producer.ProducerConfig;
/**
* 详细可以参考:https://cwiki.apache.org/confluence/display/KAFKA/0.8.0+Producer+Example
* @author Fung
*
*/
public
class
ProducerDemo {
public
static
void
main(String[] args) {
Random rnd =
new
Random();
int
events=
100
;
// 设置配置属性
Properties props =
new
Properties();
props.put(
"metadata.broker.list"
,
"172.168.63.221:9092,172.168.63.233:9092,172.168.63.234:9092"
);
props.put(
"serializer.class"
,
"kafka.serializer.StringEncoder"
);
// key.serializer.class默认为serializer.class
props.put(
"key.serializer.class"
,
"kafka.serializer.StringEncoder"
);
// 可选配置,如果不配置,则使用默认的partitioner
props.put(
"partitioner.class"
,
"com.catt.kafka.demo.PartitionerDemo"
);
// 触发acknowledgement机制,否则是fire and forget,可能会引起数据丢失
// 值为0,1,-1,可以参考
// http://kafka.apache.org/08/configuration.html
props.put(
"request.required.acks"
,
"1"
);
ProducerConfig config =
new
ProducerConfig(props);
// 创建producer
Producer<String, String> producer =
new
Producer<String, String>(config);
// 产生并发送消息
long
start=System.currentTimeMillis();
for
(
long
i =
0
; i < events; i++) {
long
runtime =
new
Date().getTime();
String ip =
"192.168.2."
+ i;
//rnd.nextInt(255);
String msg = runtime +
",www.example.com,"
+ ip;
//如果topic不存在,则会自动创建,默认replication-factor为1,partitions为0
KeyedMessage<String, String> data =
new
KeyedMessage<String, String>(
"page_visits"
, ip, msg);
producer.send(data);
}
System.out.println(
"耗时:"
+ (System.currentTimeMillis() - start));
// 关闭producer
producer.close();
}
}
|
消息消费者代码示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
import
java.util.HashMap;
import
java.util.List;
import
java.util.Map;
import
java.util.Properties;
import
java.util.concurrent.ExecutorService;
import
java.util.concurrent.Executors;
import
kafka.consumer.Consumer;
import
kafka.consumer.ConsumerConfig;
import
kafka.consumer.KafkaStream;
import
kafka.javaapi.consumer.ConsumerConnector;
/**
* 详细可以参考:https://cwiki.apache.org/confluence/display/KAFKA/Consumer+Group+Example
*
* @author Fung
*
*/
public
class
ConsumerDemo {
private
final
ConsumerConnector consumer;
private
final
String topic;
private
ExecutorService executor;
public
ConsumerDemo(String a_zookeeper, String a_groupId, String a_topic) {
consumer = Consumer.createJavaConsumerConnector(createConsumerConfig(a_zookeeper,a_groupId));
this
.topic = a_topic;
}
public
void
shutdown() {
if
(consumer !=
null
)
consumer.shutdown();
if
(executor !=
null
)
executor.shutdown();
}
public
void
run(
int
numThreads) {
Map<String, Integer> topicCountMap =
new
HashMap<String, Integer>();
topicCountMap.put(topic,
new
Integer(numThreads));
Map<String, List<KafkaStream<
byte
[],
byte
[]>>> consumerMap = consumer
.createMessageStreams(topicCountMap);
List<KafkaStream<
byte
[],
byte
[]>> streams = consumerMap.get(topic);
// now launch all the threads
executor = Executors.newFixedThreadPool(numThreads);
// now create an object to consume the messages
//
int
threadNumber =
0
;
for
(
final
KafkaStream stream : streams) {
executor.submit(
new
ConsumerMsgTask(stream, threadNumber));
threadNumber++;
}
}
private
static
ConsumerConfig createConsumerConfig(String a_zookeeper,
String a_groupId) {
Properties props =
new
Properties();
props.put(
"zookeeper.connect"
, a_zookeeper);
props.put(
"group.id"
, a_groupId);
props.put(
"zookeeper.session.timeout.ms"
,
"400"
);
props.put(
"zookeeper.sync.time.ms"
,
"200"
);
props.put(
"auto.commit.interval.ms"
,
"1000"
);
return
new
ConsumerConfig(props);
}
public
static
void
main(String[] arg) {
String[] args = {
"172.168.63.221:2188"
,
"group-1"
,
"page_visits"
,
"12"
};
String zooKeeper = args[
0
];
String groupId = args[
1
];
String topic = args[
2
];
int
threads = Integer.parseInt(args[
3
]);
ConsumerDemo demo =
new
ConsumerDemo(zooKeeper, groupId, topic);
demo.run(threads);
try
{
Thread.sleep(
10000
);
}
catch
(InterruptedException ie) {
}
demo.shutdown();
}
}
|
消息处理类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
import
kafka.consumer.ConsumerIterator;
import
kafka.consumer.KafkaStream;
public
class
ConsumerMsgTask
implements
Runnable {
private
KafkaStream m_stream;
private
int
m_threadNumber;
public
ConsumerMsgTask(KafkaStream stream,
int
threadNumber) {
m_threadNumber = threadNumber;
m_stream = stream;
}
public
void
run() {
ConsumerIterator<
byte
[],
byte
[]> it = m_stream.iterator();
while
(it.hasNext())
System.out.println(
"Thread "
+ m_threadNumber +
": "
+
new
String(it.next().message()));
System.out.println(
"Shutting down Thread: "
+ m_threadNumber);
}
}
|
Partitioner类示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
import
kafka.producer.Partitioner;
import
kafka.utils.VerifiableProperties;
public
class
PartitionerDemo
implements
Partitioner {
public
PartitionerDemo(VerifiableProperties props) {
}
@Override
public
int
partition(Object obj,
int
numPartitions) {
int
partition =
0
;
if
(obj
instanceof
String) {
String key=(String)obj;
int
offset = key.lastIndexOf(
'.'
);
if
(offset >
0
) {
partition = Integer.parseInt(key.substring(offset +
1
)) % numPartitions;
}
}
else
{
partition = obj.toString().length() % numPartitions;
}
return
partition;
}
}
|
pom.xml文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
<project xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0
.
0
</modelVersion>
<groupId>com.xxx</groupId>
<artifactId>kafka-demo</artifactId>
<version>
0.0
.
1
-SNAPSHOT</version>
<packaging>jar</packaging>
<name>kafka-demo</name>
<url>http:
//maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-
8
</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.
10
</artifactId>
<version>
0.8
.
1.1
</version>
<exclusions>
<exclusion>
<artifactId>jmxtools</artifactId>
<groupId>com.sun.jdmk</groupId>
</exclusion>
<exclusion>
<artifactId>jmxri</artifactId>
<groupId>com.sun.jmx</groupId>
</exclusion>
<exclusion>
<artifactId>jms</artifactId>
<groupId>javax.jms</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>
1.2
.
15
</version>
<exclusions>
<exclusion>
<artifactId>jmxtools</artifactId>
<groupId>com.sun.jdmk</groupId>
</exclusion>
<exclusion>
<artifactId>jmxri</artifactId>
<groupId>com.sun.jmx</groupId>
</exclusion>
<exclusion>
<artifactId>jms</artifactId>
<groupId>javax.jms</groupId>
</exclusion>
<exclusion>
<artifactId>mail</artifactId>
<groupId>javax.mail</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>
4.11
</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
|
参考
https://cwiki.apache.org/confluence/display/KAFKA/Index
什么时间使用高级应用?
- 针对一个消息读取多次
- 在一个process中,仅仅处理一个topic中的一组partitions
- 使用事务,确保每个消息只被处理一次
使用高级应用(调用较底层函数)的缺点?
SimpleConsumer需要做很多额外的工作(在以groups方式进行消息处理时不需要)
- 在应用程序中跟踪上次消息处理的offset
- 确定一个topic partition的lead broker
- 手工处理broker leander的改变
使用底层函数(SimpleConsumer)开发的步骤
- 通过active broker,确定topic partition的lead broker
- 确定topic partition的replicat brokers
- 根据需要,创建数据请求
- 抓取数据
- 识别lead brokder改变并进行恢复
代码示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
|
import
java.nio.ByteBuffer;
import
java.util.ArrayList;
import
java.util.Collections;
import
java.util.HashMap;
import
java.util.List;
import
java.util.Map;
import
kafka.api.FetchRequest;
import
kafka.api.FetchRequestBuilder;
import
kafka.api.PartitionOffsetRequestInfo;
import
kafka.cluster.Broker;
import
kafka.common.ErrorMapping;
import
kafka.common.TopicAndPartition;
import
kafka.javaapi.FetchResponse;
import
kafka.javaapi.OffsetResponse;
import
kafka.javaapi.PartitionMetadata;
import
kafka.javaapi.TopicMetadata;
import
kafka.javaapi.TopicMetadataRequest;
import
kafka.javaapi.TopicMetadataResponse;
import
kafka.javaapi.consumer.SimpleConsumer;
import
kafka.message.MessageAndOffset;
/**
* https://cwiki.apache.org/confluence/display/KAFKA/0.8.0+SimpleConsumer+Example
* @author Fung
*
*/
public
class
ConsumerSimpleExample {
public
static
void
main(String arg[]) {
String[] args={
"20"
,
"page_visits"
,
"2"
,
"172.168.63.233"
,
"9092"
};
ConsumerSimpleExample example =
new
ConsumerSimpleExample();
long
maxReads = Long.parseLong(args[
0
]);
String topic = args[
1
];
int
partition = Integer.parseInt(args[
2
]);
List<String> seeds =
new
ArrayList<String>();
seeds.add(args[
3
]);
int
port = Integer.parseInt(args[
4
]);
try
{
example.run(maxReads, topic, partition, seeds, port);
}
catch
(Exception e) {
System.out.println(
"Oops:"
+ e);
e.printStackTrace();
}
}
private
List<String> m_replicaBrokers =
new
ArrayList<String>();
public
ConsumerSimpleExample() {
m_replicaBrokers =
new
ArrayList<String>();
}
public
void
run(
long
a_maxReads, String a_topic,
int
a_partition,
List<String> a_seedBrokers,
int
a_port)
throws
Exception {
// find the meta data about the topic and partition we are interested in
//
PartitionMetadata metadata = findLeader(a_seedBrokers, a_port, a_topic,
a_partition);
if
(metadata ==
null
) {
System.out
.println(
"Can't find metadata for Topic and Partition. Exiting"
);
return
;
}
if
(metadata.leader() ==
null
) {
System.out
.println(
"Can't find Leader for Topic and Partition. Exiting"
);
return
;
}
String leadBroker = metadata.leader().host();
String clientName =
"Client_"
+ a_topic +
"_"
+ a_partition;
SimpleConsumer consumer =
new
SimpleConsumer(leadBroker, a_port,
100000
,
64
*
1024
, clientName);
long
readOffset = getLastOffset(consumer, a_topic, a_partition,
kafka.api.OffsetRequest.LatestTime(), clientName);
int
numErrors =
0
;
while
(a_maxReads >
0
) {
if
(consumer ==
null
) {
consumer =
new
SimpleConsumer(leadBroker, a_port,
100000
,
64
*
1024
, clientName);
}
// Note: this fetchSize of 100000 might need to be increased if
// large batches are written to Kafka
FetchRequest req =
new
FetchRequestBuilder().clientId(clientName)
.addFetch(a_topic, a_partition, readOffset,
100000
).build();
FetchResponse fetchResponse = consumer.fetch(req);
if
(fetchResponse.hasError()) {
numErrors++;
// Something went wrong!
short
code = fetchResponse.errorCode(a_topic, a_partition);
System.out.println(
"Error fetching data from the Broker:"
+ leadBroker +
" Reason: "
+ code);
if
(numErrors >
5
)
break
;
if
(code == ErrorMapping.OffsetOutOfRangeCode()) {
// We asked for an invalid offset. For simple case ask for
// the last element to reset
readOffset = getLastOffset(consumer, a_topic, a_partition,
kafka.api.OffsetRequest.LatestTime(), clientName);
continue
;
}
consumer.close();
consumer =
null
;
leadBroker = findNewLeader(leadBroker, a_topic, a_partition,
a_port);
continue
;
}
numErrors =
0
;
long
numRead =
0
;
for
(MessageAndOffset messageAndOffset : fetchResponse.messageSet(
a_topic, a_partition)) {
long
currentOffset = messageAndOffset.offset();
if
(currentOffset < readOffset) {
System.out.println(
"Found an old offset: "
+ currentOffset
+
" Expecting: "
+ readOffset);
continue
;
}
readOffset = messageAndOffset.nextOffset();
ByteBuffer payload = messageAndOffset.message().payload();
byte
[] bytes =
new
byte
[payload.limit()];
payload.get(bytes);
System.out.println(String.valueOf(messageAndOffset.offset())
+
": "
+
new
String(bytes,
"UTF-8"
));
numRead++;
a_maxReads--;
}
if
(numRead ==
0
) {
try
{
Thread.sleep(
1000
);
}
catch
(InterruptedException ie) {
}
}
}
if
(consumer !=
null
)
consumer.close();
}
public
static
long
getLastOffset(SimpleConsumer consumer, String topic,
int
partition,
long
whichTime, String clientName) {
TopicAndPartition topicAndPartition =
new
TopicAndPartition(topic,
partition);
Map<TopicAndPartition, PartitionOffsetRequestInfo> requestInfo =
new
HashMap<TopicAndPartition, PartitionOffsetRequestInfo>();
requestInfo.put(topicAndPartition,
new
PartitionOffsetRequestInfo(
whichTime,
1
));
kafka.javaapi.OffsetRequest request =
new
kafka.javaapi.OffsetRequest(
requestInfo, kafka.api.OffsetRequest.CurrentVersion(),
clientName);
OffsetResponse response = consumer.getOffsetsBefore(request);
if
(response.hasError()) {
System.out
.println(
"Error fetching data Offset Data the Broker. Reason: "
+ response.errorCode(topic, partition));
return
0
;
}
long
[] offsets = response.offsets(topic, partition);
return
offsets[
0
];
}
private
String findNewLeader(String a_oldLeader, String a_topic,
int
a_partition,
int
a_port)
throws
Exception {
for
(
int
i =
0
; i <
3
; i++) {
boolean
goToSleep =
false
;
PartitionMetadata metadata = findLeader(m_replicaBrokers, a_port,
a_topic, a_partition);
if
(metadata ==
null
) {
goToSleep =
true
;
}
else
if
(metadata.leader() ==
null
) {
goToSleep =
true
;
}
else
if
(a_oldLeader.equalsIgnoreCase(metadata.leader().host())
&& i ==
0
) {
// first time through if the leader hasn't changed give
// ZooKeeper a second to recover
// second time, assume the broker did recover before failover,
// or it was a non-Broker issue
//
goToSleep =
true
;
}
else
{
return
metadata.leader().host();
}
if
(goToSleep) {
try
{
Thread.sleep(
1000
);
}
catch
(InterruptedException ie) {
}
}
}
System.out
.println(
"Unable to find new leader after Broker failure. Exiting"
);
throw
new
Exception(
"Unable to find new leader after Broker failure. Exiting"
);
}
private
PartitionMetadata findLeader(List<String> a_seedBrokers,
int
a_port, String a_topic,
int
a_partition) {
PartitionMetadata returnMetaData =
null
;
loop:
for
(String seed : a_seedBrokers) {
SimpleConsumer consumer =
null
;
try
{
consumer =
new
SimpleConsumer(seed, a_port,
100000
,
64
*
1024
,
"leaderLookup"
);
List<String> topics = Collections.singletonList(a_topic);
TopicMetadataRequest req =
new
TopicMetadataRequest(topics);
TopicMetadataResponse resp = consumer.send(req);
List<TopicMetadata> metaData = resp.topicsMetadata();
for
(TopicMetadata item : metaData) {
for
(PartitionMetadata part : item.partitionsMetadata()) {
if
(part.partitionId() == a_partition) {
returnMetaData = part;
break
loop;
}
}
}
}
catch
(Exception e) {
System.out.println(
"Error communicating with Broker ["
+ seed
+
"] to find Leader for ["
+ a_topic +
", "
+ a_partition +
"] Reason: "
+ e);
}
finally
{
if
(consumer !=
null
)
consumer.close();
}
}
if
(returnMetaData !=
null
) {
m_replicaBrokers.clear();
for
(Broker replica : returnMetaData.replicas()) {
m_replicaBrokers.add(replica.host());
}
}
return
returnMetaData;
}
}
|
参考
https://cwiki.apache.org/confluence/display/KAFKA/0.8.0+SimpleConsumer+Example