1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
package
org.apache.hadoop.ha;
@InterfaceAudience
.Private
@InterfaceStability
.Evolving
public
class
ActiveStandbyElector
implements
StatCallback, StringCallback {
// 暂时不做说明,先列在这里
public
interface
ActiveStandbyElectorCallback {
void
becomeActive()
throws
ServiceFailedException;
void
becomeStandby();
void
enterNeutralMode();
void
notifyFatalError(String errorMessage);
void
fenceOldActive(
byte
[] oldActiveData);
}
@VisibleForTesting
protected
static
final
String LOCK_FILENAME =
"ActiveStandbyElectorLock"
;
@VisibleForTesting
protected
static
final
String BREADCRUMB_FILENAME =
"ActiveBreadCrumb"
;
// 其他代码略
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
/**
* ActiveStandbyElector类型写"ActiveBreadCrumb"节点方法,
* 表明该节点在failover时需要被隔离(fenced)
*/
private
void
writeBreadCrumbNode(Stat oldBreadcrumbStat)
throws
KeeperException, InterruptedException {
Preconditions.checkState(appData !=
null
,
"no appdata"
);
LOG.info(
"Writing znode "
+ zkBreadCrumbPath +
" to indicate that the local node is the most recent active..."
);
if
(oldBreadcrumbStat ==
null
) {
// No previous active, just create the node
createWithRetries(zkBreadCrumbPath, appData, zkAcl, CreateMode.PERSISTENT);
}
else
{
// There was a previous active, update the node
setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion());
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
package
org.apache.hadoop.ha;
@InterfaceAudience
.LimitedPrivate(
"HDFS"
)
public
abstract
class
ZKFailoverController {
static
final
Log LOG = LogFactory.getLog(ZKFailoverController.
class
);
public
static
final
String ZK_QUORUM_KEY =
"ha.zookeeper.quorum"
;
private
static
final
String ZK_SESSION_TIMEOUT_KEY =
"ha.zookeeper.session-timeout.ms"
;
private
static
final
int
ZK_SESSION_TIMEOUT_DEFAULT =
5
*
1000
;
private
static
final
String ZK_PARENT_ZNODE_KEY =
"ha.zookeeper.parent-znode"
;
public
static
final
String ZK_ACL_KEY =
"ha.zookeeper.acl"
;
private
static
final
String ZK_ACL_DEFAULT =
"world:anyone:rwcda"
;
public
static
final
String ZK_AUTH_KEY =
"ha.zookeeper.auth"
;
static
final
String ZK_PARENT_ZNODE_DEFAULT =
"/hadoop-ha"
;
protected
static
final
String USAGE =
"Usage: java zkfc [ -formatZK [-force] [-nonInteractive] ]"
;
// 其他代码略
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
package
org.apache.hadoop.hdfs.tools;
@InterfaceAudience
.Private
public
class
DFSZKFailoverController
extends
ZKFailoverController {
@Override
protected
byte
[] targetToData(HAServiceTarget target) {
// HAServiceTarget
InetSocketAddress addr = target.getAddress();
return
ActiveNodeInfo.newBuilder()
.setHostname(addr.getHostName())
.setPort(addr.getPort())
.setZkfcPort(target.getZKFCAddress().getPort())
.setNameserviceId(localNNTarget.getNameServiceId())
.setNamenodeId(localNNTarget.getNameNodeId())
.build()
.toByteArray();
}
// 其他代码略
}
|
1
2
3
4
5
6
7
8
9
10
|
package
org.apache.hadoop.ha;
@InterfaceAudience
.Public
@InterfaceStability
.Evolving
public
abstract
class
HAServiceTarget {
private
static
final
String HOST_SUBST_KEY =
"host"
;
private
static
final
String PORT_SUBST_KEY =
"port"
;
private
static
final
String ADDRESS_SUBST_KEY =
"address"
;
// 其他代码略
}
|
1
2
3
4
5
6
7
8
9
|
package
org.apache.hadoop.hdfs.tools;
@InterfaceAudience
.Private
public
class
NNHAServiceTarget
extends
HAServiceTarget {
// Keys added to the fencing script environment
private
static
final
String NAMESERVICE_ID_KEY =
"nameserviceid"
;
private
static
final
String NAMENODE_ID_KEY =
"namenodeid"
;
// 其他代码略
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
package
org.apache.hadoop.ha;
@InterfaceAudience
.Private
@InterfaceStability
.Evolving
public
class
ActiveStandbyElector
implements
StatCallback, StringCallback {
// 其他代码略
private
void
createLockNodeAsync() {
zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL,
this
, zkClient);
}
private
void
monitorLockNodeAsync() {
zkClient.exists(zkLockFilePath, watcher,
this
, zkClient);
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
@Override
public
synchronized
void
processResult(
int
rc, String path, Object ctx,
String name) {
if
(isStaleClient(ctx))
return
;
LOG.debug(
"CreateNode result: "
+ rc +
" for path: "
+ path
+
" connectionState: "
+ zkConnectionState +
" for "
+
this
);
Code code = Code.get(rc);
if
(isSuccess(code)) {
// we successfully created the znode. we are the leader. start monitoring
if
(becomeActive()) {
monitorActiveStatus();
}
else
{
reJoinElectionAfterFailureToBecomeActive();
}
return
;
}
if
(isNodeExists(code)) {
if
(createRetryCount ==
0
) {
// znode exists and we did not retry the operation. so a different
// instance has created it. become standby and monitor lock.
becomeStandby();
}
// if we had retried then the znode could have been created by our first
// attempt to the server (that we lost) and this node exists response is
// for the second attempt. verify this case via ephemeral node owner. this
// will happen on the callback for monitoring the lock.
monitorActiveStatus();
return
;
}
String errorMessage =
"Received create error from Zookeeper. code:"
+ code.toString() +
" for path "
+ path;
LOG.debug(errorMessage);
if
(shouldRetry(code)) {
if
(createRetryCount < maxRetryNum) {
LOG.debug(
"Retrying createNode createRetryCount: "
+ createRetryCount);
++createRetryCount;
createLockNodeAsync();
return
;
}
errorMessage = errorMessage
+
". Not retrying further znode create connection errors."
;
}
else
if
(isSessionExpired(code)) {
// This isn't fatal - the client Watcher will re-join the election
LOG.warn(
"Lock acquisition failed because session was lost"
);
return
;
}
fatalError(errorMessage);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
@Override
public
synchronized
void
processResult(
int
rc, String path, Object ctx,
Stat stat) {
if
(isStaleClient(ctx))
return
;
assert
wantToBeInElection :
"Got a StatNode result after quitting election"
;
LOG.debug(
"StatNode result: "
+ rc +
" for path: "
+ path
+
" connectionState: "
+ zkConnectionState +
" for "
+
this
);
Code code = Code.get(rc);
if
(isSuccess(code)) {
// the following owner check completes verification in case the lock znode
// creation was retried
if
(stat.getEphemeralOwner() == zkClient.getSessionId()) {
// we own the lock znode. so we are the leader
if
(!becomeActive()) {
reJoinElectionAfterFailureToBecomeActive();
}
}
else
{
// we dont own the lock znode. so we are a standby.
becomeStandby();
}
// the watch set by us will notify about changes
return
;
}
if
(isNodeDoesNotExist(code)) {
// the lock znode disappeared before we started monitoring it
enterNeutralMode();
joinElectionInternal();
return
;
}
String errorMessage =
"Received stat error from Zookeeper. code:"
+ code.toString();
LOG.debug(errorMessage);
if
(shouldRetry(code)) {
if
(statRetryCount < maxRetryNum) {
++statRetryCount;
monitorLockNodeAsync();
return
;
}
errorMessage = errorMessage
+
". Not retrying further znode monitoring connection errors."
;
}
else
if
(isSessionExpired(code)) {
// This isn't fatal - the client Watcher will re-join the election
LOG.warn(
"Lock monitoring failed because session was lost"
);
return
;
}
fatalError(errorMessage);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
synchronized
void
processWatchEvent(ZooKeeper zk, WatchedEvent event) {
Event.EventType eventType = event.getType();
if
(isStaleClient(zk))
return
;
if
(eventType == Event.EventType.None) {
// the connection state has changed
switch
(event.getState()) {
case
SyncConnected:
LOG.info(
"Session connected."
);
// if the listener was asked to move to safe state then it needs to be undone
ConnectionState prevConnectionState = zkConnectionState;
zkConnectionState = ConnectionState.CONNECTED;
if
(prevConnectionState == ConnectionState.DISCONNECTED && wantToBeInElection) {
monitorActiveStatus();
}
break
;
case
Disconnected:
LOG.info(
"Session disconnected. Entering neutral mode..."
);
// ask the app to move to safe state because zookeeper connection
// is not active and we dont know our state
zkConnectionState = ConnectionState.DISCONNECTED;
enterNeutralMode();
break
;
case
Expired:
// the connection got terminated because of session timeout
// call listener to reconnect
LOG.info(
"Session expired. Entering neutral mode and rejoining..."
);
enterNeutralMode();
reJoinElection(
0
);
break
;
case
SaslAuthenticated:
LOG.info(
"Successfully authenticated to ZooKeeper using SASL."
);
break
;
default
:
fatalError(
"Unexpected Zookeeper watch event state: "
+ event.getState());
break
;
}
return
;
}
// 另一方面,对于node的变更事件,ActiveStandbyElector也会执行相应的动作。
String path = event.getPath();
if
(path !=
null
) {
switch
(eventType) {
case
NodeDeleted:
if
(state == State.ACTIVE) {
enterNeutralMode();
}
joinElectionInternal();
break
;
case
NodeDataChanged:
monitorActiveStatus();
break
;
default
:
LOG.debug(
"Unexpected node event: "
+ eventType +
" for path: "
+ path);
monitorActiveStatus();
}
return
;
}
// some unexpected error has occurred
fatalError(
"Unexpected watch error from Zookeeper"
);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
private
void
joinElectionInternal() {
Preconditions.checkState(appData !=
null
,
"trying to join election without any app data"
);
if
(zkClient ==
null
) {
if
(!reEstablishSession()) {
fatalError(
"Failed to reEstablish connection with ZooKeeper"
);
return
;
}
}
createRetryCount =
0
;
wantToBeInElection =
true
;
createLockNodeAsync();
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
private
boolean
becomeActive() {
assert
wantToBeInElection;
if
(state == State.ACTIVE) {
// already active
return
true
;
}
try
{
Stat oldBreadcrumbStat = fenceOldActive();
writeBreadCrumbNode(oldBreadcrumbStat);
LOG.debug(
"Becoming active for "
+
this
);
appClient.becomeActive();
state = State.ACTIVE;
return
true
;
}
catch
(Exception e) {
LOG.warn(
"Exception handling the winning of election"
, e);
// Caller will handle quitting and rejoining the election.
return
false
;
}
}
|
1
2
3
4
5
6
7
8
9
10
|
public
synchronized
void
quitElection(
boolean
needFence) {
LOG.info(
"Yielding from election"
);
if
(!needFence && state == State.ACTIVE) {
// If active is gracefully going back to standby mode, remove
// our permanent znode so no one fences us.
tryDeleteOwnBreadCrumbNode();
}
reset();
wantToBeInElection =
false
;
}
|
1
2
3
|
public
interface
ActiveStandbyElectorCallback {
// 其他代码略
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
/**
* Callbacks from elector
*/
class
ElectorCallbacks
implements
ActiveStandbyElectorCallback {
@Override
public
void
becomeActive()
throws
ServiceFailedException {
ZKFailoverController.
this
.becomeActive();
}
@Override
public
void
becomeStandby() {
ZKFailoverController.
this
.becomeStandby();
}
@Override
public
void
enterNeutralMode() {
}
@Override
public
void
notifyFatalError(String errorMessage) {
fatalError(errorMessage);
}
@Override
public
void
fenceOldActive(
byte
[] data) {
ZKFailoverController.
this
.fenceOldActive(data);
}
@Override
public
String toString() {
synchronized
(ZKFailoverController.
this
) {
return
"Elector callbacks for "
+ localTarget;
}
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
|
private
synchronized
void
becomeActive()
throws
ServiceFailedException {
LOG.info(
"Trying to make "
+ localTarget +
" active..."
);
try
{
HAServiceProtocolHelper.transitionToActive(localTarget.getProxy(
conf, FailoverController.getRpcTimeoutToNewActive(conf)), createReqInfo());
String msg =
"Successfully transitioned "
+ localTarget +
" to active state"
;
LOG.info(msg);
serviceState = HAServiceState.ACTIVE;
recordActiveAttempt(
new
ActiveAttemptRecord(
true
, msg));
}
catch
(Throwable t) {
// 略
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
private
synchronized
void
becomeStandby() {
LOG.info(
"ZK Election indicated that "
+ localTarget +
" should become standby"
);
try
{
int
timeout = FailoverController.getGracefulFenceTimeout(conf);
localTarget.getProxy(conf, timeout).transitionToStandby(createReqInfo());
LOG.info(
"Successfully transitioned "
+ localTarget +
" to standby state"
);
}
catch
(Exception e) {
LOG.error(
"Couldn't transition "
+ localTarget +
" to standby state"
,
e);
// TODO handle this. It's a likely case since we probably got fenced
// at the same time.
}
serviceState = HAServiceState.STANDBY;
}
|
1
2
3
4
5
6
7
8
9
|
bin
/hdfs
zkfc -formatZK
#
在hdfs这个shell文件中:
COMMAND=$1
# 中间略
elif
[
"$COMMAND"
=
"zkfc"
] ;
then
CLASS=
'org.apache.hadoop.hdfs.tools.DFSZKFailoverController'
HADOOP_OPTS=
"$HADOOP_OPTS $HADOOP_ZKFC_OPTS"
elif
# 略
|
1
2
3
4
5
6
7
8
9
10
11
|
public
static
void
main(String args[])
throws
Exception {
if
(DFSUtil.parseHelpArgument(args,ZKFailoverController.USAGE, System.out,
true
)) {
System.exit(
0
);
}
GenericOptionsParser parser =
new
GenericOptionsParser(
new
HdfsConfiguration(), args);
DFSZKFailoverController zkfc = DFSZKFailoverController.create(parser.getConfiguration());
System.exit(zkfc.run(parser.getRemainingArgs()));
}
# HdfsConfiguration
extends
Configuration通过静态方法初始化了配置参数
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
private
int
doRun(String[] args) {
try
{
initZK();
}
catch
(KeeperException ke) {
// ......
return
ERR_CODE_NO_ZK;
}
if
(args.length >
0
) {
if
(
"-formatZK"
.equals(args[
0
])) {
// hdfs zkfc后接参数且为formatZK
boolean
force =
false
;
boolean
interactive =
true
;
for
(
int
i =
1
; i < args.length; i++) {
if
(
"-force"
.equals(args[i])) {
// formatZK -force
force =
true
;
}
else
if
(
"-nonInteractive"
.equals(args[i])) {
// formatZK -nonInteractive
interactive =
false
;
}
else
{
// 参数错误
badArg(args[i]);
}
}
// 执行格式化操作
return
formatZK(force, interactive);
}
else
{
// hdfs zkfc后接参数但非formatZK
badArg(args[
0
]);
}
}
// 无参数
if
(!elector.parentZNodeExists()) {
// ......
return
ERR_CODE_NO_PARENT_ZNODE;
}
try
{
localTarget.checkFencingConfigured();
}
catch
(BadFencingConfigurationException e) {
// ......
return
ERR_CODE_NO_FENCER;
}
initRPC();
initHM();
startRPC();
try
{
mainLoop();
}
finally
{
rpcServer.stopAndJoin();
elector.quitElection(
true
);
healthMonitor.shutdown();
healthMonitor.join();
}
return
0
;
}
|
1
|
sbin
/hadoop-daemon
.sh start zkfc
|