Skip to content

Commit 867b137

Browse files
author
eddy.cao
committed
Fix the SNN repeatedly checkpoint after fsimage transfer failure on one of the multiple NNs
1 parent 1abdf72 commit 867b137

File tree

3 files changed

+51
-2
lines changed

3 files changed

+51
-2
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7174,6 +7174,11 @@ public synchronized void verifyToken(DelegationTokenIdentifier identifier,
71747174
public EditLogTailer getEditLogTailer() {
71757175
return editLogTailer;
71767176
}
7177+
7178+
@VisibleForTesting
7179+
public long getStandbyLastCheckpointTime() {
7180+
return standbyCheckpointer.getLastCheckpointTime();
7181+
}
71777182

71787183
@VisibleForTesting
71797184
public void setEditLogTailerForTests(EditLogTailer tailer) {

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ public TransferFsImage.TransferResult call()
342342
throw ie;
343343
}
344344

345-
if (!ioes.isEmpty()) {
345+
if (ioes.size() > activeNNAddresses.size() / 2) {
346346
throw MultipleIOException.createIOException(ioes);
347347
}
348348
}
@@ -375,6 +375,11 @@ static int getCanceledCount() {
375375
return canceledCount;
376376
}
377377

378+
@VisibleForTesting
379+
public long getLastCheckpointTime() {
380+
return lastCheckpointTime;
381+
}
382+
378383
private long countUncheckpointedTxns() {
379384
FSImage img = namesystem.getFSImage();
380385
return img.getCorrectLastAppliedOrWrittenTxId() -

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -663,7 +663,46 @@ private void doCreate() throws IOException {
663663
out.write(42);
664664
out.close();
665665
}
666-
666+
667+
@Test(timeout = 300000)
668+
public void testPutFsimagePartFailed() throws Exception {
669+
for (int i = 1; i < NUM_NNS; i++) {
670+
cluster.shutdownNameNode(i);
671+
672+
// Make true checkpoint for DFS_NAMENODE_CHECKPOINT_PERIOD_KEY
673+
cluster.getConfiguration(i).setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 3);
674+
cluster.getConfiguration(i).setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1000);
675+
}
676+
doEdits(0, 10);
677+
cluster.transitionToStandby(0);
678+
679+
for (int i = 1; i < NUM_NNS; i++) {
680+
cluster.restartNameNode(i, false);
681+
}
682+
cluster.waitClusterUp();
683+
setNNs();
684+
685+
for (int i = 0; i < NUM_NNS; i++) {
686+
// Once the standby catches up, it should do a checkpoint
687+
// and save to local directories.
688+
HATestUtil.waitForCheckpoint(cluster, i, ImmutableList.of(12));
689+
}
690+
691+
long snnCheckpointTime1 = nns[1].getNamesystem().getStandbyLastCheckpointTime();
692+
cluster.transitionToActive(0);
693+
cluster.transitionToObserver(2);
694+
cluster.shutdownNameNode(2);
695+
696+
doEdits(11, 20);
697+
nns[0].getRpcServer().rollEditLog();
698+
HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(23));
699+
700+
long snnCheckpointTime2 = nns[1].getNamesystem().getStandbyLastCheckpointTime();
701+
702+
// Make sure that standby namenode checkpoint success and update the lastCheckpointTime
703+
// even though it send fsimage to nn2 failed because nn2 is shut down.
704+
assertTrue(snnCheckpointTime2 > snnCheckpointTime1);
705+
}
667706

668707
/**
669708
* A codec which just slows down the saving of the image significantly

0 commit comments

Comments
 (0)