Skip to content

Commit 3b3ec32

Browse files
authored
HBASE-26075: Replication is stuck due to zero length wal file in oldWALs dir (apache#3467)
Signed-off-by: Geoffrey Jacoby <[email protected]> Signed-off-by: Bharath Vissapragada <[email protected]>
1 parent b7fbfdd commit 3b3ec32

File tree

3 files changed

+59
-5
lines changed

3 files changed

+59
-5
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReaderThread.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -308,10 +308,15 @@ private boolean handleEofException(Exception e, WALEntryStream entryStream,
308308
// add current log to recovered source queue so it is safe to remove.
309309
if (e.getCause() instanceof EOFException && (isRecoveredSource || queue.size() > 1)
310310
&& conf.getBoolean("replication.source.eof.autorecovery", false)) {
311+
Path path = queue.peek();
311312
try {
312-
if (fs.getFileStatus(queue.peek()).getLen() == 0) {
313-
LOG.warn("Forcing removal of 0 length log in queue: " + queue.peek());
314-
lastReadPath = queue.peek();
313+
if (!fs.exists(path)) {
314+
// There is a chance that wal has moved to oldWALs directory, so look there also.
315+
path = entryStream.getArchivedLog(path);
316+
}
317+
if (fs.getFileStatus(path).getLen() == 0) {
318+
LOG.warn("Forcing removal of 0 length log in queue: " + path);
319+
lastReadPath = path;
315320
logQueue.remove(walGroupId);
316321
lastReadPosition = 0;
317322

@@ -325,7 +330,7 @@ private boolean handleEofException(Exception e, WALEntryStream entryStream,
325330
return true;
326331
}
327332
} catch (IOException ioe) {
328-
LOG.warn("Couldn't get file length information about log " + queue.peek());
333+
LOG.warn("Couldn't get file length information about log " + path, ioe);
329334
}
330335
}
331336

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/WALEntryStream.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ private boolean openNextLog() throws IOException {
312312
return false;
313313
}
314314

315-
private Path getArchivedLog(Path path) throws IOException {
315+
Path getArchivedLog(Path path) throws IOException {
316316
Path rootDir = FSUtils.getRootDir(conf);
317317
Path oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
318318
Path archivedLogLocation = new Path(oldLogDir, path.getName());

hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestWALEntryStream.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
import org.apache.hadoop.hbase.Waiter;
6666
import org.apache.hadoop.hbase.protobuf.generated.WALProtos;
6767
import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl;
68+
import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
6869
import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
6970
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
7071
import org.apache.hadoop.hbase.replication.ChainWALEntryFilter;
@@ -917,4 +918,52 @@ public void testCleanClosedWALs() throws Exception {
917918
assertEquals(0, logQueue.getMetrics().getUncleanlyClosedWALs());
918919
}
919920
}
921+
922+
/**
923+
* Tests that we handle EOFException properly if the wal has moved to oldWALs directory.
924+
* @throws Exception exception
925+
*/
926+
@Test
927+
public void testEOFExceptionInOldWALsDirectory() throws Exception {
928+
assertEquals(1, logQueue.getQueueSize(fakeWalGroupId));
929+
FSHLog fsLog = (FSHLog)log;
930+
Path emptyLogFile = fsLog.getCurrentFileName();
931+
log.rollWriter(true);
932+
// There will 2 logs in the queue.
933+
assertEquals(2, logQueue.getQueueSize(fakeWalGroupId));
934+
935+
Configuration localConf = new Configuration(conf);
936+
localConf.setInt("replication.source.maxretriesmultiplier", 1);
937+
localConf.setBoolean("replication.source.eof.autorecovery", true);
938+
939+
try (WALEntryStream entryStream =
940+
new WALEntryStream(logQueue, fs, localConf, logQueue.getMetrics(), fakeWalGroupId)) {
941+
// Get the archived dir path for the first wal.
942+
Path archivePath = entryStream.getArchivedLog(emptyLogFile);
943+
// Make sure that the wal path is not the same as archived Dir path.
944+
assertNotEquals(emptyLogFile.toString(), archivePath.toString());
945+
assertTrue(fs.exists(archivePath));
946+
fs.truncate(archivePath, 0);
947+
// make sure the size of the wal file is 0.
948+
assertEquals(0, fs.getFileStatus(archivePath).getLen());
949+
}
950+
951+
ReplicationSourceManager mockSourceManager = Mockito.mock(ReplicationSourceManager.class);
952+
ReplicationSource source = Mockito.mock(ReplicationSource.class);
953+
when(source.isPeerEnabled()).thenReturn(true);
954+
when(mockSourceManager.getTotalBufferUsed()).thenReturn(new AtomicLong(0));
955+
956+
// Start the reader thread.
957+
ReplicationSourceWALReaderThread readerThread =
958+
new ReplicationSourceWALReaderThread(mockSourceManager, getQueueInfo(), logQueue, 0,
959+
fs, localConf, getDummyFilter(), logQueue.getMetrics(), source, fakeWalGroupId);
960+
readerThread.start();
961+
// Wait for the replication queue size to be 1. This means that we have handled
962+
// 0 length wal from oldWALs directory.
963+
Waiter.waitFor(conf, 10000, new Waiter.Predicate<Exception>() {
964+
@Override public boolean evaluate() {
965+
return logQueue.getQueueSize(fakeWalGroupId) == 1;
966+
}
967+
});
968+
}
920969
}

0 commit comments

Comments
 (0)