From d023cff9ce127a4a7f29696c84c17f735520558f Mon Sep 17 00:00:00 2001 From: "Haiyang.Hu" Date: Thu, 18 Jan 2024 16:13:51 +0800 Subject: [PATCH] HDFS-17342. Fix DataNode may invalidates normal block causing missing block --- .../datanode/DataNodeFaultInjector.java | 5 + .../fsdataset/impl/FsDatasetImpl.java | 19 +++- .../fsdataset/impl/TestFsDatasetImpl.java | 95 ++++++++++++++++++- 3 files changed, 114 insertions(+), 5 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java index e9cdb2cc92d5d..372271b4fb28a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java @@ -167,4 +167,9 @@ public void delayDeleteReplica() {} * Just delay run diff record a while. */ public void delayDiffRecord() {} + + /** + * Just delay getMetaDataInputStream a while. + */ + public void delayGetMetaDataInputStream() {} } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java index 27fcbb12faba2..b1526c9860e94 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java @@ -63,6 +63,7 @@ import org.apache.hadoop.hdfs.server.common.AutoCloseDataSetLock; import org.apache.hadoop.hdfs.server.common.DataNodeLockManager; import org.apache.hadoop.hdfs.server.common.DataNodeLockManager.LockLevel; +import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector; import org.apache.hadoop.hdfs.server.datanode.DataSetLockManager; import org.apache.hadoop.hdfs.server.datanode.FileIoProvider; import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica; @@ -247,6 +248,7 @@ public LengthInputStream getMetaDataInputStream(ExtendedBlock b) if (info == null || !info.metadataExists()) { return null; } + DataNodeFaultInjector.get().delayGetMetaDataInputStream(); return info.getMetadataInputStream(0); } @@ -2403,8 +2405,9 @@ public void invalidate(String bpid, ReplicaInfo block) { * * @param bpid the block pool ID. * @param block The block to be invalidated. + * @param checkFiles Whether to check data and meta files. */ - public void invalidateMissingBlock(String bpid, Block block) { + public void invalidateMissingBlock(String bpid, Block block, boolean checkFiles) { // The replica seems is on its volume map but not on disk. // We can't confirm here is block file lost or disk failed. @@ -2416,11 +2419,21 @@ public void invalidateMissingBlock(String bpid, Block block) { // So remove if from volume map notify namenode is ok. try (AutoCloseableLock lock = lockManager.writeLock(LockLevel.BLOCK_POOl, bpid)) { - ReplicaInfo replica = volumeMap.remove(bpid, block); - invalidate(bpid, replica); + // Check if this block is on the volume map. + ReplicaInfo replica = volumeMap.get(bpid, block); + // Double-check block or meta file existence when checkFiles as true. + if (replica != null && (!checkFiles || + (!replica.blockDataExists() || !replica.metadataExists()))) { + volumeMap.remove(bpid, block); + invalidate(bpid, replica); + } } } + public void invalidateMissingBlock(String bpid, Block block) { + invalidateMissingBlock(bpid, block, true); + } + /** * Remove Replica from ReplicaMap. * diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java index 2f068a6a69c69..5468473d9de0b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java @@ -1962,7 +1962,7 @@ public void delayDeleteReplica() { * 4. block would be recovered when disk back to normal. */ @Test - public void tesInvalidateMissingBlock() throws Exception { + public void testInvalidateMissingBlock() throws Exception { long blockSize = 1024; int heartbeatInterval = 1; HdfsConfiguration c = new HdfsConfiguration(); @@ -1988,7 +1988,7 @@ public void tesInvalidateMissingBlock() throws Exception { File metaFile = new File(metaPath); // Mock local block file not found when disk with some exception. - fsdataset.invalidateMissingBlock(bpid, replicaInfo); + fsdataset.invalidateMissingBlock(bpid, replicaInfo, false); // Assert local block file wouldn't be deleted from disk. assertTrue(blockFile.exists()); @@ -2011,4 +2011,95 @@ public void tesInvalidateMissingBlock() throws Exception { cluster.shutdown(); } } + + @Test + public void testCheckFilesWhenInvalidateMissingBlock() throws Exception { + long blockSize = 1024; + int heartbeatInterval = 1; + HdfsConfiguration c = new HdfsConfiguration(); + c.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, heartbeatInterval); + c.setLong(DFS_BLOCK_SIZE_KEY, blockSize); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(c). + numDataNodes(1).build(); + DataNodeFaultInjector oldDnInjector = DataNodeFaultInjector.get(); + try { + cluster.waitActive(); + GenericTestUtils.LogCapturer logCapturer = GenericTestUtils.LogCapturer. + captureLogs(DataNode.LOG); + BlockReaderTestUtil util = new BlockReaderTestUtil(cluster, new + HdfsConfiguration(conf)); + Path path = new Path("/testFile"); + util.writeFile(path, 1); + String bpid = cluster.getNameNode().getNamesystem().getBlockPoolId(); + DataNode dn = cluster.getDataNodes().get(0); + FsDatasetImpl dnFSDataset = (FsDatasetImpl) dn.getFSDataset(); + List replicaInfos = dnFSDataset.getFinalizedBlocks(bpid); + assertEquals(1, replicaInfos.size()); + DFSTestUtil.readFile(cluster.getFileSystem(), path); + LocatedBlock blk = util.getFileBlocks(path, 512).get(0); + ExtendedBlock block = blk.getBlock(); + + // Append a new block with an incremented generation stamp. + long newGS = block.getGenerationStamp() + 1; + dnFSDataset.append(block, newGS, 1024); + block.setGenerationStamp(newGS); + ReplicaInfo tmpReplicaInfo = dnFSDataset.getReplicaInfo(blk.getBlock()); + + DataNodeFaultInjector injector = new DataNodeFaultInjector() { + @Override + public void delayGetMetaDataInputStream() { + try { + Thread.sleep(8000); + } catch (InterruptedException e) { + // Ignore exception. + } + } + }; + // Delay to getMetaDataInputStream. + DataNodeFaultInjector.set(injector); + + ExecutorService executorService = Executors.newFixedThreadPool(2); + try { + Future blockReaderFuture = executorService.submit(() -> { + try { + // Submit tasks for reading block. + BlockReader blockReader = BlockReaderTestUtil.getBlockReader( + cluster.getFileSystem(), blk, 0, 512); + blockReader.close(); + } catch (IOException e) { + // Ignore exception. + } + }); + + Future finalizeBlockFuture = executorService.submit(() -> { + try { + // Submit tasks for finalizing block. + Thread.sleep(1000); + dnFSDataset.finalizeBlock(block, false); + } catch (Exception e) { + // Ignore exception + } + }); + + // Wait for both tasks to complete. + blockReaderFuture.get(); + finalizeBlockFuture.get(); + } finally { + executorService.shutdown(); + } + + // Validate the replica is exits. + assertNotNull(dnFSDataset.getReplicaInfo(blk.getBlock())); + + // Check DN log for FileNotFoundException. + String expectedMsg = String.format("opReadBlock %s received exception " + + "java.io.FileNotFoundException: %s (No such file or directory)", + blk.getBlock(), tmpReplicaInfo.getMetadataURI().getPath()); + assertTrue("Expected log message not found in DN log.", + logCapturer.getOutput().contains(expectedMsg)); + } finally { + cluster.shutdown(); + DataNodeFaultInjector.set(oldDnInjector); + } + } }