-
Notifications
You must be signed in to change notification settings - Fork 9.2k
HDFS-17342. Fix DataNode may invalidates normal block causing missing block #6464
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1962,7 +1962,7 @@ public void delayDeleteReplica() { | |
| * 4. block would be recovered when disk back to normal. | ||
| */ | ||
| @Test | ||
| public void tesInvalidateMissingBlock() throws Exception { | ||
| public void testInvalidateMissingBlock() throws Exception { | ||
| long blockSize = 1024; | ||
| int heartbeatInterval = 1; | ||
| HdfsConfiguration c = new HdfsConfiguration(); | ||
|
|
@@ -1988,7 +1988,7 @@ public void tesInvalidateMissingBlock() throws Exception { | |
| File metaFile = new File(metaPath); | ||
|
|
||
| // Mock local block file not found when disk with some exception. | ||
| fsdataset.invalidateMissingBlock(bpid, replicaInfo); | ||
| fsdataset.invalidateMissingBlock(bpid, replicaInfo, false); | ||
|
|
||
| // Assert local block file wouldn't be deleted from disk. | ||
| assertTrue(blockFile.exists()); | ||
|
|
@@ -2011,4 +2011,95 @@ public void tesInvalidateMissingBlock() throws Exception { | |
| cluster.shutdown(); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void testCheckFilesWhenInvalidateMissingBlock() throws Exception { | ||
| long blockSize = 1024; | ||
| int heartbeatInterval = 1; | ||
| HdfsConfiguration c = new HdfsConfiguration(); | ||
| c.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, heartbeatInterval); | ||
| c.setLong(DFS_BLOCK_SIZE_KEY, blockSize); | ||
| MiniDFSCluster cluster = new MiniDFSCluster.Builder(c). | ||
| numDataNodes(1).build(); | ||
| DataNodeFaultInjector oldDnInjector = DataNodeFaultInjector.get(); | ||
| try { | ||
| cluster.waitActive(); | ||
| GenericTestUtils.LogCapturer logCapturer = GenericTestUtils.LogCapturer. | ||
| captureLogs(DataNode.LOG); | ||
| BlockReaderTestUtil util = new BlockReaderTestUtil(cluster, new | ||
| HdfsConfiguration(conf)); | ||
| Path path = new Path("/testFile"); | ||
| util.writeFile(path, 1); | ||
| String bpid = cluster.getNameNode().getNamesystem().getBlockPoolId(); | ||
| DataNode dn = cluster.getDataNodes().get(0); | ||
| FsDatasetImpl dnFSDataset = (FsDatasetImpl) dn.getFSDataset(); | ||
| List<ReplicaInfo> replicaInfos = dnFSDataset.getFinalizedBlocks(bpid); | ||
| assertEquals(1, replicaInfos.size()); | ||
| DFSTestUtil.readFile(cluster.getFileSystem(), path); | ||
| LocatedBlock blk = util.getFileBlocks(path, 512).get(0); | ||
| ExtendedBlock block = blk.getBlock(); | ||
|
|
||
| // Append a new block with an incremented generation stamp. | ||
| long newGS = block.getGenerationStamp() + 1; | ||
| dnFSDataset.append(block, newGS, 1024); | ||
| block.setGenerationStamp(newGS); | ||
| ReplicaInfo tmpReplicaInfo = dnFSDataset.getReplicaInfo(blk.getBlock()); | ||
|
|
||
| DataNodeFaultInjector injector = new DataNodeFaultInjector() { | ||
| @Override | ||
| public void delayGetMetaDataInputStream() { | ||
| try { | ||
| Thread.sleep(8000); | ||
| } catch (InterruptedException e) { | ||
| // Ignore exception. | ||
| } | ||
| } | ||
| }; | ||
| // Delay to getMetaDataInputStream. | ||
| DataNodeFaultInjector.set(injector); | ||
|
|
||
| ExecutorService executorService = Executors.newFixedThreadPool(2); | ||
| try { | ||
| Future<?> blockReaderFuture = executorService.submit(() -> { | ||
| try { | ||
| // Submit tasks for reading block. | ||
| BlockReader blockReader = BlockReaderTestUtil.getBlockReader( | ||
| cluster.getFileSystem(), blk, 0, 512); | ||
| blockReader.close(); | ||
| } catch (IOException e) { | ||
| // Ignore exception. | ||
| } | ||
| }); | ||
|
|
||
| Future<?> finalizeBlockFuture = executorService.submit(() -> { | ||
| try { | ||
| // Submit tasks for finalizing block. | ||
| Thread.sleep(1000); | ||
| dnFSDataset.finalizeBlock(block, false); | ||
| } catch (Exception e) { | ||
| // Ignore exception | ||
| } | ||
| }); | ||
|
|
||
| // Wait for both tasks to complete. | ||
| blockReaderFuture.get(); | ||
| finalizeBlockFuture.get(); | ||
| } finally { | ||
| executorService.shutdown(); | ||
| } | ||
|
|
||
| // Validate the replica is exits. | ||
| assertNotNull(dnFSDataset.getReplicaInfo(blk.getBlock())); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess we need one more case to check the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TestFsDatasetImpl#tesInvalidateMissingBlock line[1991-1997]
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, It tests cases the block file not found for any causes. But I am not sure whether the situation as your description would lead to FNE, so I think the case should be constructed and tested
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it is to verify whether the UT can reproduce FNE, we can add the following code for verification How about it?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch. |
||
|
|
||
| // Check DN log for FileNotFoundException. | ||
| String expectedMsg = String.format("opReadBlock %s received exception " + | ||
| "java.io.FileNotFoundException: %s (No such file or directory)", | ||
| blk.getBlock(), tmpReplicaInfo.getMetadataURI().getPath()); | ||
| assertTrue("Expected log message not found in DN log.", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, there are some deviations in my understanding in last above reply . In short words, I think we should verify two cases:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @smarthanwang for your comment.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @ZanderXu @zhangshuyan0 @smarthanwang do you have any further comments on this PR?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
OK,can we reproduce it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @smarthanwang If I understand correctly, here the UT has reproduced. |
||
| logCapturer.getOutput().contains(expectedMsg)); | ||
| } finally { | ||
| cluster.shutdown(); | ||
| DataNodeFaultInjector.set(oldDnInjector); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If replica == null,
invalidate(bpid, replica);would not executeThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If replica == nullshould not need to executeinvalidate(bpid, replica)avoid cause NPE.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, get it