Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@

import org.apache.hadoop.metrics2.util.MBeans;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Daemon;
Expand Down Expand Up @@ -1858,17 +1859,26 @@
expectedRedundancies;
boolean corruptedDuringWrite = minReplicationSatisfied &&
b.isCorruptedDuringWrite();

int countNumOfAvailableNodes = getDatanodeManager()
.getNetworkTopology().countNumOfAvailableNodes(NodeBase.ROOT, new HashSet<>());
boolean noEnoughNodes = minReplicationSatisfied &&
(numberOfReplicas.liveReplicas()
+ numberOfReplicas.corruptReplicas()) == countNumOfAvailableNodes;
// case 1: have enough number of usable replicas
// case 2: corrupted replicas + usable replicas > Replication factor
// case 3: Block is marked corrupt due to failure while writing. In this
// case genstamp will be different than that of valid block.
// case 4: Block is marked corrupt not due to failure while writing
// and number of replicas == count number of available nodes.

Check failure on line 1873 in hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

View check run for this annotation

ASF Cloudbees Jenkins ci-hadoop / Apache Yetus

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java#L1873

blanks: end of line
// This means we cannot find node to reconstruction, should delete corrupt replica.
// In all these cases we can delete the replica.
// In case 3, rbw block will be deleted and valid block can be replicated.
// Note NN only becomes aware of corrupt blocks when the block report is sent,
// this means that by default it can take up to 6 hours for a corrupt block to
// be invalidated, after which the valid block can be replicated.
if (hasEnoughLiveReplicas || hasMoreCorruptReplicas
|| corruptedDuringWrite) {
|| corruptedDuringWrite || noEnoughNodes) {
if (b.getStored().isStriped()) {
// If the block is an EC block, the whole block group is marked
// corrupted, so if this block is getting deleted, remove the block
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList;
import org.apache.hadoop.thirdparty.com.google.common.collect.LinkedListMultimap;
import org.apache.hadoop.thirdparty.com.google.common.collect.Lists;

import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys;
import org.apache.hadoop.hdfs.protocol.SystemErasureCodingPolicies;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
Expand Down Expand Up @@ -94,8 +95,11 @@
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.LinkedList;
Expand Down Expand Up @@ -144,6 +148,7 @@
*/
private static final int NUM_TEST_ITERS = 30;
private static final int BLOCK_SIZE = 64*1024;
private static final int DN_DIRECTORYSCAN_INTERVAL = 10;
private static final org.slf4j.Logger LOG =
LoggerFactory.getLogger(TestBlockManager.class);

Expand Down Expand Up @@ -452,6 +457,71 @@
assertFalse(bm.isNeededReconstruction(block,
bm.countNodes(block, fsn.isInStartupSafeMode())));
}

Check failure on line 460 in hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java

View check run for this annotation

ASF Cloudbees Jenkins ci-hadoop / Apache Yetus

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java#L460

blanks: end of line
@Test(timeout = 60000)
public void testMiniClusterCannotReconstructionWhileReplicaAnomaly()

Check failure on line 462 in hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java

View check run for this annotation

ASF Cloudbees Jenkins ci-hadoop / Apache Yetus

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java#L462

blanks: end of line
throws IOException, InterruptedException, TimeoutException {
Configuration conf = new HdfsConfiguration();
conf.setInt("dfs.datanode.directoryscan.interval", DN_DIRECTORYSCAN_INTERVAL);
conf.setInt("dfs.namenode.replication.interval", 1);
conf.setInt("dfs.heartbeat.interval", 1);
String src = "/test-reconstruction";
Path file = new Path(src);
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
try {
cluster.waitActive();
FSNamesystem fsn = cluster.getNamesystem();
BlockManager bm = fsn.getBlockManager();
FSDataOutputStream out = null;
FileSystem fs = cluster.getFileSystem();
try {
out = fs.create(file);
for (int i = 0; i < 1024 * 1; i++) {
out.write(i);
}
out.hflush();
} finally {
IOUtils.closeStream(out);
}
FSDataInputStream in = null;
ExtendedBlock oldBlock = null;
try {
in = fs.open(file);
oldBlock = DFSTestUtil.getAllBlocks(in).get(0).getBlock();
} finally {
IOUtils.closeStream(in);
}
DataNode dn = cluster.getDataNodes().get(0);
String blockPath = dn.getFSDataset().getBlockLocalPathInfo(oldBlock).getBlockPath();
String metaBlockPath = dn.getFSDataset().getBlockLocalPathInfo(oldBlock).getMetaPath();
Files.write(Paths.get(blockPath), Collections.emptyList());
Files.write(Paths.get(metaBlockPath), Collections.emptyList());
cluster.restartDataNode(0, true);
cluster.waitDatanodeConnectedToActive(dn, 60000);
while(!dn.isDatanodeFullyStarted()) {
Thread.sleep(1000);
}
Thread.sleep(DN_DIRECTORYSCAN_INTERVAL * 1000);
cluster.triggerBlockReports();
BlockInfo bi = bm.getStoredBlock(oldBlock.getLocalBlock());
boolean isNeededReconstruction = bm.isNeededReconstruction(bi,
bm.countNodes(bi, cluster.getNamesystem().isInStartupSafeMode()));
if (isNeededReconstruction) {
BlockReconstructionWork reconstructionWork = null;
fsn.readLock();
try {
reconstructionWork = bm.scheduleReconstruction(bi, 3);
} finally {
fsn.readUnlock();
}
assertNull(reconstructionWork);
}
} finally {
if (cluster != null) {
cluster.shutdown();
}
}
}

@Test(timeout = 60000)
public void testNeededReconstructionWhileAppending() throws IOException {
Expand Down