From 6f15fffe383cfa8b934a596729f13ce8b2a7376e Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Mon, 18 Jun 2018 17:38:37 -0400 Subject: [PATCH 1/2] bump LLVM BB version Adds patches for: - #27055 - #27080 - #27032 furthermore this adds the ability to use assertion builds on CI. Which is used on OSX and Windows. --- .travis.yml | 2 +- Make.inc | 2 + contrib/windows/appveyor_build.sh | 1 + deps/Versions.make | 2 +- deps/llvm.mk | 13 +- deps/patches/llvm-rL327898.patch | 6131 +++++++++++++++++++++++++++++ deps/patches/llvm-rL332302.patch | 84 + deps/patches/llvm-rL332680.patch | 68 + deps/patches/llvm-rL332682.patch | 137 + deps/patches/llvm-rL332694.patch | 38 + 10 files changed, 6475 insertions(+), 3 deletions(-) create mode 100644 deps/patches/llvm-rL327898.patch create mode 100644 deps/patches/llvm-rL332302.patch create mode 100644 deps/patches/llvm-rL332680.patch create mode 100644 deps/patches/llvm-rL332682.patch create mode 100644 deps/patches/llvm-rL332694.patch diff --git a/.travis.yml b/.travis.yml index 2963901d0bf9d..50b9c6700b80e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -85,7 +85,7 @@ before_install: brew tap staticfloat/julia > /dev/null; brew rm --force $(brew deps --HEAD julia); brew install -v gcc gmp mpfr pcre2 staticfloat/julia/openblas-julia staticfloat/julia/suite-sparse-julia staticfloat/juliadeps/libgfortran; - BUILDOPTS="-j3 USECLANG=1 USECCACHE=1 BINARYBUILDER_TRIPLET=x86_64-apple-darwin14"; + BUILDOPTS="-j3 USECLANG=1 USECCACHE=1 BINARYBUILDER_TRIPLET=x86_64-apple-darwin14 BINARYBUILDER_LLVM_ASSERTS=1"; BUILDOPTS="$BUILDOPTS USE_BINARYBUILDER_LLVM=1 LLVM_CONFIG=$TRAVIS_BUILD_DIR/usr/tools/llvm-config LLVM_SIZE=$TRAVIS_BUILD_DIR/usr/tools/llvm-size"; BUILDOPTS="$BUILDOPTS VERBOSE=1 USE_BLAS64=0 SUITESPARSE_INC=-I$(brew --prefix suite-sparse-julia)/include FORCE_ASSERTIONS=1"; BUILDOPTS="$BUILDOPTS LIBBLAS=-lopenblas LIBBLASNAME=libopenblas LIBLAPACK=-lopenblas LIBLAPACKNAME=libopenblas"; diff --git a/Make.inc b/Make.inc index 730c63550c754..33efa4c7de423 100644 --- a/Make.inc +++ b/Make.inc @@ -222,6 +222,8 @@ INSTALL_M := $(JULIAHOME)/contrib/install.sh 755 # BinaryBuilder options # TODO: Autodiscover triplet USE_BINARYBUILDER_LLVM := 0 +# Use the Assertions build +BINARYBUILDER_LLVM_ASSERTS := 0 BINARYBUILDER_TRIPLET := # LLVM Options diff --git a/contrib/windows/appveyor_build.sh b/contrib/windows/appveyor_build.sh index 4b450611244f8..482e8c88349b7 100755 --- a/contrib/windows/appveyor_build.sh +++ b/contrib/windows/appveyor_build.sh @@ -204,6 +204,7 @@ if [ -n "$USEMSVC" ]; then else # Use BinaryBuilder echo 'USE_BINARYBUILDER_LLVM = 1' >> Make.user + echo 'BINARYBUILDER_LLVM_ASSERTS = 1' >> Make.user echo 'override DEP_LIBS += llvm openlibm' >> Make.user export CCACHE_DIR=/cygdrive/c/ccache echo 'USECCACHE=1' >> Make.user diff --git a/deps/Versions.make b/deps/Versions.make index 478c4fcca300a..99980a1d4e273 100644 --- a/deps/Versions.make +++ b/deps/Versions.make @@ -1,5 +1,5 @@ LLVM_VER = 6.0.0 -LLVM_BB_REL = 2 +LLVM_BB_REL = 3 PCRE_VER = 10.30 DSFMT_VER = 2.2.3 LAPACK_VER = 3.5.0 diff --git a/deps/llvm.mk b/deps/llvm.mk index b4c658c683251..16f93c56c14d3 100644 --- a/deps/llvm.mk +++ b/deps/llvm.mk @@ -488,6 +488,11 @@ $(eval $(call LLVM_PATCH,llvm-D45008)) # remove for 7.0 $(eval $(call LLVM_PATCH,llvm-D45070)) # remove for 7.0 $(eval $(call LLVM_PATCH,llvm-6.0.0-ifconv-D45819)) # remove for 7.0 $(eval $(call LLVM_PATCH,llvm-D46460)) +$(eval $(call LLVM_PATCH,llvm-rL332680)) # remove for 7.0 +$(eval $(call LLVM_PATCH,llvm-rL332682)) # remove for 7.0 +$(eval $(call LLVM_PATCH,llvm-rL332302)) # remove for 7.0 +$(eval $(call LLVM_PATCH,llvm-rL332694)) # remove for 7.0 +$(eval $(call LLVM_PATCH,llvm-rL327898)) # remove for 7.0 endif # LLVM_VER # Remove hardcoded OS X requirements in compilter-rt cmake build @@ -574,7 +579,13 @@ endif endif else # USE_BINARYBUILDER_LLVM LLVM_BB_URL_BASE := https://github.com/staticfloat/LLVMBuilder/releases/download -LLVM_BB_URL := $(LLVM_BB_URL_BASE)/v$(LLVM_VER)-$(LLVM_BB_REL)/LLVM.$(BINARYBUILDER_TRIPLET).tar.gz +ifneq ($(BINARYBUILDER_LLVM_ASSERTS), 1) +LLVM_BB_NAME := LLVM +else +LLVM_BB_NAME := LLVM.asserts +endif +LLVM_BB_URL := $(LLVM_BB_URL_BASE)/v$(LLVM_VER)-$(LLVM_BB_REL)/$(LLVM_BB_NAME).$(BINARYBUILDER_TRIPLET).tar.gz + $(BUILDDIR)/llvm-$(LLVM_VER)-$(LLVM_BB_REL): mkdir -p $@ diff --git a/deps/patches/llvm-rL327898.patch b/deps/patches/llvm-rL327898.patch new file mode 100644 index 0000000000000..f4d9a43099686 --- /dev/null +++ b/deps/patches/llvm-rL327898.patch @@ -0,0 +1,6131 @@ +commit 64c3384f94a1eb3e3510d6f66c3bccdfc9d9050b +Author: Nirav Dave +Date: Thu Feb 1 16:11:59 2018 +0000 + + r327898/dependencies roll up + + This is a squash of 13 commits required in the lead up to r327898, + which fixes https://github.com/JuliaLang/julia/issues/27603. The squashed + commits are: + + 332d15e981e86b9e058087174bb288ba18a15807 + b659d3fca5d24c25ee73f979edb382f7f24e05e2 + c01d1363ea080170fc5143d72f26eecd9270f03b + eab8a177a4caef9e42ef1d2aeb4ba15dc788d3f2 + bedb1391781b009ace95f5586e7fae5f03fe0689 + 11d041a905f82ac78e7ccf2394773e80b93d147c + e1ec36c55a0127988f42a3329ca835617b30de09 + b8d2903300c13d8fd151c8e5dc71017269617539 + 00884fea345f47ab05174a8f314ecd60d1676d02 + 28ab04cec0d9888af9d29946b3a048b8340abe0f + 3dd52e62ea3087efcca63c3772183d9471abc742 + bd3649ff6d6b4d18b3c6de253179d987a120518a + aea03035b9c633e6d745b6d3fc5b6378699f576c + + Their commit messages follow below: + + [SelectionDAG] Fix UpdateChains handling of TokenFactors + + Summary: + In Instruction Selection UpdateChains replaces all matched Nodes' + chain references including interior token factors and deletes them. + This may allow nodes which depend on these interior nodes but are not + part of the set of matched nodes to be left with a dangling dependence. + Avoid this by doing the replacement for matched non-TokenFactor nodes. + + Fixes PR36164. + + Reviewers: jonpa, RKSimon, bogner + + Subscribers: llvm-commits, hiraditya + + Differential Revision: https://reviews.llvm.org/D42754 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323977 91177308-0d34-0410-b5e6-96231b3b80d8 + + Regenerate test result for vastart-defs-eflags.ll. NFC. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323596 91177308-0d34-0410-b5e6-96231b3b80d8 + + Regenerate test result for testb-je-fusion.ll. NFC. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323595 91177308-0d34-0410-b5e6-96231b3b80d8 + + [X86] Avoid using high register trick for test instruction + + Summary: + It seems it's main effect is to create addition copies when values are inr register that do not support this trick, which increase register pressure and makes the code bigger. + + Reviewers: craig.topper, niravd, spatel, hfinkel + + Subscribers: llvm-commits + + Differential Revision: https://reviews.llvm.org/D42646 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323888 91177308-0d34-0410-b5e6-96231b3b80d8 + + Add a regression test for problems caused by D42646 . NFC + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323868 91177308-0d34-0410-b5e6-96231b3b80d8 + + Add test case for truncated and promotion to test. NFC + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323663 91177308-0d34-0410-b5e6-96231b3b80d8 + + [X86] Add test case to ensure testw is generated when optimizing for size. NFC + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323687 91177308-0d34-0410-b5e6-96231b3b80d8 + + [X86] Generate testl instruction through truncates. + + Summary: + This was introduced in D42646 but ended up being reverted because the original implementation was buggy. + + Depends on D42646 + + Reviewers: craig.topper, niravd, spatel, hfinkel + + Subscribers: llvm-commits + + Differential Revision: https://reviews.llvm.org/D42741 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323899 91177308-0d34-0410-b5e6-96231b3b80d8 + + [X86] Don't look for TEST instruction shrinking opportunities when the root node is a X86ISD::SUB. + + I don't believe we ever create an X86ISD::SUB with a 0 constant which is what the TEST handling needs. The ternary operator at the end of this code shows up as only going one way in the llvm-cov report from the bots. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@324865 91177308-0d34-0410-b5e6-96231b3b80d8 + + [X86] Teach LowerBUILD_VECTOR to recognize pair-wise splats of 32-bit elements and use a 64-bit broadcast + + If we are splatting pairs of 32-bit elements, we can use a 64-bit broadcast to get the job done. + + We could probably could probably do this with other sizes too, for example four 16-bit elements. Or we could broadcast pairs of 16-bit elements using a 32-bit element broadcast. But I've left that as a future improvement. + + I've also restricted this to AVX2 only because we can only broadcast loads under AVX. + + Differential Revision: https://reviews.llvm.org/D42086 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@322730 91177308-0d34-0410-b5e6-96231b3b80d8 + + [DAG, X86] Revert r327197 "Revert r327170, r327171, r327172" + + Reland ISel cycle checking improvements after simplifying node id + invariant traversal and correcting typo. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@327898 91177308-0d34-0410-b5e6-96231b3b80d8 + + [ Modified for cherry-pick: Dropped Hexagon and SystemZ changes" + + [DAG, X86] Fix ISel-time node insertion ids + + As in SystemZ backend, correctly propagate node ids when inserting new + unselected nodes into the DAG during instruction Seleciton for X86 + target. + + Fixes PR36865. + + Reviewers: jyknight, craig.topper + + Subscribers: hiraditya, llvm-commits + + Differential Revision: https://reviews.llvm.org/D44797 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@328233 91177308-0d34-0410-b5e6-96231b3b80d8 + + [DAG] Fix node id invalidation in Instruction Selection. + + Invalidation should be bit negation. Add missing negation. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@328287 91177308-0d34-0410-b5e6-96231b3b80d8 + + Remove failing tests + + This removes tests that are failing due to codegen differences, + after the latest set of backports. Fixing thse for the backport + branch does not seem worth it. + +diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h +index de6849a1eae..e56eafc437c 100644 +--- a/include/llvm/CodeGen/SelectionDAGISel.h ++++ b/include/llvm/CodeGen/SelectionDAGISel.h +@@ -110,6 +110,11 @@ public: + CodeGenOpt::Level OptLevel, + bool IgnoreChains = false); + ++ static void InvalidateNodeId(SDNode *N); ++ static int getUninvalidatedNodeId(SDNode *N); ++ ++ static void EnforceNodeIdInvariant(SDNode *N); ++ + // Opcodes used by the DAG state machine: + enum BuiltinOpcodes { + OPC_Scope, +@@ -199,23 +204,28 @@ protected: + /// of the new node T. + void ReplaceUses(SDValue F, SDValue T) { + CurDAG->ReplaceAllUsesOfValueWith(F, T); ++ EnforceNodeIdInvariant(T.getNode()); + } + + /// ReplaceUses - replace all uses of the old nodes F with the use + /// of the new nodes T. + void ReplaceUses(const SDValue *F, const SDValue *T, unsigned Num) { + CurDAG->ReplaceAllUsesOfValuesWith(F, T, Num); ++ for (unsigned i = 0; i < Num; ++i) ++ EnforceNodeIdInvariant(T[i].getNode()); + } + + /// ReplaceUses - replace all uses of the old node F with the use + /// of the new node T. + void ReplaceUses(SDNode *F, SDNode *T) { + CurDAG->ReplaceAllUsesWith(F, T); ++ EnforceNodeIdInvariant(T); + } + + /// Replace all uses of \c F with \c T, then remove \c F from the DAG. + void ReplaceNode(SDNode *F, SDNode *T) { + CurDAG->ReplaceAllUsesWith(F, T); ++ EnforceNodeIdInvariant(T); + CurDAG->RemoveDeadNode(F); + } + +diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h +index 522c2f1b2cb..2d974234abf 100644 +--- a/include/llvm/CodeGen/SelectionDAGNodes.h ++++ b/include/llvm/CodeGen/SelectionDAGNodes.h +@@ -796,16 +796,44 @@ public: + /// searches to be performed in parallel, caching of results across + /// queries and incremental addition to Worklist. Stops early if N is + /// found but will resume. Remember to clear Visited and Worklists +- /// if DAG changes. ++ /// if DAG changes. MaxSteps gives a maximum number of nodes to visit before ++ /// giving up. The TopologicalPrune flag signals that positive NodeIds are ++ /// topologically ordered (Operands have strictly smaller node id) and search ++ /// can be pruned leveraging this. + static bool hasPredecessorHelper(const SDNode *N, + SmallPtrSetImpl &Visited, + SmallVectorImpl &Worklist, +- unsigned int MaxSteps = 0) { ++ unsigned int MaxSteps = 0, ++ bool TopologicalPrune = false) { ++ SmallVector DeferredNodes; + if (Visited.count(N)) + return true; ++ ++ // Node Id's are assigned in three places: As a topological ++ // ordering (> 0), during legalization (results in values set to ++ // 0), new nodes (set to -1). If N has a topolgical id then we ++ // know that all nodes with ids smaller than it cannot be ++ // successors and we need not check them. Filter out all node ++ // that can't be matches. We add them to the worklist before exit ++ // in case of multiple calls. Note that during selection the topological id ++ // may be violated if a node's predecessor is selected before it. We mark ++ // this at selection negating the id of unselected successors and ++ // restricting topological pruning to positive ids. ++ ++ int NId = N->getNodeId(); ++ // If we Invalidated the Id, reconstruct original NId. ++ if (NId < -1) ++ NId = -(NId + 1); ++ ++ bool Found = false; + while (!Worklist.empty()) { + const SDNode *M = Worklist.pop_back_val(); +- bool Found = false; ++ int MId = M->getNodeId(); ++ if (TopologicalPrune && M->getOpcode() != ISD::TokenFactor && (NId > 0) && ++ (MId > 0) && (MId < NId)) { ++ DeferredNodes.push_back(M); ++ continue; ++ } + for (const SDValue &OpV : M->op_values()) { + SDNode *Op = OpV.getNode(); + if (Visited.insert(Op).second) +@@ -814,11 +842,16 @@ public: + Found = true; + } + if (Found) +- return true; ++ break; + if (MaxSteps != 0 && Visited.size() >= MaxSteps) +- return false; ++ break; + } +- return false; ++ // Push deferred nodes back on worklist. ++ Worklist.append(DeferredNodes.begin(), DeferredNodes.end()); ++ // If we bailed early, conservatively return found. ++ if (MaxSteps != 0 && Visited.size() >= MaxSteps) ++ return true; ++ return Found; + } + + /// Return true if all the users of N are contained in Nodes. +diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +index bd9fcfb5c1e..17e42240133 100644 +--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp ++++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +@@ -937,6 +937,58 @@ public: + + } // end anonymous namespace + ++// This function is used to enforce the topological node id property ++// property leveraged during Instruction selection. Before selection all ++// nodes are given a non-negative id such that all nodes have a larger id than ++// their operands. As this holds transitively we can prune checks that a node N ++// is a predecessor of M another by not recursively checking through M's ++// operands if N's ID is larger than M's ID. This is significantly improves ++// performance of for various legality checks (e.g. IsLegalToFold / ++// UpdateChains). ++ ++// However, when we fuse multiple nodes into a single node ++// during selection we may induce a predecessor relationship between inputs and ++// outputs of distinct nodes being merged violating the topological property. ++// Should a fused node have a successor which has yet to be selected, our ++// legality checks would be incorrect. To avoid this we mark all unselected ++// sucessor nodes, i.e. id != -1 as invalid for pruning by bit-negating (x => ++// (-(x+1))) the ids and modify our pruning check to ignore negative Ids of M. ++// We use bit-negation to more clearly enforce that node id -1 can only be ++// achieved by selected nodes). As the conversion is reversable the original Id, ++// topological pruning can still be leveraged when looking for unselected nodes. ++// This method is call internally in all ISel replacement calls. ++void SelectionDAGISel::EnforceNodeIdInvariant(SDNode *Node) { ++ SmallVector Nodes; ++ Nodes.push_back(Node); ++ ++ while (!Nodes.empty()) { ++ SDNode *N = Nodes.pop_back_val(); ++ for (auto *U : N->uses()) { ++ auto UId = U->getNodeId(); ++ if (UId > 0) { ++ InvalidateNodeId(U); ++ Nodes.push_back(U); ++ } ++ } ++ } ++} ++ ++// InvalidateNodeId - As discusses in EnforceNodeIdInvariant, mark a ++// NodeId with the equivalent node id which is invalid for topological ++// pruning. ++void SelectionDAGISel::InvalidateNodeId(SDNode *N) { ++ int InvalidId = -(N->getNodeId() + 1); ++ N->setNodeId(InvalidId); ++} ++ ++// getUninvalidatedNodeId - get original uninvalidated node id. ++int SelectionDAGISel::getUninvalidatedNodeId(SDNode *N) { ++ int Id = N->getNodeId(); ++ if (Id < -1) ++ return -(Id + 1); ++ return Id; ++} ++ + void SelectionDAGISel::DoInstructionSelection() { + DEBUG(dbgs() << "===== Instruction selection begins: " + << printMBBReference(*FuncInfo->MBB) << " '" +@@ -972,6 +1024,33 @@ void SelectionDAGISel::DoInstructionSelection() { + if (Node->use_empty()) + continue; + ++#ifndef NDEBUG ++ SmallVector Nodes; ++ Nodes.push_back(Node); ++ ++ while (!Nodes.empty()) { ++ auto N = Nodes.pop_back_val(); ++ if (N->getOpcode() == ISD::TokenFactor || N->getNodeId() < 0) ++ continue; ++ for (const SDValue &Op : N->op_values()) { ++ if (Op->getOpcode() == ISD::TokenFactor) ++ Nodes.push_back(Op.getNode()); ++ else { ++ // We rely on topological ordering of node ids for checking for ++ // cycles when fusing nodes during selection. All unselected nodes ++ // successors of an already selected node should have a negative id. ++ // This assertion will catch such cases. If this assertion triggers ++ // it is likely you using DAG-level Value/Node replacement functions ++ // (versus equivalent ISEL replacement) in backend-specific ++ // selections. See comment in EnforceNodeIdInvariant for more ++ // details. ++ assert(Op->getNodeId() != -1 && ++ "Node has already selected predecessor node"); ++ } ++ } ++ } ++#endif ++ + // When we are using non-default rounding modes or FP exception behavior + // FP operations are represented by StrictFP pseudo-operations. They + // need to be simplified here so that the target-specific instruction +@@ -2134,52 +2213,44 @@ static SDNode *findGlueUse(SDNode *N) { + return nullptr; + } + +-/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def". +-/// This function iteratively traverses up the operand chain, ignoring +-/// certain nodes. +-static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse, +- SDNode *Root, SmallPtrSetImpl &Visited, ++/// findNonImmUse - Return true if "Def" is a predecessor of "Root" via a path ++/// beyond "ImmedUse". We may ignore chains as they are checked separately. ++static bool findNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse, + bool IgnoreChains) { +- // The NodeID's are given uniques ID's where a node ID is guaranteed to be +- // greater than all of its (recursive) operands. If we scan to a point where +- // 'use' is smaller than the node we're scanning for, then we know we will +- // never find it. +- // +- // The Use may be -1 (unassigned) if it is a newly allocated node. This can +- // happen because we scan down to newly selected nodes in the case of glue +- // uses. +- std::vector WorkList; +- WorkList.push_back(Use); +- +- while (!WorkList.empty()) { +- Use = WorkList.back(); +- WorkList.pop_back(); +- if (Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1) +- continue; ++ SmallPtrSet Visited; ++ SmallVector WorkList; ++ // Only check if we have non-immediate uses of Def. ++ if (ImmedUse->isOnlyUserOf(Def)) ++ return false; + +- // Don't revisit nodes if we already scanned it and didn't fail, we know we +- // won't fail if we scan it again. +- if (!Visited.insert(Use).second) ++ // We don't care about paths to Def that go through ImmedUse so mark it ++ // visited and mark non-def operands as used. ++ Visited.insert(ImmedUse); ++ for (const SDValue &Op : ImmedUse->op_values()) { ++ SDNode *N = Op.getNode(); ++ // Ignore chain deps (they are validated by ++ // HandleMergeInputChains) and immediate uses ++ if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def) + continue; ++ if (!Visited.insert(N).second) ++ continue; ++ WorkList.push_back(N); ++ } + +- for (const SDValue &Op : Use->op_values()) { +- // Ignore chain uses, they are validated by HandleMergeInputChains. +- if (Op.getValueType() == MVT::Other && IgnoreChains) +- continue; +- ++ // Initialize worklist to operands of Root. ++ if (Root != ImmedUse) { ++ for (const SDValue &Op : Root->op_values()) { + SDNode *N = Op.getNode(); +- if (N == Def) { +- if (Use == ImmedUse || Use == Root) +- continue; // We are not looking for immediate use. +- assert(N != Root); +- return true; +- } +- +- // Traverse up the operand chain. ++ // Ignore chains (they are validated by HandleMergeInputChains) ++ if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def) ++ continue; ++ if (!Visited.insert(N).second) ++ continue; + WorkList.push_back(N); + } + } +- return false; ++ ++ return SDNode::hasPredecessorHelper(Def, Visited, WorkList, 0, true); + } + + /// IsProfitableToFold - Returns true if it's profitable to fold the specific +@@ -2251,13 +2322,12 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root, + + // If our query node has a glue result with a use, we've walked up it. If + // the user (which has already been selected) has a chain or indirectly uses +- // the chain, our WalkChainUsers predicate will not consider it. Because of ++ // the chain, HandleMergeInputChains will not consider it. Because of + // this, we cannot ignore chains in this predicate. + IgnoreChains = false; + } + +- SmallPtrSet Visited; +- return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains); ++ return !findNonImmUse(Root, N.getNode(), U, IgnoreChains); + } + + void SelectionDAGISel::Select_INLINEASM(SDNode *N) { +@@ -2360,7 +2430,8 @@ void SelectionDAGISel::UpdateChains( + std::replace(ChainNodesMatched.begin(), ChainNodesMatched.end(), N, + static_cast(nullptr)); + }); +- CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain); ++ if (ChainNode->getOpcode() != ISD::TokenFactor) ++ ReplaceUses(ChainVal, InputChain); + + // If the node became dead and we haven't already seen it, delete it. + if (ChainNode != NodeToMatch && ChainNode->use_empty() && +@@ -2375,143 +2446,6 @@ void SelectionDAGISel::UpdateChains( + DEBUG(dbgs() << "ISEL: Match complete!\n"); + } + +-enum ChainResult { +- CR_Simple, +- CR_InducesCycle, +- CR_LeadsToInteriorNode +-}; +- +-/// WalkChainUsers - Walk down the users of the specified chained node that is +-/// part of the pattern we're matching, looking at all of the users we find. +-/// This determines whether something is an interior node, whether we have a +-/// non-pattern node in between two pattern nodes (which prevent folding because +-/// it would induce a cycle) and whether we have a TokenFactor node sandwiched +-/// between pattern nodes (in which case the TF becomes part of the pattern). +-/// +-/// The walk we do here is guaranteed to be small because we quickly get down to +-/// already selected nodes "below" us. +-static ChainResult +-WalkChainUsers(const SDNode *ChainedNode, +- SmallVectorImpl &ChainedNodesInPattern, +- DenseMap &TokenFactorResult, +- SmallVectorImpl &InteriorChainedNodes) { +- ChainResult Result = CR_Simple; +- +- for (SDNode::use_iterator UI = ChainedNode->use_begin(), +- E = ChainedNode->use_end(); UI != E; ++UI) { +- // Make sure the use is of the chain, not some other value we produce. +- if (UI.getUse().getValueType() != MVT::Other) continue; +- +- SDNode *User = *UI; +- +- if (User->getOpcode() == ISD::HANDLENODE) // Root of the graph. +- continue; +- +- // If we see an already-selected machine node, then we've gone beyond the +- // pattern that we're selecting down into the already selected chunk of the +- // DAG. +- unsigned UserOpcode = User->getOpcode(); +- if (User->isMachineOpcode() || +- UserOpcode == ISD::CopyToReg || +- UserOpcode == ISD::CopyFromReg || +- UserOpcode == ISD::INLINEASM || +- UserOpcode == ISD::EH_LABEL || +- UserOpcode == ISD::LIFETIME_START || +- UserOpcode == ISD::LIFETIME_END) { +- // If their node ID got reset to -1 then they've already been selected. +- // Treat them like a MachineOpcode. +- if (User->getNodeId() == -1) +- continue; +- } +- +- // If we have a TokenFactor, we handle it specially. +- if (User->getOpcode() != ISD::TokenFactor) { +- // If the node isn't a token factor and isn't part of our pattern, then it +- // must be a random chained node in between two nodes we're selecting. +- // This happens when we have something like: +- // x = load ptr +- // call +- // y = x+4 +- // store y -> ptr +- // Because we structurally match the load/store as a read/modify/write, +- // but the call is chained between them. We cannot fold in this case +- // because it would induce a cycle in the graph. +- if (!std::count(ChainedNodesInPattern.begin(), +- ChainedNodesInPattern.end(), User)) +- return CR_InducesCycle; +- +- // Otherwise we found a node that is part of our pattern. For example in: +- // x = load ptr +- // y = x+4 +- // store y -> ptr +- // This would happen when we're scanning down from the load and see the +- // store as a user. Record that there is a use of ChainedNode that is +- // part of the pattern and keep scanning uses. +- Result = CR_LeadsToInteriorNode; +- InteriorChainedNodes.push_back(User); +- continue; +- } +- +- // If we found a TokenFactor, there are two cases to consider: first if the +- // TokenFactor is just hanging "below" the pattern we're matching (i.e. no +- // uses of the TF are in our pattern) we just want to ignore it. Second, +- // the TokenFactor can be sandwiched in between two chained nodes, like so: +- // [Load chain] +- // ^ +- // | +- // [Load] +- // ^ ^ +- // | \ DAG's like cheese +- // / \ do you? +- // / | +- // [TokenFactor] [Op] +- // ^ ^ +- // | | +- // \ / +- // \ / +- // [Store] +- // +- // In this case, the TokenFactor becomes part of our match and we rewrite it +- // as a new TokenFactor. +- // +- // To distinguish these two cases, do a recursive walk down the uses. +- auto MemoizeResult = TokenFactorResult.find(User); +- bool Visited = MemoizeResult != TokenFactorResult.end(); +- // Recursively walk chain users only if the result is not memoized. +- if (!Visited) { +- auto Res = WalkChainUsers(User, ChainedNodesInPattern, TokenFactorResult, +- InteriorChainedNodes); +- MemoizeResult = TokenFactorResult.insert(std::make_pair(User, Res)).first; +- } +- switch (MemoizeResult->second) { +- case CR_Simple: +- // If the uses of the TokenFactor are just already-selected nodes, ignore +- // it, it is "below" our pattern. +- continue; +- case CR_InducesCycle: +- // If the uses of the TokenFactor lead to nodes that are not part of our +- // pattern that are not selected, folding would turn this into a cycle, +- // bail out now. +- return CR_InducesCycle; +- case CR_LeadsToInteriorNode: +- break; // Otherwise, keep processing. +- } +- +- // Okay, we know we're in the interesting interior case. The TokenFactor +- // is now going to be considered part of the pattern so that we rewrite its +- // uses (it may have uses that are not part of the pattern) with the +- // ultimate chain result of the generated code. We will also add its chain +- // inputs as inputs to the ultimate TokenFactor we create. +- Result = CR_LeadsToInteriorNode; +- if (!Visited) { +- ChainedNodesInPattern.push_back(User); +- InteriorChainedNodes.push_back(User); +- } +- } +- +- return Result; +-} +- + /// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains + /// operation for when the pattern matched at least one node with a chains. The + /// input vector contains a list of all of the chained nodes that we match. We +@@ -2521,47 +2455,56 @@ WalkChainUsers(const SDNode *ChainedNode, + static SDValue + HandleMergeInputChains(SmallVectorImpl &ChainNodesMatched, + SelectionDAG *CurDAG) { +- // Used for memoization. Without it WalkChainUsers could take exponential +- // time to run. +- DenseMap TokenFactorResult; +- // Walk all of the chained nodes we've matched, recursively scanning down the +- // users of the chain result. This adds any TokenFactor nodes that are caught +- // in between chained nodes to the chained and interior nodes list. +- SmallVector InteriorChainedNodes; +- for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) { +- if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched, +- TokenFactorResult, +- InteriorChainedNodes) == CR_InducesCycle) +- return SDValue(); // Would induce a cycle. +- } + +- // Okay, we have walked all the matched nodes and collected TokenFactor nodes +- // that we are interested in. Form our input TokenFactor node. ++ SmallPtrSet Visited; ++ SmallVector Worklist; + SmallVector InputChains; +- for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) { +- // Add the input chain of this node to the InputChains list (which will be +- // the operands of the generated TokenFactor) if it's not an interior node. +- SDNode *N = ChainNodesMatched[i]; +- if (N->getOpcode() != ISD::TokenFactor) { +- if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N)) +- continue; ++ unsigned int Max = 8192; + +- // Otherwise, add the input chain. +- SDValue InChain = ChainNodesMatched[i]->getOperand(0); +- assert(InChain.getValueType() == MVT::Other && "Not a chain"); +- InputChains.push_back(InChain); +- continue; +- } ++ // Quick exit on trivial merge. ++ if (ChainNodesMatched.size() == 1) ++ return ChainNodesMatched[0]->getOperand(0); + +- // If we have a token factor, we want to add all inputs of the token factor +- // that are not part of the pattern we're matching. +- for (const SDValue &Op : N->op_values()) { +- if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(), +- Op.getNode())) +- InputChains.push_back(Op); +- } ++ // Add chains that aren't already added (internal). Peek through ++ // token factors. ++ std::function AddChains = [&](const SDValue V) { ++ if (V.getValueType() != MVT::Other) ++ return; ++ if (V->getOpcode() == ISD::EntryToken) ++ return; ++ if (!Visited.insert(V.getNode()).second) ++ return; ++ if (V->getOpcode() == ISD::TokenFactor) { ++ for (const SDValue &Op : V->op_values()) ++ AddChains(Op); ++ } else ++ InputChains.push_back(V); ++ }; ++ ++ for (auto *N : ChainNodesMatched) { ++ Worklist.push_back(N); ++ Visited.insert(N); + } + ++ while (!Worklist.empty()) ++ AddChains(Worklist.pop_back_val()->getOperand(0)); ++ ++ // Skip the search if there are no chain dependencies. ++ if (InputChains.size() == 0) ++ return CurDAG->getEntryNode(); ++ ++ // If one of these chains is a successor of input, we must have a ++ // node that is both the predecessor and successor of the ++ // to-be-merged nodes. Fail. ++ Visited.clear(); ++ for (SDValue V : InputChains) ++ Worklist.push_back(V.getNode()); ++ ++ for (auto *N : ChainNodesMatched) ++ if (SDNode::hasPredecessorHelper(N, Visited, Worklist, Max, true)) ++ return SDValue(); ++ ++ // Return merged chain. + if (InputChains.size() == 1) + return InputChains[0]; + return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]), +@@ -2606,8 +2549,8 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, + // Move the glue if needed. + if ((EmitNodeInfo & OPFL_GlueOutput) && OldGlueResultNo != -1 && + (unsigned)OldGlueResultNo != ResNumResults-1) +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldGlueResultNo), +- SDValue(Res, ResNumResults-1)); ++ ReplaceUses(SDValue(Node, OldGlueResultNo), ++ SDValue(Res, ResNumResults - 1)); + + if ((EmitNodeInfo & OPFL_GlueOutput) != 0) + --ResNumResults; +@@ -2615,14 +2558,15 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, + // Move the chain reference if needed. + if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 && + (unsigned)OldChainResultNo != ResNumResults-1) +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldChainResultNo), +- SDValue(Res, ResNumResults-1)); ++ ReplaceUses(SDValue(Node, OldChainResultNo), ++ SDValue(Res, ResNumResults - 1)); + + // Otherwise, no replacement happened because the node already exists. Replace + // Uses of the old node with the new one. + if (Res != Node) { +- CurDAG->ReplaceAllUsesWith(Node, Res); +- CurDAG->RemoveDeadNode(Node); ++ ReplaceNode(Node, Res); ++ } else { ++ EnforceNodeIdInvariant(Res); + } + + return Res; +@@ -2939,8 +2883,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, + return; + case ISD::AssertSext: + case ISD::AssertZext: +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0), +- NodeToMatch->getOperand(0)); ++ ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0)); + CurDAG->RemoveDeadNode(NodeToMatch); + return; + case ISD::INLINEASM: +@@ -3702,7 +3645,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, + NodeToMatch->getValueType(i).getSizeInBits() == + Res.getValueSizeInBits()) && + "invalid replacement"); +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res); ++ ReplaceUses(SDValue(NodeToMatch, i), Res); + } + + // Update chain uses. +@@ -3715,8 +3658,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, + if (NodeToMatch->getValueType(NodeToMatch->getNumValues() - 1) == + MVT::Glue && + InputGlue.getNode()) +- CurDAG->ReplaceAllUsesOfValueWith( +- SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), InputGlue); ++ ReplaceUses(SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), ++ InputGlue); + + assert(NodeToMatch->use_empty() && + "Didn't replace all uses of the node?"); +diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +index f4776adb069..be5345e422d 100644 +--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp ++++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +@@ -759,12 +759,11 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { + + if (ProduceCarry) { + // Replace the carry-use +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1)); ++ ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1)); + } + + // Replace the remaining uses. +- CurDAG->ReplaceAllUsesWith(N, RegSequence); +- CurDAG->RemoveDeadNode(N); ++ ReplaceNode(N, RegSequence); + } + + void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { +diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp +index 8d32510e200..0f504718f28 100644 +--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp ++++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp +@@ -498,7 +498,7 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, + + void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) { + CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode()); +- CurDAG->ReplaceAllUsesWith(N, M); ++ ReplaceUses(N, M); + } + + bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, +diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +index a6ac4e3df74..3721856ff45 100644 +--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp ++++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +@@ -777,7 +777,7 @@ void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) { + return; + } + +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N,0), N->getOperand(0)); ++ ReplaceUses(SDValue(N, 0), N->getOperand(0)); + CurDAG->RemoveDeadNode(N); + } + +@@ -2182,4 +2182,3 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() { + RootHeights.clear(); + RootWeights.clear(); + } +- +diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +index f08c5054065..0608f06ef7e 100644 +--- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp ++++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +@@ -1914,7 +1914,6 @@ void HvxSelector::selectShuffle(SDNode *N) { + // If the mask is all -1's, generate "undef". + if (!UseLeft && !UseRight) { + ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode()); +- DAG.RemoveDeadNode(N); + return; + } + +@@ -1970,7 +1969,6 @@ void HvxSelector::selectRor(SDNode *N) { + NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV}); + + ISel.ReplaceNode(N, NewN); +- DAG.RemoveDeadNode(N); + } + + void HexagonDAGToDAGISel::SelectHvxShuffle(SDNode *N) { +@@ -2017,8 +2015,7 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) { + MemOp[0] = cast(N)->getMemOperand(); + cast(Result)->setMemRefs(MemOp, MemOp + 1); + +- ReplaceUses(N, Result); +- CurDAG->RemoveDeadNode(N); ++ ReplaceNode(N, Result); + } + + void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { +@@ -2056,8 +2053,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { + MemOp[0] = cast(N)->getMemOperand(); + cast(Result)->setMemRefs(MemOp, MemOp + 1); + +- ReplaceUses(N, Result); +- CurDAG->RemoveDeadNode(N); ++ ReplaceNode(N, Result); + } + + void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) { +@@ -2100,5 +2096,3 @@ void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) { + ReplaceUses(SDValue(N, 1), SDValue(Result, 1)); + CurDAG->RemoveDeadNode(N); + } +- +- +diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +index ce6f3d37f5c..fe59d820c88 100644 +--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp ++++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +@@ -589,10 +589,16 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr, + // The selection DAG must no longer depend on their uniqueness when this + // function is used. + static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) { +- if (N.getNode()->getNodeId() == -1 || +- N.getNode()->getNodeId() > Pos->getNodeId()) { ++ if (N->getNodeId() == -1 || ++ (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > ++ SelectionDAGISel::getUninvalidatedNodeId(Pos))) { + DAG->RepositionNode(Pos->getIterator(), N.getNode()); +- N.getNode()->setNodeId(Pos->getNodeId()); ++ // Mark Node as invalid for pruning as after this it may be a successor to a ++ // selected node but otherwise be in the same position of Pos. ++ // Conservatively mark it with the same -abs(Id) to assure node id ++ // invariant is preserved. ++ N->setNodeId(Pos->getNodeId()); ++ SelectionDAGISel::InvalidateNodeId(N.getNode()); + } + } + +@@ -1022,8 +1028,7 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) { + }; + SDValue New = convertTo( + DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops), 0)); +- ReplaceUses(N, New.getNode()); +- CurDAG->RemoveDeadNode(N); ++ ReplaceNode(N, New.getNode()); + return true; + } + +@@ -1114,8 +1119,7 @@ void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node, + SDValue Lower = CurDAG->getConstant(LowerVal, DL, VT); + SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower); + +- ReplaceUses(Node, Or.getNode()); +- CurDAG->RemoveDeadNode(Node); ++ ReplaceNode(Node, Or.getNode()); + + SelectCode(Or.getNode()); + } +diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp +index d79fd0ca4da..ee2d221e31c 100644 +--- a/lib/Target/X86/X86ISelDAGToDAG.cpp ++++ b/lib/Target/X86/X86ISelDAGToDAG.cpp +@@ -988,10 +988,16 @@ bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM, + // IDs! The selection DAG must no longer depend on their uniqueness when this + // is used. + static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { +- if (N.getNode()->getNodeId() == -1 || +- N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) { +- DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode()); +- N.getNode()->setNodeId(Pos.getNode()->getNodeId()); ++ if (N->getNodeId() == -1 || ++ (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > ++ SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) { ++ DAG.RepositionNode(Pos->getIterator(), N.getNode()); ++ // Mark Node as invalid for pruning as after this it may be a successor to a ++ // selected node but otherwise be in the same position of Pos. ++ // Conservatively mark it with the same -abs(Id) to assure node id ++ // invariant is preserved. ++ N->setNodeId(Pos->getNodeId()); ++ SelectionDAGISel::InvalidateNodeId(N.getNode()); + } + } + +@@ -2092,50 +2098,84 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, + LoadNode->getOffset() != StoreNode->getOffset()) + return false; + +- // Check if the chain is produced by the load or is a TokenFactor with +- // the load output chain as an operand. Return InputChain by reference. ++ bool FoundLoad = false; ++ SmallVector ChainOps; ++ SmallVector LoopWorklist; ++ SmallPtrSet Visited; ++ const unsigned int Max = 1024; ++ ++ // Visualization of Load-Op-Store fusion: ++ // ------------------------- ++ // Legend: ++ // *-lines = Chain operand dependencies. ++ // |-lines = Normal operand dependencies. ++ // Dependencies flow down and right. n-suffix references multiple nodes. ++ // ++ // C Xn C ++ // * * * ++ // * * * ++ // Xn A-LD Yn TF Yn ++ // * * \ | * | ++ // * * \ | * | ++ // * * \ | => A--LD_OP_ST ++ // * * \| \ ++ // TF OP \ ++ // * | \ Zn ++ // * | \ ++ // A-ST Zn ++ // ++ ++ // This merge induced dependences from: #1: Xn -> LD, OP, Zn ++ // #2: Yn -> LD ++ // #3: ST -> Zn ++ ++ // Ensure the transform is safe by checking for the dual ++ // dependencies to make sure we do not induce a loop. ++ ++ // As LD is a predecessor to both OP and ST we can do this by checking: ++ // a). if LD is a predecessor to a member of Xn or Yn. ++ // b). if a Zn is a predecessor to ST. ++ ++ // However, (b) can only occur through being a chain predecessor to ++ // ST, which is the same as Zn being a member or predecessor of Xn, ++ // which is a subset of LD being a predecessor of Xn. So it's ++ // subsumed by check (a). ++ + SDValue Chain = StoreNode->getChain(); + +- bool ChainCheck = false; ++ // Gather X elements in ChainOps. + if (Chain == Load.getValue(1)) { +- ChainCheck = true; +- InputChain = LoadNode->getChain(); ++ FoundLoad = true; ++ ChainOps.push_back(Load.getOperand(0)); + } else if (Chain.getOpcode() == ISD::TokenFactor) { +- SmallVector ChainOps; + for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { + SDValue Op = Chain.getOperand(i); + if (Op == Load.getValue(1)) { +- ChainCheck = true; ++ FoundLoad = true; + // Drop Load, but keep its chain. No cycle check necessary. + ChainOps.push_back(Load.getOperand(0)); + continue; + } +- +- // Make sure using Op as part of the chain would not cause a cycle here. +- // In theory, we could check whether the chain node is a predecessor of +- // the load. But that can be very expensive. Instead visit the uses and +- // make sure they all have smaller node id than the load. +- int LoadId = LoadNode->getNodeId(); +- for (SDNode::use_iterator UI = Op.getNode()->use_begin(), +- UE = UI->use_end(); UI != UE; ++UI) { +- if (UI.getUse().getResNo() != 0) +- continue; +- if (UI->getNodeId() > LoadId) +- return false; +- } +- ++ LoopWorklist.push_back(Op.getNode()); + ChainOps.push_back(Op); + } +- +- if (ChainCheck) +- // Make a new TokenFactor with all the other input chains except +- // for the load. +- InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), +- MVT::Other, ChainOps); + } +- if (!ChainCheck) ++ ++ if (!FoundLoad) ++ return false; ++ ++ // Worklist is currently Xn. Add Yn to worklist. ++ for (SDValue Op : StoredVal->ops()) ++ if (Op.getNode() != LoadNode) ++ LoopWorklist.push_back(Op.getNode()); ++ ++ // Check (a) if Load is a predecessor to Xn + Yn ++ if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, ++ true)) + return false; + ++ InputChain = ++ CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); + return true; + } + +@@ -2335,6 +2375,8 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { + MemOp[1] = LoadNode->getMemOperand(); + Result->setMemRefs(MemOp, MemOp + 2); + ++ // Update Load Chain uses as well. ++ ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); + ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); + ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); + CurDAG->RemoveDeadNode(Node); +@@ -2946,12 +2988,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { + return; + } + +- case X86ISD::CMP: +- case X86ISD::SUB: { +- // Sometimes a SUB is used to perform comparison. +- if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0)) +- // This node is not a CMP. +- break; ++ case X86ISD::CMP: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + +@@ -2971,95 +3008,52 @@ void X86DAGToDAGISel::Select(SDNode *Node) { + if (!C) break; + uint64_t Mask = C->getZExtValue(); + +- // For example, convert "testl %eax, $8" to "testb %al, $8" ++ MVT VT; ++ int SubRegOp; ++ unsigned Op; ++ + if (isUInt<8>(Mask) && + (!(Mask & 0x80) || hasNoSignedComparisonUses(Node))) { +- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i8); +- SDValue Reg = N0.getOperand(0); +- +- // Extract the l-register. +- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, +- MVT::i8, Reg); +- +- // Emit a testb. +- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, +- Subreg, Imm); +- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has +- // one, do not call ReplaceAllUsesWith. +- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), +- SDValue(NewNode, 0)); +- CurDAG->RemoveDeadNode(Node); +- return; ++ // For example, convert "testl %eax, $8" to "testb %al, $8" ++ VT = MVT::i8; ++ SubRegOp = X86::sub_8bit; ++ Op = X86::TEST8ri; ++ } else if (OptForMinSize && isUInt<16>(Mask) && ++ (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { ++ // For example, "testl %eax, $32776" to "testw %ax, $32776". ++ // NOTE: We only want to form TESTW instructions if optimizing for ++ // min size. Otherwise we only save one byte and possibly get a length ++ // changing prefix penalty in the decoders. ++ VT = MVT::i16; ++ SubRegOp = X86::sub_16bit; ++ Op = X86::TEST16ri; ++ } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && ++ (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) { ++ // For example, "testq %rax, $268468232" to "testl %eax, $268468232". ++ // NOTE: We only want to run that transform if N0 is 32 or 64 bits. ++ // Otherwize, we find ourselves in a position where we have to do ++ // promotion. If previous passes did not promote the and, we assume ++ // they had a good reason not to and do not promote here. ++ VT = MVT::i32; ++ SubRegOp = X86::sub_32bit; ++ Op = X86::TEST32ri; ++ } else { ++ // No eligible transformation was found. ++ break; + } + +- // For example, "testl %eax, $2048" to "testb %ah, $8". +- if (isShiftedUInt<8, 8>(Mask) && +- (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { +- // Shift the immediate right by 8 bits. +- SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8); +- SDValue Reg = N0.getOperand(0); +- +- // Extract the h-register. +- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, +- MVT::i8, Reg); +- +- // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only +- // target GR8_NOREX registers, so make sure the register class is +- // forced. +- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, +- MVT::i32, Subreg, ShiftedImm); +- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has +- // one, do not call ReplaceAllUsesWith. +- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), +- SDValue(NewNode, 0)); +- CurDAG->RemoveDeadNode(Node); +- return; +- } ++ SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); ++ SDValue Reg = N0.getOperand(0); + +- // For example, "testl %eax, $32776" to "testw %ax, $32776". +- // NOTE: We only want to form TESTW instructions if optimizing for +- // min size. Otherwise we only save one byte and possibly get a length +- // changing prefix penalty in the decoders. +- if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 && +- (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { +- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16); +- SDValue Reg = N0.getOperand(0); +- +- // Extract the 16-bit subregister. +- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, +- MVT::i16, Reg); +- +- // Emit a testw. +- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, +- Subreg, Imm); +- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has +- // one, do not call ReplaceAllUsesWith. +- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), +- SDValue(NewNode, 0)); +- CurDAG->RemoveDeadNode(Node); +- return; +- } ++ // Extract the subregister if necessary. ++ if (N0.getValueType() != VT) ++ Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg); + +- // For example, "testq %rax, $268468232" to "testl %eax, $268468232". +- if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 && +- (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) { +- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32); +- SDValue Reg = N0.getOperand(0); +- +- // Extract the 32-bit subregister. +- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl, +- MVT::i32, Reg); +- +- // Emit a testl. +- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, +- Subreg, Imm); +- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has +- // one, do not call ReplaceAllUsesWith. +- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), +- SDValue(NewNode, 0)); +- CurDAG->RemoveDeadNode(Node); +- return; +- } ++ // Emit a testl or testw. ++ SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm); ++ // Replace CMP with TEST. ++ ReplaceNode(Node, NewNode); ++ return; + } + break; + } +diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp +index c1ddb771e2f..86e71cba87b 100644 +--- a/lib/Target/X86/X86ISelLowering.cpp ++++ b/lib/Target/X86/X86ISelLowering.cpp +@@ -8131,6 +8131,32 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { + return LD; + } + ++ // If this is a splat of pairs of 32-bit elements, we can use a narrower ++ // build_vector and broadcast it. ++ // TODO: We could probably generalize this more. ++ if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) { ++ SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), ++ DAG.getUNDEF(ExtVT), DAG.getUNDEF(ExtVT) }; ++ auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef Ops) { ++ // Make sure all the even/odd operands match. ++ for (unsigned i = 2; i != NumElems; ++i) ++ if (Ops[i % 2] != Op.getOperand(i)) ++ return false; ++ return true; ++ }; ++ if (CanSplat(Op, NumElems, Ops)) { ++ MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; ++ MVT NarrowVT = MVT::getVectorVT(ExtVT, 4); ++ // Create a new build vector and cast to v2i64/v2f64. ++ SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2), ++ DAG.getBuildVector(NarrowVT, dl, Ops)); ++ // Broadcast from v2i64/v2f64 and cast to final VT. ++ MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2); ++ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, ++ NewBV)); ++ } ++ } ++ + // For AVX-length vectors, build the individual 128-bit pieces and use + // shuffles to put them in place. + if (VT.is256BitVector() || VT.is512BitVector()) { +diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td +index 98cc8fb7439..3d5de637da2 100644 +--- a/lib/Target/X86/X86InstrArithmetic.td ++++ b/lib/Target/X86/X86InstrArithmetic.td +@@ -1257,14 +1257,6 @@ let isCompare = 1 in { + def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>; + let Predicates = [In64BitMode] in + def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>; +- +- // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the +- // register class is constrained to GR8_NOREX. This pseudo is explicitly +- // marked side-effect free, since it doesn't have an isel pattern like +- // other test instructions. +- let isPseudo = 1, hasSideEffects = 0 in +- def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), +- "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; + } // Defs = [EFLAGS] + + def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, +diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp +index 11ada51a870..84a9200a0ef 100644 +--- a/lib/Target/X86/X86InstrInfo.cpp ++++ b/lib/Target/X86/X86InstrInfo.cpp +@@ -7854,9 +7854,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + case X86::VMOVUPSZ256mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr), + get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); +- case X86::TEST8ri_NOREX: +- MI.setDesc(get(X86::TEST8ri)); +- return true; + case X86::MOV32ri64: + MI.setDesc(get(X86::MOV32ri)); + return true; +diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp +index 67d95c2233d..4e11397dec4 100644 +--- a/lib/Target/X86/X86MacroFusion.cpp ++++ b/lib/Target/X86/X86MacroFusion.cpp +@@ -86,7 +86,6 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + case X86::TEST16mr: + case X86::TEST32mr: + case X86::TEST64mr: +- case X86::TEST8ri_NOREX: + case X86::AND16i16: + case X86::AND16ri: + case X86::AND16ri8: +diff --git a/test/CodeGen/SystemZ/pr36164.ll b/test/CodeGen/SystemZ/pr36164.ll +new file mode 100644 +index 00000000000..0c850091d31 +--- /dev/null ++++ b/test/CodeGen/SystemZ/pr36164.ll +@@ -0,0 +1,113 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc %s -o - -mtriple=s390x-linux-gnu -mcpu=z13 -disable-basicaa | FileCheck %s ++ ++; This test checks that we do not a reference to a deleted node. ++ ++%0 = type { i32 } ++ ++@g_11 = external dso_local unnamed_addr global i1, align 4 ++@g_69 = external dso_local global i32, align 4 ++@g_73 = external dso_local unnamed_addr global i32, align 4 ++@g_832 = external dso_local constant %0, align 4 ++@g_938 = external dso_local unnamed_addr global i64, align 8 ++ ++; Function Attrs: nounwind ++define void @main() local_unnamed_addr #0 { ++; CHECK-LABEL: main: ++; CHECK: # %bb.0: ++; CHECK-NEXT: stmg %r12, %r15, 96(%r15) ++; CHECK-NEXT: .cfi_offset %r12, -64 ++; CHECK-NEXT: .cfi_offset %r13, -56 ++; CHECK-NEXT: .cfi_offset %r14, -48 ++; CHECK-NEXT: .cfi_offset %r15, -40 ++; CHECK-NEXT: lhi %r0, 1 ++; CHECK-NEXT: larl %r1, g_938 ++; CHECK-NEXT: lhi %r2, 2 ++; CHECK-NEXT: lhi %r3, 3 ++; CHECK-NEXT: lhi %r4, 0 ++; CHECK-NEXT: lhi %r5, 4 ++; CHECK-NEXT: larl %r14, g_11 ++; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: strl %r0, g_73 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: strl %r0, g_69 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lghi %r13, 24 ++; CHECK-NEXT: strl %r2, g_69 ++; CHECK-NEXT: ag %r13, 0(%r1) ++; CHECK-NEXT: lrl %r12, g_832 ++; CHECK-NEXT: strl %r3, g_69 ++; CHECK-NEXT: lrl %r12, g_832 ++; CHECK-NEXT: strl %r4, g_69 ++; CHECK-NEXT: lrl %r12, g_832 ++; CHECK-NEXT: strl %r0, g_69 ++; CHECK-NEXT: lrl %r12, g_832 ++; CHECK-NEXT: strl %r2, g_69 ++; CHECK-NEXT: lrl %r12, g_832 ++; CHECK-NEXT: strl %r3, g_69 ++; CHECK-NEXT: stgrl %r13, g_938 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: strl %r5, g_69 ++; CHECK-NEXT: mvi 0(%r14), 1 ++; CHECK-NEXT: j .LBB0_1 ++ br label %1 ++ ++;