| //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// | 
 | // | 
 | //                     The LLVM Compiler Infrastructure | 
 | // | 
 | // This file is distributed under the University of Illinois Open Source | 
 | // License. See LICENSE.TXT for details. | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 | // | 
 | // This file defines the interfaces that NVPTX uses to lower LLVM code into a | 
 | // selection DAG. | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 |  | 
 | #include "NVPTXISelLowering.h" | 
 | #include "MCTargetDesc/NVPTXBaseInfo.h" | 
 | #include "NVPTX.h" | 
 | #include "NVPTXSubtarget.h" | 
 | #include "NVPTXTargetMachine.h" | 
 | #include "NVPTXTargetObjectFile.h" | 
 | #include "NVPTXUtilities.h" | 
 | #include "llvm/ADT/APInt.h" | 
 | #include "llvm/ADT/SmallVector.h" | 
 | #include "llvm/ADT/StringRef.h" | 
 | #include "llvm/CodeGen/Analysis.h" | 
 | #include "llvm/CodeGen/MachineFunction.h" | 
 | #include "llvm/CodeGen/MachineMemOperand.h" | 
 | #include "llvm/CodeGen/SelectionDAG.h" | 
 | #include "llvm/CodeGen/SelectionDAGNodes.h" | 
 | #include "llvm/CodeGen/TargetCallingConv.h" | 
 | #include "llvm/CodeGen/TargetLowering.h" | 
 | #include "llvm/CodeGen/ValueTypes.h" | 
 | #include "llvm/IR/Argument.h" | 
 | #include "llvm/IR/Attributes.h" | 
 | #include "llvm/IR/CallSite.h" | 
 | #include "llvm/IR/Constants.h" | 
 | #include "llvm/IR/DataLayout.h" | 
 | #include "llvm/IR/DerivedTypes.h" | 
 | #include "llvm/IR/Function.h" | 
 | #include "llvm/IR/GlobalValue.h" | 
 | #include "llvm/IR/Instruction.h" | 
 | #include "llvm/IR/Instructions.h" | 
 | #include "llvm/IR/Module.h" | 
 | #include "llvm/IR/Type.h" | 
 | #include "llvm/IR/Value.h" | 
 | #include "llvm/Support/Casting.h" | 
 | #include "llvm/Support/CodeGen.h" | 
 | #include "llvm/Support/CommandLine.h" | 
 | #include "llvm/Support/ErrorHandling.h" | 
 | #include "llvm/Support/MachineValueType.h" | 
 | #include "llvm/Support/MathExtras.h" | 
 | #include "llvm/Support/raw_ostream.h" | 
 | #include "llvm/Target/TargetMachine.h" | 
 | #include "llvm/Target/TargetOptions.h" | 
 | #include <algorithm> | 
 | #include <cassert> | 
 | #include <cstdint> | 
 | #include <iterator> | 
 | #include <sstream> | 
 | #include <string> | 
 | #include <utility> | 
 | #include <vector> | 
 |  | 
 | #define DEBUG_TYPE "nvptx-lower" | 
 |  | 
 | using namespace llvm; | 
 |  | 
 | static unsigned int uniqueCallSite = 0; | 
 |  | 
 | static cl::opt<bool> sched4reg( | 
 |     "nvptx-sched4reg", | 
 |     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); | 
 |  | 
 | static cl::opt<unsigned> | 
 | FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, | 
 |                     cl::desc("NVPTX Specific: FMA contraction (0: don't do it" | 
 |                              " 1: do it  2: do it aggressively"), | 
 |                     cl::init(2)); | 
 |  | 
 | static cl::opt<int> UsePrecDivF32( | 
 |     "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, | 
 |     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" | 
 |              " IEEE Compliant F32 div.rnd if available."), | 
 |     cl::init(2)); | 
 |  | 
 | static cl::opt<bool> UsePrecSqrtF32( | 
 |     "nvptx-prec-sqrtf32", cl::Hidden, | 
 |     cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), | 
 |     cl::init(true)); | 
 |  | 
 | static cl::opt<bool> FtzEnabled( | 
 |     "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden, | 
 |     cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."), | 
 |     cl::init(false)); | 
 |  | 
 | int NVPTXTargetLowering::getDivF32Level() const { | 
 |   if (UsePrecDivF32.getNumOccurrences() > 0) { | 
 |     // If nvptx-prec-div32=N is used on the command-line, always honor it | 
 |     return UsePrecDivF32; | 
 |   } else { | 
 |     // Otherwise, use div.approx if fast math is enabled | 
 |     if (getTargetMachine().Options.UnsafeFPMath) | 
 |       return 0; | 
 |     else | 
 |       return 2; | 
 |   } | 
 | } | 
 |  | 
 | bool NVPTXTargetLowering::usePrecSqrtF32() const { | 
 |   if (UsePrecSqrtF32.getNumOccurrences() > 0) { | 
 |     // If nvptx-prec-sqrtf32 is used on the command-line, always honor it | 
 |     return UsePrecSqrtF32; | 
 |   } else { | 
 |     // Otherwise, use sqrt.approx if fast math is enabled | 
 |     return !getTargetMachine().Options.UnsafeFPMath; | 
 |   } | 
 | } | 
 |  | 
 | bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { | 
 |   // TODO: Get rid of this flag; there can be only one way to do this. | 
 |   if (FtzEnabled.getNumOccurrences() > 0) { | 
 |     // If nvptx-f32ftz is used on the command-line, always honor it | 
 |     return FtzEnabled; | 
 |   } else { | 
 |     const Function &F = MF.getFunction(); | 
 |     // Otherwise, check for an nvptx-f32ftz attribute on the function | 
 |     if (F.hasFnAttribute("nvptx-f32ftz")) | 
 |       return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true"; | 
 |     else | 
 |       return false; | 
 |   } | 
 | } | 
 |  | 
 | static bool IsPTXVectorType(MVT VT) { | 
 |   switch (VT.SimpleTy) { | 
 |   default: | 
 |     return false; | 
 |   case MVT::v2i1: | 
 |   case MVT::v4i1: | 
 |   case MVT::v2i8: | 
 |   case MVT::v4i8: | 
 |   case MVT::v2i16: | 
 |   case MVT::v4i16: | 
 |   case MVT::v2i32: | 
 |   case MVT::v4i32: | 
 |   case MVT::v2i64: | 
 |   case MVT::v2f16: | 
 |   case MVT::v4f16: | 
 |   case MVT::v8f16: // <4 x f16x2> | 
 |   case MVT::v2f32: | 
 |   case MVT::v4f32: | 
 |   case MVT::v2f64: | 
 |     return true; | 
 |   } | 
 | } | 
 |  | 
 | /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive | 
 | /// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors | 
 | /// into their primitive components. | 
 | /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the | 
 | /// same number of types as the Ins/Outs arrays in LowerFormalArguments, | 
 | /// LowerCall, and LowerReturn. | 
 | static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, | 
 |                                Type *Ty, SmallVectorImpl<EVT> &ValueVTs, | 
 |                                SmallVectorImpl<uint64_t> *Offsets = nullptr, | 
 |                                uint64_t StartingOffset = 0) { | 
 |   SmallVector<EVT, 16> TempVTs; | 
 |   SmallVector<uint64_t, 16> TempOffsets; | 
 |  | 
 |   // Special case for i128 - decompose to (i64, i64) | 
 |   if (Ty->isIntegerTy(128)) { | 
 |     ValueVTs.push_back(EVT(MVT::i64)); | 
 |     ValueVTs.push_back(EVT(MVT::i64)); | 
 |  | 
 |     if (Offsets) { | 
 |       Offsets->push_back(StartingOffset + 0); | 
 |       Offsets->push_back(StartingOffset + 8); | 
 |     } | 
 |  | 
 |     return; | 
 |   } | 
 |  | 
 |   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); | 
 |   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { | 
 |     EVT VT = TempVTs[i]; | 
 |     uint64_t Off = TempOffsets[i]; | 
 |     // Split vectors into individual elements, except for v2f16, which | 
 |     // we will pass as a single scalar. | 
 |     if (VT.isVector()) { | 
 |       unsigned NumElts = VT.getVectorNumElements(); | 
 |       EVT EltVT = VT.getVectorElementType(); | 
 |       // Vectors with an even number of f16 elements will be passed to | 
 |       // us as an array of v2f16 elements. We must match this so we | 
 |       // stay in sync with Ins/Outs. | 
 |       if (EltVT == MVT::f16 && NumElts % 2 == 0) { | 
 |         EltVT = MVT::v2f16; | 
 |         NumElts /= 2; | 
 |       } | 
 |       for (unsigned j = 0; j != NumElts; ++j) { | 
 |         ValueVTs.push_back(EltVT); | 
 |         if (Offsets) | 
 |           Offsets->push_back(Off + j * EltVT.getStoreSize()); | 
 |       } | 
 |     } else { | 
 |       ValueVTs.push_back(VT); | 
 |       if (Offsets) | 
 |         Offsets->push_back(Off); | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | // Check whether we can merge loads/stores of some of the pieces of a | 
 | // flattened function parameter or return value into a single vector | 
 | // load/store. | 
 | // | 
 | // The flattened parameter is represented as a list of EVTs and | 
 | // offsets, and the whole structure is aligned to ParamAlignment. This | 
 | // function determines whether we can load/store pieces of the | 
 | // parameter starting at index Idx using a single vectorized op of | 
 | // size AccessSize. If so, it returns the number of param pieces | 
 | // covered by the vector op. Otherwise, it returns 1. | 
 | static unsigned CanMergeParamLoadStoresStartingAt( | 
 |     unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, | 
 |     const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) { | 
 |   assert(isPowerOf2_32(AccessSize) && "must be a power of 2!"); | 
 |  | 
 |   // Can't vectorize if param alignment is not sufficient. | 
 |   if (AccessSize > ParamAlignment) | 
 |     return 1; | 
 |   // Can't vectorize if offset is not aligned. | 
 |   if (Offsets[Idx] & (AccessSize - 1)) | 
 |     return 1; | 
 |  | 
 |   EVT EltVT = ValueVTs[Idx]; | 
 |   unsigned EltSize = EltVT.getStoreSize(); | 
 |  | 
 |   // Element is too large to vectorize. | 
 |   if (EltSize >= AccessSize) | 
 |     return 1; | 
 |  | 
 |   unsigned NumElts = AccessSize / EltSize; | 
 |   // Can't vectorize if AccessBytes if not a multiple of EltSize. | 
 |   if (AccessSize != EltSize * NumElts) | 
 |     return 1; | 
 |  | 
 |   // We don't have enough elements to vectorize. | 
 |   if (Idx + NumElts > ValueVTs.size()) | 
 |     return 1; | 
 |  | 
 |   // PTX ISA can only deal with 2- and 4-element vector ops. | 
 |   if (NumElts != 4 && NumElts != 2) | 
 |     return 1; | 
 |  | 
 |   for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { | 
 |     // Types do not match. | 
 |     if (ValueVTs[j] != EltVT) | 
 |       return 1; | 
 |  | 
 |     // Elements are not contiguous. | 
 |     if (Offsets[j] - Offsets[j - 1] != EltSize) | 
 |       return 1; | 
 |   } | 
 |   // OK. We can vectorize ValueVTs[i..i+NumElts) | 
 |   return NumElts; | 
 | } | 
 |  | 
 | // Flags for tracking per-element vectorization state of loads/stores | 
 | // of a flattened function parameter or return value. | 
 | enum ParamVectorizationFlags { | 
 |   PVF_INNER = 0x0, // Middle elements of a vector. | 
 |   PVF_FIRST = 0x1, // First element of the vector. | 
 |   PVF_LAST = 0x2,  // Last element of the vector. | 
 |   // Scalar is effectively a 1-element vector. | 
 |   PVF_SCALAR = PVF_FIRST | PVF_LAST | 
 | }; | 
 |  | 
 | // Computes whether and how we can vectorize the loads/stores of a | 
 | // flattened function parameter or return value. | 
 | // | 
 | // The flattened parameter is represented as the list of ValueVTs and | 
 | // Offsets, and is aligned to ParamAlignment bytes. We return a vector | 
 | // of the same size as ValueVTs indicating how each piece should be | 
 | // loaded/stored (i.e. as a scalar, or as part of a vector | 
 | // load/store). | 
 | static SmallVector<ParamVectorizationFlags, 16> | 
 | VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, | 
 |                      const SmallVectorImpl<uint64_t> &Offsets, | 
 |                      unsigned ParamAlignment) { | 
 |   // Set vector size to match ValueVTs and mark all elements as | 
 |   // scalars by default. | 
 |   SmallVector<ParamVectorizationFlags, 16> VectorInfo; | 
 |   VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); | 
 |  | 
 |   // Check what we can vectorize using 128/64/32-bit accesses. | 
 |   for (int I = 0, E = ValueVTs.size(); I != E; ++I) { | 
 |     // Skip elements we've already processed. | 
 |     assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); | 
 |     for (unsigned AccessSize : {16, 8, 4, 2}) { | 
 |       unsigned NumElts = CanMergeParamLoadStoresStartingAt( | 
 |           I, AccessSize, ValueVTs, Offsets, ParamAlignment); | 
 |       // Mark vectorized elements. | 
 |       switch (NumElts) { | 
 |       default: | 
 |         llvm_unreachable("Unexpected return value"); | 
 |       case 1: | 
 |         // Can't vectorize using this size, try next smaller size. | 
 |         continue; | 
 |       case 2: | 
 |         assert(I + 1 < E && "Not enough elements."); | 
 |         VectorInfo[I] = PVF_FIRST; | 
 |         VectorInfo[I + 1] = PVF_LAST; | 
 |         I += 1; | 
 |         break; | 
 |       case 4: | 
 |         assert(I + 3 < E && "Not enough elements."); | 
 |         VectorInfo[I] = PVF_FIRST; | 
 |         VectorInfo[I + 1] = PVF_INNER; | 
 |         VectorInfo[I + 2] = PVF_INNER; | 
 |         VectorInfo[I + 3] = PVF_LAST; | 
 |         I += 3; | 
 |         break; | 
 |       } | 
 |       // Break out of the inner loop because we've already succeeded | 
 |       // using largest possible AccessSize. | 
 |       break; | 
 |     } | 
 |   } | 
 |   return VectorInfo; | 
 | } | 
 |  | 
 | // NVPTXTargetLowering Constructor. | 
 | NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, | 
 |                                          const NVPTXSubtarget &STI) | 
 |     : TargetLowering(TM), nvTM(&TM), STI(STI) { | 
 |   // always lower memset, memcpy, and memmove intrinsics to load/store | 
 |   // instructions, rather | 
 |   // then generating calls to memset, mempcy or memmove. | 
 |   MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; | 
 |   MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; | 
 |   MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; | 
 |  | 
 |   setBooleanContents(ZeroOrNegativeOneBooleanContent); | 
 |   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); | 
 |  | 
 |   // Jump is Expensive. Don't create extra control flow for 'and', 'or' | 
 |   // condition branches. | 
 |   setJumpIsExpensive(true); | 
 |  | 
 |   // Wide divides are _very_ slow. Try to reduce the width of the divide if | 
 |   // possible. | 
 |   addBypassSlowDiv(64, 32); | 
 |  | 
 |   // By default, use the Source scheduling | 
 |   if (sched4reg) | 
 |     setSchedulingPreference(Sched::RegPressure); | 
 |   else | 
 |     setSchedulingPreference(Sched::Source); | 
 |  | 
 |   auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, | 
 |                                     LegalizeAction NoF16Action) { | 
 |     setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); | 
 |   }; | 
 |  | 
 |   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); | 
 |   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); | 
 |   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); | 
 |   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); | 
 |   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); | 
 |   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); | 
 |   addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); | 
 |   addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); | 
 |  | 
 |   // Conversion to/from FP16/FP16x2 is always legal. | 
 |   setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); | 
 |   setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); | 
 |   setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); | 
 |   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); | 
 |   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); | 
 |   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); | 
 |  | 
 |   setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); | 
 |   setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); | 
 |  | 
 |   // Operations not directly supported by NVPTX. | 
 |   for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8, | 
 |                  MVT::i16, MVT::i32, MVT::i64}) { | 
 |     setOperationAction(ISD::SELECT_CC, VT, Expand); | 
 |     setOperationAction(ISD::BR_CC, VT, Expand); | 
 |   } | 
 |  | 
 |   // Some SIGN_EXTEND_INREG can be done using cvt instruction. | 
 |   // For others we will expand to a SHL/SRA pair. | 
 |   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); | 
 |   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); | 
 |   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); | 
 |   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); | 
 |   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); | 
 |  | 
 |   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom); | 
 |   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom); | 
 |   setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom); | 
 |   setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom); | 
 |   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom); | 
 |   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom); | 
 |  | 
 |   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); | 
 |   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); | 
 |  | 
 |   // TODO: we may consider expanding ROTL/ROTR on older GPUs.  Currently on GPUs | 
 |   // that don't have h/w rotation we lower them to multi-instruction assembly. | 
 |   // See ROT*_sw in NVPTXIntrInfo.td | 
 |   setOperationAction(ISD::ROTL, MVT::i64, Legal); | 
 |   setOperationAction(ISD::ROTR, MVT::i64, Legal); | 
 |   setOperationAction(ISD::ROTL, MVT::i32, Legal); | 
 |   setOperationAction(ISD::ROTR, MVT::i32, Legal); | 
 |  | 
 |   setOperationAction(ISD::ROTL, MVT::i16, Expand); | 
 |   setOperationAction(ISD::ROTR, MVT::i16, Expand); | 
 |   setOperationAction(ISD::ROTL, MVT::i8, Expand); | 
 |   setOperationAction(ISD::ROTR, MVT::i8, Expand); | 
 |   setOperationAction(ISD::BSWAP, MVT::i16, Expand); | 
 |   setOperationAction(ISD::BSWAP, MVT::i32, Expand); | 
 |   setOperationAction(ISD::BSWAP, MVT::i64, Expand); | 
 |  | 
 |   // Indirect branch is not supported. | 
 |   // This also disables Jump Table creation. | 
 |   setOperationAction(ISD::BR_JT, MVT::Other, Expand); | 
 |   setOperationAction(ISD::BRIND, MVT::Other, Expand); | 
 |  | 
 |   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); | 
 |   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); | 
 |  | 
 |   // We want to legalize constant related memmove and memcopy | 
 |   // intrinsics. | 
 |   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); | 
 |  | 
 |   // Turn FP extload into load/fpextend | 
 |   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); | 
 |   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); | 
 |   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); | 
 |   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); | 
 |   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); | 
 |   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); | 
 |   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); | 
 |   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); | 
 |   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); | 
 |   // Turn FP truncstore into trunc + store. | 
 |   // FIXME: vector types should also be expanded | 
 |   setTruncStoreAction(MVT::f32, MVT::f16, Expand); | 
 |   setTruncStoreAction(MVT::f64, MVT::f16, Expand); | 
 |   setTruncStoreAction(MVT::f64, MVT::f32, Expand); | 
 |  | 
 |   // PTX does not support load / store predicate registers | 
 |   setOperationAction(ISD::LOAD, MVT::i1, Custom); | 
 |   setOperationAction(ISD::STORE, MVT::i1, Custom); | 
 |  | 
 |   for (MVT VT : MVT::integer_valuetypes()) { | 
 |     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); | 
 |     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); | 
 |     setTruncStoreAction(VT, MVT::i1, Expand); | 
 |   } | 
 |  | 
 |   // This is legal in NVPTX | 
 |   setOperationAction(ISD::ConstantFP, MVT::f64, Legal); | 
 |   setOperationAction(ISD::ConstantFP, MVT::f32, Legal); | 
 |   setOperationAction(ISD::ConstantFP, MVT::f16, Legal); | 
 |  | 
 |   // TRAP can be lowered to PTX trap | 
 |   setOperationAction(ISD::TRAP, MVT::Other, Legal); | 
 |  | 
 |   // Register custom handling for vector loads/stores | 
 |   for (MVT VT : MVT::vector_valuetypes()) { | 
 |     if (IsPTXVectorType(VT)) { | 
 |       setOperationAction(ISD::LOAD, VT, Custom); | 
 |       setOperationAction(ISD::STORE, VT, Custom); | 
 |       setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); | 
 |     } | 
 |   } | 
 |  | 
 |   // Custom handling for i8 intrinsics | 
 |   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); | 
 |  | 
 |   for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { | 
 |     setOperationAction(ISD::ABS,  Ty, Legal); | 
 |     setOperationAction(ISD::SMIN, Ty, Legal); | 
 |     setOperationAction(ISD::SMAX, Ty, Legal); | 
 |     setOperationAction(ISD::UMIN, Ty, Legal); | 
 |     setOperationAction(ISD::UMAX, Ty, Legal); | 
 |  | 
 |     setOperationAction(ISD::CTPOP, Ty, Legal); | 
 |     setOperationAction(ISD::CTLZ, Ty, Legal); | 
 |   } | 
 |  | 
 |   setOperationAction(ISD::CTTZ, MVT::i16, Expand); | 
 |   setOperationAction(ISD::CTTZ, MVT::i32, Expand); | 
 |   setOperationAction(ISD::CTTZ, MVT::i64, Expand); | 
 |  | 
 |   // PTX does not directly support SELP of i1, so promote to i32 first | 
 |   setOperationAction(ISD::SELECT, MVT::i1, Custom); | 
 |  | 
 |   // PTX cannot multiply two i64s in a single instruction. | 
 |   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); | 
 |   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); | 
 |  | 
 |   // We have some custom DAG combine patterns for these nodes | 
 |   setTargetDAGCombine(ISD::ADD); | 
 |   setTargetDAGCombine(ISD::AND); | 
 |   setTargetDAGCombine(ISD::FADD); | 
 |   setTargetDAGCombine(ISD::MUL); | 
 |   setTargetDAGCombine(ISD::SHL); | 
 |   setTargetDAGCombine(ISD::SREM); | 
 |   setTargetDAGCombine(ISD::UREM); | 
 |  | 
 |   // setcc for f16x2 needs special handling to prevent legalizer's | 
 |   // attempt to scalarize it due to v2i1 not being legal. | 
 |   if (STI.allowFP16Math()) | 
 |     setTargetDAGCombine(ISD::SETCC); | 
 |  | 
 |   // Promote fp16 arithmetic if fp16 hardware isn't available or the | 
 |   // user passed --nvptx-no-fp16-math. The flag is useful because, | 
 |   // although sm_53+ GPUs have some sort of FP16 support in | 
 |   // hardware, only sm_53 and sm_60 have full implementation. Others | 
 |   // only have token amount of hardware and are likely to run faster | 
 |   // by using fp32 units instead. | 
 |   for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { | 
 |     setFP16OperationAction(Op, MVT::f16, Legal, Promote); | 
 |     setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); | 
 |   } | 
 |  | 
 |   // There's no neg.f16 instruction. Expand to (0-x). | 
 |   setOperationAction(ISD::FNEG, MVT::f16, Expand); | 
 |   setOperationAction(ISD::FNEG, MVT::v2f16, Expand); | 
 |  | 
 |   // (would be) Library functions. | 
 |  | 
 |   // These map to conversion instructions for scalar FP types. | 
 |   for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, | 
 |                          ISD::FROUND, ISD::FTRUNC}) { | 
 |     setOperationAction(Op, MVT::f16, Legal); | 
 |     setOperationAction(Op, MVT::f32, Legal); | 
 |     setOperationAction(Op, MVT::f64, Legal); | 
 |     setOperationAction(Op, MVT::v2f16, Expand); | 
 |   } | 
 |  | 
 |   // 'Expand' implements FCOPYSIGN without calling an external library. | 
 |   setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); | 
 |   setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); | 
 |   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); | 
 |   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); | 
 |  | 
 |   // These map to corresponding instructions for f32/f64. f16 must be | 
 |   // promoted to f32. v2f16 is expanded to f16, which is then promoted | 
 |   // to f32. | 
 |   for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, | 
 |                          ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) { | 
 |     setOperationAction(Op, MVT::f16, Promote); | 
 |     setOperationAction(Op, MVT::f32, Legal); | 
 |     setOperationAction(Op, MVT::f64, Legal); | 
 |     setOperationAction(Op, MVT::v2f16, Expand); | 
 |   } | 
 |   setOperationAction(ISD::FMINNUM, MVT::f16, Promote); | 
 |   setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); | 
 |   setOperationAction(ISD::FMINNAN, MVT::f16, Promote); | 
 |   setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); | 
 |  | 
 |   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate. | 
 |   // No FPOW or FREM in PTX. | 
 |  | 
 |   // Now deduce the information based on the above mentioned | 
 |   // actions | 
 |   computeRegisterProperties(STI.getRegisterInfo()); | 
 | } | 
 |  | 
 | const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { | 
 |   switch ((NVPTXISD::NodeType)Opcode) { | 
 |   case NVPTXISD::FIRST_NUMBER: | 
 |     break; | 
 |   case NVPTXISD::CALL: | 
 |     return "NVPTXISD::CALL"; | 
 |   case NVPTXISD::RET_FLAG: | 
 |     return "NVPTXISD::RET_FLAG"; | 
 |   case NVPTXISD::LOAD_PARAM: | 
 |     return "NVPTXISD::LOAD_PARAM"; | 
 |   case NVPTXISD::Wrapper: | 
 |     return "NVPTXISD::Wrapper"; | 
 |   case NVPTXISD::DeclareParam: | 
 |     return "NVPTXISD::DeclareParam"; | 
 |   case NVPTXISD::DeclareScalarParam: | 
 |     return "NVPTXISD::DeclareScalarParam"; | 
 |   case NVPTXISD::DeclareRet: | 
 |     return "NVPTXISD::DeclareRet"; | 
 |   case NVPTXISD::DeclareScalarRet: | 
 |     return "NVPTXISD::DeclareScalarRet"; | 
 |   case NVPTXISD::DeclareRetParam: | 
 |     return "NVPTXISD::DeclareRetParam"; | 
 |   case NVPTXISD::PrintCall: | 
 |     return "NVPTXISD::PrintCall"; | 
 |   case NVPTXISD::PrintConvergentCall: | 
 |     return "NVPTXISD::PrintConvergentCall"; | 
 |   case NVPTXISD::PrintCallUni: | 
 |     return "NVPTXISD::PrintCallUni"; | 
 |   case NVPTXISD::PrintConvergentCallUni: | 
 |     return "NVPTXISD::PrintConvergentCallUni"; | 
 |   case NVPTXISD::LoadParam: | 
 |     return "NVPTXISD::LoadParam"; | 
 |   case NVPTXISD::LoadParamV2: | 
 |     return "NVPTXISD::LoadParamV2"; | 
 |   case NVPTXISD::LoadParamV4: | 
 |     return "NVPTXISD::LoadParamV4"; | 
 |   case NVPTXISD::StoreParam: | 
 |     return "NVPTXISD::StoreParam"; | 
 |   case NVPTXISD::StoreParamV2: | 
 |     return "NVPTXISD::StoreParamV2"; | 
 |   case NVPTXISD::StoreParamV4: | 
 |     return "NVPTXISD::StoreParamV4"; | 
 |   case NVPTXISD::StoreParamS32: | 
 |     return "NVPTXISD::StoreParamS32"; | 
 |   case NVPTXISD::StoreParamU32: | 
 |     return "NVPTXISD::StoreParamU32"; | 
 |   case NVPTXISD::CallArgBegin: | 
 |     return "NVPTXISD::CallArgBegin"; | 
 |   case NVPTXISD::CallArg: | 
 |     return "NVPTXISD::CallArg"; | 
 |   case NVPTXISD::LastCallArg: | 
 |     return "NVPTXISD::LastCallArg"; | 
 |   case NVPTXISD::CallArgEnd: | 
 |     return "NVPTXISD::CallArgEnd"; | 
 |   case NVPTXISD::CallVoid: | 
 |     return "NVPTXISD::CallVoid"; | 
 |   case NVPTXISD::CallVal: | 
 |     return "NVPTXISD::CallVal"; | 
 |   case NVPTXISD::CallSymbol: | 
 |     return "NVPTXISD::CallSymbol"; | 
 |   case NVPTXISD::Prototype: | 
 |     return "NVPTXISD::Prototype"; | 
 |   case NVPTXISD::MoveParam: | 
 |     return "NVPTXISD::MoveParam"; | 
 |   case NVPTXISD::StoreRetval: | 
 |     return "NVPTXISD::StoreRetval"; | 
 |   case NVPTXISD::StoreRetvalV2: | 
 |     return "NVPTXISD::StoreRetvalV2"; | 
 |   case NVPTXISD::StoreRetvalV4: | 
 |     return "NVPTXISD::StoreRetvalV4"; | 
 |   case NVPTXISD::PseudoUseParam: | 
 |     return "NVPTXISD::PseudoUseParam"; | 
 |   case NVPTXISD::RETURN: | 
 |     return "NVPTXISD::RETURN"; | 
 |   case NVPTXISD::CallSeqBegin: | 
 |     return "NVPTXISD::CallSeqBegin"; | 
 |   case NVPTXISD::CallSeqEnd: | 
 |     return "NVPTXISD::CallSeqEnd"; | 
 |   case NVPTXISD::CallPrototype: | 
 |     return "NVPTXISD::CallPrototype"; | 
 |   case NVPTXISD::LoadV2: | 
 |     return "NVPTXISD::LoadV2"; | 
 |   case NVPTXISD::LoadV4: | 
 |     return "NVPTXISD::LoadV4"; | 
 |   case NVPTXISD::LDGV2: | 
 |     return "NVPTXISD::LDGV2"; | 
 |   case NVPTXISD::LDGV4: | 
 |     return "NVPTXISD::LDGV4"; | 
 |   case NVPTXISD::LDUV2: | 
 |     return "NVPTXISD::LDUV2"; | 
 |   case NVPTXISD::LDUV4: | 
 |     return "NVPTXISD::LDUV4"; | 
 |   case NVPTXISD::StoreV2: | 
 |     return "NVPTXISD::StoreV2"; | 
 |   case NVPTXISD::StoreV4: | 
 |     return "NVPTXISD::StoreV4"; | 
 |   case NVPTXISD::FUN_SHFL_CLAMP: | 
 |     return "NVPTXISD::FUN_SHFL_CLAMP"; | 
 |   case NVPTXISD::FUN_SHFR_CLAMP: | 
 |     return "NVPTXISD::FUN_SHFR_CLAMP"; | 
 |   case NVPTXISD::IMAD: | 
 |     return "NVPTXISD::IMAD"; | 
 |   case NVPTXISD::SETP_F16X2: | 
 |     return "NVPTXISD::SETP_F16X2"; | 
 |   case NVPTXISD::Dummy: | 
 |     return "NVPTXISD::Dummy"; | 
 |   case NVPTXISD::MUL_WIDE_SIGNED: | 
 |     return "NVPTXISD::MUL_WIDE_SIGNED"; | 
 |   case NVPTXISD::MUL_WIDE_UNSIGNED: | 
 |     return "NVPTXISD::MUL_WIDE_UNSIGNED"; | 
 |   case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32"; | 
 |   case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat"; | 
 |   case NVPTXISD::Tex1DFloatFloatLevel: | 
 |     return "NVPTXISD::Tex1DFloatFloatLevel"; | 
 |   case NVPTXISD::Tex1DFloatFloatGrad: | 
 |     return "NVPTXISD::Tex1DFloatFloatGrad"; | 
 |   case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32"; | 
 |   case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float"; | 
 |   case NVPTXISD::Tex1DS32FloatLevel: | 
 |     return "NVPTXISD::Tex1DS32FloatLevel"; | 
 |   case NVPTXISD::Tex1DS32FloatGrad: | 
 |     return "NVPTXISD::Tex1DS32FloatGrad"; | 
 |   case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32"; | 
 |   case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float"; | 
 |   case NVPTXISD::Tex1DU32FloatLevel: | 
 |     return "NVPTXISD::Tex1DU32FloatLevel"; | 
 |   case NVPTXISD::Tex1DU32FloatGrad: | 
 |     return "NVPTXISD::Tex1DU32FloatGrad"; | 
 |   case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32"; | 
 |   case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; | 
 |   case NVPTXISD::Tex1DArrayFloatFloatLevel: | 
 |     return "NVPTXISD::Tex1DArrayFloatFloatLevel"; | 
 |   case NVPTXISD::Tex1DArrayFloatFloatGrad: | 
 |     return "NVPTXISD::Tex1DArrayFloatFloatGrad"; | 
 |   case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32"; | 
 |   case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float"; | 
 |   case NVPTXISD::Tex1DArrayS32FloatLevel: | 
 |     return "NVPTXISD::Tex1DArrayS32FloatLevel"; | 
 |   case NVPTXISD::Tex1DArrayS32FloatGrad: | 
 |     return "NVPTXISD::Tex1DArrayS32FloatGrad"; | 
 |   case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32"; | 
 |   case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float"; | 
 |   case NVPTXISD::Tex1DArrayU32FloatLevel: | 
 |     return "NVPTXISD::Tex1DArrayU32FloatLevel"; | 
 |   case NVPTXISD::Tex1DArrayU32FloatGrad: | 
 |     return "NVPTXISD::Tex1DArrayU32FloatGrad"; | 
 |   case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32"; | 
 |   case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat"; | 
 |   case NVPTXISD::Tex2DFloatFloatLevel: | 
 |     return "NVPTXISD::Tex2DFloatFloatLevel"; | 
 |   case NVPTXISD::Tex2DFloatFloatGrad: | 
 |     return "NVPTXISD::Tex2DFloatFloatGrad"; | 
 |   case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32"; | 
 |   case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float"; | 
 |   case NVPTXISD::Tex2DS32FloatLevel: | 
 |     return "NVPTXISD::Tex2DS32FloatLevel"; | 
 |   case NVPTXISD::Tex2DS32FloatGrad: | 
 |     return "NVPTXISD::Tex2DS32FloatGrad"; | 
 |   case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32"; | 
 |   case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float"; | 
 |   case NVPTXISD::Tex2DU32FloatLevel: | 
 |     return "NVPTXISD::Tex2DU32FloatLevel"; | 
 |   case NVPTXISD::Tex2DU32FloatGrad: | 
 |     return "NVPTXISD::Tex2DU32FloatGrad"; | 
 |   case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32"; | 
 |   case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; | 
 |   case NVPTXISD::Tex2DArrayFloatFloatLevel: | 
 |     return "NVPTXISD::Tex2DArrayFloatFloatLevel"; | 
 |   case NVPTXISD::Tex2DArrayFloatFloatGrad: | 
 |     return "NVPTXISD::Tex2DArrayFloatFloatGrad"; | 
 |   case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32"; | 
 |   case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float"; | 
 |   case NVPTXISD::Tex2DArrayS32FloatLevel: | 
 |     return "NVPTXISD::Tex2DArrayS32FloatLevel"; | 
 |   case NVPTXISD::Tex2DArrayS32FloatGrad: | 
 |     return "NVPTXISD::Tex2DArrayS32FloatGrad"; | 
 |   case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32"; | 
 |   case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float"; | 
 |   case NVPTXISD::Tex2DArrayU32FloatLevel: | 
 |     return "NVPTXISD::Tex2DArrayU32FloatLevel"; | 
 |   case NVPTXISD::Tex2DArrayU32FloatGrad: | 
 |     return "NVPTXISD::Tex2DArrayU32FloatGrad"; | 
 |   case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32"; | 
 |   case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat"; | 
 |   case NVPTXISD::Tex3DFloatFloatLevel: | 
 |     return "NVPTXISD::Tex3DFloatFloatLevel"; | 
 |   case NVPTXISD::Tex3DFloatFloatGrad: | 
 |     return "NVPTXISD::Tex3DFloatFloatGrad"; | 
 |   case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32"; | 
 |   case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float"; | 
 |   case NVPTXISD::Tex3DS32FloatLevel: | 
 |     return "NVPTXISD::Tex3DS32FloatLevel"; | 
 |   case NVPTXISD::Tex3DS32FloatGrad: | 
 |     return "NVPTXISD::Tex3DS32FloatGrad"; | 
 |   case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32"; | 
 |   case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float"; | 
 |   case NVPTXISD::Tex3DU32FloatLevel: | 
 |     return "NVPTXISD::Tex3DU32FloatLevel"; | 
 |   case NVPTXISD::Tex3DU32FloatGrad: | 
 |     return "NVPTXISD::Tex3DU32FloatGrad"; | 
 |   case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat"; | 
 |   case NVPTXISD::TexCubeFloatFloatLevel: | 
 |     return "NVPTXISD::TexCubeFloatFloatLevel"; | 
 |   case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float"; | 
 |   case NVPTXISD::TexCubeS32FloatLevel: | 
 |     return "NVPTXISD::TexCubeS32FloatLevel"; | 
 |   case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float"; | 
 |   case NVPTXISD::TexCubeU32FloatLevel: | 
 |     return "NVPTXISD::TexCubeU32FloatLevel"; | 
 |   case NVPTXISD::TexCubeArrayFloatFloat: | 
 |     return "NVPTXISD::TexCubeArrayFloatFloat"; | 
 |   case NVPTXISD::TexCubeArrayFloatFloatLevel: | 
 |     return "NVPTXISD::TexCubeArrayFloatFloatLevel"; | 
 |   case NVPTXISD::TexCubeArrayS32Float: | 
 |     return "NVPTXISD::TexCubeArrayS32Float"; | 
 |   case NVPTXISD::TexCubeArrayS32FloatLevel: | 
 |     return "NVPTXISD::TexCubeArrayS32FloatLevel"; | 
 |   case NVPTXISD::TexCubeArrayU32Float: | 
 |     return "NVPTXISD::TexCubeArrayU32Float"; | 
 |   case NVPTXISD::TexCubeArrayU32FloatLevel: | 
 |     return "NVPTXISD::TexCubeArrayU32FloatLevel"; | 
 |   case NVPTXISD::Tld4R2DFloatFloat: | 
 |     return "NVPTXISD::Tld4R2DFloatFloat"; | 
 |   case NVPTXISD::Tld4G2DFloatFloat: | 
 |     return "NVPTXISD::Tld4G2DFloatFloat"; | 
 |   case NVPTXISD::Tld4B2DFloatFloat: | 
 |     return "NVPTXISD::Tld4B2DFloatFloat"; | 
 |   case NVPTXISD::Tld4A2DFloatFloat: | 
 |     return "NVPTXISD::Tld4A2DFloatFloat"; | 
 |   case NVPTXISD::Tld4R2DS64Float: | 
 |     return "NVPTXISD::Tld4R2DS64Float"; | 
 |   case NVPTXISD::Tld4G2DS64Float: | 
 |     return "NVPTXISD::Tld4G2DS64Float"; | 
 |   case NVPTXISD::Tld4B2DS64Float: | 
 |     return "NVPTXISD::Tld4B2DS64Float"; | 
 |   case NVPTXISD::Tld4A2DS64Float: | 
 |     return "NVPTXISD::Tld4A2DS64Float"; | 
 |   case NVPTXISD::Tld4R2DU64Float: | 
 |     return "NVPTXISD::Tld4R2DU64Float"; | 
 |   case NVPTXISD::Tld4G2DU64Float: | 
 |     return "NVPTXISD::Tld4G2DU64Float"; | 
 |   case NVPTXISD::Tld4B2DU64Float: | 
 |     return "NVPTXISD::Tld4B2DU64Float"; | 
 |   case NVPTXISD::Tld4A2DU64Float: | 
 |     return "NVPTXISD::Tld4A2DU64Float"; | 
 |  | 
 |   case NVPTXISD::TexUnified1DFloatS32: | 
 |     return "NVPTXISD::TexUnified1DFloatS32"; | 
 |   case NVPTXISD::TexUnified1DFloatFloat: | 
 |     return "NVPTXISD::TexUnified1DFloatFloat"; | 
 |   case NVPTXISD::TexUnified1DFloatFloatLevel: | 
 |     return "NVPTXISD::TexUnified1DFloatFloatLevel"; | 
 |   case NVPTXISD::TexUnified1DFloatFloatGrad: | 
 |     return "NVPTXISD::TexUnified1DFloatFloatGrad"; | 
 |   case NVPTXISD::TexUnified1DS32S32: | 
 |     return "NVPTXISD::TexUnified1DS32S32"; | 
 |   case NVPTXISD::TexUnified1DS32Float: | 
 |     return "NVPTXISD::TexUnified1DS32Float"; | 
 |   case NVPTXISD::TexUnified1DS32FloatLevel: | 
 |     return "NVPTXISD::TexUnified1DS32FloatLevel"; | 
 |   case NVPTXISD::TexUnified1DS32FloatGrad: | 
 |     return "NVPTXISD::TexUnified1DS32FloatGrad"; | 
 |   case NVPTXISD::TexUnified1DU32S32: | 
 |     return "NVPTXISD::TexUnified1DU32S32"; | 
 |   case NVPTXISD::TexUnified1DU32Float: | 
 |     return "NVPTXISD::TexUnified1DU32Float"; | 
 |   case NVPTXISD::TexUnified1DU32FloatLevel: | 
 |     return "NVPTXISD::TexUnified1DU32FloatLevel"; | 
 |   case NVPTXISD::TexUnified1DU32FloatGrad: | 
 |     return "NVPTXISD::TexUnified1DU32FloatGrad"; | 
 |   case NVPTXISD::TexUnified1DArrayFloatS32: | 
 |     return "NVPTXISD::TexUnified1DArrayFloatS32"; | 
 |   case NVPTXISD::TexUnified1DArrayFloatFloat: | 
 |     return "NVPTXISD::TexUnified1DArrayFloatFloat"; | 
 |   case NVPTXISD::TexUnified1DArrayFloatFloatLevel: | 
 |     return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; | 
 |   case NVPTXISD::TexUnified1DArrayFloatFloatGrad: | 
 |     return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; | 
 |   case NVPTXISD::TexUnified1DArrayS32S32: | 
 |     return "NVPTXISD::TexUnified1DArrayS32S32"; | 
 |   case NVPTXISD::TexUnified1DArrayS32Float: | 
 |     return "NVPTXISD::TexUnified1DArrayS32Float"; | 
 |   case NVPTXISD::TexUnified1DArrayS32FloatLevel: | 
 |     return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; | 
 |   case NVPTXISD::TexUnified1DArrayS32FloatGrad: | 
 |     return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; | 
 |   case NVPTXISD::TexUnified1DArrayU32S32: | 
 |     return "NVPTXISD::TexUnified1DArrayU32S32"; | 
 |   case NVPTXISD::TexUnified1DArrayU32Float: | 
 |     return "NVPTXISD::TexUnified1DArrayU32Float"; | 
 |   case NVPTXISD::TexUnified1DArrayU32FloatLevel: | 
 |     return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; | 
 |   case NVPTXISD::TexUnified1DArrayU32FloatGrad: | 
 |     return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; | 
 |   case NVPTXISD::TexUnified2DFloatS32: | 
 |     return "NVPTXISD::TexUnified2DFloatS32"; | 
 |   case NVPTXISD::TexUnified2DFloatFloat: | 
 |     return "NVPTXISD::TexUnified2DFloatFloat"; | 
 |   case NVPTXISD::TexUnified2DFloatFloatLevel: | 
 |     return "NVPTXISD::TexUnified2DFloatFloatLevel"; | 
 |   case NVPTXISD::TexUnified2DFloatFloatGrad: | 
 |     return "NVPTXISD::TexUnified2DFloatFloatGrad"; | 
 |   case NVPTXISD::TexUnified2DS32S32: | 
 |     return "NVPTXISD::TexUnified2DS32S32"; | 
 |   case NVPTXISD::TexUnified2DS32Float: | 
 |     return "NVPTXISD::TexUnified2DS32Float"; | 
 |   case NVPTXISD::TexUnified2DS32FloatLevel: | 
 |     return "NVPTXISD::TexUnified2DS32FloatLevel"; | 
 |   case NVPTXISD::TexUnified2DS32FloatGrad: | 
 |     return "NVPTXISD::TexUnified2DS32FloatGrad"; | 
 |   case NVPTXISD::TexUnified2DU32S32: | 
 |     return "NVPTXISD::TexUnified2DU32S32"; | 
 |   case NVPTXISD::TexUnified2DU32Float: | 
 |     return "NVPTXISD::TexUnified2DU32Float"; | 
 |   case NVPTXISD::TexUnified2DU32FloatLevel: | 
 |     return "NVPTXISD::TexUnified2DU32FloatLevel"; | 
 |   case NVPTXISD::TexUnified2DU32FloatGrad: | 
 |     return "NVPTXISD::TexUnified2DU32FloatGrad"; | 
 |   case NVPTXISD::TexUnified2DArrayFloatS32: | 
 |     return "NVPTXISD::TexUnified2DArrayFloatS32"; | 
 |   case NVPTXISD::TexUnified2DArrayFloatFloat: | 
 |     return "NVPTXISD::TexUnified2DArrayFloatFloat"; | 
 |   case NVPTXISD::TexUnified2DArrayFloatFloatLevel: | 
 |     return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; | 
 |   case NVPTXISD::TexUnified2DArrayFloatFloatGrad: | 
 |     return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; | 
 |   case NVPTXISD::TexUnified2DArrayS32S32: | 
 |     return "NVPTXISD::TexUnified2DArrayS32S32"; | 
 |   case NVPTXISD::TexUnified2DArrayS32Float: | 
 |     return "NVPTXISD::TexUnified2DArrayS32Float"; | 
 |   case NVPTXISD::TexUnified2DArrayS32FloatLevel: | 
 |     return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; | 
 |   case NVPTXISD::TexUnified2DArrayS32FloatGrad: | 
 |     return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; | 
 |   case NVPTXISD::TexUnified2DArrayU32S32: | 
 |     return "NVPTXISD::TexUnified2DArrayU32S32"; | 
 |   case NVPTXISD::TexUnified2DArrayU32Float: | 
 |     return "NVPTXISD::TexUnified2DArrayU32Float"; | 
 |   case NVPTXISD::TexUnified2DArrayU32FloatLevel: | 
 |     return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; | 
 |   case NVPTXISD::TexUnified2DArrayU32FloatGrad: | 
 |     return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; | 
 |   case NVPTXISD::TexUnified3DFloatS32: | 
 |     return "NVPTXISD::TexUnified3DFloatS32"; | 
 |   case NVPTXISD::TexUnified3DFloatFloat: | 
 |     return "NVPTXISD::TexUnified3DFloatFloat"; | 
 |   case NVPTXISD::TexUnified3DFloatFloatLevel: | 
 |     return "NVPTXISD::TexUnified3DFloatFloatLevel"; | 
 |   case NVPTXISD::TexUnified3DFloatFloatGrad: | 
 |     return "NVPTXISD::TexUnified3DFloatFloatGrad"; | 
 |   case NVPTXISD::TexUnified3DS32S32: | 
 |     return "NVPTXISD::TexUnified3DS32S32"; | 
 |   case NVPTXISD::TexUnified3DS32Float: | 
 |     return "NVPTXISD::TexUnified3DS32Float"; | 
 |   case NVPTXISD::TexUnified3DS32FloatLevel: | 
 |     return "NVPTXISD::TexUnified3DS32FloatLevel"; | 
 |   case NVPTXISD::TexUnified3DS32FloatGrad: | 
 |     return "NVPTXISD::TexUnified3DS32FloatGrad"; | 
 |   case NVPTXISD::TexUnified3DU32S32: | 
 |     return "NVPTXISD::TexUnified3DU32S32"; | 
 |   case NVPTXISD::TexUnified3DU32Float: | 
 |     return "NVPTXISD::TexUnified3DU32Float"; | 
 |   case NVPTXISD::TexUnified3DU32FloatLevel: | 
 |     return "NVPTXISD::TexUnified3DU32FloatLevel"; | 
 |   case NVPTXISD::TexUnified3DU32FloatGrad: | 
 |     return "NVPTXISD::TexUnified3DU32FloatGrad"; | 
 |   case NVPTXISD::TexUnifiedCubeFloatFloat: | 
 |     return "NVPTXISD::TexUnifiedCubeFloatFloat"; | 
 |   case NVPTXISD::TexUnifiedCubeFloatFloatLevel: | 
 |     return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; | 
 |   case NVPTXISD::TexUnifiedCubeS32Float: | 
 |     return "NVPTXISD::TexUnifiedCubeS32Float"; | 
 |   case NVPTXISD::TexUnifiedCubeS32FloatLevel: | 
 |     return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; | 
 |   case NVPTXISD::TexUnifiedCubeU32Float: | 
 |     return "NVPTXISD::TexUnifiedCubeU32Float"; | 
 |   case NVPTXISD::TexUnifiedCubeU32FloatLevel: | 
 |     return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; | 
 |   case NVPTXISD::TexUnifiedCubeArrayFloatFloat: | 
 |     return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; | 
 |   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: | 
 |     return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; | 
 |   case NVPTXISD::TexUnifiedCubeArrayS32Float: | 
 |     return "NVPTXISD::TexUnifiedCubeArrayS32Float"; | 
 |   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: | 
 |     return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; | 
 |   case NVPTXISD::TexUnifiedCubeArrayU32Float: | 
 |     return "NVPTXISD::TexUnifiedCubeArrayU32Float"; | 
 |   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: | 
 |     return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; | 
 |   case NVPTXISD::Tld4UnifiedR2DFloatFloat: | 
 |     return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; | 
 |   case NVPTXISD::Tld4UnifiedG2DFloatFloat: | 
 |     return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; | 
 |   case NVPTXISD::Tld4UnifiedB2DFloatFloat: | 
 |     return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; | 
 |   case NVPTXISD::Tld4UnifiedA2DFloatFloat: | 
 |     return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; | 
 |   case NVPTXISD::Tld4UnifiedR2DS64Float: | 
 |     return "NVPTXISD::Tld4UnifiedR2DS64Float"; | 
 |   case NVPTXISD::Tld4UnifiedG2DS64Float: | 
 |     return "NVPTXISD::Tld4UnifiedG2DS64Float"; | 
 |   case NVPTXISD::Tld4UnifiedB2DS64Float: | 
 |     return "NVPTXISD::Tld4UnifiedB2DS64Float"; | 
 |   case NVPTXISD::Tld4UnifiedA2DS64Float: | 
 |     return "NVPTXISD::Tld4UnifiedA2DS64Float"; | 
 |   case NVPTXISD::Tld4UnifiedR2DU64Float: | 
 |     return "NVPTXISD::Tld4UnifiedR2DU64Float"; | 
 |   case NVPTXISD::Tld4UnifiedG2DU64Float: | 
 |     return "NVPTXISD::Tld4UnifiedG2DU64Float"; | 
 |   case NVPTXISD::Tld4UnifiedB2DU64Float: | 
 |     return "NVPTXISD::Tld4UnifiedB2DU64Float"; | 
 |   case NVPTXISD::Tld4UnifiedA2DU64Float: | 
 |     return "NVPTXISD::Tld4UnifiedA2DU64Float"; | 
 |  | 
 |   case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp"; | 
 |   case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp"; | 
 |   case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp"; | 
 |   case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp"; | 
 |   case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp"; | 
 |   case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp"; | 
 |   case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp"; | 
 |   case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp"; | 
 |   case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp"; | 
 |   case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp"; | 
 |   case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp"; | 
 |  | 
 |   case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp"; | 
 |   case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp"; | 
 |   case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp"; | 
 |   case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp"; | 
 |   case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; | 
 |   case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; | 
 |   case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; | 
 |   case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; | 
 |   case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; | 
 |   case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; | 
 |   case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; | 
 |  | 
 |   case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp"; | 
 |   case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp"; | 
 |   case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp"; | 
 |   case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp"; | 
 |   case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp"; | 
 |   case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp"; | 
 |   case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp"; | 
 |   case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp"; | 
 |   case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp"; | 
 |   case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp"; | 
 |   case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp"; | 
 |  | 
 |   case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp"; | 
 |   case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp"; | 
 |   case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp"; | 
 |   case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp"; | 
 |   case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; | 
 |   case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; | 
 |   case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; | 
 |   case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; | 
 |   case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; | 
 |   case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; | 
 |   case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; | 
 |  | 
 |   case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp"; | 
 |   case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp"; | 
 |   case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp"; | 
 |   case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp"; | 
 |   case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp"; | 
 |   case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp"; | 
 |   case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp"; | 
 |   case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp"; | 
 |   case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp"; | 
 |   case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp"; | 
 |   case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp"; | 
 |  | 
 |   case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap"; | 
 |   case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap"; | 
 |   case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap"; | 
 |   case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap"; | 
 |   case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap"; | 
 |   case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap"; | 
 |   case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap"; | 
 |   case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap"; | 
 |   case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap"; | 
 |   case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap"; | 
 |   case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap"; | 
 |  | 
 |   case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap"; | 
 |   case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap"; | 
 |   case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap"; | 
 |   case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap"; | 
 |   case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap"; | 
 |   case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap"; | 
 |   case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap"; | 
 |   case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap"; | 
 |   case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap"; | 
 |   case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap"; | 
 |   case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap"; | 
 |  | 
 |   case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap"; | 
 |   case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap"; | 
 |   case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap"; | 
 |   case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap"; | 
 |   case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap"; | 
 |   case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap"; | 
 |   case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap"; | 
 |   case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap"; | 
 |   case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap"; | 
 |   case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap"; | 
 |   case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap"; | 
 |  | 
 |   case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap"; | 
 |   case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap"; | 
 |   case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap"; | 
 |   case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap"; | 
 |   case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap"; | 
 |   case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap"; | 
 |   case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap"; | 
 |   case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap"; | 
 |   case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap"; | 
 |   case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap"; | 
 |   case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap"; | 
 |  | 
 |   case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap"; | 
 |   case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap"; | 
 |   case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap"; | 
 |   case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap"; | 
 |   case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap"; | 
 |   case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap"; | 
 |   case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap"; | 
 |   case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap"; | 
 |   case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap"; | 
 |   case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap"; | 
 |   case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap"; | 
 |  | 
 |   case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero"; | 
 |   case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero"; | 
 |   case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero"; | 
 |   case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero"; | 
 |   case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero"; | 
 |   case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero"; | 
 |   case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero"; | 
 |   case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero"; | 
 |   case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero"; | 
 |   case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero"; | 
 |   case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero"; | 
 |  | 
 |   case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero"; | 
 |   case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero"; | 
 |   case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero"; | 
 |   case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero"; | 
 |   case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero"; | 
 |   case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero"; | 
 |   case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero"; | 
 |   case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero"; | 
 |   case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero"; | 
 |   case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero"; | 
 |   case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero"; | 
 |  | 
 |   case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero"; | 
 |   case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero"; | 
 |   case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero"; | 
 |   case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero"; | 
 |   case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero"; | 
 |   case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero"; | 
 |   case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero"; | 
 |   case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero"; | 
 |   case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero"; | 
 |   case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero"; | 
 |   case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero"; | 
 |  | 
 |   case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero"; | 
 |   case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero"; | 
 |   case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero"; | 
 |   case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero"; | 
 |   case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero"; | 
 |   case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero"; | 
 |   case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero"; | 
 |   case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero"; | 
 |   case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero"; | 
 |   case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero"; | 
 |   case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero"; | 
 |  | 
 |   case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero"; | 
 |   case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero"; | 
 |   case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero"; | 
 |   case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero"; | 
 |   case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero"; | 
 |   case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero"; | 
 |   case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero"; | 
 |   case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero"; | 
 |   case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero"; | 
 |   case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero"; | 
 |   case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero"; | 
 |   } | 
 |   return nullptr; | 
 | } | 
 |  | 
 | TargetLoweringBase::LegalizeTypeAction | 
 | NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const { | 
 |   if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) | 
 |     return TypeSplitVector; | 
 |   if (VT == MVT::v2f16) | 
 |     return TypeLegal; | 
 |   return TargetLoweringBase::getPreferredVectorAction(VT); | 
 | } | 
 |  | 
 | SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, | 
 |                                              int Enabled, int &ExtraSteps, | 
 |                                              bool &UseOneConst, | 
 |                                              bool Reciprocal) const { | 
 |   if (!(Enabled == ReciprocalEstimate::Enabled || | 
 |         (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) | 
 |     return SDValue(); | 
 |  | 
 |   if (ExtraSteps == ReciprocalEstimate::Unspecified) | 
 |     ExtraSteps = 0; | 
 |  | 
 |   SDLoc DL(Operand); | 
 |   EVT VT = Operand.getValueType(); | 
 |   bool Ftz = useF32FTZ(DAG.getMachineFunction()); | 
 |  | 
 |   auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { | 
 |     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, | 
 |                        DAG.getConstant(IID, DL, MVT::i32), Operand); | 
 |   }; | 
 |  | 
 |   // The sqrt and rsqrt refinement processes assume we always start out with an | 
 |   // approximation of the rsqrt.  Therefore, if we're going to do any refinement | 
 |   // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing | 
 |   // any refinement, we must return a regular sqrt. | 
 |   if (Reciprocal || ExtraSteps > 0) { | 
 |     if (VT == MVT::f32) | 
 |       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f | 
 |                                    : Intrinsic::nvvm_rsqrt_approx_f); | 
 |     else if (VT == MVT::f64) | 
 |       return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); | 
 |     else | 
 |       return SDValue(); | 
 |   } else { | 
 |     if (VT == MVT::f32) | 
 |       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f | 
 |                                    : Intrinsic::nvvm_sqrt_approx_f); | 
 |     else { | 
 |       // There's no sqrt.approx.f64 instruction, so we emit | 
 |       // reciprocal(rsqrt(x)).  This is faster than | 
 |       // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain | 
 |       // x * rsqrt(x).) | 
 |       return DAG.getNode( | 
 |           ISD::INTRINSIC_WO_CHAIN, DL, VT, | 
 |           DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), | 
 |           MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | SDValue | 
 | NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { | 
 |   SDLoc dl(Op); | 
 |   const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); | 
 |   auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); | 
 |   Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); | 
 |   return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); | 
 | } | 
 |  | 
 | std::string NVPTXTargetLowering::getPrototype( | 
 |     const DataLayout &DL, Type *retTy, const ArgListTy &Args, | 
 |     const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment, | 
 |     ImmutableCallSite CS) const { | 
 |   auto PtrVT = getPointerTy(DL); | 
 |  | 
 |   bool isABI = (STI.getSmVersion() >= 20); | 
 |   assert(isABI && "Non-ABI compilation is not supported"); | 
 |   if (!isABI) | 
 |     return ""; | 
 |  | 
 |   std::stringstream O; | 
 |   O << "prototype_" << uniqueCallSite << " : .callprototype "; | 
 |  | 
 |   if (retTy->getTypeID() == Type::VoidTyID) { | 
 |     O << "()"; | 
 |   } else { | 
 |     O << "("; | 
 |     if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) { | 
 |       unsigned size = 0; | 
 |       if (auto *ITy = dyn_cast<IntegerType>(retTy)) { | 
 |         size = ITy->getBitWidth(); | 
 |       } else { | 
 |         assert(retTy->isFloatingPointTy() && | 
 |                "Floating point type expected here"); | 
 |         size = retTy->getPrimitiveSizeInBits(); | 
 |       } | 
 |       // PTX ABI requires all scalar return values to be at least 32 | 
 |       // bits in size.  fp16 normally uses .b16 as its storage type in | 
 |       // PTX, so its size must be adjusted here, too. | 
 |       if (size < 32) | 
 |         size = 32; | 
 |  | 
 |       O << ".param .b" << size << " _"; | 
 |     } else if (isa<PointerType>(retTy)) { | 
 |       O << ".param .b" << PtrVT.getSizeInBits() << " _"; | 
 |     } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) { | 
 |       auto &DL = CS.getCalledFunction()->getParent()->getDataLayout(); | 
 |       O << ".param .align " << retAlignment << " .b8 _[" | 
 |         << DL.getTypeAllocSize(retTy) << "]"; | 
 |     } else { | 
 |       llvm_unreachable("Unknown return type"); | 
 |     } | 
 |     O << ") "; | 
 |   } | 
 |   O << "_ ("; | 
 |  | 
 |   bool first = true; | 
 |  | 
 |   unsigned OIdx = 0; | 
 |   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { | 
 |     Type *Ty = Args[i].Ty; | 
 |     if (!first) { | 
 |       O << ", "; | 
 |     } | 
 |     first = false; | 
 |  | 
 |     if (!Outs[OIdx].Flags.isByVal()) { | 
 |       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { | 
 |         unsigned align = 0; | 
 |         const CallInst *CallI = cast<CallInst>(CS.getInstruction()); | 
 |         // +1 because index 0 is reserved for return type alignment | 
 |         if (!getAlign(*CallI, i + 1, align)) | 
 |           align = DL.getABITypeAlignment(Ty); | 
 |         unsigned sz = DL.getTypeAllocSize(Ty); | 
 |         O << ".param .align " << align << " .b8 "; | 
 |         O << "_"; | 
 |         O << "[" << sz << "]"; | 
 |         // update the index for Outs | 
 |         SmallVector<EVT, 16> vtparts; | 
 |         ComputeValueVTs(*this, DL, Ty, vtparts); | 
 |         if (unsigned len = vtparts.size()) | 
 |           OIdx += len - 1; | 
 |         continue; | 
 |       } | 
 |       // i8 types in IR will be i16 types in SDAG | 
 |       assert((getValueType(DL, Ty) == Outs[OIdx].VT || | 
 |               (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && | 
 |              "type mismatch between callee prototype and arguments"); | 
 |       // scalar type | 
 |       unsigned sz = 0; | 
 |       if (isa<IntegerType>(Ty)) { | 
 |         sz = cast<IntegerType>(Ty)->getBitWidth(); | 
 |         if (sz < 32) | 
 |           sz = 32; | 
 |       } else if (isa<PointerType>(Ty)) { | 
 |         sz = PtrVT.getSizeInBits(); | 
 |       } else if (Ty->isHalfTy()) | 
 |         // PTX ABI requires all scalar parameters to be at least 32 | 
 |         // bits in size.  fp16 normally uses .b16 as its storage type | 
 |         // in PTX, so its size must be adjusted here, too. | 
 |         sz = 32; | 
 |       else | 
 |         sz = Ty->getPrimitiveSizeInBits(); | 
 |       O << ".param .b" << sz << " "; | 
 |       O << "_"; | 
 |       continue; | 
 |     } | 
 |     auto *PTy = dyn_cast<PointerType>(Ty); | 
 |     assert(PTy && "Param with byval attribute should be a pointer type"); | 
 |     Type *ETy = PTy->getElementType(); | 
 |  | 
 |     unsigned align = Outs[OIdx].Flags.getByValAlign(); | 
 |     unsigned sz = DL.getTypeAllocSize(ETy); | 
 |     O << ".param .align " << align << " .b8 "; | 
 |     O << "_"; | 
 |     O << "[" << sz << "]"; | 
 |   } | 
 |   O << ");"; | 
 |   return O.str(); | 
 | } | 
 |  | 
 | unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, | 
 |                                                    ImmutableCallSite CS, | 
 |                                                    Type *Ty, unsigned Idx, | 
 |                                                    const DataLayout &DL) const { | 
 |   if (!CS) { | 
 |     // CallSite is zero, fallback to ABI type alignment | 
 |     return DL.getABITypeAlignment(Ty); | 
 |   } | 
 |  | 
 |   unsigned Align = 0; | 
 |   const Value *DirectCallee = CS.getCalledFunction(); | 
 |  | 
 |   if (!DirectCallee) { | 
 |     // We don't have a direct function symbol, but that may be because of | 
 |     // constant cast instructions in the call. | 
 |     const Instruction *CalleeI = CS.getInstruction(); | 
 |     assert(CalleeI && "Call target is not a function or derived value?"); | 
 |  | 
 |     // With bitcast'd call targets, the instruction will be the call | 
 |     if (isa<CallInst>(CalleeI)) { | 
 |       // Check if we have call alignment metadata | 
 |       if (getAlign(*cast<CallInst>(CalleeI), Idx, Align)) | 
 |         return Align; | 
 |  | 
 |       const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue(); | 
 |       // Ignore any bitcast instructions | 
 |       while (isa<ConstantExpr>(CalleeV)) { | 
 |         const ConstantExpr *CE = cast<ConstantExpr>(CalleeV); | 
 |         if (!CE->isCast()) | 
 |           break; | 
 |         // Look through the bitcast | 
 |         CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0); | 
 |       } | 
 |  | 
 |       // We have now looked past all of the bitcasts.  Do we finally have a | 
 |       // Function? | 
 |       if (isa<Function>(CalleeV)) | 
 |         DirectCallee = CalleeV; | 
 |     } | 
 |   } | 
 |  | 
 |   // Check for function alignment information if we found that the | 
 |   // ultimate target is a Function | 
 |   if (DirectCallee) | 
 |     if (getAlign(*cast<Function>(DirectCallee), Idx, Align)) | 
 |       return Align; | 
 |  | 
 |   // Call is indirect or alignment information is not available, fall back to | 
 |   // the ABI type alignment | 
 |   return DL.getABITypeAlignment(Ty); | 
 | } | 
 |  | 
 | SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, | 
 |                                        SmallVectorImpl<SDValue> &InVals) const { | 
 |   SelectionDAG &DAG = CLI.DAG; | 
 |   SDLoc dl = CLI.DL; | 
 |   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; | 
 |   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; | 
 |   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; | 
 |   SDValue Chain = CLI.Chain; | 
 |   SDValue Callee = CLI.Callee; | 
 |   bool &isTailCall = CLI.IsTailCall; | 
 |   ArgListTy &Args = CLI.getArgs(); | 
 |   Type *RetTy = CLI.RetTy; | 
 |   ImmutableCallSite CS = CLI.CS; | 
 |   const DataLayout &DL = DAG.getDataLayout(); | 
 |  | 
 |   bool isABI = (STI.getSmVersion() >= 20); | 
 |   assert(isABI && "Non-ABI compilation is not supported"); | 
 |   if (!isABI) | 
 |     return Chain; | 
 |  | 
 |   SDValue tempChain = Chain; | 
 |   Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl); | 
 |   SDValue InFlag = Chain.getValue(1); | 
 |  | 
 |   unsigned paramCount = 0; | 
 |   // Args.size() and Outs.size() need not match. | 
 |   // Outs.size() will be larger | 
 |   //   * if there is an aggregate argument with multiple fields (each field | 
 |   //     showing up separately in Outs) | 
 |   //   * if there is a vector argument with more than typical vector-length | 
 |   //     elements (generally if more than 4) where each vector element is | 
 |   //     individually present in Outs. | 
 |   // So a different index should be used for indexing into Outs/OutVals. | 
 |   // See similar issue in LowerFormalArguments. | 
 |   unsigned OIdx = 0; | 
 |   // Declare the .params or .reg need to pass values | 
 |   // to the function | 
 |   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { | 
 |     EVT VT = Outs[OIdx].VT; | 
 |     Type *Ty = Args[i].Ty; | 
 |  | 
 |     if (!Outs[OIdx].Flags.isByVal()) { | 
 |       SmallVector<EVT, 16> VTs; | 
 |       SmallVector<uint64_t, 16> Offsets; | 
 |       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); | 
 |       unsigned ArgAlign = | 
 |           getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); | 
 |       unsigned AllocSize = DL.getTypeAllocSize(Ty); | 
 |       SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |       bool NeedAlign; // Does argument declaration specify alignment? | 
 |       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { | 
 |         // declare .param .align <align> .b8 .param<n>[<size>]; | 
 |         SDValue DeclareParamOps[] = { | 
 |             Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), | 
 |             DAG.getConstant(paramCount, dl, MVT::i32), | 
 |             DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; | 
 |         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, | 
 |                             DeclareParamOps); | 
 |         NeedAlign = true; | 
 |       } else { | 
 |         // declare .param .b<size> .param<n>; | 
 |         if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { | 
 |           // PTX ABI requires integral types to be at least 32 bits in | 
 |           // size. FP16 is loaded/stored using i16, so it's handled | 
 |           // here as well. | 
 |           AllocSize = 4; | 
 |         } | 
 |         SDValue DeclareScalarParamOps[] = { | 
 |             Chain, DAG.getConstant(paramCount, dl, MVT::i32), | 
 |             DAG.getConstant(AllocSize * 8, dl, MVT::i32), | 
 |             DAG.getConstant(0, dl, MVT::i32), InFlag}; | 
 |         Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, | 
 |                             DeclareScalarParamOps); | 
 |         NeedAlign = false; | 
 |       } | 
 |       InFlag = Chain.getValue(1); | 
 |  | 
 |       // PTX Interoperability Guide 3.3(A): [Integer] Values shorter | 
 |       // than 32-bits are sign extended or zero extended, depending on | 
 |       // whether they are signed or unsigned types. This case applies | 
 |       // only to scalar parameters and not to aggregate values. | 
 |       bool ExtendIntegerParam = | 
 |           Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; | 
 |  | 
 |       auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); | 
 |       SmallVector<SDValue, 6> StoreOperands; | 
 |       for (unsigned j = 0, je = VTs.size(); j != je; ++j) { | 
 |         // New store. | 
 |         if (VectorInfo[j] & PVF_FIRST) { | 
 |           assert(StoreOperands.empty() && "Unfinished preceeding store."); | 
 |           StoreOperands.push_back(Chain); | 
 |           StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); | 
 |           StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); | 
 |         } | 
 |  | 
 |         EVT EltVT = VTs[j]; | 
 |         SDValue StVal = OutVals[OIdx]; | 
 |         if (ExtendIntegerParam) { | 
 |           assert(VTs.size() == 1 && "Scalar can't have multiple parts."); | 
 |           // zext/sext to i32 | 
 |           StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND | 
 |                                                         : ISD::ZERO_EXTEND, | 
 |                               dl, MVT::i32, StVal); | 
 |         } else if (EltVT.getSizeInBits() < 16) { | 
 |           // Use 16-bit registers for small stores as it's the | 
 |           // smallest general purpose register size supported by NVPTX. | 
 |           StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); | 
 |         } | 
 |  | 
 |         // Record the value to store. | 
 |         StoreOperands.push_back(StVal); | 
 |  | 
 |         if (VectorInfo[j] & PVF_LAST) { | 
 |           unsigned NumElts = StoreOperands.size() - 3; | 
 |           NVPTXISD::NodeType Op; | 
 |           switch (NumElts) { | 
 |           case 1: | 
 |             Op = NVPTXISD::StoreParam; | 
 |             break; | 
 |           case 2: | 
 |             Op = NVPTXISD::StoreParamV2; | 
 |             break; | 
 |           case 4: | 
 |             Op = NVPTXISD::StoreParamV4; | 
 |             break; | 
 |           default: | 
 |             llvm_unreachable("Invalid vector info."); | 
 |           } | 
 |  | 
 |           StoreOperands.push_back(InFlag); | 
 |  | 
 |           // Adjust type of the store op if we've extended the scalar | 
 |           // return value. | 
 |           EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j]; | 
 |           unsigned EltAlign = | 
 |               NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0; | 
 |  | 
 |           Chain = DAG.getMemIntrinsicNode( | 
 |               Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, | 
 |               TheStoreType, MachinePointerInfo(), EltAlign, | 
 |               MachineMemOperand::MOStore); | 
 |           InFlag = Chain.getValue(1); | 
 |  | 
 |           // Cleanup. | 
 |           StoreOperands.clear(); | 
 |         } | 
 |         ++OIdx; | 
 |       } | 
 |       assert(StoreOperands.empty() && "Unfinished parameter store."); | 
 |       if (VTs.size() > 0) | 
 |         --OIdx; | 
 |       ++paramCount; | 
 |       continue; | 
 |     } | 
 |  | 
 |     // ByVal arguments | 
 |     SmallVector<EVT, 16> VTs; | 
 |     SmallVector<uint64_t, 16> Offsets; | 
 |     auto *PTy = dyn_cast<PointerType>(Args[i].Ty); | 
 |     assert(PTy && "Type of a byval parameter should be pointer"); | 
 |     ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0); | 
 |  | 
 |     // declare .param .align <align> .b8 .param<n>[<size>]; | 
 |     unsigned sz = Outs[OIdx].Flags.getByValSize(); | 
 |     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |     unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); | 
 |     // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, | 
 |     // so we don't need to worry about natural alignment or not. | 
 |     // See TargetLowering::LowerCallTo(). | 
 |  | 
 |     // Enforce minumum alignment of 4 to work around ptxas miscompile | 
 |     // for sm_50+. See corresponding alignment adjustment in | 
 |     // emitFunctionParamList() for details. | 
 |     if (ArgAlign < 4) | 
 |       ArgAlign = 4; | 
 |     SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), | 
 |                                  DAG.getConstant(paramCount, dl, MVT::i32), | 
 |                                  DAG.getConstant(sz, dl, MVT::i32), InFlag}; | 
 |     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, | 
 |                         DeclareParamOps); | 
 |     InFlag = Chain.getValue(1); | 
 |     for (unsigned j = 0, je = VTs.size(); j != je; ++j) { | 
 |       EVT elemtype = VTs[j]; | 
 |       int curOffset = Offsets[j]; | 
 |       unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); | 
 |       auto PtrVT = getPointerTy(DL); | 
 |       SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], | 
 |                                     DAG.getConstant(curOffset, dl, PtrVT)); | 
 |       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, | 
 |                                    MachinePointerInfo(), PartAlign); | 
 |       if (elemtype.getSizeInBits() < 16) { | 
 |         theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); | 
 |       } | 
 |       SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |       SDValue CopyParamOps[] = { Chain, | 
 |                                  DAG.getConstant(paramCount, dl, MVT::i32), | 
 |                                  DAG.getConstant(curOffset, dl, MVT::i32), | 
 |                                  theVal, InFlag }; | 
 |       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, | 
 |                                       CopyParamOps, elemtype, | 
 |                                       MachinePointerInfo(), /* Align */ 0, | 
 |                                       MachineMemOperand::MOStore); | 
 |  | 
 |       InFlag = Chain.getValue(1); | 
 |     } | 
 |     ++paramCount; | 
 |   } | 
 |  | 
 |   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); | 
 |   unsigned retAlignment = 0; | 
 |  | 
 |   // Handle Result | 
 |   if (Ins.size() > 0) { | 
 |     SmallVector<EVT, 16> resvtparts; | 
 |     ComputeValueVTs(*this, DL, RetTy, resvtparts); | 
 |  | 
 |     // Declare | 
 |     //  .param .align 16 .b8 retval0[<size-in-bytes>], or | 
 |     //  .param .b<size-in-bits> retval0 | 
 |     unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); | 
 |     // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for | 
 |     // these three types to match the logic in | 
 |     // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. | 
 |     // Plus, this behavior is consistent with nvcc's. | 
 |     if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() || | 
 |         (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) { | 
 |       // Scalar needs to be at least 32bit wide | 
 |       if (resultsz < 32) | 
 |         resultsz = 32; | 
 |       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |       SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), | 
 |                                   DAG.getConstant(resultsz, dl, MVT::i32), | 
 |                                   DAG.getConstant(0, dl, MVT::i32), InFlag }; | 
 |       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, | 
 |                           DeclareRetOps); | 
 |       InFlag = Chain.getValue(1); | 
 |     } else { | 
 |       retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL); | 
 |       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |       SDValue DeclareRetOps[] = { Chain, | 
 |                                   DAG.getConstant(retAlignment, dl, MVT::i32), | 
 |                                   DAG.getConstant(resultsz / 8, dl, MVT::i32), | 
 |                                   DAG.getConstant(0, dl, MVT::i32), InFlag }; | 
 |       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, | 
 |                           DeclareRetOps); | 
 |       InFlag = Chain.getValue(1); | 
 |     } | 
 |   } | 
 |  | 
 |   if (!Func) { | 
 |     // This is indirect function call case : PTX requires a prototype of the | 
 |     // form | 
 |     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); | 
 |     // to be emitted, and the label has to used as the last arg of call | 
 |     // instruction. | 
 |     // The prototype is embedded in a string and put as the operand for a | 
 |     // CallPrototype SDNode which will print out to the value of the string. | 
 |     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |     std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS); | 
 |     const char *ProtoStr = | 
 |       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); | 
 |     SDValue ProtoOps[] = { | 
 |       Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, | 
 |     }; | 
 |     Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); | 
 |     InFlag = Chain.getValue(1); | 
 |   } | 
 |   // Op to just print "call" | 
 |   SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |   SDValue PrintCallOps[] = { | 
 |     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag | 
 |   }; | 
 |   // We model convergent calls as separate opcodes. | 
 |   unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall; | 
 |   if (CLI.IsConvergent) | 
 |     Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni | 
 |                                               : NVPTXISD::PrintConvergentCall; | 
 |   Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); | 
 |   InFlag = Chain.getValue(1); | 
 |  | 
 |   // Ops to print out the function name | 
 |   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |   SDValue CallVoidOps[] = { Chain, Callee, InFlag }; | 
 |   Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); | 
 |   InFlag = Chain.getValue(1); | 
 |  | 
 |   // Ops to print out the param list | 
 |   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |   SDValue CallArgBeginOps[] = { Chain, InFlag }; | 
 |   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, | 
 |                       CallArgBeginOps); | 
 |   InFlag = Chain.getValue(1); | 
 |  | 
 |   for (unsigned i = 0, e = paramCount; i != e; ++i) { | 
 |     unsigned opcode; | 
 |     if (i == (e - 1)) | 
 |       opcode = NVPTXISD::LastCallArg; | 
 |     else | 
 |       opcode = NVPTXISD::CallArg; | 
 |     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), | 
 |                              DAG.getConstant(i, dl, MVT::i32), InFlag }; | 
 |     Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); | 
 |     InFlag = Chain.getValue(1); | 
 |   } | 
 |   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |   SDValue CallArgEndOps[] = { Chain, | 
 |                               DAG.getConstant(Func ? 1 : 0, dl, MVT::i32), | 
 |                               InFlag }; | 
 |   Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); | 
 |   InFlag = Chain.getValue(1); | 
 |  | 
 |   if (!Func) { | 
 |     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); | 
 |     SDValue PrototypeOps[] = { Chain, | 
 |                                DAG.getConstant(uniqueCallSite, dl, MVT::i32), | 
 |                                InFlag }; | 
 |     Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); | 
 |     InFlag = Chain.getValue(1); | 
 |   } | 
 |  | 
 |   // Generate loads from param memory/moves from registers for result | 
 |   if (Ins.size() > 0) { | 
 |     SmallVector<EVT, 16> VTs; | 
 |     SmallVector<uint64_t, 16> Offsets; | 
 |     ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); | 
 |     assert(VTs.size() == Ins.size() && "Bad value decomposition"); | 
 |  | 
 |     unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL); | 
 |     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); | 
 |  | 
 |     SmallVector<EVT, 6> LoadVTs; | 
 |     int VecIdx = -1; // Index of the first element of the vector. | 
 |  | 
 |     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than | 
 |     // 32-bits are sign extended or zero extended, depending on whether | 
 |     // they are signed or unsigned types. | 
 |     bool ExtendIntegerRetVal = | 
 |         RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; | 
 |  | 
 |     for (unsigned i = 0, e = VTs.size(); i != e; ++i) { | 
 |       bool needTruncate = false; | 
 |       EVT TheLoadType = VTs[i]; | 
 |       EVT EltType = Ins[i].VT; | 
 |       unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]); | 
 |       if (ExtendIntegerRetVal) { | 
 |         TheLoadType = MVT::i32; | 
 |         EltType = MVT::i32; | 
 |         needTruncate = true; | 
 |       } else if (TheLoadType.getSizeInBits() < 16) { | 
 |         if (VTs[i].isInteger()) | 
 |           needTruncate = true; | 
 |         EltType = MVT::i16; | 
 |       } | 
 |  | 
 |       // Record index of the very first element of the vector. | 
 |       if (VectorInfo[i] & PVF_FIRST) { | 
 |         assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); | 
 |         VecIdx = i; | 
 |       } | 
 |  | 
 |       LoadVTs.push_back(EltType); | 
 |  | 
 |       if (VectorInfo[i] & PVF_LAST) { | 
 |         unsigned NumElts = LoadVTs.size(); | 
 |         LoadVTs.push_back(MVT::Other); | 
 |         LoadVTs.push_back(MVT::Glue); | 
 |         NVPTXISD::NodeType Op; | 
 |         switch (NumElts) { | 
 |         case 1: | 
 |           Op = NVPTXISD::LoadParam; | 
 |           break; | 
 |         case 2: | 
 |           Op = NVPTXISD::LoadParamV2; | 
 |           break; | 
 |         case 4: | 
 |           Op = NVPTXISD::LoadParamV4; | 
 |           break; | 
 |         default: | 
 |           llvm_unreachable("Invalid vector info."); | 
 |         } | 
 |  | 
 |         SDValue LoadOperands[] = { | 
 |             Chain, DAG.getConstant(1, dl, MVT::i32), | 
 |             DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; | 
 |         SDValue RetVal = DAG.getMemIntrinsicNode( | 
 |             Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, | 
 |             MachinePointerInfo(), EltAlign, | 
 |             MachineMemOperand::MOLoad); | 
 |  | 
 |         for (unsigned j = 0; j < NumElts; ++j) { | 
 |           SDValue Ret = RetVal.getValue(j); | 
 |           if (needTruncate) | 
 |             Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret); | 
 |           InVals.push_back(Ret); | 
 |         } | 
 |         Chain = RetVal.getValue(NumElts); | 
 |         InFlag = RetVal.getValue(NumElts + 1); | 
 |  | 
 |         // Cleanup | 
 |         VecIdx = -1; | 
 |         LoadVTs.clear(); | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |   Chain = DAG.getCALLSEQ_END(Chain, | 
 |                              DAG.getIntPtrConstant(uniqueCallSite, dl, true), | 
 |                              DAG.getIntPtrConstant(uniqueCallSite + 1, dl, | 
 |                                                    true), | 
 |                              InFlag, dl); | 
 |   uniqueCallSite++; | 
 |  | 
 |   // set isTailCall to false for now, until we figure out how to express | 
 |   // tail call optimization in PTX | 
 |   isTailCall = false; | 
 |   return Chain; | 
 | } | 
 |  | 
 | // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() | 
 | // (see LegalizeDAG.cpp). This is slow and uses local memory. | 
 | // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 | 
 | SDValue | 
 | NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { | 
 |   SDNode *Node = Op.getNode(); | 
 |   SDLoc dl(Node); | 
 |   SmallVector<SDValue, 8> Ops; | 
 |   unsigned NumOperands = Node->getNumOperands(); | 
 |   for (unsigned i = 0; i < NumOperands; ++i) { | 
 |     SDValue SubOp = Node->getOperand(i); | 
 |     EVT VVT = SubOp.getNode()->getValueType(0); | 
 |     EVT EltVT = VVT.getVectorElementType(); | 
 |     unsigned NumSubElem = VVT.getVectorNumElements(); | 
 |     for (unsigned j = 0; j < NumSubElem; ++j) { | 
 |       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, | 
 |                                 DAG.getIntPtrConstant(j, dl))); | 
 |     } | 
 |   } | 
 |   return DAG.getBuildVector(Node->getValueType(0), dl, Ops); | 
 | } | 
 |  | 
 | // We can init constant f16x2 with a single .b32 move.  Normally it | 
 | // would get lowered as two constant loads and vector-packing move. | 
 | //        mov.b16         %h1, 0x4000; | 
 | //        mov.b16         %h2, 0x3C00; | 
 | //        mov.b32         %hh2, {%h2, %h1}; | 
 | // Instead we want just a constant move: | 
 | //        mov.b32         %hh2, 0x40003C00 | 
 | // | 
 | // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 | 
 | // generates good SASS in both cases. | 
 | SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, | 
 |                                                SelectionDAG &DAG) const { | 
 |   //return Op; | 
 |   if (!(Op->getValueType(0) == MVT::v2f16 && | 
 |         isa<ConstantFPSDNode>(Op->getOperand(0)) && | 
 |         isa<ConstantFPSDNode>(Op->getOperand(1)))) | 
 |     return Op; | 
 |  | 
 |   APInt E0 = | 
 |       cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); | 
 |   APInt E1 = | 
 |       cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); | 
 |   SDValue Const = | 
 |       DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); | 
 |   return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); | 
 | } | 
 |  | 
 | SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, | 
 |                                                      SelectionDAG &DAG) const { | 
 |   SDValue Index = Op->getOperand(1); | 
 |   // Constant index will be matched by tablegen. | 
 |   if (isa<ConstantSDNode>(Index.getNode())) | 
 |     return Op; | 
 |  | 
 |   // Extract individual elements and select one of them. | 
 |   SDValue Vector = Op->getOperand(0); | 
 |   EVT VectorVT = Vector.getValueType(); | 
 |   assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); | 
 |   EVT EltVT = VectorVT.getVectorElementType(); | 
 |  | 
 |   SDLoc dl(Op.getNode()); | 
 |   SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, | 
 |                            DAG.getIntPtrConstant(0, dl)); | 
 |   SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, | 
 |                            DAG.getIntPtrConstant(1, dl)); | 
 |   return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, | 
 |                          ISD::CondCode::SETEQ); | 
 | } | 
 |  | 
 | /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which | 
 | /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift | 
 | ///    amount, or | 
 | /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift | 
 | ///    amount. | 
 | SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, | 
 |                                                   SelectionDAG &DAG) const { | 
 |   assert(Op.getNumOperands() == 3 && "Not a double-shift!"); | 
 |   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); | 
 |  | 
 |   EVT VT = Op.getValueType(); | 
 |   unsigned VTBits = VT.getSizeInBits(); | 
 |   SDLoc dl(Op); | 
 |   SDValue ShOpLo = Op.getOperand(0); | 
 |   SDValue ShOpHi = Op.getOperand(1); | 
 |   SDValue ShAmt  = Op.getOperand(2); | 
 |   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; | 
 |  | 
 |   if (VTBits == 32 && STI.getSmVersion() >= 35) { | 
 |     // For 32bit and sm35, we can use the funnel shift 'shf' instruction. | 
 |     // {dHi, dLo} = {aHi, aLo} >> Amt | 
 |     //   dHi = aHi >> Amt | 
 |     //   dLo = shf.r.clamp aLo, aHi, Amt | 
 |  | 
 |     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); | 
 |     SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, | 
 |                              ShAmt); | 
 |  | 
 |     SDValue Ops[2] = { Lo, Hi }; | 
 |     return DAG.getMergeValues(Ops, dl); | 
 |   } | 
 |   else { | 
 |     // {dHi, dLo} = {aHi, aLo} >> Amt | 
 |     // - if (Amt>=size) then | 
 |     //      dLo = aHi >> (Amt-size) | 
 |     //      dHi = aHi >> Amt (this is either all 0 or all 1) | 
 |     //   else | 
 |     //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) | 
 |     //      dHi = aHi >> Amt | 
 |  | 
 |     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, | 
 |                                    DAG.getConstant(VTBits, dl, MVT::i32), | 
 |                                    ShAmt); | 
 |     SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); | 
 |     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, | 
 |                                      DAG.getConstant(VTBits, dl, MVT::i32)); | 
 |     SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); | 
 |     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); | 
 |     SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); | 
 |  | 
 |     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, | 
 |                                DAG.getConstant(VTBits, dl, MVT::i32), | 
 |                                ISD::SETGE); | 
 |     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); | 
 |     SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); | 
 |  | 
 |     SDValue Ops[2] = { Lo, Hi }; | 
 |     return DAG.getMergeValues(Ops, dl); | 
 |   } | 
 | } | 
 |  | 
 | /// LowerShiftLeftParts - Lower SHL_PARTS, which | 
 | /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift | 
 | ///    amount, or | 
 | /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift | 
 | ///    amount. | 
 | SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, | 
 |                                                  SelectionDAG &DAG) const { | 
 |   assert(Op.getNumOperands() == 3 && "Not a double-shift!"); | 
 |   assert(Op.getOpcode() == ISD::SHL_PARTS); | 
 |  | 
 |   EVT VT = Op.getValueType(); | 
 |   unsigned VTBits = VT.getSizeInBits(); | 
 |   SDLoc dl(Op); | 
 |   SDValue ShOpLo = Op.getOperand(0); | 
 |   SDValue ShOpHi = Op.getOperand(1); | 
 |   SDValue ShAmt  = Op.getOperand(2); | 
 |  | 
 |   if (VTBits == 32 && STI.getSmVersion() >= 35) { | 
 |     // For 32bit and sm35, we can use the funnel shift 'shf' instruction. | 
 |     // {dHi, dLo} = {aHi, aLo} << Amt | 
 |     //   dHi = shf.l.clamp aLo, aHi, Amt | 
 |     //   dLo = aLo << Amt | 
 |  | 
 |     SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, | 
 |                              ShAmt); | 
 |     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); | 
 |  | 
 |     SDValue Ops[2] = { Lo, Hi }; | 
 |     return DAG.getMergeValues(Ops, dl); | 
 |   } | 
 |   else { | 
 |     // {dHi, dLo} = {aHi, aLo} << Amt | 
 |     // - if (Amt>=size) then | 
 |     //      dLo = aLo << Amt (all 0) | 
 |     //      dLo = aLo << (Amt-size) | 
 |     //   else | 
 |     //      dLo = aLo << Amt | 
 |     //      dHi = (aHi << Amt) | (aLo >> (size-Amt)) | 
 |  | 
 |     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, | 
 |                                    DAG.getConstant(VTBits, dl, MVT::i32), | 
 |                                    ShAmt); | 
 |     SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); | 
 |     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, | 
 |                                      DAG.getConstant(VTBits, dl, MVT::i32)); | 
 |     SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); | 
 |     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); | 
 |     SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); | 
 |  | 
 |     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, | 
 |                                DAG.getConstant(VTBits, dl, MVT::i32), | 
 |                                ISD::SETGE); | 
 |     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); | 
 |     SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); | 
 |  | 
 |     SDValue Ops[2] = { Lo, Hi }; | 
 |     return DAG.getMergeValues(Ops, dl); | 
 |   } | 
 | } | 
 |  | 
 | SDValue | 
 | NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { | 
 |   switch (Op.getOpcode()) { | 
 |   case ISD::RETURNADDR: | 
 |     return SDValue(); | 
 |   case ISD::FRAMEADDR: | 
 |     return SDValue(); | 
 |   case ISD::GlobalAddress: | 
 |     return LowerGlobalAddress(Op, DAG); | 
 |   case ISD::INTRINSIC_W_CHAIN: | 
 |     return Op; | 
 |   case ISD::BUILD_VECTOR: | 
 |     return LowerBUILD_VECTOR(Op, DAG); | 
 |   case ISD::EXTRACT_SUBVECTOR: | 
 |     return Op; | 
 |   case ISD::EXTRACT_VECTOR_ELT: | 
 |     return LowerEXTRACT_VECTOR_ELT(Op, DAG); | 
 |   case ISD::CONCAT_VECTORS: | 
 |     return LowerCONCAT_VECTORS(Op, DAG); | 
 |   case ISD::STORE: | 
 |     return LowerSTORE(Op, DAG); | 
 |   case ISD::LOAD: | 
 |     return LowerLOAD(Op, DAG); | 
 |   case ISD::SHL_PARTS: | 
 |     return LowerShiftLeftParts(Op, DAG); | 
 |   case ISD::SRA_PARTS: | 
 |   case ISD::SRL_PARTS: | 
 |     return LowerShiftRightParts(Op, DAG); | 
 |   case ISD::SELECT: | 
 |     return LowerSelect(Op, DAG); | 
 |   default: | 
 |     llvm_unreachable("Custom lowering not defined for operation"); | 
 |   } | 
 | } | 
 |  | 
 | SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { | 
 |   SDValue Op0 = Op->getOperand(0); | 
 |   SDValue Op1 = Op->getOperand(1); | 
 |   SDValue Op2 = Op->getOperand(2); | 
 |   SDLoc DL(Op.getNode()); | 
 |  | 
 |   assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); | 
 |  | 
 |   Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); | 
 |   Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); | 
 |   SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); | 
 |   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); | 
 |  | 
 |   return Trunc; | 
 | } | 
 |  | 
 | SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { | 
 |   if (Op.getValueType() == MVT::i1) | 
 |     return LowerLOADi1(Op, DAG); | 
 |  | 
 |   // v2f16 is legal, so we can't rely on legalizer to handle unaligned | 
 |   // loads and have to handle it here. | 
 |   if (Op.getValueType() == MVT::v2f16) { | 
 |     LoadSDNode *Load = cast<LoadSDNode>(Op); | 
 |     EVT MemVT = Load->getMemoryVT(); | 
 |     if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, | 
 |                             Load->getAddressSpace(), Load->getAlignment())) { | 
 |       SDValue Ops[2]; | 
 |       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); | 
 |       return DAG.getMergeValues(Ops, SDLoc(Op)); | 
 |     } | 
 |   } | 
 |  | 
 |   return SDValue(); | 
 | } | 
 |  | 
 | // v = ld i1* addr | 
 | //   => | 
 | // v1 = ld i8* addr (-> i16) | 
 | // v = trunc i16 to i1 | 
 | SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { | 
 |   SDNode *Node = Op.getNode(); | 
 |   LoadSDNode *LD = cast<LoadSDNode>(Node); | 
 |   SDLoc dl(Node); | 
 |   assert(LD->getExtensionType() == ISD::NON_EXTLOAD); | 
 |   assert(Node->getValueType(0) == MVT::i1 && | 
 |          "Custom lowering for i1 load only"); | 
 |   SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), | 
 |                               LD->getPointerInfo(), LD->getAlignment(), | 
 |                               LD->getMemOperand()->getFlags()); | 
 |   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); | 
 |   // The legalizer (the caller) is expecting two values from the legalized | 
 |   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() | 
 |   // in LegalizeDAG.cpp which also uses MergeValues. | 
 |   SDValue Ops[] = { result, LD->getChain() }; | 
 |   return DAG.getMergeValues(Ops, dl); | 
 | } | 
 |  | 
 | SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { | 
 |   StoreSDNode *Store = cast<StoreSDNode>(Op); | 
 |   EVT VT = Store->getMemoryVT(); | 
 |  | 
 |   if (VT == MVT::i1) | 
 |     return LowerSTOREi1(Op, DAG); | 
 |  | 
 |   // v2f16 is legal, so we can't rely on legalizer to handle unaligned | 
 |   // stores and have to handle it here. | 
 |   if (VT == MVT::v2f16 && | 
 |       !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, | 
 |                           Store->getAddressSpace(), Store->getAlignment())) | 
 |     return expandUnalignedStore(Store, DAG); | 
 |  | 
 |   if (VT.isVector()) | 
 |     return LowerSTOREVector(Op, DAG); | 
 |  | 
 |   return SDValue(); | 
 | } | 
 |  | 
 | SDValue | 
 | NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { | 
 |   SDNode *N = Op.getNode(); | 
 |   SDValue Val = N->getOperand(1); | 
 |   SDLoc DL(N); | 
 |   EVT ValVT = Val.getValueType(); | 
 |  | 
 |   if (ValVT.isVector()) { | 
 |     // We only handle "native" vector sizes for now, e.g. <4 x double> is not | 
 |     // legal.  We can (and should) split that into 2 stores of <2 x double> here | 
 |     // but I'm leaving that as a TODO for now. | 
 |     if (!ValVT.isSimple()) | 
 |       return SDValue(); | 
 |     switch (ValVT.getSimpleVT().SimpleTy) { | 
 |     default: | 
 |       return SDValue(); | 
 |     case MVT::v2i8: | 
 |     case MVT::v2i16: | 
 |     case MVT::v2i32: | 
 |     case MVT::v2i64: | 
 |     case MVT::v2f16: | 
 |     case MVT::v2f32: | 
 |     case MVT::v2f64: | 
 |     case MVT::v4i8: | 
 |     case MVT::v4i16: | 
 |     case MVT::v4i32: | 
 |     case MVT::v4f16: | 
 |     case MVT::v4f32: | 
 |     case MVT::v8f16: // <4 x f16x2> | 
 |       // This is a "native" vector type | 
 |       break; | 
 |     } | 
 |  | 
 |     MemSDNode *MemSD = cast<MemSDNode>(N); | 
 |     const DataLayout &TD = DAG.getDataLayout(); | 
 |  | 
 |     unsigned Align = MemSD->getAlignment(); | 
 |     unsigned PrefAlign = | 
 |         TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); | 
 |     if (Align < PrefAlign) { | 
 |       // This store is not sufficiently aligned, so bail out and let this vector | 
 |       // store be scalarized.  Note that we may still be able to emit smaller | 
 |       // vector stores.  For example, if we are storing a <4 x float> with an | 
 |       // alignment of 8, this check will fail but the legalizer will try again | 
 |       // with 2 x <2 x float>, which will succeed with an alignment of 8. | 
 |       return SDValue(); | 
 |     } | 
 |  | 
 |     unsigned Opcode = 0; | 
 |     EVT EltVT = ValVT.getVectorElementType(); | 
 |     unsigned NumElts = ValVT.getVectorNumElements(); | 
 |  | 
 |     // Since StoreV2 is a target node, we cannot rely on DAG type legalization. | 
 |     // Therefore, we must ensure the type is legal.  For i1 and i8, we set the | 
 |     // stored type to i16 and propagate the "real" type as the memory type. | 
 |     bool NeedExt = false; | 
 |     if (EltVT.getSizeInBits() < 16) | 
 |       NeedExt = true; | 
 |  | 
 |     bool StoreF16x2 = false; | 
 |     switch (NumElts) { | 
 |     default: | 
 |       return SDValue(); | 
 |     case 2: | 
 |       Opcode = NVPTXISD::StoreV2; | 
 |       break; | 
 |     case 4: | 
 |       Opcode = NVPTXISD::StoreV4; | 
 |       break; | 
 |     case 8: | 
 |       // v8f16 is a special case. PTX doesn't have st.v8.f16 | 
 |       // instruction. Instead, we split the vector into v2f16 chunks and | 
 |       // store them with st.v4.b32. | 
 |       assert(EltVT == MVT::f16 && "Wrong type for the vector."); | 
 |       Opcode = NVPTXISD::StoreV4; | 
 |       StoreF16x2 = true; | 
 |       break; | 
 |     } | 
 |  | 
 |     SmallVector<SDValue, 8> Ops; | 
 |  | 
 |     // First is the chain | 
 |     Ops.push_back(N->getOperand(0)); | 
 |  | 
 |     if (StoreF16x2) { | 
 |       // Combine f16,f16 -> v2f16 | 
 |       NumElts /= 2; | 
 |       for (unsigned i = 0; i < NumElts; ++i) { | 
 |         SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, | 
 |                                  DAG.getIntPtrConstant(i * 2, DL)); | 
 |         SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, | 
 |                                  DAG.getIntPtrConstant(i * 2 + 1, DL)); | 
 |         SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); | 
 |         Ops.push_back(V2); | 
 |       } | 
 |     } else { | 
 |       // Then the split values | 
 |       for (unsigned i = 0; i < NumElts; ++i) { | 
 |         SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, | 
 |                                      DAG.getIntPtrConstant(i, DL)); | 
 |         if (NeedExt) | 
 |           ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); | 
 |         Ops.push_back(ExtVal); | 
 |       } | 
 |     } | 
 |  | 
 |     // Then any remaining arguments | 
 |     Ops.append(N->op_begin() + 2, N->op_end()); | 
 |  | 
 |     SDValue NewSt = | 
 |         DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, | 
 |                                 MemSD->getMemoryVT(), MemSD->getMemOperand()); | 
 |  | 
 |     // return DCI.CombineTo(N, NewSt, true); | 
 |     return NewSt; | 
 |   } | 
 |  | 
 |   return SDValue(); | 
 | } | 
 |  | 
 | // st i1 v, addr | 
 | //    => | 
 | // v1 = zxt v to i16 | 
 | // st.u8 i16, addr | 
 | SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { | 
 |   SDNode *Node = Op.getNode(); | 
 |   SDLoc dl(Node); | 
 |   StoreSDNode *ST = cast<StoreSDNode>(Node); | 
 |   SDValue Tmp1 = ST->getChain(); | 
 |   SDValue Tmp2 = ST->getBasePtr(); | 
 |   SDValue Tmp3 = ST->getValue(); | 
 |   assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); | 
 |   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); | 
 |   SDValue Result = | 
 |       DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, | 
 |                         ST->getAlignment(), ST->getMemOperand()->getFlags()); | 
 |   return Result; | 
 | } | 
 |  | 
 | SDValue | 
 | NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { | 
 |   std::string ParamSym; | 
 |   raw_string_ostream ParamStr(ParamSym); | 
 |  | 
 |   ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx; | 
 |   ParamStr.flush(); | 
 |  | 
 |   std::string *SavedStr = | 
 |     nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str()); | 
 |   return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); | 
 | } | 
 |  | 
 | // Check to see if the kernel argument is image*_t or sampler_t | 
 |  | 
 | static bool isImageOrSamplerVal(const Value *arg, const Module *context) { | 
 |   static const char *const specialTypes[] = { "struct._image2d_t", | 
 |                                               "struct._image3d_t", | 
 |                                               "struct._sampler_t" }; | 
 |  | 
 |   Type *Ty = arg->getType(); | 
 |   auto *PTy = dyn_cast<PointerType>(Ty); | 
 |  | 
 |   if (!PTy) | 
 |     return false; | 
 |  | 
 |   if (!context) | 
 |     return false; | 
 |  | 
 |   auto *STy = dyn_cast<StructType>(PTy->getElementType()); | 
 |   if (!STy || STy->isLiteral()) | 
 |     return false; | 
 |  | 
 |   return std::find(std::begin(specialTypes), std::end(specialTypes), | 
 |                    STy->getName()) != std::end(specialTypes); | 
 | } | 
 |  | 
 | SDValue NVPTXTargetLowering::LowerFormalArguments( | 
 |     SDValue Chain, CallingConv::ID CallConv, bool isVarArg, | 
 |     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, | 
 |     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { | 
 |   MachineFunction &MF = DAG.getMachineFunction(); | 
 |   const DataLayout &DL = DAG.getDataLayout(); | 
 |   auto PtrVT = getPointerTy(DAG.getDataLayout()); | 
 |  | 
 |   const Function *F = &MF.getFunction(); | 
 |   const AttributeList &PAL = F->getAttributes(); | 
 |   const TargetLowering *TLI = STI.getTargetLowering(); | 
 |  | 
 |   SDValue Root = DAG.getRoot(); | 
 |   std::vector<SDValue> OutChains; | 
 |  | 
 |   bool isABI = (STI.getSmVersion() >= 20); | 
 |   assert(isABI && "Non-ABI compilation is not supported"); | 
 |   if (!isABI) | 
 |     return Chain; | 
 |  | 
 |   std::vector<Type *> argTypes; | 
 |   std::vector<const Argument *> theArgs; | 
 |   for (const Argument &I : F->args()) { | 
 |     theArgs.push_back(&I); | 
 |     argTypes.push_back(I.getType()); | 
 |   } | 
 |   // argTypes.size() (or theArgs.size()) and Ins.size() need not match. | 
 |   // Ins.size() will be larger | 
 |   //   * if there is an aggregate argument with multiple fields (each field | 
 |   //     showing up separately in Ins) | 
 |   //   * if there is a vector argument with more than typical vector-length | 
 |   //     elements (generally if more than 4) where each vector element is | 
 |   //     individually present in Ins. | 
 |   // So a different index should be used for indexing into Ins. | 
 |   // See similar issue in LowerCall. | 
 |   unsigned InsIdx = 0; | 
 |  | 
 |   int idx = 0; | 
 |   for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { | 
 |     Type *Ty = argTypes[i]; | 
 |  | 
 |     // If the kernel argument is image*_t or sampler_t, convert it to | 
 |     // a i32 constant holding the parameter position. This can later | 
 |     // matched in the AsmPrinter to output the correct mangled name. | 
 |     if (isImageOrSamplerVal( | 
 |             theArgs[i], | 
 |             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() | 
 |                                      : nullptr))) { | 
 |       assert(isKernelFunction(*F) && | 
 |              "Only kernels can have image/sampler params"); | 
 |       InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); | 
 |       continue; | 
 |     } | 
 |  | 
 |     if (theArgs[i]->use_empty()) { | 
 |       // argument is dead | 
 |       if (Ty->isAggregateType() || Ty->isIntegerTy(128)) { | 
 |         SmallVector<EVT, 16> vtparts; | 
 |  | 
 |         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); | 
 |         assert(vtparts.size() > 0 && "empty aggregate type not expected"); | 
 |         for (unsigned parti = 0, parte = vtparts.size(); parti != parte; | 
 |              ++parti) { | 
 |           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); | 
 |           ++InsIdx; | 
 |         } | 
 |         if (vtparts.size() > 0) | 
 |           --InsIdx; | 
 |         continue; | 
 |       } | 
 |       if (Ty->isVectorTy()) { | 
 |         EVT ObjectVT = getValueType(DL, Ty); | 
 |         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); | 
 |         for (unsigned parti = 0; parti < NumRegs; ++parti) { | 
 |           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); | 
 |           ++InsIdx; | 
 |         } | 
 |         if (NumRegs > 0) | 
 |           --InsIdx; | 
 |         continue; | 
 |       } | 
 |       InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); | 
 |       continue; | 
 |     } | 
 |  | 
 |     // In the following cases, assign a node order of "idx+1" | 
 |     // to newly created nodes. The SDNodes for params have to | 
 |     // appear in the same order as their order of appearance | 
 |     // in the original function. "idx+1" holds that order. | 
 |     if (!PAL.hasParamAttribute(i, Attribute::ByVal)) { | 
 |       bool aggregateIsPacked = false; | 
 |       if (StructType *STy = dyn_cast<StructType>(Ty)) | 
 |         aggregateIsPacked = STy->isPacked(); | 
 |  | 
 |       SmallVector<EVT, 16> VTs; | 
 |       SmallVector<uint64_t, 16> Offsets; | 
 |       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); | 
 |       assert(VTs.size() > 0 && "Unexpected empty type."); | 
 |       auto VectorInfo = | 
 |           VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty)); | 
 |  | 
 |       SDValue Arg = getParamSymbol(DAG, idx, PtrVT); | 
 |       int VecIdx = -1; // Index of the first element of the current vector. | 
 |       for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { | 
 |         if (VectorInfo[parti] & PVF_FIRST) { | 
 |           assert(VecIdx == -1 && "Orphaned vector."); | 
 |           VecIdx = parti; | 
 |         } | 
 |  | 
 |         // That's the last element of this store op. | 
 |         if (VectorInfo[parti] & PVF_LAST) { | 
 |           unsigned NumElts = parti - VecIdx + 1; | 
 |           EVT EltVT = VTs[parti]; | 
 |           // i1 is loaded/stored as i8. | 
 |           EVT LoadVT = EltVT; | 
 |           if (EltVT == MVT::i1) | 
 |             LoadVT = MVT::i8; | 
 |           else if (EltVT == MVT::v2f16) | 
 |             // getLoad needs a vector type, but it can't handle | 
 |             // vectors which contain v2f16 elements. So we must load | 
 |             // using i32 here and then bitcast back. | 
 |             LoadVT = MVT::i32; | 
 |  | 
 |           EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); | 
 |           SDValue VecAddr = | 
 |               DAG.getNode(ISD::ADD, dl, PtrVT, Arg, | 
 |                           DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); | 
 |           Value *srcValue = Constant::getNullValue(PointerType::get( | 
 |               EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); | 
 |           SDValue P = | 
 |               DAG.getLoad(VecVT, dl, Root, VecAddr, | 
 |                           MachinePointerInfo(srcValue), aggregateIsPacked, | 
 |                           MachineMemOperand::MODereferenceable | | 
 |                               MachineMemOperand::MOInvariant); | 
 |           if (P.getNode()) | 
 |             P.getNode()->setIROrder(idx + 1); | 
 |           for (unsigned j = 0; j < NumElts; ++j) { | 
 |             SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, | 
 |                                       DAG.getIntPtrConstant(j, dl)); | 
 |             // We've loaded i1 as an i8 and now must truncate it back to i1 | 
 |             if (EltVT == MVT::i1) | 
 |               Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); | 
 |             // v2f16 was loaded as an i32. Now we must bitcast it back. | 
 |             else if (EltVT == MVT::v2f16) | 
 |               Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); | 
 |             // Extend the element if necessary (e.g. an i8 is loaded | 
 |             // into an i16 register) | 
 |             if (Ins[InsIdx].VT.isInteger() && | 
 |                 Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { | 
 |               unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND | 
 |                                                            : ISD::ZERO_EXTEND; | 
 |               Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); | 
 |             } | 
 |             InVals.push_back(Elt); | 
 |           } | 
 |  | 
 |           // Reset vector tracking state. | 
 |           VecIdx = -1; | 
 |         } | 
 |         ++InsIdx; | 
 |       } | 
 |       if (VTs.size() > 0) | 
 |         --InsIdx; | 
 |       continue; | 
 |     } | 
 |  | 
 |     // Param has ByVal attribute | 
 |     // Return MoveParam(param symbol). | 
 |     // Ideally, the param symbol can be returned directly, | 
 |     // but when SDNode builder decides to use it in a CopyToReg(), | 
 |     // machine instruction fails because TargetExternalSymbol | 
 |     // (not lowered) is target dependent, and CopyToReg assumes | 
 |     // the source is lowered. | 
 |     EVT ObjectVT = getValueType(DL, Ty); | 
 |     assert(ObjectVT == Ins[InsIdx].VT && | 
 |            "Ins type did not match function type"); | 
 |     SDValue Arg = getParamSymbol(DAG, idx, PtrVT); | 
 |     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); | 
 |     if (p.getNode()) | 
 |       p.getNode()->setIROrder(idx + 1); | 
 |     InVals.push_back(p); | 
 |   } | 
 |  | 
 |   // Clang will check explicit VarArg and issue error if any. However, Clang | 
 |   // will let code with | 
 |   // implicit var arg like f() pass. See bug 617733. | 
 |   // We treat this case as if the arg list is empty. | 
 |   // if (F.isVarArg()) { | 
 |   // assert(0 && "VarArg not supported yet!"); | 
 |   //} | 
 |  | 
 |   if (!OutChains.empty()) | 
 |     DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); | 
 |  | 
 |   return Chain; | 
 | } | 
 |  | 
 | SDValue | 
 | NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, | 
 |                                  bool isVarArg, | 
 |                                  const SmallVectorImpl<ISD::OutputArg> &Outs, | 
 |                                  const SmallVectorImpl<SDValue> &OutVals, | 
 |                                  const SDLoc &dl, SelectionDAG &DAG) const { | 
 |   MachineFunction &MF = DAG.getMachineFunction(); | 
 |   Type *RetTy = MF.getFunction().getReturnType(); | 
 |  | 
 |   bool isABI = (STI.getSmVersion() >= 20); | 
 |   assert(isABI && "Non-ABI compilation is not supported"); | 
 |   if (!isABI) | 
 |     return Chain; | 
 |  | 
 |   const DataLayout DL = DAG.getDataLayout(); | 
 |   SmallVector<EVT, 16> VTs; | 
 |   SmallVector<uint64_t, 16> Offsets; | 
 |   ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); | 
 |   assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); | 
 |  | 
 |   auto VectorInfo = VectorizePTXValueVTs( | 
 |       VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1); | 
 |  | 
 |   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than | 
 |   // 32-bits are sign extended or zero extended, depending on whether | 
 |   // they are signed or unsigned types. | 
 |   bool ExtendIntegerRetVal = | 
 |       RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; | 
 |  | 
 |   SmallVector<SDValue, 6> StoreOperands; | 
 |   for (unsigned i = 0, e = VTs.size(); i != e; ++i) { | 
 |     // New load/store. Record chain and offset operands. | 
 |     if (VectorInfo[i] & PVF_FIRST) { | 
 |       assert(StoreOperands.empty() && "Orphaned operand list."); | 
 |       StoreOperands.push_back(Chain); | 
 |       StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); | 
 |     } | 
 |  | 
 |     SDValue RetVal = OutVals[i]; | 
 |     if (ExtendIntegerRetVal) { | 
 |       RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND | 
 |                                                   : ISD::ZERO_EXTEND, | 
 |                            dl, MVT::i32, RetVal); | 
 |     } else if (RetVal.getValueSizeInBits() < 16) { | 
 |       // Use 16-bit registers for small load-stores as it's the | 
 |       // smallest general purpose register size supported by NVPTX. | 
 |       RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); | 
 |     } | 
 |  | 
 |     // Record the value to return. | 
 |     StoreOperands.push_back(RetVal); | 
 |  | 
 |     // That's the last element of this store op. | 
 |     if (VectorInfo[i] & PVF_LAST) { | 
 |       NVPTXISD::NodeType Op; | 
 |       unsigned NumElts = StoreOperands.size() - 2; | 
 |       switch (NumElts) { | 
 |       case 1: | 
 |         Op = NVPTXISD::StoreRetval; | 
 |         break; | 
 |       case 2: | 
 |         Op = NVPTXISD::StoreRetvalV2; | 
 |         break; | 
 |       case 4: | 
 |         Op = NVPTXISD::StoreRetvalV4; | 
 |         break; | 
 |       default: | 
 |         llvm_unreachable("Invalid vector info."); | 
 |       } | 
 |  | 
 |       // Adjust type of load/store op if we've extended the scalar | 
 |       // return value. | 
 |       EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; | 
 |       Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other), | 
 |                                       StoreOperands, TheStoreType, | 
 |                                       MachinePointerInfo(), /* Align */ 1, | 
 |                                       MachineMemOperand::MOStore); | 
 |       // Cleanup vector state. | 
 |       StoreOperands.clear(); | 
 |     } | 
 |   } | 
 |  | 
 |   return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); | 
 | } | 
 |  | 
 | void NVPTXTargetLowering::LowerAsmOperandForConstraint( | 
 |     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, | 
 |     SelectionDAG &DAG) const { | 
 |   if (Constraint.length() > 1) | 
 |     return; | 
 |   else | 
 |     TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); | 
 | } | 
 |  | 
 | static unsigned getOpcForTextureInstr(unsigned Intrinsic) { | 
 |   switch (Intrinsic) { | 
 |   default: | 
 |     return 0; | 
 |  | 
 |   case Intrinsic::nvvm_tex_1d_v4f32_s32: | 
 |     return NVPTXISD::Tex1DFloatS32; | 
 |   case Intrinsic::nvvm_tex_1d_v4f32_f32: | 
 |     return NVPTXISD::Tex1DFloatFloat; | 
 |   case Intrinsic::nvvm_tex_1d_level_v4f32_f32: | 
 |     return NVPTXISD::Tex1DFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: | 
 |     return NVPTXISD::Tex1DFloatFloatGrad; | 
 |   case Intrinsic::nvvm_tex_1d_v4s32_s32: | 
 |     return NVPTXISD::Tex1DS32S32; | 
 |   case Intrinsic::nvvm_tex_1d_v4s32_f32: | 
 |     return NVPTXISD::Tex1DS32Float; | 
 |   case Intrinsic::nvvm_tex_1d_level_v4s32_f32: | 
 |     return NVPTXISD::Tex1DS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: | 
 |     return NVPTXISD::Tex1DS32FloatGrad; | 
 |   case Intrinsic::nvvm_tex_1d_v4u32_s32: | 
 |     return NVPTXISD::Tex1DU32S32; | 
 |   case Intrinsic::nvvm_tex_1d_v4u32_f32: | 
 |     return NVPTXISD::Tex1DU32Float; | 
 |   case Intrinsic::nvvm_tex_1d_level_v4u32_f32: | 
 |     return NVPTXISD::Tex1DU32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: | 
 |     return NVPTXISD::Tex1DU32FloatGrad; | 
 |  | 
 |   case Intrinsic::nvvm_tex_1d_array_v4f32_s32: | 
 |     return NVPTXISD::Tex1DArrayFloatS32; | 
 |   case Intrinsic::nvvm_tex_1d_array_v4f32_f32: | 
 |     return NVPTXISD::Tex1DArrayFloatFloat; | 
 |   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: | 
 |     return NVPTXISD::Tex1DArrayFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: | 
 |     return NVPTXISD::Tex1DArrayFloatFloatGrad; | 
 |   case Intrinsic::nvvm_tex_1d_array_v4s32_s32: | 
 |     return NVPTXISD::Tex1DArrayS32S32; | 
 |   case Intrinsic::nvvm_tex_1d_array_v4s32_f32: | 
 |     return NVPTXISD::Tex1DArrayS32Float; | 
 |   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: | 
 |     return NVPTXISD::Tex1DArrayS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: | 
 |     return NVPTXISD::Tex1DArrayS32FloatGrad; | 
 |   case Intrinsic::nvvm_tex_1d_array_v4u32_s32: | 
 |     return NVPTXISD::Tex1DArrayU32S32; | 
 |   case Intrinsic::nvvm_tex_1d_array_v4u32_f32: | 
 |     return NVPTXISD::Tex1DArrayU32Float; | 
 |   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: | 
 |     return NVPTXISD::Tex1DArrayU32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: | 
 |     return NVPTXISD::Tex1DArrayU32FloatGrad; | 
 |  | 
 |   case Intrinsic::nvvm_tex_2d_v4f32_s32: | 
 |     return NVPTXISD::Tex2DFloatS32; | 
 |   case Intrinsic::nvvm_tex_2d_v4f32_f32: | 
 |     return NVPTXISD::Tex2DFloatFloat; | 
 |   case Intrinsic::nvvm_tex_2d_level_v4f32_f32: | 
 |     return NVPTXISD::Tex2DFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: | 
 |     return NVPTXISD::Tex2DFloatFloatGrad; | 
 |   case Intrinsic::nvvm_tex_2d_v4s32_s32: | 
 |     return NVPTXISD::Tex2DS32S32; | 
 |   case Intrinsic::nvvm_tex_2d_v4s32_f32: | 
 |     return NVPTXISD::Tex2DS32Float; | 
 |   case Intrinsic::nvvm_tex_2d_level_v4s32_f32: | 
 |     return NVPTXISD::Tex2DS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: | 
 |     return NVPTXISD::Tex2DS32FloatGrad; | 
 |   case Intrinsic::nvvm_tex_2d_v4u32_s32: | 
 |     return NVPTXISD::Tex2DU32S32; | 
 |   case Intrinsic::nvvm_tex_2d_v4u32_f32: | 
 |     return NVPTXISD::Tex2DU32Float; | 
 |   case Intrinsic::nvvm_tex_2d_level_v4u32_f32: | 
 |     return NVPTXISD::Tex2DU32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: | 
 |     return NVPTXISD::Tex2DU32FloatGrad; | 
 |  | 
 |   case Intrinsic::nvvm_tex_2d_array_v4f32_s32: | 
 |     return NVPTXISD::Tex2DArrayFloatS32; | 
 |   case Intrinsic::nvvm_tex_2d_array_v4f32_f32: | 
 |     return NVPTXISD::Tex2DArrayFloatFloat; | 
 |   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: | 
 |     return NVPTXISD::Tex2DArrayFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: | 
 |     return NVPTXISD::Tex2DArrayFloatFloatGrad; | 
 |   case Intrinsic::nvvm_tex_2d_array_v4s32_s32: | 
 |     return NVPTXISD::Tex2DArrayS32S32; | 
 |   case Intrinsic::nvvm_tex_2d_array_v4s32_f32: | 
 |     return NVPTXISD::Tex2DArrayS32Float; | 
 |   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: | 
 |     return NVPTXISD::Tex2DArrayS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: | 
 |     return NVPTXISD::Tex2DArrayS32FloatGrad; | 
 |   case Intrinsic::nvvm_tex_2d_array_v4u32_s32: | 
 |     return NVPTXISD::Tex2DArrayU32S32; | 
 |   case Intrinsic::nvvm_tex_2d_array_v4u32_f32: | 
 |     return NVPTXISD::Tex2DArrayU32Float; | 
 |   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: | 
 |     return NVPTXISD::Tex2DArrayU32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: | 
 |     return NVPTXISD::Tex2DArrayU32FloatGrad; | 
 |  | 
 |   case Intrinsic::nvvm_tex_3d_v4f32_s32: | 
 |     return NVPTXISD::Tex3DFloatS32; | 
 |   case Intrinsic::nvvm_tex_3d_v4f32_f32: | 
 |     return NVPTXISD::Tex3DFloatFloat; | 
 |   case Intrinsic::nvvm_tex_3d_level_v4f32_f32: | 
 |     return NVPTXISD::Tex3DFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: | 
 |     return NVPTXISD::Tex3DFloatFloatGrad; | 
 |   case Intrinsic::nvvm_tex_3d_v4s32_s32: | 
 |     return NVPTXISD::Tex3DS32S32; | 
 |   case Intrinsic::nvvm_tex_3d_v4s32_f32: | 
 |     return NVPTXISD::Tex3DS32Float; | 
 |   case Intrinsic::nvvm_tex_3d_level_v4s32_f32: | 
 |     return NVPTXISD::Tex3DS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: | 
 |     return NVPTXISD::Tex3DS32FloatGrad; | 
 |   case Intrinsic::nvvm_tex_3d_v4u32_s32: | 
 |     return NVPTXISD::Tex3DU32S32; | 
 |   case Intrinsic::nvvm_tex_3d_v4u32_f32: | 
 |     return NVPTXISD::Tex3DU32Float; | 
 |   case Intrinsic::nvvm_tex_3d_level_v4u32_f32: | 
 |     return NVPTXISD::Tex3DU32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: | 
 |     return NVPTXISD::Tex3DU32FloatGrad; | 
 |  | 
 |   case Intrinsic::nvvm_tex_cube_v4f32_f32: | 
 |     return NVPTXISD::TexCubeFloatFloat; | 
 |   case Intrinsic::nvvm_tex_cube_level_v4f32_f32: | 
 |     return NVPTXISD::TexCubeFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_cube_v4s32_f32: | 
 |     return NVPTXISD::TexCubeS32Float; | 
 |   case Intrinsic::nvvm_tex_cube_level_v4s32_f32: | 
 |     return NVPTXISD::TexCubeS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_cube_v4u32_f32: | 
 |     return NVPTXISD::TexCubeU32Float; | 
 |   case Intrinsic::nvvm_tex_cube_level_v4u32_f32: | 
 |     return NVPTXISD::TexCubeU32FloatLevel; | 
 |  | 
 |   case Intrinsic::nvvm_tex_cube_array_v4f32_f32: | 
 |     return NVPTXISD::TexCubeArrayFloatFloat; | 
 |   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: | 
 |     return NVPTXISD::TexCubeArrayFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_cube_array_v4s32_f32: | 
 |     return NVPTXISD::TexCubeArrayS32Float; | 
 |   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: | 
 |     return NVPTXISD::TexCubeArrayS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_cube_array_v4u32_f32: | 
 |     return NVPTXISD::TexCubeArrayU32Float; | 
 |   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: | 
 |     return NVPTXISD::TexCubeArrayU32FloatLevel; | 
 |  | 
 |   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: | 
 |     return NVPTXISD::Tld4R2DFloatFloat; | 
 |   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: | 
 |     return NVPTXISD::Tld4G2DFloatFloat; | 
 |   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: | 
 |     return NVPTXISD::Tld4B2DFloatFloat; | 
 |   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: | 
 |     return NVPTXISD::Tld4A2DFloatFloat; | 
 |   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: | 
 |     return NVPTXISD::Tld4R2DS64Float; | 
 |   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: | 
 |     return NVPTXISD::Tld4G2DS64Float; | 
 |   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: | 
 |     return NVPTXISD::Tld4B2DS64Float; | 
 |   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: | 
 |     return NVPTXISD::Tld4A2DS64Float; | 
 |   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: | 
 |     return NVPTXISD::Tld4R2DU64Float; | 
 |   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: | 
 |     return NVPTXISD::Tld4G2DU64Float; | 
 |   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: | 
 |     return NVPTXISD::Tld4B2DU64Float; | 
 |   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: | 
 |     return NVPTXISD::Tld4A2DU64Float; | 
 |  | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: | 
 |     return NVPTXISD::TexUnified1DFloatS32; | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: | 
 |     return NVPTXISD::TexUnified1DFloatFloat; | 
 |   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: | 
 |     return NVPTXISD::TexUnified1DFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: | 
 |     return NVPTXISD::TexUnified1DFloatFloatGrad; | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: | 
 |     return NVPTXISD::TexUnified1DS32S32; | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: | 
 |     return NVPTXISD::TexUnified1DS32Float; | 
 |   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: | 
 |     return NVPTXISD::TexUnified1DS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: | 
 |     return NVPTXISD::TexUnified1DS32FloatGrad; | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: | 
 |     return NVPTXISD::TexUnified1DU32S32; | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: | 
 |     return NVPTXISD::TexUnified1DU32Float; | 
 |   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: | 
 |     return NVPTXISD::TexUnified1DU32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: | 
 |     return NVPTXISD::TexUnified1DU32FloatGrad; | 
 |  | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: | 
 |     return NVPTXISD::TexUnified1DArrayFloatS32; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: | 
 |     return NVPTXISD::TexUnified1DArrayFloatFloat; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: | 
 |     return NVPTXISD::TexUnified1DArrayFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: | 
 |     return NVPTXISD::TexUnified1DArrayFloatFloatGrad; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: | 
 |     return NVPTXISD::TexUnified1DArrayS32S32; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: | 
 |     return NVPTXISD::TexUnified1DArrayS32Float; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: | 
 |     return NVPTXISD::TexUnified1DArrayS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: | 
 |     return NVPTXISD::TexUnified1DArrayS32FloatGrad; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: | 
 |     return NVPTXISD::TexUnified1DArrayU32S32; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: | 
 |     return NVPTXISD::TexUnified1DArrayU32Float; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: | 
 |     return NVPTXISD::TexUnified1DArrayU32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: | 
 |     return NVPTXISD::TexUnified1DArrayU32FloatGrad; | 
 |  | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: | 
 |     return NVPTXISD::TexUnified2DFloatS32; | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: | 
 |     return NVPTXISD::TexUnified2DFloatFloat; | 
 |   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: | 
 |     return NVPTXISD::TexUnified2DFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: | 
 |     return NVPTXISD::TexUnified2DFloatFloatGrad; | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: | 
 |     return NVPTXISD::TexUnified2DS32S32; | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: | 
 |     return NVPTXISD::TexUnified2DS32Float; | 
 |   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: | 
 |     return NVPTXISD::TexUnified2DS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: | 
 |     return NVPTXISD::TexUnified2DS32FloatGrad; | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: | 
 |     return NVPTXISD::TexUnified2DU32S32; | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: | 
 |     return NVPTXISD::TexUnified2DU32Float; | 
 |   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: | 
 |     return NVPTXISD::TexUnified2DU32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: | 
 |     return NVPTXISD::TexUnified2DU32FloatGrad; | 
 |  | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: | 
 |     return NVPTXISD::TexUnified2DArrayFloatS32; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: | 
 |     return NVPTXISD::TexUnified2DArrayFloatFloat; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: | 
 |     return NVPTXISD::TexUnified2DArrayFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: | 
 |     return NVPTXISD::TexUnified2DArrayFloatFloatGrad; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: | 
 |     return NVPTXISD::TexUnified2DArrayS32S32; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: | 
 |     return NVPTXISD::TexUnified2DArrayS32Float; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: | 
 |     return NVPTXISD::TexUnified2DArrayS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: | 
 |     return NVPTXISD::TexUnified2DArrayS32FloatGrad; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: | 
 |     return NVPTXISD::TexUnified2DArrayU32S32; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: | 
 |     return NVPTXISD::TexUnified2DArrayU32Float; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: | 
 |     return NVPTXISD::TexUnified2DArrayU32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: | 
 |     return NVPTXISD::TexUnified2DArrayU32FloatGrad; | 
 |  | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: | 
 |     return NVPTXISD::TexUnified3DFloatS32; | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: | 
 |     return NVPTXISD::TexUnified3DFloatFloat; | 
 |   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: | 
 |     return NVPTXISD::TexUnified3DFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: | 
 |     return NVPTXISD::TexUnified3DFloatFloatGrad; | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: | 
 |     return NVPTXISD::TexUnified3DS32S32; | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: | 
 |     return NVPTXISD::TexUnified3DS32Float; | 
 |   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: | 
 |     return NVPTXISD::TexUnified3DS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: | 
 |     return NVPTXISD::TexUnified3DS32FloatGrad; | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: | 
 |     return NVPTXISD::TexUnified3DU32S32; | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: | 
 |     return NVPTXISD::TexUnified3DU32Float; | 
 |   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: | 
 |     return NVPTXISD::TexUnified3DU32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: | 
 |     return NVPTXISD::TexUnified3DU32FloatGrad; | 
 |  | 
 |   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeFloatFloat; | 
 |   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeS32Float; | 
 |   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeU32Float; | 
 |   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeU32FloatLevel; | 
 |  | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeArrayFloatFloat; | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeArrayS32Float; | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeArrayU32Float; | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: | 
 |     return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; | 
 |  | 
 |   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: | 
 |     return NVPTXISD::Tld4UnifiedR2DFloatFloat; | 
 |   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: | 
 |     return NVPTXISD::Tld4UnifiedG2DFloatFloat; | 
 |   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: | 
 |     return NVPTXISD::Tld4UnifiedB2DFloatFloat; | 
 |   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: | 
 |     return NVPTXISD::Tld4UnifiedA2DFloatFloat; | 
 |   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: | 
 |     return NVPTXISD::Tld4UnifiedR2DS64Float; | 
 |   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: | 
 |     return NVPTXISD::Tld4UnifiedG2DS64Float; | 
 |   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: | 
 |     return NVPTXISD::Tld4UnifiedB2DS64Float; | 
 |   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: | 
 |     return NVPTXISD::Tld4UnifiedA2DS64Float; | 
 |   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: | 
 |     return NVPTXISD::Tld4UnifiedR2DU64Float; | 
 |   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: | 
 |     return NVPTXISD::Tld4UnifiedG2DU64Float; | 
 |   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: | 
 |     return NVPTXISD::Tld4UnifiedB2DU64Float; | 
 |   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: | 
 |     return NVPTXISD::Tld4UnifiedA2DU64Float; | 
 |   } | 
 | } | 
 |  | 
 | static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { | 
 |   switch (Intrinsic) { | 
 |   default: | 
 |     return 0; | 
 |   case Intrinsic::nvvm_suld_1d_i8_clamp: | 
 |     return NVPTXISD::Suld1DI8Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_i16_clamp: | 
 |     return NVPTXISD::Suld1DI16Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_i32_clamp: | 
 |     return NVPTXISD::Suld1DI32Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_i64_clamp: | 
 |     return NVPTXISD::Suld1DI64Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_v2i8_clamp: | 
 |     return NVPTXISD::Suld1DV2I8Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_v2i16_clamp: | 
 |     return NVPTXISD::Suld1DV2I16Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_v2i32_clamp: | 
 |     return NVPTXISD::Suld1DV2I32Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_v2i64_clamp: | 
 |     return NVPTXISD::Suld1DV2I64Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_v4i8_clamp: | 
 |     return NVPTXISD::Suld1DV4I8Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_v4i16_clamp: | 
 |     return NVPTXISD::Suld1DV4I16Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_v4i32_clamp: | 
 |     return NVPTXISD::Suld1DV4I32Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_i8_clamp: | 
 |     return NVPTXISD::Suld1DArrayI8Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_i16_clamp: | 
 |     return NVPTXISD::Suld1DArrayI16Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_i32_clamp: | 
 |     return NVPTXISD::Suld1DArrayI32Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_i64_clamp: | 
 |     return NVPTXISD::Suld1DArrayI64Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: | 
 |     return NVPTXISD::Suld1DArrayV2I8Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: | 
 |     return NVPTXISD::Suld1DArrayV2I16Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: | 
 |     return NVPTXISD::Suld1DArrayV2I32Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: | 
 |     return NVPTXISD::Suld1DArrayV2I64Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: | 
 |     return NVPTXISD::Suld1DArrayV4I8Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: | 
 |     return NVPTXISD::Suld1DArrayV4I16Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: | 
 |     return NVPTXISD::Suld1DArrayV4I32Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_i8_clamp: | 
 |     return NVPTXISD::Suld2DI8Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_i16_clamp: | 
 |     return NVPTXISD::Suld2DI16Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_i32_clamp: | 
 |     return NVPTXISD::Suld2DI32Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_i64_clamp: | 
 |     return NVPTXISD::Suld2DI64Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_v2i8_clamp: | 
 |     return NVPTXISD::Suld2DV2I8Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_v2i16_clamp: | 
 |     return NVPTXISD::Suld2DV2I16Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_v2i32_clamp: | 
 |     return NVPTXISD::Suld2DV2I32Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_v2i64_clamp: | 
 |     return NVPTXISD::Suld2DV2I64Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_v4i8_clamp: | 
 |     return NVPTXISD::Suld2DV4I8Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_v4i16_clamp: | 
 |     return NVPTXISD::Suld2DV4I16Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_v4i32_clamp: | 
 |     return NVPTXISD::Suld2DV4I32Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_i8_clamp: | 
 |     return NVPTXISD::Suld2DArrayI8Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_i16_clamp: | 
 |     return NVPTXISD::Suld2DArrayI16Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_i32_clamp: | 
 |     return NVPTXISD::Suld2DArrayI32Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_i64_clamp: | 
 |     return NVPTXISD::Suld2DArrayI64Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: | 
 |     return NVPTXISD::Suld2DArrayV2I8Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: | 
 |     return NVPTXISD::Suld2DArrayV2I16Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: | 
 |     return NVPTXISD::Suld2DArrayV2I32Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: | 
 |     return NVPTXISD::Suld2DArrayV2I64Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: | 
 |     return NVPTXISD::Suld2DArrayV4I8Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: | 
 |     return NVPTXISD::Suld2DArrayV4I16Clamp; | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: | 
 |     return NVPTXISD::Suld2DArrayV4I32Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_i8_clamp: | 
 |     return NVPTXISD::Suld3DI8Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_i16_clamp: | 
 |     return NVPTXISD::Suld3DI16Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_i32_clamp: | 
 |     return NVPTXISD::Suld3DI32Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_i64_clamp: | 
 |     return NVPTXISD::Suld3DI64Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_v2i8_clamp: | 
 |     return NVPTXISD::Suld3DV2I8Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_v2i16_clamp: | 
 |     return NVPTXISD::Suld3DV2I16Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_v2i32_clamp: | 
 |     return NVPTXISD::Suld3DV2I32Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_v2i64_clamp: | 
 |     return NVPTXISD::Suld3DV2I64Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_v4i8_clamp: | 
 |     return NVPTXISD::Suld3DV4I8Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_v4i16_clamp: | 
 |     return NVPTXISD::Suld3DV4I16Clamp; | 
 |   case Intrinsic::nvvm_suld_3d_v4i32_clamp: | 
 |     return NVPTXISD::Suld3DV4I32Clamp; | 
 |   case Intrinsic::nvvm_suld_1d_i8_trap: | 
 |     return NVPTXISD::Suld1DI8Trap; | 
 |   case Intrinsic::nvvm_suld_1d_i16_trap: | 
 |     return NVPTXISD::Suld1DI16Trap; | 
 |   case Intrinsic::nvvm_suld_1d_i32_trap: | 
 |     return NVPTXISD::Suld1DI32Trap; | 
 |   case Intrinsic::nvvm_suld_1d_i64_trap: | 
 |     return NVPTXISD::Suld1DI64Trap; | 
 |   case Intrinsic::nvvm_suld_1d_v2i8_trap: | 
 |     return NVPTXISD::Suld1DV2I8Trap; | 
 |   case Intrinsic::nvvm_suld_1d_v2i16_trap: | 
 |     return NVPTXISD::Suld1DV2I16Trap; | 
 |   case Intrinsic::nvvm_suld_1d_v2i32_trap: | 
 |     return NVPTXISD::Suld1DV2I32Trap; | 
 |   case Intrinsic::nvvm_suld_1d_v2i64_trap: | 
 |     return NVPTXISD::Suld1DV2I64Trap; | 
 |   case Intrinsic::nvvm_suld_1d_v4i8_trap: | 
 |     return NVPTXISD::Suld1DV4I8Trap; | 
 |   case Intrinsic::nvvm_suld_1d_v4i16_trap: | 
 |     return NVPTXISD::Suld1DV4I16Trap; | 
 |   case Intrinsic::nvvm_suld_1d_v4i32_trap: | 
 |     return NVPTXISD::Suld1DV4I32Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_i8_trap: | 
 |     return NVPTXISD::Suld1DArrayI8Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_i16_trap: | 
 |     return NVPTXISD::Suld1DArrayI16Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_i32_trap: | 
 |     return NVPTXISD::Suld1DArrayI32Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_i64_trap: | 
 |     return NVPTXISD::Suld1DArrayI64Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i8_trap: | 
 |     return NVPTXISD::Suld1DArrayV2I8Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i16_trap: | 
 |     return NVPTXISD::Suld1DArrayV2I16Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i32_trap: | 
 |     return NVPTXISD::Suld1DArrayV2I32Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i64_trap: | 
 |     return NVPTXISD::Suld1DArrayV2I64Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i8_trap: | 
 |     return NVPTXISD::Suld1DArrayV4I8Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i16_trap: | 
 |     return NVPTXISD::Suld1DArrayV4I16Trap; | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i32_trap: | 
 |     return NVPTXISD::Suld1DArrayV4I32Trap; | 
 |   case Intrinsic::nvvm_suld_2d_i8_trap: | 
 |     return NVPTXISD::Suld2DI8Trap; | 
 |   case Intrinsic::nvvm_suld_2d_i16_trap: | 
 |     return NVPTXISD::Suld2DI16Trap; | 
 |   case Intrinsic::nvvm_suld_2d_i32_trap: | 
 |     return NVPTXISD::Suld2DI32Trap; | 
 |   case Intrinsic::nvvm_suld_2d_i64_trap: | 
 |     return NVPTXISD::Suld2DI64Trap; | 
 |   case Intrinsic::nvvm_suld_2d_v2i8_trap: | 
 |     return NVPTXISD::Suld2DV2I8Trap; | 
 |   case Intrinsic::nvvm_suld_2d_v2i16_trap: | 
 |     return NVPTXISD::Suld2DV2I16Trap; | 
 |   case Intrinsic::nvvm_suld_2d_v2i32_trap: | 
 |     return NVPTXISD::Suld2DV2I32Trap; | 
 |   case Intrinsic::nvvm_suld_2d_v2i64_trap: | 
 |     return NVPTXISD::Suld2DV2I64Trap; | 
 |   case Intrinsic::nvvm_suld_2d_v4i8_trap: | 
 |     return NVPTXISD::Suld2DV4I8Trap; | 
 |   case Intrinsic::nvvm_suld_2d_v4i16_trap: | 
 |     return NVPTXISD::Suld2DV4I16Trap; | 
 |   case Intrinsic::nvvm_suld_2d_v4i32_trap: | 
 |     return NVPTXISD::Suld2DV4I32Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_i8_trap: | 
 |     return NVPTXISD::Suld2DArrayI8Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_i16_trap: | 
 |     return NVPTXISD::Suld2DArrayI16Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_i32_trap: | 
 |     return NVPTXISD::Suld2DArrayI32Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_i64_trap: | 
 |     return NVPTXISD::Suld2DArrayI64Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i8_trap: | 
 |     return NVPTXISD::Suld2DArrayV2I8Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i16_trap: | 
 |     return NVPTXISD::Suld2DArrayV2I16Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i32_trap: | 
 |     return NVPTXISD::Suld2DArrayV2I32Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i64_trap: | 
 |     return NVPTXISD::Suld2DArrayV2I64Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i8_trap: | 
 |     return NVPTXISD::Suld2DArrayV4I8Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i16_trap: | 
 |     return NVPTXISD::Suld2DArrayV4I16Trap; | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i32_trap: | 
 |     return NVPTXISD::Suld2DArrayV4I32Trap; | 
 |   case Intrinsic::nvvm_suld_3d_i8_trap: | 
 |     return NVPTXISD::Suld3DI8Trap; | 
 |   case Intrinsic::nvvm_suld_3d_i16_trap: | 
 |     return NVPTXISD::Suld3DI16Trap; | 
 |   case Intrinsic::nvvm_suld_3d_i32_trap: | 
 |     return NVPTXISD::Suld3DI32Trap; | 
 |   case Intrinsic::nvvm_suld_3d_i64_trap: | 
 |     return NVPTXISD::Suld3DI64Trap; | 
 |   case Intrinsic::nvvm_suld_3d_v2i8_trap: | 
 |     return NVPTXISD::Suld3DV2I8Trap; | 
 |   case Intrinsic::nvvm_suld_3d_v2i16_trap: | 
 |     return NVPTXISD::Suld3DV2I16Trap; | 
 |   case Intrinsic::nvvm_suld_3d_v2i32_trap: | 
 |     return NVPTXISD::Suld3DV2I32Trap; | 
 |   case Intrinsic::nvvm_suld_3d_v2i64_trap: | 
 |     return NVPTXISD::Suld3DV2I64Trap; | 
 |   case Intrinsic::nvvm_suld_3d_v4i8_trap: | 
 |     return NVPTXISD::Suld3DV4I8Trap; | 
 |   case Intrinsic::nvvm_suld_3d_v4i16_trap: | 
 |     return NVPTXISD::Suld3DV4I16Trap; | 
 |   case Intrinsic::nvvm_suld_3d_v4i32_trap: | 
 |     return NVPTXISD::Suld3DV4I32Trap; | 
 |   case Intrinsic::nvvm_suld_1d_i8_zero: | 
 |     return NVPTXISD::Suld1DI8Zero; | 
 |   case Intrinsic::nvvm_suld_1d_i16_zero: | 
 |     return NVPTXISD::Suld1DI16Zero; | 
 |   case Intrinsic::nvvm_suld_1d_i32_zero: | 
 |     return NVPTXISD::Suld1DI32Zero; | 
 |   case Intrinsic::nvvm_suld_1d_i64_zero: | 
 |     return NVPTXISD::Suld1DI64Zero; | 
 |   case Intrinsic::nvvm_suld_1d_v2i8_zero: | 
 |     return NVPTXISD::Suld1DV2I8Zero; | 
 |   case Intrinsic::nvvm_suld_1d_v2i16_zero: | 
 |     return NVPTXISD::Suld1DV2I16Zero; | 
 |   case Intrinsic::nvvm_suld_1d_v2i32_zero: | 
 |     return NVPTXISD::Suld1DV2I32Zero; | 
 |   case Intrinsic::nvvm_suld_1d_v2i64_zero: | 
 |     return NVPTXISD::Suld1DV2I64Zero; | 
 |   case Intrinsic::nvvm_suld_1d_v4i8_zero: | 
 |     return NVPTXISD::Suld1DV4I8Zero; | 
 |   case Intrinsic::nvvm_suld_1d_v4i16_zero: | 
 |     return NVPTXISD::Suld1DV4I16Zero; | 
 |   case Intrinsic::nvvm_suld_1d_v4i32_zero: | 
 |     return NVPTXISD::Suld1DV4I32Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_i8_zero: | 
 |     return NVPTXISD::Suld1DArrayI8Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_i16_zero: | 
 |     return NVPTXISD::Suld1DArrayI16Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_i32_zero: | 
 |     return NVPTXISD::Suld1DArrayI32Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_i64_zero: | 
 |     return NVPTXISD::Suld1DArrayI64Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i8_zero: | 
 |     return NVPTXISD::Suld1DArrayV2I8Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i16_zero: | 
 |     return NVPTXISD::Suld1DArrayV2I16Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i32_zero: | 
 |     return NVPTXISD::Suld1DArrayV2I32Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i64_zero: | 
 |     return NVPTXISD::Suld1DArrayV2I64Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i8_zero: | 
 |     return NVPTXISD::Suld1DArrayV4I8Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i16_zero: | 
 |     return NVPTXISD::Suld1DArrayV4I16Zero; | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i32_zero: | 
 |     return NVPTXISD::Suld1DArrayV4I32Zero; | 
 |   case Intrinsic::nvvm_suld_2d_i8_zero: | 
 |     return NVPTXISD::Suld2DI8Zero; | 
 |   case Intrinsic::nvvm_suld_2d_i16_zero: | 
 |     return NVPTXISD::Suld2DI16Zero; | 
 |   case Intrinsic::nvvm_suld_2d_i32_zero: | 
 |     return NVPTXISD::Suld2DI32Zero; | 
 |   case Intrinsic::nvvm_suld_2d_i64_zero: | 
 |     return NVPTXISD::Suld2DI64Zero; | 
 |   case Intrinsic::nvvm_suld_2d_v2i8_zero: | 
 |     return NVPTXISD::Suld2DV2I8Zero; | 
 |   case Intrinsic::nvvm_suld_2d_v2i16_zero: | 
 |     return NVPTXISD::Suld2DV2I16Zero; | 
 |   case Intrinsic::nvvm_suld_2d_v2i32_zero: | 
 |     return NVPTXISD::Suld2DV2I32Zero; | 
 |   case Intrinsic::nvvm_suld_2d_v2i64_zero: | 
 |     return NVPTXISD::Suld2DV2I64Zero; | 
 |   case Intrinsic::nvvm_suld_2d_v4i8_zero: | 
 |     return NVPTXISD::Suld2DV4I8Zero; | 
 |   case Intrinsic::nvvm_suld_2d_v4i16_zero: | 
 |     return NVPTXISD::Suld2DV4I16Zero; | 
 |   case Intrinsic::nvvm_suld_2d_v4i32_zero: | 
 |     return NVPTXISD::Suld2DV4I32Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_i8_zero: | 
 |     return NVPTXISD::Suld2DArrayI8Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_i16_zero: | 
 |     return NVPTXISD::Suld2DArrayI16Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_i32_zero: | 
 |     return NVPTXISD::Suld2DArrayI32Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_i64_zero: | 
 |     return NVPTXISD::Suld2DArrayI64Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i8_zero: | 
 |     return NVPTXISD::Suld2DArrayV2I8Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i16_zero: | 
 |     return NVPTXISD::Suld2DArrayV2I16Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i32_zero: | 
 |     return NVPTXISD::Suld2DArrayV2I32Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i64_zero: | 
 |     return NVPTXISD::Suld2DArrayV2I64Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i8_zero: | 
 |     return NVPTXISD::Suld2DArrayV4I8Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i16_zero: | 
 |     return NVPTXISD::Suld2DArrayV4I16Zero; | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i32_zero: | 
 |     return NVPTXISD::Suld2DArrayV4I32Zero; | 
 |   case Intrinsic::nvvm_suld_3d_i8_zero: | 
 |     return NVPTXISD::Suld3DI8Zero; | 
 |   case Intrinsic::nvvm_suld_3d_i16_zero: | 
 |     return NVPTXISD::Suld3DI16Zero; | 
 |   case Intrinsic::nvvm_suld_3d_i32_zero: | 
 |     return NVPTXISD::Suld3DI32Zero; | 
 |   case Intrinsic::nvvm_suld_3d_i64_zero: | 
 |     return NVPTXISD::Suld3DI64Zero; | 
 |   case Intrinsic::nvvm_suld_3d_v2i8_zero: | 
 |     return NVPTXISD::Suld3DV2I8Zero; | 
 |   case Intrinsic::nvvm_suld_3d_v2i16_zero: | 
 |     return NVPTXISD::Suld3DV2I16Zero; | 
 |   case Intrinsic::nvvm_suld_3d_v2i32_zero: | 
 |     return NVPTXISD::Suld3DV2I32Zero; | 
 |   case Intrinsic::nvvm_suld_3d_v2i64_zero: | 
 |     return NVPTXISD::Suld3DV2I64Zero; | 
 |   case Intrinsic::nvvm_suld_3d_v4i8_zero: | 
 |     return NVPTXISD::Suld3DV4I8Zero; | 
 |   case Intrinsic::nvvm_suld_3d_v4i16_zero: | 
 |     return NVPTXISD::Suld3DV4I16Zero; | 
 |   case Intrinsic::nvvm_suld_3d_v4i32_zero: | 
 |     return NVPTXISD::Suld3DV4I32Zero; | 
 |   } | 
 | } | 
 |  | 
 | // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as | 
 | // TgtMemIntrinsic | 
 | // because we need the information that is only available in the "Value" type | 
 | // of destination | 
 | // pointer. In particular, the address space information. | 
 | bool NVPTXTargetLowering::getTgtMemIntrinsic( | 
 |     IntrinsicInfo &Info, const CallInst &I, | 
 |     MachineFunction &MF, unsigned Intrinsic) const { | 
 |   switch (Intrinsic) { | 
 |   default: | 
 |     return false; | 
 |   case Intrinsic::nvvm_match_all_sync_i32p: | 
 |   case Intrinsic::nvvm_match_all_sync_i64p: | 
 |     Info.opc = ISD::INTRINSIC_W_CHAIN; | 
 |     // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute | 
 |     // in order to model data exchange with other threads, but perform no real | 
 |     // memory accesses. | 
 |     Info.memVT = MVT::i1; | 
 |  | 
 |     // Our result depends on both our and other thread's arguments. | 
 |     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; | 
 |     return true; | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { | 
 |     Info.opc = ISD::INTRINSIC_W_CHAIN; | 
 |     Info.memVT = MVT::v8f16; | 
 |     Info.ptrVal = I.getArgOperand(0); | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = 16; | 
 |     return true; | 
 |   } | 
 |  | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { | 
 |     Info.opc = ISD::INTRINSIC_W_CHAIN; | 
 |     Info.memVT = MVT::v4f16; | 
 |     Info.ptrVal = I.getArgOperand(0); | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = 16; | 
 |     return true; | 
 |   } | 
 |  | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: { | 
 |     Info.opc = ISD::INTRINSIC_W_CHAIN; | 
 |     Info.memVT = MVT::v8f32; | 
 |     Info.ptrVal = I.getArgOperand(0); | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = 16; | 
 |     return true; | 
 |   } | 
 |  | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { | 
 |     Info.opc = ISD::INTRINSIC_VOID; | 
 |     Info.memVT = MVT::v4f16; | 
 |     Info.ptrVal = I.getArgOperand(0); | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOStore; | 
 |     Info.align = 16; | 
 |     return true; | 
 |   } | 
 |  | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: | 
 |   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: { | 
 |     Info.opc = ISD::INTRINSIC_VOID; | 
 |     Info.memVT = MVT::v8f32; | 
 |     Info.ptrVal = I.getArgOperand(0); | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOStore; | 
 |     Info.align = 16; | 
 |     return true; | 
 |   } | 
 |  | 
 |   case Intrinsic::nvvm_atomic_load_add_f32: | 
 |   case Intrinsic::nvvm_atomic_load_add_f64: | 
 |   case Intrinsic::nvvm_atomic_load_inc_32: | 
 |   case Intrinsic::nvvm_atomic_load_dec_32: | 
 |  | 
 |   case Intrinsic::nvvm_atomic_add_gen_f_cta: | 
 |   case Intrinsic::nvvm_atomic_add_gen_f_sys: | 
 |   case Intrinsic::nvvm_atomic_add_gen_i_cta: | 
 |   case Intrinsic::nvvm_atomic_add_gen_i_sys: | 
 |   case Intrinsic::nvvm_atomic_and_gen_i_cta: | 
 |   case Intrinsic::nvvm_atomic_and_gen_i_sys: | 
 |   case Intrinsic::nvvm_atomic_cas_gen_i_cta: | 
 |   case Intrinsic::nvvm_atomic_cas_gen_i_sys: | 
 |   case Intrinsic::nvvm_atomic_dec_gen_i_cta: | 
 |   case Intrinsic::nvvm_atomic_dec_gen_i_sys: | 
 |   case Intrinsic::nvvm_atomic_inc_gen_i_cta: | 
 |   case Intrinsic::nvvm_atomic_inc_gen_i_sys: | 
 |   case Intrinsic::nvvm_atomic_max_gen_i_cta: | 
 |   case Intrinsic::nvvm_atomic_max_gen_i_sys: | 
 |   case Intrinsic::nvvm_atomic_min_gen_i_cta: | 
 |   case Intrinsic::nvvm_atomic_min_gen_i_sys: | 
 |   case Intrinsic::nvvm_atomic_or_gen_i_cta: | 
 |   case Intrinsic::nvvm_atomic_or_gen_i_sys: | 
 |   case Intrinsic::nvvm_atomic_exch_gen_i_cta: | 
 |   case Intrinsic::nvvm_atomic_exch_gen_i_sys: | 
 |   case Intrinsic::nvvm_atomic_xor_gen_i_cta: | 
 |   case Intrinsic::nvvm_atomic_xor_gen_i_sys: { | 
 |     auto &DL = I.getModule()->getDataLayout(); | 
 |     Info.opc = ISD::INTRINSIC_W_CHAIN; | 
 |     Info.memVT = getValueType(DL, I.getType()); | 
 |     Info.ptrVal = I.getArgOperand(0); | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; | 
 |     Info.align = 0; | 
 |     return true; | 
 |   } | 
 |  | 
 |   case Intrinsic::nvvm_ldu_global_i: | 
 |   case Intrinsic::nvvm_ldu_global_f: | 
 |   case Intrinsic::nvvm_ldu_global_p: { | 
 |     auto &DL = I.getModule()->getDataLayout(); | 
 |     Info.opc = ISD::INTRINSIC_W_CHAIN; | 
 |     if (Intrinsic == Intrinsic::nvvm_ldu_global_i) | 
 |       Info.memVT = getValueType(DL, I.getType()); | 
 |     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) | 
 |       Info.memVT = getPointerTy(DL); | 
 |     else | 
 |       Info.memVT = getValueType(DL, I.getType()); | 
 |     Info.ptrVal = I.getArgOperand(0); | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); | 
 |  | 
 |     return true; | 
 |   } | 
 |   case Intrinsic::nvvm_ldg_global_i: | 
 |   case Intrinsic::nvvm_ldg_global_f: | 
 |   case Intrinsic::nvvm_ldg_global_p: { | 
 |     auto &DL = I.getModule()->getDataLayout(); | 
 |  | 
 |     Info.opc = ISD::INTRINSIC_W_CHAIN; | 
 |     if (Intrinsic == Intrinsic::nvvm_ldg_global_i) | 
 |       Info.memVT = getValueType(DL, I.getType()); | 
 |     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) | 
 |       Info.memVT = getPointerTy(DL); | 
 |     else | 
 |       Info.memVT = getValueType(DL, I.getType()); | 
 |     Info.ptrVal = I.getArgOperand(0); | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); | 
 |  | 
 |     return true; | 
 |   } | 
 |  | 
 |   case Intrinsic::nvvm_tex_1d_v4f32_s32: | 
 |   case Intrinsic::nvvm_tex_1d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_array_v4f32_s32: | 
 |   case Intrinsic::nvvm_tex_1d_array_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_v4f32_s32: | 
 |   case Intrinsic::nvvm_tex_2d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_array_v4f32_s32: | 
 |   case Intrinsic::nvvm_tex_2d_array_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_3d_v4f32_s32: | 
 |   case Intrinsic::nvvm_tex_3d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_3d_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_array_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: | 
 |     Info.opc = getOpcForTextureInstr(Intrinsic); | 
 |     Info.memVT = MVT::v4f32; | 
 |     Info.ptrVal = nullptr; | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = 16; | 
 |     return true; | 
 |  | 
 |   case Intrinsic::nvvm_tex_1d_v4s32_s32: | 
 |   case Intrinsic::nvvm_tex_1d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_array_v4s32_s32: | 
 |   case Intrinsic::nvvm_tex_1d_array_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_v4s32_s32: | 
 |   case Intrinsic::nvvm_tex_2d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_array_v4s32_s32: | 
 |   case Intrinsic::nvvm_tex_2d_array_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_3d_v4s32_s32: | 
 |   case Intrinsic::nvvm_tex_3d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_3d_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_array_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_array_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_v4u32_s32: | 
 |   case Intrinsic::nvvm_tex_1d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_array_v4u32_s32: | 
 |   case Intrinsic::nvvm_tex_1d_array_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_v4u32_s32: | 
 |   case Intrinsic::nvvm_tex_2d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_array_v4u32_s32: | 
 |   case Intrinsic::nvvm_tex_2d_array_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_3d_v4u32_s32: | 
 |   case Intrinsic::nvvm_tex_3d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_3d_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: | 
 |   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: | 
 |   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: | 
 |   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: | 
 |     Info.opc = getOpcForTextureInstr(Intrinsic); | 
 |     Info.memVT = MVT::v4i32; | 
 |     Info.ptrVal = nullptr; | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = 16; | 
 |     return true; | 
 |  | 
 |   case Intrinsic::nvvm_suld_1d_i8_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_v2i8_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_v4i8_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_i8_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_i8_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_v2i8_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_v4i8_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_i8_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_i8_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_v2i8_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_v4i8_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_i8_trap: | 
 |   case Intrinsic::nvvm_suld_1d_v2i8_trap: | 
 |   case Intrinsic::nvvm_suld_1d_v4i8_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_i8_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i8_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i8_trap: | 
 |   case Intrinsic::nvvm_suld_2d_i8_trap: | 
 |   case Intrinsic::nvvm_suld_2d_v2i8_trap: | 
 |   case Intrinsic::nvvm_suld_2d_v4i8_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_i8_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i8_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i8_trap: | 
 |   case Intrinsic::nvvm_suld_3d_i8_trap: | 
 |   case Intrinsic::nvvm_suld_3d_v2i8_trap: | 
 |   case Intrinsic::nvvm_suld_3d_v4i8_trap: | 
 |   case Intrinsic::nvvm_suld_1d_i8_zero: | 
 |   case Intrinsic::nvvm_suld_1d_v2i8_zero: | 
 |   case Intrinsic::nvvm_suld_1d_v4i8_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_i8_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i8_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i8_zero: | 
 |   case Intrinsic::nvvm_suld_2d_i8_zero: | 
 |   case Intrinsic::nvvm_suld_2d_v2i8_zero: | 
 |   case Intrinsic::nvvm_suld_2d_v4i8_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_i8_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i8_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i8_zero: | 
 |   case Intrinsic::nvvm_suld_3d_i8_zero: | 
 |   case Intrinsic::nvvm_suld_3d_v2i8_zero: | 
 |   case Intrinsic::nvvm_suld_3d_v4i8_zero: | 
 |     Info.opc = getOpcForSurfaceInstr(Intrinsic); | 
 |     Info.memVT = MVT::i8; | 
 |     Info.ptrVal = nullptr; | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = 16; | 
 |     return true; | 
 |  | 
 |   case Intrinsic::nvvm_suld_1d_i16_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_v2i16_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_v4i16_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_i16_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_i16_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_v2i16_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_v4i16_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_i16_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_i16_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_v2i16_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_v4i16_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_i16_trap: | 
 |   case Intrinsic::nvvm_suld_1d_v2i16_trap: | 
 |   case Intrinsic::nvvm_suld_1d_v4i16_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_i16_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i16_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i16_trap: | 
 |   case Intrinsic::nvvm_suld_2d_i16_trap: | 
 |   case Intrinsic::nvvm_suld_2d_v2i16_trap: | 
 |   case Intrinsic::nvvm_suld_2d_v4i16_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_i16_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i16_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i16_trap: | 
 |   case Intrinsic::nvvm_suld_3d_i16_trap: | 
 |   case Intrinsic::nvvm_suld_3d_v2i16_trap: | 
 |   case Intrinsic::nvvm_suld_3d_v4i16_trap: | 
 |   case Intrinsic::nvvm_suld_1d_i16_zero: | 
 |   case Intrinsic::nvvm_suld_1d_v2i16_zero: | 
 |   case Intrinsic::nvvm_suld_1d_v4i16_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_i16_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i16_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i16_zero: | 
 |   case Intrinsic::nvvm_suld_2d_i16_zero: | 
 |   case Intrinsic::nvvm_suld_2d_v2i16_zero: | 
 |   case Intrinsic::nvvm_suld_2d_v4i16_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_i16_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i16_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i16_zero: | 
 |   case Intrinsic::nvvm_suld_3d_i16_zero: | 
 |   case Intrinsic::nvvm_suld_3d_v2i16_zero: | 
 |   case Intrinsic::nvvm_suld_3d_v4i16_zero: | 
 |     Info.opc = getOpcForSurfaceInstr(Intrinsic); | 
 |     Info.memVT = MVT::i16; | 
 |     Info.ptrVal = nullptr; | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = 16; | 
 |     return true; | 
 |  | 
 |   case Intrinsic::nvvm_suld_1d_i32_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_v2i32_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_v4i32_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_i32_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_i32_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_v2i32_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_v4i32_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_i32_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_i32_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_v2i32_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_v4i32_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_i32_trap: | 
 |   case Intrinsic::nvvm_suld_1d_v2i32_trap: | 
 |   case Intrinsic::nvvm_suld_1d_v4i32_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_i32_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i32_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i32_trap: | 
 |   case Intrinsic::nvvm_suld_2d_i32_trap: | 
 |   case Intrinsic::nvvm_suld_2d_v2i32_trap: | 
 |   case Intrinsic::nvvm_suld_2d_v4i32_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_i32_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i32_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i32_trap: | 
 |   case Intrinsic::nvvm_suld_3d_i32_trap: | 
 |   case Intrinsic::nvvm_suld_3d_v2i32_trap: | 
 |   case Intrinsic::nvvm_suld_3d_v4i32_trap: | 
 |   case Intrinsic::nvvm_suld_1d_i32_zero: | 
 |   case Intrinsic::nvvm_suld_1d_v2i32_zero: | 
 |   case Intrinsic::nvvm_suld_1d_v4i32_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_i32_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i32_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_v4i32_zero: | 
 |   case Intrinsic::nvvm_suld_2d_i32_zero: | 
 |   case Intrinsic::nvvm_suld_2d_v2i32_zero: | 
 |   case Intrinsic::nvvm_suld_2d_v4i32_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_i32_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i32_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_v4i32_zero: | 
 |   case Intrinsic::nvvm_suld_3d_i32_zero: | 
 |   case Intrinsic::nvvm_suld_3d_v2i32_zero: | 
 |   case Intrinsic::nvvm_suld_3d_v4i32_zero: | 
 |     Info.opc = getOpcForSurfaceInstr(Intrinsic); | 
 |     Info.memVT = MVT::i32; | 
 |     Info.ptrVal = nullptr; | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = 16; | 
 |     return true; | 
 |  | 
 |   case Intrinsic::nvvm_suld_1d_i64_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_v2i64_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_i64_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_i64_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_v2i64_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_i64_clamp: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_i64_clamp: | 
 |   case Intrinsic::nvvm_suld_3d_v2i64_clamp: | 
 |   case Intrinsic::nvvm_suld_1d_i64_trap: | 
 |   case Intrinsic::nvvm_suld_1d_v2i64_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_i64_trap: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i64_trap: | 
 |   case Intrinsic::nvvm_suld_2d_i64_trap: | 
 |   case Intrinsic::nvvm_suld_2d_v2i64_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_i64_trap: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i64_trap: | 
 |   case Intrinsic::nvvm_suld_3d_i64_trap: | 
 |   case Intrinsic::nvvm_suld_3d_v2i64_trap: | 
 |   case Intrinsic::nvvm_suld_1d_i64_zero: | 
 |   case Intrinsic::nvvm_suld_1d_v2i64_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_i64_zero: | 
 |   case Intrinsic::nvvm_suld_1d_array_v2i64_zero: | 
 |   case Intrinsic::nvvm_suld_2d_i64_zero: | 
 |   case Intrinsic::nvvm_suld_2d_v2i64_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_i64_zero: | 
 |   case Intrinsic::nvvm_suld_2d_array_v2i64_zero: | 
 |   case Intrinsic::nvvm_suld_3d_i64_zero: | 
 |   case Intrinsic::nvvm_suld_3d_v2i64_zero: | 
 |     Info.opc = getOpcForSurfaceInstr(Intrinsic); | 
 |     Info.memVT = MVT::i64; | 
 |     Info.ptrVal = nullptr; | 
 |     Info.offset = 0; | 
 |     Info.flags = MachineMemOperand::MOLoad; | 
 |     Info.align = 16; | 
 |     return true; | 
 |   } | 
 |   return false; | 
 | } | 
 |  | 
 | /// isLegalAddressingMode - Return true if the addressing mode represented | 
 | /// by AM is legal for this target, for a load/store of the specified type. | 
 | /// Used to guide target specific optimizations, like loop strength reduction | 
 | /// (LoopStrengthReduce.cpp) and memory optimization for address mode | 
 | /// (CodeGenPrepare.cpp) | 
 | bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, | 
 |                                                 const AddrMode &AM, Type *Ty, | 
 |                                                 unsigned AS, Instruction *I) const { | 
 |   // AddrMode - This represents an addressing mode of: | 
 |   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg | 
 |   // | 
 |   // The legal address modes are | 
 |   // - [avar] | 
 |   // - [areg] | 
 |   // - [areg+immoff] | 
 |   // - [immAddr] | 
 |  | 
 |   if (AM.BaseGV) { | 
 |     return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; | 
 |   } | 
 |  | 
 |   switch (AM.Scale) { | 
 |   case 0: // "r", "r+i" or "i" is allowed | 
 |     break; | 
 |   case 1: | 
 |     if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. | 
 |       return false; | 
 |     // Otherwise we have r+i. | 
 |     break; | 
 |   default: | 
 |     // No scale > 1 is allowed | 
 |     return false; | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | //===----------------------------------------------------------------------===// | 
 | //                         NVPTX Inline Assembly Support | 
 | //===----------------------------------------------------------------------===// | 
 |  | 
 | /// getConstraintType - Given a constraint letter, return the type of | 
 | /// constraint it is for this target. | 
 | NVPTXTargetLowering::ConstraintType | 
 | NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { | 
 |   if (Constraint.size() == 1) { | 
 |     switch (Constraint[0]) { | 
 |     default: | 
 |       break; | 
 |     case 'b': | 
 |     case 'r': | 
 |     case 'h': | 
 |     case 'c': | 
 |     case 'l': | 
 |     case 'f': | 
 |     case 'd': | 
 |     case '0': | 
 |     case 'N': | 
 |       return C_RegisterClass; | 
 |     } | 
 |   } | 
 |   return TargetLowering::getConstraintType(Constraint); | 
 | } | 
 |  | 
 | std::pair<unsigned, const TargetRegisterClass *> | 
 | NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, | 
 |                                                   StringRef Constraint, | 
 |                                                   MVT VT) const { | 
 |   if (Constraint.size() == 1) { | 
 |     switch (Constraint[0]) { | 
 |     case 'b': | 
 |       return std::make_pair(0U, &NVPTX::Int1RegsRegClass); | 
 |     case 'c': | 
 |       return std::make_pair(0U, &NVPTX::Int16RegsRegClass); | 
 |     case 'h': | 
 |       return std::make_pair(0U, &NVPTX::Int16RegsRegClass); | 
 |     case 'r': | 
 |       return std::make_pair(0U, &NVPTX::Int32RegsRegClass); | 
 |     case 'l': | 
 |     case 'N': | 
 |       return std::make_pair(0U, &NVPTX::Int64RegsRegClass); | 
 |     case 'f': | 
 |       return std::make_pair(0U, &NVPTX::Float32RegsRegClass); | 
 |     case 'd': | 
 |       return std::make_pair(0U, &NVPTX::Float64RegsRegClass); | 
 |     } | 
 |   } | 
 |   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); | 
 | } | 
 |  | 
 | //===----------------------------------------------------------------------===// | 
 | //                         NVPTX DAG Combining | 
 | //===----------------------------------------------------------------------===// | 
 |  | 
 | bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, | 
 |                                    CodeGenOpt::Level OptLevel) const { | 
 |   // Always honor command-line argument | 
 |   if (FMAContractLevelOpt.getNumOccurrences() > 0) | 
 |     return FMAContractLevelOpt > 0; | 
 |  | 
 |   // Do not contract if we're not optimizing the code. | 
 |   if (OptLevel == 0) | 
 |     return false; | 
 |  | 
 |   // Honor TargetOptions flags that explicitly say fusion is okay. | 
 |   if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) | 
 |     return true; | 
 |  | 
 |   return allowUnsafeFPMath(MF); | 
 | } | 
 |  | 
 | bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { | 
 |   // Honor TargetOptions flags that explicitly say unsafe math is okay. | 
 |   if (MF.getTarget().Options.UnsafeFPMath) | 
 |     return true; | 
 |  | 
 |   // Allow unsafe math if unsafe-fp-math attribute explicitly says so. | 
 |   const Function &F = MF.getFunction(); | 
 |   if (F.hasFnAttribute("unsafe-fp-math")) { | 
 |     Attribute Attr = F.getFnAttribute("unsafe-fp-math"); | 
 |     StringRef Val = Attr.getValueAsString(); | 
 |     if (Val == "true") | 
 |       return true; | 
 |   } | 
 |  | 
 |   return false; | 
 | } | 
 |  | 
 | /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with | 
 | /// operands N0 and N1.  This is a helper for PerformADDCombine that is | 
 | /// called with the default operands, and if that fails, with commuted | 
 | /// operands. | 
 | static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, | 
 |                                            TargetLowering::DAGCombinerInfo &DCI, | 
 |                                              const NVPTXSubtarget &Subtarget, | 
 |                                              CodeGenOpt::Level OptLevel) { | 
 |   SelectionDAG  &DAG = DCI.DAG; | 
 |   // Skip non-integer, non-scalar case | 
 |   EVT VT=N0.getValueType(); | 
 |   if (VT.isVector()) | 
 |     return SDValue(); | 
 |  | 
 |   // fold (add (mul a, b), c) -> (mad a, b, c) | 
 |   // | 
 |   if (N0.getOpcode() == ISD::MUL) { | 
 |     assert (VT.isInteger()); | 
 |     // For integer: | 
 |     // Since integer multiply-add costs the same as integer multiply | 
 |     // but is more costly than integer add, do the fusion only when | 
 |     // the mul is only used in the add. | 
 |     if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || | 
 |         !N0.getNode()->hasOneUse()) | 
 |       return SDValue(); | 
 |  | 
 |     // Do the folding | 
 |     return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, | 
 |                        N0.getOperand(0), N0.getOperand(1), N1); | 
 |   } | 
 |   else if (N0.getOpcode() == ISD::FMUL) { | 
 |     if (VT == MVT::f32 || VT == MVT::f64) { | 
 |       const auto *TLI = static_cast<const NVPTXTargetLowering *>( | 
 |           &DAG.getTargetLoweringInfo()); | 
 |       if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) | 
 |         return SDValue(); | 
 |  | 
 |       // For floating point: | 
 |       // Do the fusion only when the mul has less than 5 uses and all | 
 |       // are add. | 
 |       // The heuristic is that if a use is not an add, then that use | 
 |       // cannot be fused into fma, therefore mul is still needed anyway. | 
 |       // If there are more than 4 uses, even if they are all add, fusing | 
 |       // them will increase register pressue. | 
 |       // | 
 |       int numUses = 0; | 
 |       int nonAddCount = 0; | 
 |       for (SDNode::use_iterator UI = N0.getNode()->use_begin(), | 
 |            UE = N0.getNode()->use_end(); | 
 |            UI != UE; ++UI) { | 
 |         numUses++; | 
 |         SDNode *User = *UI; | 
 |         if (User->getOpcode() != ISD::FADD) | 
 |           ++nonAddCount; | 
 |       } | 
 |       if (numUses >= 5) | 
 |         return SDValue(); | 
 |       if (nonAddCount) { | 
 |         int orderNo = N->getIROrder(); | 
 |         int orderNo2 = N0.getNode()->getIROrder(); | 
 |         // simple heuristics here for considering potential register | 
 |         // pressure, the logics here is that the differnce are used | 
 |         // to measure the distance between def and use, the longer distance | 
 |         // more likely cause register pressure. | 
 |         if (orderNo - orderNo2 < 500) | 
 |           return SDValue(); | 
 |  | 
 |         // Now, check if at least one of the FMUL's operands is live beyond the node N, | 
 |         // which guarantees that the FMA will not increase register pressure at node N. | 
 |         bool opIsLive = false; | 
 |         const SDNode *left = N0.getOperand(0).getNode(); | 
 |         const SDNode *right = N0.getOperand(1).getNode(); | 
 |  | 
 |         if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) | 
 |           opIsLive = true; | 
 |  | 
 |         if (!opIsLive) | 
 |           for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { | 
 |             SDNode *User = *UI; | 
 |             int orderNo3 = User->getIROrder(); | 
 |             if (orderNo3 > orderNo) { | 
 |               opIsLive = true; | 
 |               break; | 
 |             } | 
 |           } | 
 |  | 
 |         if (!opIsLive) | 
 |           for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { | 
 |             SDNode *User = *UI; | 
 |             int orderNo3 = User->getIROrder(); | 
 |             if (orderNo3 > orderNo) { | 
 |               opIsLive = true; | 
 |               break; | 
 |             } | 
 |           } | 
 |  | 
 |         if (!opIsLive) | 
 |           return SDValue(); | 
 |       } | 
 |  | 
 |       return DAG.getNode(ISD::FMA, SDLoc(N), VT, | 
 |                          N0.getOperand(0), N0.getOperand(1), N1); | 
 |     } | 
 |   } | 
 |  | 
 |   return SDValue(); | 
 | } | 
 |  | 
 | /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. | 
 | /// | 
 | static SDValue PerformADDCombine(SDNode *N, | 
 |                                  TargetLowering::DAGCombinerInfo &DCI, | 
 |                                  const NVPTXSubtarget &Subtarget, | 
 |                                  CodeGenOpt::Level OptLevel) { | 
 |   SDValue N0 = N->getOperand(0); | 
 |   SDValue N1 = N->getOperand(1); | 
 |  | 
 |   // First try with the default operand order. | 
 |   if (SDValue Result = | 
 |           PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) | 
 |     return Result; | 
 |  | 
 |   // If that didn't work, try again with the operands commuted. | 
 |   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); | 
 | } | 
 |  | 
 | static SDValue PerformANDCombine(SDNode *N, | 
 |                                  TargetLowering::DAGCombinerInfo &DCI) { | 
 |   // The type legalizer turns a vector load of i8 values into a zextload to i16 | 
 |   // registers, optionally ANY_EXTENDs it (if target type is integer), | 
 |   // and ANDs off the high 8 bits. Since we turn this load into a | 
 |   // target-specific DAG node, the DAG combiner fails to eliminate these AND | 
 |   // nodes. Do that here. | 
 |   SDValue Val = N->getOperand(0); | 
 |   SDValue Mask = N->getOperand(1); | 
 |  | 
 |   if (isa<ConstantSDNode>(Val)) { | 
 |     std::swap(Val, Mask); | 
 |   } | 
 |  | 
 |   SDValue AExt; | 
 |   // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and | 
 |   if (Val.getOpcode() == ISD::ANY_EXTEND) { | 
 |     AExt = Val; | 
 |     Val = Val->getOperand(0); | 
 |   } | 
 |  | 
 |   if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { | 
 |     Val = Val->getOperand(0); | 
 |   } | 
 |  | 
 |   if (Val->getOpcode() == NVPTXISD::LoadV2 || | 
 |       Val->getOpcode() == NVPTXISD::LoadV4) { | 
 |     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); | 
 |     if (!MaskCnst) { | 
 |       // Not an AND with a constant | 
 |       return SDValue(); | 
 |     } | 
 |  | 
 |     uint64_t MaskVal = MaskCnst->getZExtValue(); | 
 |     if (MaskVal != 0xff) { | 
 |       // Not an AND that chops off top 8 bits | 
 |       return SDValue(); | 
 |     } | 
 |  | 
 |     MemSDNode *Mem = dyn_cast<MemSDNode>(Val); | 
 |     if (!Mem) { | 
 |       // Not a MemSDNode?!? | 
 |       return SDValue(); | 
 |     } | 
 |  | 
 |     EVT MemVT = Mem->getMemoryVT(); | 
 |     if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { | 
 |       // We only handle the i8 case | 
 |       return SDValue(); | 
 |     } | 
 |  | 
 |     unsigned ExtType = | 
 |       cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> | 
 |         getZExtValue(); | 
 |     if (ExtType == ISD::SEXTLOAD) { | 
 |       // If for some reason the load is a sextload, the and is needed to zero | 
 |       // out the high 8 bits | 
 |       return SDValue(); | 
 |     } | 
 |  | 
 |     bool AddTo = false; | 
 |     if (AExt.getNode() != nullptr) { | 
 |       // Re-insert the ext as a zext. | 
 |       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), | 
 |                             AExt.getValueType(), Val); | 
 |       AddTo = true; | 
 |     } | 
 |  | 
 |     // If we get here, the AND is unnecessary.  Just replace it with the load | 
 |     DCI.CombineTo(N, Val, AddTo); | 
 |   } | 
 |  | 
 |   return SDValue(); | 
 | } | 
 |  | 
 | static SDValue PerformREMCombine(SDNode *N, | 
 |                                  TargetLowering::DAGCombinerInfo &DCI, | 
 |                                  CodeGenOpt::Level OptLevel) { | 
 |   assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); | 
 |  | 
 |   // Don't do anything at less than -O2. | 
 |   if (OptLevel < CodeGenOpt::Default) | 
 |     return SDValue(); | 
 |  | 
 |   SelectionDAG &DAG = DCI.DAG; | 
 |   SDLoc DL(N); | 
 |   EVT VT = N->getValueType(0); | 
 |   bool IsSigned = N->getOpcode() == ISD::SREM; | 
 |   unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; | 
 |  | 
 |   const SDValue &Num = N->getOperand(0); | 
 |   const SDValue &Den = N->getOperand(1); | 
 |  | 
 |   for (const SDNode *U : Num->uses()) { | 
 |     if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && | 
 |         U->getOperand(1) == Den) { | 
 |       // Num % Den -> Num - (Num / Den) * Den | 
 |       return DAG.getNode(ISD::SUB, DL, VT, Num, | 
 |                          DAG.getNode(ISD::MUL, DL, VT, | 
 |                                      DAG.getNode(DivOpc, DL, VT, Num, Den), | 
 |                                      Den)); | 
 |     } | 
 |   } | 
 |   return SDValue(); | 
 | } | 
 |  | 
 | enum OperandSignedness { | 
 |   Signed = 0, | 
 |   Unsigned, | 
 |   Unknown | 
 | }; | 
 |  | 
 | /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand | 
 | /// that can be demoted to \p OptSize bits without loss of information. The | 
 | /// signedness of the operand, if determinable, is placed in \p S. | 
 | static bool IsMulWideOperandDemotable(SDValue Op, | 
 |                                       unsigned OptSize, | 
 |                                       OperandSignedness &S) { | 
 |   S = Unknown; | 
 |  | 
 |   if (Op.getOpcode() == ISD::SIGN_EXTEND || | 
 |       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { | 
 |     EVT OrigVT = Op.getOperand(0).getValueType(); | 
 |     if (OrigVT.getSizeInBits() <= OptSize) { | 
 |       S = Signed; | 
 |       return true; | 
 |     } | 
 |   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { | 
 |     EVT OrigVT = Op.getOperand(0).getValueType(); | 
 |     if (OrigVT.getSizeInBits() <= OptSize) { | 
 |       S = Unsigned; | 
 |       return true; | 
 |     } | 
 |   } | 
 |  | 
 |   return false; | 
 | } | 
 |  | 
 | /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can | 
 | /// be demoted to \p OptSize bits without loss of information. If the operands | 
 | /// contain a constant, it should appear as the RHS operand. The signedness of | 
 | /// the operands is placed in \p IsSigned. | 
 | static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, | 
 |                                         unsigned OptSize, | 
 |                                         bool &IsSigned) { | 
 |   OperandSignedness LHSSign; | 
 |  | 
 |   // The LHS operand must be a demotable op | 
 |   if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) | 
 |     return false; | 
 |  | 
 |   // We should have been able to determine the signedness from the LHS | 
 |   if (LHSSign == Unknown) | 
 |     return false; | 
 |  | 
 |   IsSigned = (LHSSign == Signed); | 
 |  | 
 |   // The RHS can be a demotable op or a constant | 
 |   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { | 
 |     const APInt &Val = CI->getAPIntValue(); | 
 |     if (LHSSign == Unsigned) { | 
 |       return Val.isIntN(OptSize); | 
 |     } else { | 
 |       return Val.isSignedIntN(OptSize); | 
 |     } | 
 |   } else { | 
 |     OperandSignedness RHSSign; | 
 |     if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) | 
 |       return false; | 
 |  | 
 |     return LHSSign == RHSSign; | 
 |   } | 
 | } | 
 |  | 
 | /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply | 
 | /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform | 
 | /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift | 
 | /// amount. | 
 | static SDValue TryMULWIDECombine(SDNode *N, | 
 |                                  TargetLowering::DAGCombinerInfo &DCI) { | 
 |   EVT MulType = N->getValueType(0); | 
 |   if (MulType != MVT::i32 && MulType != MVT::i64) { | 
 |     return SDValue(); | 
 |   } | 
 |  | 
 |   SDLoc DL(N); | 
 |   unsigned OptSize = MulType.getSizeInBits() >> 1; | 
 |   SDValue LHS = N->getOperand(0); | 
 |   SDValue RHS = N->getOperand(1); | 
 |  | 
 |   // Canonicalize the multiply so the constant (if any) is on the right | 
 |   if (N->getOpcode() == ISD::MUL) { | 
 |     if (isa<ConstantSDNode>(LHS)) { | 
 |       std::swap(LHS, RHS); | 
 |     } | 
 |   } | 
 |  | 
 |   // If we have a SHL, determine the actual multiply amount | 
 |   if (N->getOpcode() == ISD::SHL) { | 
 |     ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); | 
 |     if (!ShlRHS) { | 
 |       return SDValue(); | 
 |     } | 
 |  | 
 |     APInt ShiftAmt = ShlRHS->getAPIntValue(); | 
 |     unsigned BitWidth = MulType.getSizeInBits(); | 
 |     if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { | 
 |       APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; | 
 |       RHS = DCI.DAG.getConstant(MulVal, DL, MulType); | 
 |     } else { | 
 |       return SDValue(); | 
 |     } | 
 |   } | 
 |  | 
 |   bool Signed; | 
 |   // Verify that our operands are demotable | 
 |   if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { | 
 |     return SDValue(); | 
 |   } | 
 |  | 
 |   EVT DemotedVT; | 
 |   if (MulType == MVT::i32) { | 
 |     DemotedVT = MVT::i16; | 
 |   } else { | 
 |     DemotedVT = MVT::i32; | 
 |   } | 
 |  | 
 |   // Truncate the operands to the correct size. Note that these are just for | 
 |   // type consistency and will (likely) be eliminated in later phases. | 
 |   SDValue TruncLHS = | 
 |     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); | 
 |   SDValue TruncRHS = | 
 |     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); | 
 |  | 
 |   unsigned Opc; | 
 |   if (Signed) { | 
 |     Opc = NVPTXISD::MUL_WIDE_SIGNED; | 
 |   } else { | 
 |     Opc = NVPTXISD::MUL_WIDE_UNSIGNED; | 
 |   } | 
 |  | 
 |   return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); | 
 | } | 
 |  | 
 | /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. | 
 | static SDValue PerformMULCombine(SDNode *N, | 
 |                                  TargetLowering::DAGCombinerInfo &DCI, | 
 |                                  CodeGenOpt::Level OptLevel) { | 
 |   if (OptLevel > 0) { | 
 |     // Try mul.wide combining at OptLevel > 0 | 
 |     if (SDValue Ret = TryMULWIDECombine(N, DCI)) | 
 |       return Ret; | 
 |   } | 
 |  | 
 |   return SDValue(); | 
 | } | 
 |  | 
 | /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. | 
 | static SDValue PerformSHLCombine(SDNode *N, | 
 |                                  TargetLowering::DAGCombinerInfo &DCI, | 
 |                                  CodeGenOpt::Level OptLevel) { | 
 |   if (OptLevel > 0) { | 
 |     // Try mul.wide combining at OptLevel > 0 | 
 |     if (SDValue Ret = TryMULWIDECombine(N, DCI)) | 
 |       return Ret; | 
 |   } | 
 |  | 
 |   return SDValue(); | 
 | } | 
 |  | 
 | static SDValue PerformSETCCCombine(SDNode *N, | 
 |                                    TargetLowering::DAGCombinerInfo &DCI) { | 
 |   EVT CCType = N->getValueType(0); | 
 |   SDValue A = N->getOperand(0); | 
 |   SDValue B = N->getOperand(1); | 
 |  | 
 |   if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) | 
 |     return SDValue(); | 
 |  | 
 |   SDLoc DL(N); | 
 |   // setp.f16x2 returns two scalar predicates, which we need to | 
 |   // convert back to v2i1. The returned result will be scalarized by | 
 |   // the legalizer, but the comparison will remain a single vector | 
 |   // instruction. | 
 |   SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, | 
 |                                    DCI.DAG.getVTList(MVT::i1, MVT::i1), | 
 |                                    {A, B, N->getOperand(2)}); | 
 |   return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), | 
 |                          CCNode.getValue(1)); | 
 | } | 
 |  | 
 | SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, | 
 |                                                DAGCombinerInfo &DCI) const { | 
 |   CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); | 
 |   switch (N->getOpcode()) { | 
 |     default: break; | 
 |     case ISD::ADD: | 
 |     case ISD::FADD: | 
 |       return PerformADDCombine(N, DCI, STI, OptLevel); | 
 |     case ISD::MUL: | 
 |       return PerformMULCombine(N, DCI, OptLevel); | 
 |     case ISD::SHL: | 
 |       return PerformSHLCombine(N, DCI, OptLevel); | 
 |     case ISD::AND: | 
 |       return PerformANDCombine(N, DCI); | 
 |     case ISD::UREM: | 
 |     case ISD::SREM: | 
 |       return PerformREMCombine(N, DCI, OptLevel); | 
 |     case ISD::SETCC: | 
 |       return PerformSETCCCombine(N, DCI); | 
 |   } | 
 |   return SDValue(); | 
 | } | 
 |  | 
 | /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. | 
 | static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, | 
 |                               SmallVectorImpl<SDValue> &Results) { | 
 |   EVT ResVT = N->getValueType(0); | 
 |   SDLoc DL(N); | 
 |  | 
 |   assert(ResVT.isVector() && "Vector load must have vector type"); | 
 |  | 
 |   // We only handle "native" vector sizes for now, e.g. <4 x double> is not | 
 |   // legal.  We can (and should) split that into 2 loads of <2 x double> here | 
 |   // but I'm leaving that as a TODO for now. | 
 |   assert(ResVT.isSimple() && "Can only handle simple types"); | 
 |   switch (ResVT.getSimpleVT().SimpleTy) { | 
 |   default: | 
 |     return; | 
 |   case MVT::v2i8: | 
 |   case MVT::v2i16: | 
 |   case MVT::v2i32: | 
 |   case MVT::v2i64: | 
 |   case MVT::v2f16: | 
 |   case MVT::v2f32: | 
 |   case MVT::v2f64: | 
 |   case MVT::v4i8: | 
 |   case MVT::v4i16: | 
 |   case MVT::v4i32: | 
 |   case MVT::v4f16: | 
 |   case MVT::v4f32: | 
 |   case MVT::v8f16: // <4 x f16x2> | 
 |     // This is a "native" vector type | 
 |     break; | 
 |   } | 
 |  | 
 |   LoadSDNode *LD = cast<LoadSDNode>(N); | 
 |  | 
 |   unsigned Align = LD->getAlignment(); | 
 |   auto &TD = DAG.getDataLayout(); | 
 |   unsigned PrefAlign = | 
 |       TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); | 
 |   if (Align < PrefAlign) { | 
 |     // This load is not sufficiently aligned, so bail out and let this vector | 
 |     // load be scalarized.  Note that we may still be able to emit smaller | 
 |     // vector loads.  For example, if we are loading a <4 x float> with an | 
 |     // alignment of 8, this check will fail but the legalizer will try again | 
 |     // with 2 x <2 x float>, which will succeed with an alignment of 8. | 
 |     return; | 
 |   } | 
 |  | 
 |   EVT EltVT = ResVT.getVectorElementType(); | 
 |   unsigned NumElts = ResVT.getVectorNumElements(); | 
 |  | 
 |   // Since LoadV2 is a target node, we cannot rely on DAG type legalization. | 
 |   // Therefore, we must ensure the type is legal.  For i1 and i8, we set the | 
 |   // loaded type to i16 and propagate the "real" type as the memory type. | 
 |   bool NeedTrunc = false; | 
 |   if (EltVT.getSizeInBits() < 16) { | 
 |     EltVT = MVT::i16; | 
 |     NeedTrunc = true; | 
 |   } | 
 |  | 
 |   unsigned Opcode = 0; | 
 |   SDVTList LdResVTs; | 
 |   bool LoadF16x2 = false; | 
 |  | 
 |   switch (NumElts) { | 
 |   default: | 
 |     return; | 
 |   case 2: | 
 |     Opcode = NVPTXISD::LoadV2; | 
 |     LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); | 
 |     break; | 
 |   case 4: { | 
 |     Opcode = NVPTXISD::LoadV4; | 
 |     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; | 
 |     LdResVTs = DAG.getVTList(ListVTs); | 
 |     break; | 
 |   } | 
 |   case 8: { | 
 |     // v8f16 is a special case. PTX doesn't have ld.v8.f16 | 
 |     // instruction. Instead, we split the vector into v2f16 chunks and | 
 |     // load them with ld.v4.b32. | 
 |     assert(EltVT == MVT::f16 && "Unsupported v8 vector type."); | 
 |     LoadF16x2 = true; | 
 |     Opcode = NVPTXISD::LoadV4; | 
 |     EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16, | 
 |                      MVT::Other}; | 
 |     LdResVTs = DAG.getVTList(ListVTs); | 
 |     break; | 
 |   } | 
 |   } | 
 |  | 
 |   // Copy regular operands | 
 |   SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); | 
 |  | 
 |   // The select routine does not have access to the LoadSDNode instance, so | 
 |   // pass along the extension information | 
 |   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); | 
 |  | 
 |   SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, | 
 |                                           LD->getMemoryVT(), | 
 |                                           LD->getMemOperand()); | 
 |  | 
 |   SmallVector<SDValue, 8> ScalarRes; | 
 |   if (LoadF16x2) { | 
 |     // Split v2f16 subvectors back into individual elements. | 
 |     NumElts /= 2; | 
 |     for (unsigned i = 0; i < NumElts; ++i) { | 
 |       SDValue SubVector = NewLD.getValue(i); | 
 |       SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, | 
 |                                DAG.getIntPtrConstant(0, DL)); | 
 |       SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, | 
 |                                DAG.getIntPtrConstant(1, DL)); | 
 |       ScalarRes.push_back(E0); | 
 |       ScalarRes.push_back(E1); | 
 |     } | 
 |   } else { | 
 |     for (unsigned i = 0; i < NumElts; ++i) { | 
 |       SDValue Res = NewLD.getValue(i); | 
 |       if (NeedTrunc) | 
 |         Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); | 
 |       ScalarRes.push_back(Res); | 
 |     } | 
 |   } | 
 |  | 
 |   SDValue LoadChain = NewLD.getValue(NumElts); | 
 |  | 
 |   SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); | 
 |  | 
 |   Results.push_back(BuildVec); | 
 |   Results.push_back(LoadChain); | 
 | } | 
 |  | 
 | static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, | 
 |                                      SmallVectorImpl<SDValue> &Results) { | 
 |   SDValue Chain = N->getOperand(0); | 
 |   SDValue Intrin = N->getOperand(1); | 
 |   SDLoc DL(N); | 
 |  | 
 |   // Get the intrinsic ID | 
 |   unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); | 
 |   switch (IntrinNo) { | 
 |   default: | 
 |     return; | 
 |   case Intrinsic::nvvm_ldg_global_i: | 
 |   case Intrinsic::nvvm_ldg_global_f: | 
 |   case Intrinsic::nvvm_ldg_global_p: | 
 |   case Intrinsic::nvvm_ldu_global_i: | 
 |   case Intrinsic::nvvm_ldu_global_f: | 
 |   case Intrinsic::nvvm_ldu_global_p: { | 
 |     EVT ResVT = N->getValueType(0); | 
 |  | 
 |     if (ResVT.isVector()) { | 
 |       // Vector LDG/LDU | 
 |  | 
 |       unsigned NumElts = ResVT.getVectorNumElements(); | 
 |       EVT EltVT = ResVT.getVectorElementType(); | 
 |  | 
 |       // Since LDU/LDG are target nodes, we cannot rely on DAG type | 
 |       // legalization. | 
 |       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the | 
 |       // loaded type to i16 and propagate the "real" type as the memory type. | 
 |       bool NeedTrunc = false; | 
 |       if (EltVT.getSizeInBits() < 16) { | 
 |         EltVT = MVT::i16; | 
 |         NeedTrunc = true; | 
 |       } | 
 |  | 
 |       unsigned Opcode = 0; | 
 |       SDVTList LdResVTs; | 
 |  | 
 |       switch (NumElts) { | 
 |       default: | 
 |         return; | 
 |       case 2: | 
 |         switch (IntrinNo) { | 
 |         default: | 
 |           return; | 
 |         case Intrinsic::nvvm_ldg_global_i: | 
 |         case Intrinsic::nvvm_ldg_global_f: | 
 |         case Intrinsic::nvvm_ldg_global_p: | 
 |           Opcode = NVPTXISD::LDGV2; | 
 |           break; | 
 |         case Intrinsic::nvvm_ldu_global_i: | 
 |         case Intrinsic::nvvm_ldu_global_f: | 
 |         case Intrinsic::nvvm_ldu_global_p: | 
 |           Opcode = NVPTXISD::LDUV2; | 
 |           break; | 
 |         } | 
 |         LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); | 
 |         break; | 
 |       case 4: { | 
 |         switch (IntrinNo) { | 
 |         default: | 
 |           return; | 
 |         case Intrinsic::nvvm_ldg_global_i: | 
 |         case Intrinsic::nvvm_ldg_global_f: | 
 |         case Intrinsic::nvvm_ldg_global_p: | 
 |           Opcode = NVPTXISD::LDGV4; | 
 |           break; | 
 |         case Intrinsic::nvvm_ldu_global_i: | 
 |         case Intrinsic::nvvm_ldu_global_f: | 
 |         case Intrinsic::nvvm_ldu_global_p: | 
 |           Opcode = NVPTXISD::LDUV4; | 
 |           break; | 
 |         } | 
 |         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; | 
 |         LdResVTs = DAG.getVTList(ListVTs); | 
 |         break; | 
 |       } | 
 |       } | 
 |  | 
 |       SmallVector<SDValue, 8> OtherOps; | 
 |  | 
 |       // Copy regular operands | 
 |  | 
 |       OtherOps.push_back(Chain); // Chain | 
 |                                  // Skip operand 1 (intrinsic ID) | 
 |       // Others | 
 |       OtherOps.append(N->op_begin() + 2, N->op_end()); | 
 |  | 
 |       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); | 
 |  | 
 |       SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, | 
 |                                               MemSD->getMemoryVT(), | 
 |                                               MemSD->getMemOperand()); | 
 |  | 
 |       SmallVector<SDValue, 4> ScalarRes; | 
 |  | 
 |       for (unsigned i = 0; i < NumElts; ++i) { | 
 |         SDValue Res = NewLD.getValue(i); | 
 |         if (NeedTrunc) | 
 |           Res = | 
 |               DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); | 
 |         ScalarRes.push_back(Res); | 
 |       } | 
 |  | 
 |       SDValue LoadChain = NewLD.getValue(NumElts); | 
 |  | 
 |       SDValue BuildVec = | 
 |           DAG.getBuildVector(ResVT, DL, ScalarRes); | 
 |  | 
 |       Results.push_back(BuildVec); | 
 |       Results.push_back(LoadChain); | 
 |     } else { | 
 |       // i8 LDG/LDU | 
 |       assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && | 
 |              "Custom handling of non-i8 ldu/ldg?"); | 
 |  | 
 |       // Just copy all operands as-is | 
 |       SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); | 
 |  | 
 |       // Force output to i16 | 
 |       SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); | 
 |  | 
 |       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); | 
 |  | 
 |       // We make sure the memory type is i8, which will be used during isel | 
 |       // to select the proper instruction. | 
 |       SDValue NewLD = | 
 |           DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, | 
 |                                   MVT::i8, MemSD->getMemOperand()); | 
 |  | 
 |       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, | 
 |                                     NewLD.getValue(0))); | 
 |       Results.push_back(NewLD.getValue(1)); | 
 |     } | 
 |   } | 
 |   } | 
 | } | 
 |  | 
 | void NVPTXTargetLowering::ReplaceNodeResults( | 
 |     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { | 
 |   switch (N->getOpcode()) { | 
 |   default: | 
 |     report_fatal_error("Unhandled custom legalization"); | 
 |   case ISD::LOAD: | 
 |     ReplaceLoadVector(N, DAG, Results); | 
 |     return; | 
 |   case ISD::INTRINSIC_W_CHAIN: | 
 |     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); | 
 |     return; | 
 |   } | 
 | } | 
 |  | 
 | // Pin NVPTXTargetObjectFile's vtables to this file. | 
 | NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {} | 
 |  | 
 | MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( | 
 |     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { | 
 |   return getDataSection(); | 
 | } |