v8/src/wasm/baseline/arm/liftoff-assembler-arm.h - cobalt - Git at Google

 // Copyright 2017 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef V8_WASM_BASELINE_ARM_LIFTOFF_ASSEMBLER_ARM_H_
 #define V8_WASM_BASELINE_ARM_LIFTOFF_ASSEMBLER_ARM_H_

 #include "src/heap/memory-chunk.h"
 #include "src/wasm/baseline/liftoff-assembler.h"
 #include "src/wasm/baseline/liftoff-register.h"

 namespace v8 {
 namespace internal {
 namespace wasm {

 namespace liftoff {

 //  half
 //  slot        Frame
 //  -----+--------------------+---------------------------
 //  n+3  |   parameter n      |
 //  ...  |       ...          |
 //   4   |   parameter 1      | or parameter 2
 //   3   |   parameter 0      | or parameter 1
 //   2   |  (result address)  | or parameter 0
 //  -----+--------------------+---------------------------
 //   1   | return addr (lr)   |
 //   0   | previous frame (fp)|
 //  -----+--------------------+  <-- frame ptr (fp)
 //  -1   | 0xa: WASM          |
 //  -2   |     instance       |
 //  -----+--------------------+---------------------------
 //  -3   |    slot 0 (high)   |   ^
 //  -4   |    slot 0 (low)    |   |
 //  -5   |    slot 1 (high)   | Frame slots
 //  -6   |    slot 1 (low)    |   |
 //       |                    |   v
 //  -----+--------------------+  <-- stack ptr (sp)
 //
 static_assert(2 * kSystemPointerSize == LiftoffAssembler::kStackSlotSize,
               "Slot size should be twice the size of the 32 bit pointer.");
 constexpr int kInstanceOffset = 2 * kSystemPointerSize;
 // kPatchInstructionsRequired sets a maximum limit of how many instructions that
 // PatchPrepareStackFrame will use in order to increase the stack appropriately.
 // Three instructions are required to sub a large constant, movw + movt + sub.
 constexpr int32_t kPatchInstructionsRequired = 3;
 constexpr int kHalfStackSlotSize = LiftoffAssembler::kStackSlotSize >> 1;

 inline MemOperand GetStackSlot(int offset) { return MemOperand(fp, -offset); }

 inline MemOperand GetHalfStackSlot(int offset, RegPairHalf half) {
   int32_t half_offset =
       half == kLowWord ? 0 : LiftoffAssembler::kStackSlotSize / 2;
   return MemOperand(offset > 0 ? fp : sp, -offset + half_offset);
 }

 inline MemOperand GetInstanceOperand() { return GetStackSlot(kInstanceOffset); }

 inline MemOperand GetMemOp(LiftoffAssembler* assm,
                            UseScratchRegisterScope* temps, Register addr,
                            Register offset, int32_t offset_imm) {
   if (offset != no_reg) {
     if (offset_imm == 0) return MemOperand(addr, offset);
     Register tmp = temps->Acquire();
     assm->add(tmp, offset, Operand(offset_imm));
     return MemOperand(addr, tmp);
   }
   return MemOperand(addr, offset_imm);
 }

 inline Register CalculateActualAddress(LiftoffAssembler* assm,
                                        UseScratchRegisterScope* temps,
                                        Register addr_reg, Register offset_reg,
                                        int32_t offset_imm,
                                        Register result_reg = no_reg) {
   if (offset_reg == no_reg && offset_imm == 0) {
     if (result_reg == no_reg) {
       return addr_reg;
     } else {
       assm->mov(result_reg, addr_reg);
       return result_reg;
     }
   }
   Register actual_addr_reg =
       result_reg != no_reg ? result_reg : temps->Acquire();
   if (offset_reg == no_reg) {
     assm->add(actual_addr_reg, addr_reg, Operand(offset_imm));
   } else {
     assm->add(actual_addr_reg, addr_reg, Operand(offset_reg));
     if (offset_imm != 0) {
       assm->add(actual_addr_reg, actual_addr_reg, Operand(offset_imm));
     }
   }
   return actual_addr_reg;
 }

 inline Condition MakeUnsigned(Condition cond) {
   switch (cond) {
     case kSignedLessThan:
       return kUnsignedLessThan;
     case kSignedLessEqual:
       return kUnsignedLessEqual;
     case kSignedGreaterThan:
       return kUnsignedGreaterThan;
     case kSignedGreaterEqual:
       return kUnsignedGreaterEqual;
     case kEqual:
     case kUnequal:
     case kUnsignedLessThan:
     case kUnsignedLessEqual:
     case kUnsignedGreaterThan:
     case kUnsignedGreaterEqual:
       return cond;
     default:
       UNREACHABLE();
   }
 }

 template <void (Assembler::*op)(Register, Register, Register, SBit, Condition),
           void (Assembler::*op_with_carry)(Register, Register, const Operand&,
                                            SBit, Condition)>
 inline void I64Binop(LiftoffAssembler* assm, LiftoffRegister dst,
                      LiftoffRegister lhs, LiftoffRegister rhs) {
   Register dst_low = dst.low_gp();
   if (dst_low == lhs.high_gp() || dst_low == rhs.high_gp()) {
     dst_low = assm->GetUnusedRegister(
                       kGpReg, LiftoffRegList::ForRegs(lhs, rhs, dst.high_gp()))
                   .gp();
   }
   (assm->*op)(dst_low, lhs.low_gp(), rhs.low_gp(), SetCC, al);
   (assm->*op_with_carry)(dst.high_gp(), lhs.high_gp(), Operand(rhs.high_gp()),
                          LeaveCC, al);
   if (dst_low != dst.low_gp()) assm->mov(dst.low_gp(), dst_low);
 }

 template <void (Assembler::*op)(Register, Register, const Operand&, SBit,
                                 Condition),
           void (Assembler::*op_with_carry)(Register, Register, const Operand&,
                                            SBit, Condition)>
 inline void I64BinopI(LiftoffAssembler* assm, LiftoffRegister dst,
                       LiftoffRegister lhs, int32_t imm) {
   // The compiler allocated registers such that either {dst == lhs} or there is
   // no overlap between the two.
   DCHECK_NE(dst.low_gp(), lhs.high_gp());
   (assm->*op)(dst.low_gp(), lhs.low_gp(), Operand(imm), SetCC, al);
   // Top half of the immediate sign extended, either 0 or -1.
   int32_t sign_extend = imm < 0 ? -1 : 0;
   (assm->*op_with_carry)(dst.high_gp(), lhs.high_gp(), Operand(sign_extend),
                          LeaveCC, al);
 }

 template <void (TurboAssembler::*op)(Register, Register, Register, Register,
                                      Register),
           bool is_left_shift>
 inline void I64Shiftop(LiftoffAssembler* assm, LiftoffRegister dst,
                        LiftoffRegister src, Register amount) {
   Register src_low = src.low_gp();
   Register src_high = src.high_gp();
   Register dst_low = dst.low_gp();
   Register dst_high = dst.high_gp();
   // Left shift writes {dst_high} then {dst_low}, right shifts write {dst_low}
   // then {dst_high}.
   Register clobbered_dst_reg = is_left_shift ? dst_high : dst_low;
   LiftoffRegList pinned = LiftoffRegList::ForRegs(clobbered_dst_reg, src);
   Register amount_capped =
       pinned.set(assm->GetUnusedRegister(kGpReg, pinned)).gp();
   assm->and_(amount_capped, amount, Operand(0x3F));

   // Ensure that writing the first half of {dst} does not overwrite the still
   // needed half of {src}.
   Register* later_src_reg = is_left_shift ? &src_low : &src_high;
   if (*later_src_reg == clobbered_dst_reg) {
     *later_src_reg = assm->GetUnusedRegister(kGpReg, pinned).gp();
     assm->TurboAssembler::Move(*later_src_reg, clobbered_dst_reg);
   }

   (assm->*op)(dst_low, dst_high, src_low, src_high, amount_capped);
 }

 inline FloatRegister GetFloatRegister(DoubleRegister reg) {
   DCHECK_LT(reg.code(), kDoubleCode_d16);
   return LowDwVfpRegister::from_code(reg.code()).low();
 }

 inline Simd128Register GetSimd128Register(DoubleRegister reg) {
   return QwNeonRegister::from_code(reg.code() / 2);
 }

 inline Simd128Register GetSimd128Register(LiftoffRegister reg) {
   return liftoff::GetSimd128Register(reg.low_fp());
 }

 enum class MinOrMax : uint8_t { kMin, kMax };
 template <typename RegisterType>
 inline void EmitFloatMinOrMax(LiftoffAssembler* assm, RegisterType dst,
                               RegisterType lhs, RegisterType rhs,
                               MinOrMax min_or_max) {
   DCHECK(RegisterType::kSizeInBytes == 4 || RegisterType::kSizeInBytes == 8);
   if (lhs == rhs) {
     assm->TurboAssembler::Move(dst, lhs);
     return;
   }
   Label done, is_nan;
   if (min_or_max == MinOrMax::kMin) {
     assm->TurboAssembler::FloatMin(dst, lhs, rhs, &is_nan);
   } else {
     assm->TurboAssembler::FloatMax(dst, lhs, rhs, &is_nan);
   }
   assm->b(&done);
   assm->bind(&is_nan);
   // Create a NaN output.
   assm->vadd(dst, lhs, rhs);
   assm->bind(&done);
 }

 inline Register EnsureNoAlias(Assembler* assm, Register reg,
                               Register must_not_alias,
                               UseScratchRegisterScope* temps) {
   if (reg != must_not_alias) return reg;
   Register tmp = temps->Acquire();
   DCHECK_NE(reg, tmp);
   assm->mov(tmp, reg);
   return tmp;
 }

 inline void S128NarrowOp(LiftoffAssembler* assm, NeonDataType dt,
                          NeonDataType sdt, LiftoffRegister dst,
                          LiftoffRegister lhs, LiftoffRegister rhs) {
   if (dst == lhs) {
     assm->vqmovn(dt, sdt, dst.low_fp(), liftoff::GetSimd128Register(lhs));
     assm->vqmovn(dt, sdt, dst.high_fp(), liftoff::GetSimd128Register(rhs));
   } else {
     assm->vqmovn(dt, sdt, dst.high_fp(), liftoff::GetSimd128Register(rhs));
     assm->vqmovn(dt, sdt, dst.low_fp(), liftoff::GetSimd128Register(lhs));
   }
 }

 inline void F64x2Compare(LiftoffAssembler* assm, LiftoffRegister dst,
                          LiftoffRegister lhs, LiftoffRegister rhs,
                          Condition cond) {
   DCHECK(cond == eq || cond == ne || cond == lt || cond == le);

   QwNeonRegister dest = liftoff::GetSimd128Register(dst);
   QwNeonRegister left = liftoff::GetSimd128Register(lhs);
   QwNeonRegister right = liftoff::GetSimd128Register(rhs);
   UseScratchRegisterScope temps(assm);
   Register scratch = temps.Acquire();

   assm->mov(scratch, Operand(0));
   assm->VFPCompareAndSetFlags(left.low(), right.low());
   assm->mov(scratch, Operand(-1), LeaveCC, cond);
   if (cond == lt || cond == le) {
     // Check for NaN.
     assm->mov(scratch, Operand(0), LeaveCC, vs);
   }
   assm->vmov(dest.low(), scratch, scratch);

   assm->mov(scratch, Operand(0));
   assm->VFPCompareAndSetFlags(left.high(), right.high());
   assm->mov(scratch, Operand(-1), LeaveCC, cond);
   if (cond == lt || cond == le) {
     // Check for NaN.
     assm->mov(scratch, Operand(0), LeaveCC, vs);
   }
   assm->vmov(dest.high(), scratch, scratch);
 }

 inline void Store(LiftoffAssembler* assm, LiftoffRegister src, MemOperand dst,
                   ValueType type) {
 #ifdef DEBUG
   // The {str} instruction needs a temp register when the immediate in the
   // provided MemOperand does not fit into 12 bits. This happens for large stack
   // frames. This DCHECK checks that the temp register is available when needed.
   DCHECK(UseScratchRegisterScope{assm}.CanAcquire());
 #endif
   switch (type.kind()) {
     case ValueType::kI32:
     case ValueType::kOptRef:
     case ValueType::kRef:
       assm->str(src.gp(), dst);
       break;
     case ValueType::kI64:
       // Positive offsets should be lowered to kI32.
       assm->str(src.low_gp(), MemOperand(dst.rn(), dst.offset()));
       assm->str(
           src.high_gp(),
           MemOperand(dst.rn(), dst.offset() + liftoff::kHalfStackSlotSize));
       break;
     case ValueType::kF32:
       assm->vstr(liftoff::GetFloatRegister(src.fp()), dst);
       break;
     case ValueType::kF64:
       assm->vstr(src.fp(), dst);
       break;
     case ValueType::kS128: {
       UseScratchRegisterScope temps(assm);
       Register addr = liftoff::CalculateActualAddress(assm, &temps, dst.rn(),
                                                       no_reg, dst.offset());
       assm->vst1(Neon8, NeonListOperand(src.low_fp(), 2), NeonMemOperand(addr));
       break;
     }
     default:
       UNREACHABLE();
   }
 }

 inline void Load(LiftoffAssembler* assm, LiftoffRegister dst, MemOperand src,
                  ValueType type) {
   switch (type.kind()) {
     case ValueType::kI32:
     case ValueType::kOptRef:
     case ValueType::kRef:
       assm->ldr(dst.gp(), src);
       break;
     case ValueType::kI64:
       assm->ldr(dst.low_gp(), MemOperand(src.rn(), src.offset()));
       assm->ldr(
           dst.high_gp(),
           MemOperand(src.rn(), src.offset() + liftoff::kHalfStackSlotSize));
       break;
     case ValueType::kF32:
       assm->vldr(liftoff::GetFloatRegister(dst.fp()), src);
       break;
     case ValueType::kF64:
       assm->vldr(dst.fp(), src);
       break;
     case ValueType::kS128: {
       // Get memory address of slot to fill from.
       UseScratchRegisterScope temps(assm);
       Register addr = liftoff::CalculateActualAddress(assm, &temps, src.rn(),
                                                       no_reg, src.offset());
       assm->vld1(Neon8, NeonListOperand(dst.low_fp(), 2), NeonMemOperand(addr));
       break;
     }
     default:
       UNREACHABLE();
   }
 }

 constexpr int MaskFromNeonDataType(NeonDataType dt) {
   switch (dt) {
     case NeonS8:
     case NeonU8:
       return 7;
     case NeonS16:
     case NeonU16:
       return 15;
     case NeonS32:
     case NeonU32:
       return 31;
     case NeonS64:
     case NeonU64:
       return 63;
   }
 }

 enum ShiftDirection { kLeft, kRight };

 template <ShiftDirection dir = kLeft, NeonDataType dt, NeonSize sz>
 inline void EmitSimdShift(LiftoffAssembler* assm, LiftoffRegister dst,
                           LiftoffRegister lhs, LiftoffRegister rhs) {
   constexpr int mask = MaskFromNeonDataType(dt);
   UseScratchRegisterScope temps(assm);
   QwNeonRegister tmp = temps.AcquireQ();
   Register shift = temps.Acquire();
   assm->and_(shift, rhs.gp(), Operand(mask));
   assm->vdup(sz, tmp, shift);
   if (dir == kRight) {
     assm->vneg(sz, tmp, tmp);
   }
   assm->vshl(dt, liftoff::GetSimd128Register(dst),
              liftoff::GetSimd128Register(lhs), tmp);
 }

 template <ShiftDirection dir, NeonDataType dt>
 inline void EmitSimdShiftImmediate(LiftoffAssembler* assm, LiftoffRegister dst,
                                    LiftoffRegister lhs, int32_t rhs) {
   // vshr by 0 is not allowed, so check for it, and only move if dst != lhs.
   int32_t shift = rhs & MaskFromNeonDataType(dt);
   if (shift) {
     if (dir == kLeft) {
       assm->vshl(dt, liftoff::GetSimd128Register(dst),
                  liftoff::GetSimd128Register(lhs), shift);
     } else {
       assm->vshr(dt, liftoff::GetSimd128Register(dst),
                  liftoff::GetSimd128Register(lhs), shift);
     }
   } else if (dst != lhs) {
     assm->vmov(liftoff::GetSimd128Register(dst),
                liftoff::GetSimd128Register(lhs));
   }
 }

 inline void EmitAnyTrue(LiftoffAssembler* assm, LiftoffRegister dst,
                         LiftoffRegister src) {
   UseScratchRegisterScope temps(assm);
   DwVfpRegister scratch = temps.AcquireD();
   assm->vpmax(NeonU32, scratch, src.low_fp(), src.high_fp());
   assm->vpmax(NeonU32, scratch, scratch, scratch);
   assm->ExtractLane(dst.gp(), scratch, NeonS32, 0);
   assm->cmp(dst.gp(), Operand(0));
   assm->mov(dst.gp(), Operand(1), LeaveCC, ne);
 }

 }  // namespace liftoff

 int LiftoffAssembler::PrepareStackFrame() {
   if (!CpuFeatures::IsSupported(ARMv7)) {
     bailout(kUnsupportedArchitecture, "Armv6 not supported");
     return 0;
   }
   uint32_t offset = static_cast<uint32_t>(pc_offset());
   // PatchPrepareStackFrame will patch this in order to increase the stack
   // appropriately. Additional nops are required as the bytes operand might
   // require extra moves to encode.
   for (int i = 0; i < liftoff::kPatchInstructionsRequired; i++) {
     nop();
   }
   DCHECK_EQ(offset + liftoff::kPatchInstructionsRequired * kInstrSize,
             pc_offset());
   return offset;
 }

 void LiftoffAssembler::PrepareTailCall(int num_callee_stack_params,
                                        int stack_param_delta) {
   UseScratchRegisterScope temps(this);
   Register scratch = temps.Acquire();

   // Push the return address and frame pointer to complete the stack frame.
   sub(sp, sp, Operand(8));
   ldr(scratch, MemOperand(fp, 4));
   str(scratch, MemOperand(sp, 4));
   ldr(scratch, MemOperand(fp, 0));
   str(scratch, MemOperand(sp, 0));

   // Shift the whole frame upwards.
   int slot_count = num_callee_stack_params + 2;
   for (int i = slot_count - 1; i >= 0; --i) {
     ldr(scratch, MemOperand(sp, i * 4));
     str(scratch, MemOperand(fp, (i - stack_param_delta) * 4));
   }

   // Set the new stack and frame pointer.
   sub(sp, fp, Operand(stack_param_delta * 4));
   Pop(lr, fp);
 }

 void LiftoffAssembler::PatchPrepareStackFrame(int offset, int frame_size) {
 #ifdef USE_SIMULATOR
   // When using the simulator, deal with Liftoff which allocates the stack
   // before checking it.
   // TODO(arm): Remove this when the stack check mechanism will be updated.
   if (frame_size > KB / 2) {
     bailout(kOtherReason,
             "Stack limited to 512 bytes to avoid a bug in StackCheck");
     return;
   }
 #endif
   PatchingAssembler patching_assembler(AssemblerOptions{},
                                        buffer_start_ + offset,
                                        liftoff::kPatchInstructionsRequired);
 #if V8_OS_WIN
   if (frame_size > kStackPageSize) {
     // Generate OOL code (at the end of the function, where the current
     // assembler is pointing) to do the explicit stack limit check (see
     // https://docs.microsoft.com/en-us/previous-versions/visualstudio/
     // visual-studio-6.0/aa227153(v=vs.60)).
     // At the function start, emit a jump to that OOL code (from {offset} to
     // {pc_offset()}).
     int ool_offset = pc_offset() - offset;
     patching_assembler.b(ool_offset - Instruction::kPcLoadDelta);
     patching_assembler.PadWithNops();

     // Now generate the OOL code.
     AllocateStackSpace(frame_size);
     // Jump back to the start of the function (from {pc_offset()} to {offset +
     // liftoff::kPatchInstructionsRequired * kInstrSize}).
     int func_start_offset =
         offset + liftoff::kPatchInstructionsRequired * kInstrSize - pc_offset();
     b(func_start_offset - Instruction::kPcLoadDelta);
     return;
   }
 #endif
   patching_assembler.sub(sp, sp, Operand(frame_size));
   patching_assembler.PadWithNops();
 }

 void LiftoffAssembler::FinishCode() { CheckConstPool(true, false); }

 void LiftoffAssembler::AbortCompilation() { AbortedCodeGeneration(); }

 // static
 constexpr int LiftoffAssembler::StaticStackFrameSize() {
   return liftoff::kInstanceOffset;
 }

 int LiftoffAssembler::SlotSizeForType(ValueType type) {
   switch (type.kind()) {
     case ValueType::kS128:
       return type.element_size_bytes();
     default:
       return kStackSlotSize;
   }
 }

 bool LiftoffAssembler::NeedsAlignment(ValueType type) {
   return (type.kind() == ValueType::kS128 || type.is_reference_type());
 }

 void LiftoffAssembler::LoadConstant(LiftoffRegister reg, WasmValue value,
                                     RelocInfo::Mode rmode) {
   switch (value.type().kind()) {
     case ValueType::kI32:
       TurboAssembler::Move(reg.gp(), Operand(value.to_i32(), rmode));
       break;
     case ValueType::kI64: {
       DCHECK(RelocInfo::IsNone(rmode));
       int32_t low_word = value.to_i64();
       int32_t high_word = value.to_i64() >> 32;
       TurboAssembler::Move(reg.low_gp(), Operand(low_word));
       TurboAssembler::Move(reg.high_gp(), Operand(high_word));
       break;
     }
     case ValueType::kF32:
       vmov(liftoff::GetFloatRegister(reg.fp()), value.to_f32_boxed());
       break;
     case ValueType::kF64: {
       Register extra_scratch = GetUnusedRegister(kGpReg, {}).gp();
       vmov(reg.fp(), Double(value.to_f64_boxed().get_bits()), extra_scratch);
       break;
     }
     default:
       UNREACHABLE();
   }
 }

 void LiftoffAssembler::LoadFromInstance(Register dst, int offset, int size) {
   DCHECK_LE(0, offset);
   DCHECK_EQ(4, size);
   ldr(dst, liftoff::GetInstanceOperand());
   ldr(dst, MemOperand(dst, offset));
 }

 void LiftoffAssembler::LoadTaggedPointerFromInstance(Register dst, int offset) {
   LoadFromInstance(dst, offset, kTaggedSize);
 }

 void LiftoffAssembler::SpillInstance(Register instance) {
   str(instance, liftoff::GetInstanceOperand());
 }

 void LiftoffAssembler::FillInstanceInto(Register dst) {
   ldr(dst, liftoff::GetInstanceOperand());
 }

 namespace liftoff {
 #define __ lasm->
 inline void LoadInternal(LiftoffAssembler* lasm, LiftoffRegister dst,
                          Register src_addr, Register offset_reg,
                          int32_t offset_imm, LoadType type,
                          LiftoffRegList pinned,
                          uint32_t* protected_load_pc = nullptr,
                          bool is_load_mem = false) {
   DCHECK_IMPLIES(type.value_type() == kWasmI64, dst.is_gp_pair());
   UseScratchRegisterScope temps(lasm);
   if (type.value() == LoadType::kF64Load ||
       type.value() == LoadType::kF32Load ||
       type.value() == LoadType::kS128Load) {
     Register actual_src_addr = liftoff::CalculateActualAddress(
         lasm, &temps, src_addr, offset_reg, offset_imm);
     if (type.value() == LoadType::kF64Load) {
       // Armv6 is not supported so Neon can be used to avoid alignment issues.
       CpuFeatureScope scope(lasm, NEON);
       __ vld1(Neon64, NeonListOperand(dst.fp()),
               NeonMemOperand(actual_src_addr));
     } else if (type.value() == LoadType::kF32Load) {
       // TODO(arm): Use vld1 for f32 when implemented in simulator as used for
       // f64. It supports unaligned access.
       Register scratch =
           (actual_src_addr == src_addr) ? temps.Acquire() : actual_src_addr;
       __ ldr(scratch, MemOperand(actual_src_addr));
       __ vmov(liftoff::GetFloatRegister(dst.fp()), scratch);
     } else {
       // Armv6 is not supported so Neon can be used to avoid alignment issues.
       CpuFeatureScope scope(lasm, NEON);
       __ vld1(Neon8, NeonListOperand(dst.low_fp(), 2),
               NeonMemOperand(actual_src_addr));
     }
   } else {
     MemOperand src_op =
         liftoff::GetMemOp(lasm, &temps, src_addr, offset_reg, offset_imm);
     if (protected_load_pc) *protected_load_pc = __ pc_offset();
     switch (type.value()) {
       case LoadType::kI32Load8U:
         __ ldrb(dst.gp(), src_op);
         break;
       case LoadType::kI64Load8U:
         __ ldrb(dst.low_gp(), src_op);
         __ mov(dst.high_gp(), Operand(0));
         break;
       case LoadType::kI32Load8S:
         __ ldrsb(dst.gp(), src_op);
         break;
       case LoadType::kI64Load8S:
         __ ldrsb(dst.low_gp(), src_op);
         __ asr(dst.high_gp(), dst.low_gp(), Operand(31));
         break;
       case LoadType::kI32Load16U:
         __ ldrh(dst.gp(), src_op);
         break;
       case LoadType::kI64Load16U:
         __ ldrh(dst.low_gp(), src_op);
         __ mov(dst.high_gp(), Operand(0));
         break;
       case LoadType::kI32Load16S:
         __ ldrsh(dst.gp(), src_op);
         break;
       case LoadType::kI32Load:
         __ ldr(dst.gp(), src_op);
         break;
       case LoadType::kI64Load16S:
         __ ldrsh(dst.low_gp(), src_op);
         __ asr(dst.high_gp(), dst.low_gp(), Operand(31));
         break;
       case LoadType::kI64Load32U:
         __ ldr(dst.low_gp(), src_op);
         __ mov(dst.high_gp(), Operand(0));
         break;
       case LoadType::kI64Load32S:
         __ ldr(dst.low_gp(), src_op);
         __ asr(dst.high_gp(), dst.low_gp(), Operand(31));
         break;
       case LoadType::kI64Load:
         __ ldr(dst.low_gp(), src_op);
         // GetMemOp may use a scratch register as the offset register, in which
         // case, calling GetMemOp again will fail due to the assembler having
         // ran out of scratch registers.
         if (temps.CanAcquire()) {
           src_op = liftoff::GetMemOp(lasm, &temps, src_addr, offset_reg,
                                      offset_imm + kSystemPointerSize);
         } else {
           __ add(src_op.rm(), src_op.rm(), Operand(kSystemPointerSize));
         }
         __ ldr(dst.high_gp(), src_op);
         break;
       default:
         UNREACHABLE();
     }
   }
 }
 #undef __
 }  // namespace liftoff

 void LiftoffAssembler::LoadTaggedPointer(Register dst, Register src_addr,
                                          Register offset_reg,
                                          int32_t offset_imm,
                                          LiftoffRegList pinned) {
   STATIC_ASSERT(kTaggedSize == kInt32Size);
   liftoff::LoadInternal(this, LiftoffRegister(dst), src_addr, offset_reg,
                         offset_imm, LoadType::kI32Load, pinned);
 }

 void LiftoffAssembler::StoreTaggedPointer(Register dst_addr,
                                           int32_t offset_imm,
                                           LiftoffRegister src,
                                           LiftoffRegList pinned) {
   STATIC_ASSERT(kTaggedSize == kInt32Size);
   // Store the value.
   MemOperand dst_op(dst_addr, offset_imm);
   str(src.gp(), dst_op);
   // The write barrier.
   Label write_barrier;
   Label exit;
   CheckPageFlag(dst_addr, MemoryChunk::kPointersFromHereAreInterestingMask, ne,
                 &write_barrier);
   b(&exit);
   bind(&write_barrier);
   JumpIfSmi(src.gp(), &exit);
   CheckPageFlag(src.gp(), MemoryChunk::kPointersToHereAreInterestingMask, eq,
                 &exit);
   CallRecordWriteStub(dst_addr, Operand(offset_imm), EMIT_REMEMBERED_SET,
                       kSaveFPRegs, wasm::WasmCode::kRecordWrite);
   bind(&exit);
 }

 void LiftoffAssembler::Load(LiftoffRegister dst, Register src_addr,
                             Register offset_reg, uint32_t offset_imm,
                             LoadType type, LiftoffRegList pinned,
                             uint32_t* protected_load_pc, bool is_load_mem) {
   // If offset_imm cannot be converted to int32 safely, we abort as a separate
   // check should cause this code to never be executed.
   // TODO(7881): Support when >2GB is required.
   if (!is_uint31(offset_imm)) {
     TurboAssembler::Abort(AbortReason::kOffsetOutOfRange);
     return;
   }
   liftoff::LoadInternal(this, dst, src_addr, offset_reg,
                         static_cast<int32_t>(offset_imm), type, pinned,
                         protected_load_pc, is_load_mem);
 }

 void LiftoffAssembler::Store(Register dst_addr, Register offset_reg,
                              uint32_t offset_imm, LiftoffRegister src,
                              StoreType type, LiftoffRegList pinned,
                              uint32_t* protected_store_pc, bool is_store_mem) {
   // If offset_imm cannot be converted to int32 safely, we abort as a separate
   // check should cause this code to never be executed.
   // TODO(7881): Support when >2GB is required.
   if (!is_uint31(offset_imm)) {
     TurboAssembler::Abort(AbortReason::kOffsetOutOfRange);
     return;
   }
   UseScratchRegisterScope temps(this);
   if (type.value() == StoreType::kF64Store) {
     Register actual_dst_addr = liftoff::CalculateActualAddress(
         this, &temps, dst_addr, offset_reg, offset_imm);
     // Armv6 is not supported so Neon can be used to avoid alignment issues.
     CpuFeatureScope scope(this, NEON);
     vst1(Neon64, NeonListOperand(src.fp()), NeonMemOperand(actual_dst_addr));
   } else if (type.value() == StoreType::kS128Store) {
     Register actual_dst_addr = liftoff::CalculateActualAddress(
         this, &temps, dst_addr, offset_reg, offset_imm);
     // Armv6 is not supported so Neon can be used to avoid alignment issues.
     CpuFeatureScope scope(this, NEON);
     vst1(Neon8, NeonListOperand(src.low_fp(), 2),
          NeonMemOperand(actual_dst_addr));
   } else if (type.value() == StoreType::kF32Store) {
     // TODO(arm): Use vst1 for f32 when implemented in simulator as used for
     // f64. It supports unaligned access.
     // CalculateActualAddress will only not use a scratch register if the
     // following condition holds, otherwise another register must be
     // retrieved.
     Register scratch = (offset_reg == no_reg && offset_imm == 0)
                            ? temps.Acquire()
                            : GetUnusedRegister(kGpReg, pinned).gp();
     Register actual_dst_addr = liftoff::CalculateActualAddress(
         this, &temps, dst_addr, offset_reg, offset_imm);
     vmov(scratch, liftoff::GetFloatRegister(src.fp()));
     str(scratch, MemOperand(actual_dst_addr));
   } else {
     MemOperand dst_op =
         liftoff::GetMemOp(this, &temps, dst_addr, offset_reg, offset_imm);
     if (protected_store_pc) *protected_store_pc = pc_offset();
     switch (type.value()) {
       case StoreType::kI64Store8:
         src = src.low();
         V8_FALLTHROUGH;
       case StoreType::kI32Store8:
         strb(src.gp(), dst_op);
         break;
       case StoreType::kI64Store16:
         src = src.low();
         V8_FALLTHROUGH;
       case StoreType::kI32Store16:
         strh(src.gp(), dst_op);
         break;
       case StoreType::kI64Store32:
         src = src.low();
         V8_FALLTHROUGH;
       case StoreType::kI32Store:
         str(src.gp(), dst_op);
         break;
       case StoreType::kI64Store:
         str(src.low_gp(), dst_op);
         // GetMemOp may use a scratch register as the offset register, in which
         // case, calling GetMemOp again will fail due to the assembler having
         // ran out of scratch registers.
         if (temps.CanAcquire()) {
           dst_op = liftoff::GetMemOp(this, &temps, dst_addr, offset_reg,
                                      offset_imm + kSystemPointerSize);
         } else {
           add(dst_op.rm(), dst_op.rm(), Operand(kSystemPointerSize));
         }
         str(src.high_gp(), dst_op);
         break;
       default:
         UNREACHABLE();
     }
   }
 }

 namespace liftoff {
 #define __ lasm->

 inline void AtomicOp32(
     LiftoffAssembler* lasm, Register dst_addr, Register offset_reg,
     uint32_t offset_imm, LiftoffRegister value, LiftoffRegister result,
     LiftoffRegList pinned,
     void (Assembler::*load)(Register, Register, Condition),
     void (Assembler::*store)(Register, Register, Register, Condition),
     void (*op)(LiftoffAssembler*, Register, Register, Register)) {
   Register store_result = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();

   // Allocate an additional {temp} register to hold the result that should be
   // stored to memory. Note that {temp} and {store_result} are not allowed to be
   // the same register.
   Register temp = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();

   // Make sure that {result} is unique.
   Register result_reg = result.gp();
   if (result_reg == value.gp() || result_reg == dst_addr ||
       result_reg == offset_reg) {
     result_reg = __ GetUnusedRegister(kGpReg, pinned).gp();
   }

   UseScratchRegisterScope temps(lasm);
   Register actual_addr = liftoff::CalculateActualAddress(
       lasm, &temps, dst_addr, offset_reg, offset_imm);

   __ dmb(ISH);
   Label retry;
   __ bind(&retry);
   (lasm->*load)(result_reg, actual_addr, al);
   op(lasm, temp, result_reg, value.gp());
   (lasm->*store)(store_result, temp, actual_addr, al);
   __ cmp(store_result, Operand(0));
   __ b(ne, &retry);
   __ dmb(ISH);
   if (result_reg != result.gp()) {
     __ mov(result.gp(), result_reg);
   }
 }

 inline void Add(LiftoffAssembler* lasm, Register dst, Register lhs,
                 Register rhs) {
   __ add(dst, lhs, rhs);
 }

 inline void Sub(LiftoffAssembler* lasm, Register dst, Register lhs,
                 Register rhs) {
   __ sub(dst, lhs, rhs);
 }

 inline void And(LiftoffAssembler* lasm, Register dst, Register lhs,
                 Register rhs) {
   __ and_(dst, lhs, rhs);
 }

 inline void Or(LiftoffAssembler* lasm, Register dst, Register lhs,
                Register rhs) {
   __ orr(dst, lhs, rhs);
 }

 inline void Xor(LiftoffAssembler* lasm, Register dst, Register lhs,
                 Register rhs) {
   __ eor(dst, lhs, rhs);
 }

 inline void Exchange(LiftoffAssembler* lasm, Register dst, Register lhs,
                      Register rhs) {
   __ mov(dst, rhs);
 }

 inline void AtomicBinop32(LiftoffAssembler* lasm, Register dst_addr,
                           Register offset_reg, uint32_t offset_imm,
                           LiftoffRegister value, LiftoffRegister result,
                           StoreType type,
                           void (*op)(LiftoffAssembler*, Register, Register,
                                      Register)) {
   LiftoffRegList pinned =
       LiftoffRegList::ForRegs(dst_addr, offset_reg, value, result);
   switch (type.value()) {
     case StoreType::kI64Store8:
       __ LoadConstant(result.high(), WasmValue(0));
       result = result.low();
       value = value.low();
       V8_FALLTHROUGH;
     case StoreType::kI32Store8:
       liftoff::AtomicOp32(lasm, dst_addr, offset_reg, offset_imm, value, result,
                           pinned, &Assembler::ldrexb, &Assembler::strexb, op);
       return;
     case StoreType::kI64Store16:
       __ LoadConstant(result.high(), WasmValue(0));
       result = result.low();
       value = value.low();
       V8_FALLTHROUGH;
     case StoreType::kI32Store16:
       liftoff::AtomicOp32(lasm, dst_addr, offset_reg, offset_imm, value, result,
                           pinned, &Assembler::ldrexh, &Assembler::strexh, op);
       return;
     case StoreType::kI64Store32:
       __ LoadConstant(result.high(), WasmValue(0));
       result = result.low();
       value = value.low();
       V8_FALLTHROUGH;
     case StoreType::kI32Store:
       liftoff::AtomicOp32(lasm, dst_addr, offset_reg, offset_imm, value, result,
                           pinned, &Assembler::ldrex, &Assembler::strex, op);
       return;
     default:
       UNREACHABLE();
   }
 }

 inline void AtomicOp64(LiftoffAssembler* lasm, Register dst_addr,
                        Register offset_reg, uint32_t offset_imm,
                        LiftoffRegister value,
                        base::Optional<LiftoffRegister> result,
                        void (*op)(LiftoffAssembler*, LiftoffRegister,
                                   LiftoffRegister, LiftoffRegister)) {
   // strexd loads a 64 bit word into two registers. The first register needs
   // to have an even index, e.g. r8, the second register needs to be the one
   // with the next higher index, e.g. r9 if the first register is r8. In the
   // following code we use the fixed register pair r8/r9 to make the code here
   // simpler, even though other register pairs would also be possible.
   constexpr Register dst_low = r8;
   constexpr Register dst_high = r9;

   // Make sure {dst_low} and {dst_high} are not occupied by any other value.
   Register value_low = value.low_gp();
   Register value_high = value.high_gp();
   LiftoffRegList pinned = LiftoffRegList::ForRegs(
       dst_addr, offset_reg, value_low, value_high, dst_low, dst_high);
   __ ClearRegister(dst_low, {&dst_addr, &offset_reg, &value_low, &value_high},
                    pinned);
   pinned = pinned |
            LiftoffRegList::ForRegs(dst_addr, offset_reg, value_low, value_high);
   __ ClearRegister(dst_high, {&dst_addr, &offset_reg, &value_low, &value_high},
                    pinned);
   pinned = pinned |
            LiftoffRegList::ForRegs(dst_addr, offset_reg, value_low, value_high);

   // Make sure that {result}, if it exists, also does not overlap with
   // {dst_low} and {dst_high}. We don't have to transfer the value stored in
   // {result}.
   Register result_low = no_reg;
   Register result_high = no_reg;
   if (result.has_value()) {
     result_low = result.value().low_gp();
     if (pinned.has(result_low)) {
       result_low = __ GetUnusedRegister(kGpReg, pinned).gp();
     }
     pinned.set(result_low);

     result_high = result.value().high_gp();
     if (pinned.has(result_high)) {
       result_high = __ GetUnusedRegister(kGpReg, pinned).gp();
     }
     pinned.set(result_high);
   }

   Register store_result = __ GetUnusedRegister(kGpReg, pinned).gp();

   UseScratchRegisterScope temps(lasm);
   Register actual_addr = liftoff::CalculateActualAddress(
       lasm, &temps, dst_addr, offset_reg, offset_imm);

   __ dmb(ISH);
   Label retry;
   __ bind(&retry);
   // {ldrexd} is needed here so that the {strexd} instruction below can
   // succeed. We don't need the value we are reading. We use {dst_low} and
   // {dst_high} as the destination registers because {ldrexd} has the same
   // restrictions on registers as {strexd}, see the comment above.
   __ ldrexd(dst_low, dst_high, actual_addr);
   if (result.has_value()) {
     __ mov(result_low, dst_low);
     __ mov(result_high, dst_high);
   }
   op(lasm, LiftoffRegister::ForPair(dst_low, dst_high),
      LiftoffRegister::ForPair(dst_low, dst_high),
      LiftoffRegister::ForPair(value_low, value_high));
   __ strexd(store_result, dst_low, dst_high, actual_addr);
   __ cmp(store_result, Operand(0));
   __ b(ne, &retry);
   __ dmb(ISH);

   if (result.has_value()) {
     if (result_low != result.value().low_gp()) {
       __ mov(result.value().low_gp(), result_low);
     }
     if (result_high != result.value().high_gp()) {
       __ mov(result.value().high_gp(), result_high);
     }
   }
 }

 inline void I64Store(LiftoffAssembler* lasm, LiftoffRegister dst,
                      LiftoffRegister, LiftoffRegister src) {
   __ mov(dst.low_gp(), src.low_gp());
   __ mov(dst.high_gp(), src.high_gp());
 }

 #undef __
 }  // namespace liftoff

 void LiftoffAssembler::AtomicLoad(LiftoffRegister dst, Register src_addr,
                                   Register offset_reg, uint32_t offset_imm,
                                   LoadType type, LiftoffRegList pinned) {
   if (type.value() != LoadType::kI64Load) {
     Load(dst, src_addr, offset_reg, offset_imm, type, pinned, nullptr, true);
     dmb(ISH);
     return;
   }
   // ldrexd loads a 64 bit word into two registers. The first register needs to
   // have an even index, e.g. r8, the second register needs to be the one with
   // the next higher index, e.g. r9 if the first register is r8. In the
   // following code we use the fixed register pair r8/r9 to make the code here
   // simpler, even though other register pairs would also be possible.
   constexpr Register dst_low = r8;
   constexpr Register dst_high = r9;
   if (cache_state()->is_used(LiftoffRegister(dst_low))) {
     SpillRegister(LiftoffRegister(dst_low));
   }
   if (cache_state()->is_used(LiftoffRegister(dst_high))) {
     SpillRegister(LiftoffRegister(dst_high));
   }
   {
     UseScratchRegisterScope temps(this);
     Register actual_addr = liftoff::CalculateActualAddress(
         this, &temps, src_addr, offset_reg, offset_imm);
     ldrexd(dst_low, dst_high, actual_addr);
     dmb(ISH);
   }

   ParallelRegisterMove(
       {{dst, LiftoffRegister::ForPair(dst_low, dst_high), kWasmI64}});
 }

 void LiftoffAssembler::AtomicStore(Register dst_addr, Register offset_reg,
                                    uint32_t offset_imm, LiftoffRegister src,
                                    StoreType type, LiftoffRegList pinned) {
   if (type.value() == StoreType::kI64Store) {
     liftoff::AtomicOp64(this, dst_addr, offset_reg, offset_imm, src, {},
                         liftoff::I64Store);
     return;
   }

   dmb(ISH);
   Store(dst_addr, offset_reg, offset_imm, src, type, pinned, nullptr, true);
   dmb(ISH);
   return;
 }

 void LiftoffAssembler::AtomicAdd(Register dst_addr, Register offset_reg,
                                  uint32_t offset_imm, LiftoffRegister value,
                                  LiftoffRegister result, StoreType type) {
   if (type.value() == StoreType::kI64Store) {
     liftoff::AtomicOp64(this, dst_addr, offset_reg, offset_imm, value, {result},
                         liftoff::I64Binop<&Assembler::add, &Assembler::adc>);
     return;
   }
   liftoff::AtomicBinop32(this, dst_addr, offset_reg, offset_imm, value, result,
                          type, &liftoff::Add);
 }

 void LiftoffAssembler::AtomicSub(Register dst_addr, Register offset_reg,
                                  uint32_t offset_imm, LiftoffRegister value,
                                  LiftoffRegister result, StoreType type) {
   if (type.value() == StoreType::kI64Store) {
     liftoff::AtomicOp64(this, dst_addr, offset_reg, offset_imm, value, {result},
                         liftoff::I64Binop<&Assembler::sub, &Assembler::sbc>);
     return;
   }
   liftoff::AtomicBinop32(this, dst_addr, offset_reg, offset_imm, value, result,
                          type, &liftoff::Sub);
 }

 void LiftoffAssembler::AtomicAnd(Register dst_addr, Register offset_reg,
                                  uint32_t offset_imm, LiftoffRegister value,
                                  LiftoffRegister result, StoreType type) {
   if (type.value() == StoreType::kI64Store) {
     liftoff::AtomicOp64(this, dst_addr, offset_reg, offset_imm, value, {result},
                         liftoff::I64Binop<&Assembler::and_, &Assembler::and_>);
     return;
   }
   liftoff::AtomicBinop32(this, dst_addr, offset_reg, offset_imm, value, result,
                          type, &liftoff::And);
 }

 void LiftoffAssembler::AtomicOr(Register dst_addr, Register offset_reg,
                                 uint32_t offset_imm, LiftoffRegister value,
                                 LiftoffRegister result, StoreType type) {
   if (type.value() == StoreType::kI64Store) {
     liftoff::AtomicOp64(this, dst_addr, offset_reg, offset_imm, value, {result},
                         liftoff::I64Binop<&Assembler::orr, &Assembler::orr>);
     return;
   }
   liftoff::AtomicBinop32(this, dst_addr, offset_reg, offset_imm, value, result,
                          type, &liftoff::Or);
 }

 void LiftoffAssembler::AtomicXor(Register dst_addr, Register offset_reg,
                                  uint32_t offset_imm, LiftoffRegister value,
                                  LiftoffRegister result, StoreType type) {
   if (type.value() == StoreType::kI64Store) {
     liftoff::AtomicOp64(this, dst_addr, offset_reg, offset_imm, value, {result},
                         liftoff::I64Binop<&Assembler::eor, &Assembler::eor>);
     return;
   }
   liftoff::AtomicBinop32(this, dst_addr, offset_reg, offset_imm, value, result,
                          type, &liftoff::Xor);
 }

 void LiftoffAssembler::AtomicExchange(Register dst_addr, Register offset_reg,
                                       uint32_t offset_imm,
                                       LiftoffRegister value,
                                       LiftoffRegister result, StoreType type) {
   if (type.value() == StoreType::kI64Store) {
     liftoff::AtomicOp64(this, dst_addr, offset_reg, offset_imm, value, {result},
                         liftoff::I64Store);
     return;
   }
   liftoff::AtomicBinop32(this, dst_addr, offset_reg, offset_imm, value, result,
                          type, &liftoff::Exchange);
 }

 namespace liftoff {
 #define __ lasm->

 inline void AtomicI64CompareExchange(LiftoffAssembler* lasm,
                                      Register dst_addr_reg, Register offset_reg,
                                      uint32_t offset_imm,
                                      LiftoffRegister expected,
                                      LiftoffRegister new_value,
                                      LiftoffRegister result) {
   // To implement I64AtomicCompareExchange, we nearly need all registers, with
   // some registers having special constraints, e.g. like for {new_value} and
   // {result} the low-word register has to have an even register code, and the
   // high-word has to be in the next higher register. To avoid complicated
   // register allocation code here, we just assign fixed registers to all
   // values here, and then move all values into the correct register.
   Register dst_addr = r0;
   Register offset = r1;
   Register result_low = r4;
   Register result_high = r5;
   Register new_value_low = r2;
   Register new_value_high = r3;
   Register store_result = r6;
   Register expected_low = r8;
   Register expected_high = r9;

   // We spill all registers, so that we can re-assign them afterwards.
   __ SpillRegisters(dst_addr, offset, result_low, result_high, new_value_low,
                     new_value_high, store_result, expected_low, expected_high);

   __ ParallelRegisterMove(
       {{LiftoffRegister::ForPair(new_value_low, new_value_high), new_value,
         kWasmI64},
        {LiftoffRegister::ForPair(expected_low, expected_high), expected,
         kWasmI64},
        {dst_addr, dst_addr_reg, kWasmI32},
        {offset, offset_reg != no_reg ? offset_reg : offset, kWasmI32}});

   {
     UseScratchRegisterScope temps(lasm);
     Register temp = liftoff::CalculateActualAddress(
         lasm, &temps, dst_addr, offset_reg == no_reg ? no_reg : offset,
         offset_imm, dst_addr);
     // Make sure the actual address is stored in the right register.
     DCHECK_EQ(dst_addr, temp);
     USE(temp);
   }

   Label retry;
   Label done;
   __ dmb(ISH);
   __ bind(&retry);
   __ ldrexd(result_low, result_high, dst_addr);
   __ cmp(result_low, expected_low);
   __ b(ne, &done);
   __ cmp(result_high, expected_high);
   __ b(ne, &done);
   __ strexd(store_result, new_value_low, new_value_high, dst_addr);
   __ cmp(store_result, Operand(0));
   __ b(ne, &retry);
   __ dmb(ISH);
   __ bind(&done);

   __ ParallelRegisterMove(
       {{result, LiftoffRegister::ForPair(result_low, result_high), kWasmI64}});
 }
 #undef __
 }  // namespace liftoff

 void LiftoffAssembler::AtomicCompareExchange(
     Register dst_addr, Register offset_reg, uint32_t offset_imm,
     LiftoffRegister expected, LiftoffRegister new_value, LiftoffRegister result,
     StoreType type) {
   if (type.value() == StoreType::kI64Store) {
     liftoff::AtomicI64CompareExchange(this, dst_addr, offset_reg, offset_imm,
                                       expected, new_value, result);
     return;
   }

   // The other versions of CompareExchange can share code, but need special load
   // and store instructions.
   void (Assembler::*load)(Register, Register, Condition) = nullptr;
   void (Assembler::*store)(Register, Register, Register, Condition) = nullptr;

   LiftoffRegList pinned = LiftoffRegList::ForRegs(dst_addr, offset_reg);
   // We need to remember the high word of {result}, so we can set it to zero in
   // the end if necessary.
   Register result_high = no_reg;
   switch (type.value()) {
     case StoreType::kI64Store8:
       result_high = result.high_gp();
       result = result.low();
       new_value = new_value.low();
       expected = expected.low();
       V8_FALLTHROUGH;
     case StoreType::kI32Store8:
       load = &Assembler::ldrexb;
       store = &Assembler::strexb;
       // We have to clear the high bits of {expected}, as we can only do a
       // 32-bit comparison. If the {expected} register is used, we spill it
       // first.
       if (cache_state()->is_used(expected)) {
         SpillRegister(expected);
       }
       uxtb(expected.gp(), expected.gp());
       break;
     case StoreType::kI64Store16:
       result_high = result.high_gp();
       result = result.low();
       new_value = new_value.low();
       expected = expected.low();
       V8_FALLTHROUGH;
     case StoreType::kI32Store16:
       load = &Assembler::ldrexh;
       store = &Assembler::strexh;
       // We have to clear the high bits of {expected}, as we can only do a
       // 32-bit comparison. If the {expected} register is used, we spill it
       // first.
       if (cache_state()->is_used(expected)) {
         SpillRegister(expected);
       }
       uxth(expected.gp(), expected.gp());
       break;
     case StoreType::kI64Store32:
       result_high = result.high_gp();
       result = result.low();
       new_value = new_value.low();
       expected = expected.low();
       V8_FALLTHROUGH;
     case StoreType::kI32Store:
       load = &Assembler::ldrex;
       store = &Assembler::strex;
       break;
     default:
       UNREACHABLE();
   }
   pinned.set(new_value);
   pinned.set(expected);

   Register result_reg = result.gp();
   if (pinned.has(result)) {
     result_reg = GetUnusedRegister(kGpReg, pinned).gp();
   }
   pinned.set(LiftoffRegister(result));
   Register store_result = GetUnusedRegister(kGpReg, pinned).gp();

   UseScratchRegisterScope temps(this);
   Register actual_addr = liftoff::CalculateActualAddress(
       this, &temps, dst_addr, offset_reg, offset_imm);

   Label retry;
   Label done;
   dmb(ISH);
   bind(&retry);
   (this->*load)(result.gp(), actual_addr, al);
   cmp(result.gp(), expected.gp());
   b(ne, &done);
   (this->*store)(store_result, new_value.gp(), actual_addr, al);
   cmp(store_result, Operand(0));
   b(ne, &retry);
   dmb(ISH);
   bind(&done);

   if (result.gp() != result_reg) {
     mov(result.gp(), result_reg);
   }
   if (result_high != no_reg) {
     LoadConstant(LiftoffRegister(result_high), WasmValue(0));
   }
 }

 void LiftoffAssembler::AtomicFence() { dmb(ISH); }

 void LiftoffAssembler::LoadCallerFrameSlot(LiftoffRegister dst,
                                            uint32_t caller_slot_idx,
                                            ValueType type) {
   MemOperand src(fp, (caller_slot_idx + 1) * kSystemPointerSize);
   liftoff::Load(this, dst, src, type);
 }

 void LiftoffAssembler::StoreCallerFrameSlot(LiftoffRegister src,
                                             uint32_t caller_slot_idx,
                                             ValueType type) {
   MemOperand dst(fp, (caller_slot_idx + 1) * kSystemPointerSize);
   liftoff::Store(this, src, dst, type);
 }

 void LiftoffAssembler::LoadReturnStackSlot(LiftoffRegister dst, int offset,
                                            ValueType type) {
   MemOperand src(sp, offset);
   liftoff::Load(this, dst, src, type);
 }

 void LiftoffAssembler::MoveStackValue(uint32_t dst_offset, uint32_t src_offset,
                                       ValueType type) {
   DCHECK_NE(dst_offset, src_offset);
   LiftoffRegister reg = GetUnusedRegister(reg_class_for(type), {});
   Fill(reg, src_offset, type);
   Spill(dst_offset, reg, type);
 }

 void LiftoffAssembler::Move(Register dst, Register src, ValueType type) {
   DCHECK_NE(dst, src);
   DCHECK(type == kWasmI32 || type.is_reference_type());
   TurboAssembler::Move(dst, src);
 }

 void LiftoffAssembler::Move(DoubleRegister dst, DoubleRegister src,
                             ValueType type) {
   DCHECK_NE(dst, src);
   if (type == kWasmF32) {
     vmov(liftoff::GetFloatRegister(dst), liftoff::GetFloatRegister(src));
   } else if (type == kWasmF64) {
     vmov(dst, src);
   } else {
     DCHECK_EQ(kWasmS128, type);
     vmov(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(src));
   }
 }

 void LiftoffAssembler::Spill(int offset, LiftoffRegister reg, ValueType type) {
   // The {str} instruction needs a temp register when the immediate in the
   // provided MemOperand does not fit into 12 bits. This happens for large stack
   // frames. This DCHECK checks that the temp register is available when needed.
   DCHECK(UseScratchRegisterScope{this}.CanAcquire());
   DCHECK_LT(0, offset);
   RecordUsedSpillOffset(offset);
   MemOperand dst(fp, -offset);
   liftoff::Store(this, reg, dst, type);
 }

 void LiftoffAssembler::Spill(int offset, WasmValue value) {
   RecordUsedSpillOffset(offset);
   MemOperand dst = liftoff::GetStackSlot(offset);
   UseScratchRegisterScope temps(this);
   Register src = no_reg;
   // The scratch register will be required by str if multiple instructions
   // are required to encode the offset, and so we cannot use it in that case.
   if (!ImmediateFitsAddrMode2Instruction(dst.offset())) {
     src = GetUnusedRegister(kGpReg, {}).gp();
   } else {
     src = temps.Acquire();
   }
   switch (value.type().kind()) {
     case ValueType::kI32:
       mov(src, Operand(value.to_i32()));
       str(src, dst);
       break;
     case ValueType::kI64: {
       int32_t low_word = value.to_i64();
       mov(src, Operand(low_word));
       str(src, liftoff::GetHalfStackSlot(offset, kLowWord));
       int32_t high_word = value.to_i64() >> 32;
       mov(src, Operand(high_word));
       str(src, liftoff::GetHalfStackSlot(offset, kHighWord));
       break;
     }
     default:
       // We do not track f32 and f64 constants, hence they are unreachable.
       UNREACHABLE();
   }
 }

 void LiftoffAssembler::Fill(LiftoffRegister reg, int offset, ValueType type) {
   liftoff::Load(this, reg, liftoff::GetStackSlot(offset), type);
 }

 void LiftoffAssembler::FillI64Half(Register reg, int offset, RegPairHalf half) {
   ldr(reg, liftoff::GetHalfStackSlot(offset, half));
 }

 void LiftoffAssembler::FillStackSlotsWithZero(int start, int size) {
   DCHECK_LT(0, size);
   DCHECK_EQ(0, size % 4);
   RecordUsedSpillOffset(start + size);

   // We need a zero reg. Always use r0 for that, and push it before to restore
   // its value afterwards.
   push(r0);
   mov(r0, Operand(0));

   if (size <= 36) {
     // Special straight-line code for up to 9 words. Generates one
     // instruction per word.
     for (int offset = 4; offset <= size; offset += 4) {
       str(r0, liftoff::GetHalfStackSlot(start + offset, kLowWord));
     }
   } else {
     // General case for bigger counts (9 instructions).
     // Use r1 for start address (inclusive), r2 for end address (exclusive).
     push(r1);
     push(r2);
     sub(r1, fp, Operand(start + size));
     sub(r2, fp, Operand(start));

     Label loop;
     bind(&loop);
     str(r0, MemOperand(r1, /* offset */ kSystemPointerSize, PostIndex));
     cmp(r1, r2);
     b(&loop, ne);

     pop(r2);
     pop(r1);
   }

   pop(r0);
 }

 #define I32_BINOP(name, instruction)                             \
   void LiftoffAssembler::emit_##name(Register dst, Register lhs, \
                                      Register rhs) {             \
     instruction(dst, lhs, rhs);                                  \
   }
 #define I32_BINOP_I(name, instruction)                              \
   I32_BINOP(name, instruction)                                      \
   void LiftoffAssembler::emit_##name##i(Register dst, Register lhs, \
                                         int32_t imm) {              \
     instruction(dst, lhs, Operand(imm));                            \
   }
 #define I32_SHIFTOP(name, instruction)                              \
   void LiftoffAssembler::emit_##name(Register dst, Register src,    \
                                      Register amount) {             \
     UseScratchRegisterScope temps(this);                            \
     Register scratch = temps.Acquire();                             \
     and_(scratch, amount, Operand(0x1f));                           \
     instruction(dst, src, Operand(scratch));                        \
   }                                                                 \
   void LiftoffAssembler::emit_##name##i(Register dst, Register src, \
                                         int32_t amount) {           \
     if (V8_LIKELY((amount & 31) != 0)) {                            \
       instruction(dst, src, Operand(amount & 31));                  \
     } else if (dst != src) {                                        \
       mov(dst, src);                                                \
     }                                                               \
   }
 #define FP32_UNOP(name, instruction)                                           \
   void LiftoffAssembler::emit_##name(DoubleRegister dst, DoubleRegister src) { \
     instruction(liftoff::GetFloatRegister(dst),                                \
                 liftoff::GetFloatRegister(src));                               \
   }
 #define FP32_BINOP(name, instruction)                                        \
   void LiftoffAssembler::emit_##name(DoubleRegister dst, DoubleRegister lhs, \
                                      DoubleRegister rhs) {                   \
     instruction(liftoff::GetFloatRegister(dst),                              \
                 liftoff::GetFloatRegister(lhs),                              \
                 liftoff::GetFloatRegister(rhs));                             \
   }
 #define FP64_UNOP(name, instruction)                                           \
   void LiftoffAssembler::emit_##name(DoubleRegister dst, DoubleRegister src) { \
     instruction(dst, src);                                                     \
   }
 #define FP64_BINOP(name, instruction)                                        \
   void LiftoffAssembler::emit_##name(DoubleRegister dst, DoubleRegister lhs, \
                                      DoubleRegister rhs) {                   \
     instruction(dst, lhs, rhs);                                              \
   }

 I32_BINOP_I(i32_add, add)
 I32_BINOP(i32_sub, sub)
 I32_BINOP(i32_mul, mul)
 I32_BINOP_I(i32_and, and_)
 I32_BINOP_I(i32_or, orr)
 I32_BINOP_I(i32_xor, eor)
 I32_SHIFTOP(i32_shl, lsl)
 I32_SHIFTOP(i32_sar, asr)
 I32_SHIFTOP(i32_shr, lsr)
 FP32_BINOP(f32_add, vadd)
 FP32_BINOP(f32_sub, vsub)
 FP32_BINOP(f32_mul, vmul)
 FP32_BINOP(f32_div, vdiv)
 FP32_UNOP(f32_abs, vabs)
 FP32_UNOP(f32_neg, vneg)
 FP32_UNOP(f32_sqrt, vsqrt)
 FP64_BINOP(f64_add, vadd)
 FP64_BINOP(f64_sub, vsub)
 FP64_BINOP(f64_mul, vmul)
 FP64_BINOP(f64_div, vdiv)
 FP64_UNOP(f64_abs, vabs)
 FP64_UNOP(f64_neg, vneg)
 FP64_UNOP(f64_sqrt, vsqrt)

 #undef I32_BINOP
 #undef I32_SHIFTOP
 #undef FP32_UNOP
 #undef FP32_BINOP
 #undef FP64_UNOP
 #undef FP64_BINOP

 void LiftoffAssembler::emit_i32_clz(Register dst, Register src) {
   clz(dst, src);
 }

 void LiftoffAssembler::emit_i32_ctz(Register dst, Register src) {
   rbit(dst, src);
   clz(dst, dst);
 }

 namespace liftoff {
 inline void GeneratePopCnt(Assembler* assm, Register dst, Register src,
                            Register scratch1, Register scratch2) {
   DCHECK(!AreAliased(dst, scratch1, scratch2));
   if (src == scratch1) std::swap(scratch1, scratch2);
   // x = x - ((x & (0x55555555 << 1)) >> 1)
   assm->and_(scratch1, src, Operand(0xaaaaaaaa));
   assm->sub(dst, src, Operand(scratch1, LSR, 1));
   // x = (x & 0x33333333) + ((x & (0x33333333 << 2)) >> 2)
   assm->mov(scratch1, Operand(0x33333333));
   assm->and_(scratch2, dst, Operand(scratch1, LSL, 2));
   assm->and_(scratch1, dst, scratch1);
   assm->add(dst, scratch1, Operand(scratch2, LSR, 2));
   // x = (x + (x >> 4)) & 0x0F0F0F0F
   assm->add(dst, dst, Operand(dst, LSR, 4));
   assm->and_(dst, dst, Operand(0x0f0f0f0f));
   // x = x + (x >> 8)
   assm->add(dst, dst, Operand(dst, LSR, 8));
   // x = x + (x >> 16)
   assm->add(dst, dst, Operand(dst, LSR, 16));
   // x = x & 0x3F
   assm->and_(dst, dst, Operand(0x3f));
 }
 }  // namespace liftoff

 bool LiftoffAssembler::emit_i32_popcnt(Register dst, Register src) {
   LiftoffRegList pinned = LiftoffRegList::ForRegs(dst);
   Register scratch1 = pinned.set(GetUnusedRegister(kGpReg, pinned)).gp();
   Register scratch2 = GetUnusedRegister(kGpReg, pinned).gp();
   liftoff::GeneratePopCnt(this, dst, src, scratch1, scratch2);
   return true;
 }

 void LiftoffAssembler::emit_i32_divs(Register dst, Register lhs, Register rhs,
                                      Label* trap_div_by_zero,
                                      Label* trap_div_unrepresentable) {
   if (!CpuFeatures::IsSupported(SUDIV)) {
     bailout(kMissingCPUFeature, "i32_divs");
     return;
   }
   CpuFeatureScope scope(this, SUDIV);
   // Issue division early so we can perform the trapping checks whilst it
   // completes.
   bool speculative_sdiv = dst != lhs && dst != rhs;
   if (speculative_sdiv) {
     sdiv(dst, lhs, rhs);
   }
   Label noTrap;
   // Check for division by zero.
   cmp(rhs, Operand(0));
   b(trap_div_by_zero, eq);
   // Check for kMinInt / -1. This is unrepresentable.
   cmp(rhs, Operand(-1));
   b(&noTrap, ne);
   cmp(lhs, Operand(kMinInt));
   b(trap_div_unrepresentable, eq);
   bind(&noTrap);
   if (!speculative_sdiv) {
     sdiv(dst, lhs, rhs);
   }
 }

 void LiftoffAssembler::emit_i32_divu(Register dst, Register lhs, Register rhs,
                                      Label* trap_div_by_zero) {
   if (!CpuFeatures::IsSupported(SUDIV)) {
     bailout(kMissingCPUFeature, "i32_divu");
     return;
   }
   CpuFeatureScope scope(this, SUDIV);
   // Check for division by zero.
   cmp(rhs, Operand(0));
   b(trap_div_by_zero, eq);
   udiv(dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i32_rems(Register dst, Register lhs, Register rhs,
                                      Label* trap_div_by_zero) {
   if (!CpuFeatures::IsSupported(SUDIV)) {
     // When this case is handled, a check for ARMv7 is required to use mls.
     // Mls support is implied with SUDIV support.
     bailout(kMissingCPUFeature, "i32_rems");
     return;
   }
   CpuFeatureScope scope(this, SUDIV);
   // No need to check kMinInt / -1 because the result is kMinInt and then
   // kMinInt * -1 -> kMinInt. In this case, the Msub result is therefore 0.
   UseScratchRegisterScope temps(this);
   Register scratch = temps.Acquire();
   sdiv(scratch, lhs, rhs);
   // Check for division by zero.
   cmp(rhs, Operand(0));
   b(trap_div_by_zero, eq);
   // Compute remainder.
   mls(dst, scratch, rhs, lhs);
 }

 void LiftoffAssembler::emit_i32_remu(Register dst, Register lhs, Register rhs,
                                      Label* trap_div_by_zero) {
   if (!CpuFeatures::IsSupported(SUDIV)) {
     // When this case is handled, a check for ARMv7 is required to use mls.
     // Mls support is implied with SUDIV support.
     bailout(kMissingCPUFeature, "i32_remu");
     return;
   }
   CpuFeatureScope scope(this, SUDIV);
   // No need to check kMinInt / -1 because the result is kMinInt and then
   // kMinInt * -1 -> kMinInt. In this case, the Msub result is therefore 0.
   UseScratchRegisterScope temps(this);
   Register scratch = temps.Acquire();
   udiv(scratch, lhs, rhs);
   // Check for division by zero.
   cmp(rhs, Operand(0));
   b(trap_div_by_zero, eq);
   // Compute remainder.
   mls(dst, scratch, rhs, lhs);
 }

 void LiftoffAssembler::emit_i64_add(LiftoffRegister dst, LiftoffRegister lhs,
                                     LiftoffRegister rhs) {
   liftoff::I64Binop<&Assembler::add, &Assembler::adc>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i64_addi(LiftoffRegister dst, LiftoffRegister lhs,
                                      int32_t imm) {
   liftoff::I64BinopI<&Assembler::add, &Assembler::adc>(this, dst, lhs, imm);
 }

 void LiftoffAssembler::emit_i64_sub(LiftoffRegister dst, LiftoffRegister lhs,
                                     LiftoffRegister rhs) {
   liftoff::I64Binop<&Assembler::sub, &Assembler::sbc>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i64_mul(LiftoffRegister dst, LiftoffRegister lhs,
                                     LiftoffRegister rhs) {
   // Idea:
   //        [           lhs_hi  |           lhs_lo  ] * [  rhs_hi  |  rhs_lo  ]
   //    =   [  lhs_hi * rhs_lo  |                   ]  (32 bit mul, shift 32)
   //      + [  lhs_lo * rhs_hi  |                   ]  (32 bit mul, shift 32)
   //      + [             lhs_lo * rhs_lo           ]  (32x32->64 mul, shift 0)
   UseScratchRegisterScope temps(this);
   Register scratch = temps.Acquire();
   // scratch = lhs_hi * rhs_lo
   mul(scratch, lhs.high_gp(), rhs.low_gp());
   // scratch += lhs_lo * rhs_hi
   mla(scratch, lhs.low_gp(), rhs.high_gp(), scratch);
   // TODO(arm): use umlal once implemented correctly in the simulator.
   // [dst_hi|dst_lo] = lhs_lo * rhs_lo
   umull(dst.low_gp(), dst.high_gp(), lhs.low_gp(), rhs.low_gp());
   // dst_hi += scratch
   add(dst.high_gp(), dst.high_gp(), scratch);
 }

 bool LiftoffAssembler::emit_i64_divs(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs,
                                      Label* trap_div_by_zero,
                                      Label* trap_div_unrepresentable) {
   return false;
 }

 bool LiftoffAssembler::emit_i64_divu(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs,
                                      Label* trap_div_by_zero) {
   return false;
 }

 bool LiftoffAssembler::emit_i64_rems(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs,
                                      Label* trap_div_by_zero) {
   return false;
 }

 bool LiftoffAssembler::emit_i64_remu(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs,
                                      Label* trap_div_by_zero) {
   return false;
 }

 void LiftoffAssembler::emit_i64_shl(LiftoffRegister dst, LiftoffRegister src,
                                     Register amount) {
   liftoff::I64Shiftop<&TurboAssembler::LslPair, true>(this, dst, src, amount);
 }

 void LiftoffAssembler::emit_i64_shli(LiftoffRegister dst, LiftoffRegister src,
                                      int32_t amount) {
   UseScratchRegisterScope temps(this);
   // {src.low_gp()} will still be needed after writing {dst.high_gp()}.
   Register src_low =
       liftoff::EnsureNoAlias(this, src.low_gp(), dst.high_gp(), &temps);

   LslPair(dst.low_gp(), dst.high_gp(), src_low, src.high_gp(), amount & 63);
 }

 void LiftoffAssembler::emit_i64_sar(LiftoffRegister dst, LiftoffRegister src,
                                     Register amount) {
   liftoff::I64Shiftop<&TurboAssembler::AsrPair, false>(this, dst, src, amount);
 }

 void LiftoffAssembler::emit_i64_sari(LiftoffRegister dst, LiftoffRegister src,
                                      int32_t amount) {
   UseScratchRegisterScope temps(this);
   // {src.high_gp()} will still be needed after writing {dst.low_gp()}.
   Register src_high =
       liftoff::EnsureNoAlias(this, src.high_gp(), dst.low_gp(), &temps);

   AsrPair(dst.low_gp(), dst.high_gp(), src.low_gp(), src_high, amount & 63);
 }

 void LiftoffAssembler::emit_i64_shr(LiftoffRegister dst, LiftoffRegister src,
                                     Register amount) {
   liftoff::I64Shiftop<&TurboAssembler::LsrPair, false>(this, dst, src, amount);
 }

 void LiftoffAssembler::emit_i64_shri(LiftoffRegister dst, LiftoffRegister src,
                                      int32_t amount) {
   UseScratchRegisterScope temps(this);
   // {src.high_gp()} will still be needed after writing {dst.low_gp()}.
   Register src_high =
       liftoff::EnsureNoAlias(this, src.high_gp(), dst.low_gp(), &temps);

   LsrPair(dst.low_gp(), dst.high_gp(), src.low_gp(), src_high, amount & 63);
 }

 void LiftoffAssembler::emit_i64_clz(LiftoffRegister dst, LiftoffRegister src) {
   // return high == 0 ? 32 + CLZ32(low) : CLZ32(high);
   Label done;
   Label high_is_zero;
   cmp(src.high_gp(), Operand(0));
   b(&high_is_zero, eq);

   clz(dst.low_gp(), src.high_gp());
   jmp(&done);

   bind(&high_is_zero);
   clz(dst.low_gp(), src.low_gp());
   add(dst.low_gp(), dst.low_gp(), Operand(32));

   bind(&done);
   mov(dst.high_gp(), Operand(0));  // High word of result is always 0.
 }

 void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) {
   // return low == 0 ? 32 + CTZ32(high) : CTZ32(low);
   // CTZ32(x) = CLZ(RBIT(x))
   Label done;
   Label low_is_zero;
   cmp(src.low_gp(), Operand(0));
   b(&low_is_zero, eq);

   rbit(dst.low_gp(), src.low_gp());
   clz(dst.low_gp(), dst.low_gp());
   jmp(&done);

   bind(&low_is_zero);
   rbit(dst.low_gp(), src.high_gp());
   clz(dst.low_gp(), dst.low_gp());
   add(dst.low_gp(), dst.low_gp(), Operand(32));

   bind(&done);
   mov(dst.high_gp(), Operand(0));  // High word of result is always 0.
 }

 bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst,
                                        LiftoffRegister src) {
   // Produce partial popcnts in the two dst registers, making sure not to
   // overwrite the second src register before using it.
   Register src1 = src.high_gp() == dst.low_gp() ? src.high_gp() : src.low_gp();
   Register src2 = src.high_gp() == dst.low_gp() ? src.low_gp() : src.high_gp();
   LiftoffRegList pinned = LiftoffRegList::ForRegs(dst, src2);
   Register scratch1 = pinned.set(GetUnusedRegister(kGpReg, pinned)).gp();
   Register scratch2 = GetUnusedRegister(kGpReg, pinned).gp();
   liftoff::GeneratePopCnt(this, dst.low_gp(), src1, scratch1, scratch2);
   liftoff::GeneratePopCnt(this, dst.high_gp(), src2, scratch1, scratch2);
   // Now add the two into the lower dst reg and clear the higher dst reg.
   add(dst.low_gp(), dst.low_gp(), dst.high_gp());
   mov(dst.high_gp(), Operand(0));
   return true;
 }

 bool LiftoffAssembler::emit_f32_ceil(DoubleRegister dst, DoubleRegister src) {
   if (CpuFeatures::IsSupported(ARMv8)) {
     CpuFeatureScope scope(this, ARMv8);
     vrintp(liftoff::GetFloatRegister(dst), liftoff::GetFloatRegister(src));
     return true;
   }
   return false;
 }

 bool LiftoffAssembler::emit_f32_floor(DoubleRegister dst, DoubleRegister src) {
   if (CpuFeatures::IsSupported(ARMv8)) {
     CpuFeatureScope scope(this, ARMv8);
     vrintm(liftoff::GetFloatRegister(dst), liftoff::GetFloatRegister(src));
     return true;
   }
   return false;
 }

 bool LiftoffAssembler::emit_f32_trunc(DoubleRegister dst, DoubleRegister src) {
   if (CpuFeatures::IsSupported(ARMv8)) {
     CpuFeatureScope scope(this, ARMv8);
     vrintz(liftoff::GetFloatRegister(dst), liftoff::GetFloatRegister(src));
     return true;
   }
   return false;
 }

 bool LiftoffAssembler::emit_f32_nearest_int(DoubleRegister dst,
                                             DoubleRegister src) {
   if (CpuFeatures::IsSupported(ARMv8)) {
     CpuFeatureScope scope(this, ARMv8);
     vrintn(liftoff::GetFloatRegister(dst), liftoff::GetFloatRegister(src));
     return true;
   }
   return false;
 }

 void LiftoffAssembler::emit_f32_min(DoubleRegister dst, DoubleRegister lhs,
                                     DoubleRegister rhs) {
   liftoff::EmitFloatMinOrMax(
       this, liftoff::GetFloatRegister(dst), liftoff::GetFloatRegister(lhs),
       liftoff::GetFloatRegister(rhs), liftoff::MinOrMax::kMin);
 }

 void LiftoffAssembler::emit_f32_max(DoubleRegister dst, DoubleRegister lhs,
                                     DoubleRegister rhs) {
   liftoff::EmitFloatMinOrMax(
       this, liftoff::GetFloatRegister(dst), liftoff::GetFloatRegister(lhs),
       liftoff::GetFloatRegister(rhs), liftoff::MinOrMax::kMax);
 }

 bool LiftoffAssembler::emit_f64_ceil(DoubleRegister dst, DoubleRegister src) {
   if (CpuFeatures::IsSupported(ARMv8)) {
     CpuFeatureScope scope(this, ARMv8);
     vrintp(dst, src);
     return true;
   }
   return false;
 }

 bool LiftoffAssembler::emit_f64_floor(DoubleRegister dst, DoubleRegister src) {
   if (CpuFeatures::IsSupported(ARMv8)) {
     CpuFeatureScope scope(this, ARMv8);
     vrintm(dst, src);
     return true;
   }
   return false;
 }

 bool LiftoffAssembler::emit_f64_trunc(DoubleRegister dst, DoubleRegister src) {
   if (CpuFeatures::IsSupported(ARMv8)) {
     CpuFeatureScope scope(this, ARMv8);
     vrintz(dst, src);
     return true;
   }
   return false;
 }

 bool LiftoffAssembler::emit_f64_nearest_int(DoubleRegister dst,
                                             DoubleRegister src) {
   if (CpuFeatures::IsSupported(ARMv8)) {
     CpuFeatureScope scope(this, ARMv8);
     vrintn(dst, src);
     return true;
   }
   return false;
 }

 void LiftoffAssembler::emit_f64_min(DoubleRegister dst, DoubleRegister lhs,
                                     DoubleRegister rhs) {
   liftoff::EmitFloatMinOrMax(this, dst, lhs, rhs, liftoff::MinOrMax::kMin);
 }

 void LiftoffAssembler::emit_f64_max(DoubleRegister dst, DoubleRegister lhs,
                                     DoubleRegister rhs) {
   liftoff::EmitFloatMinOrMax(this, dst, lhs, rhs, liftoff::MinOrMax::kMax);
 }

 void LiftoffAssembler::emit_u32_to_intptr(Register dst, Register src) {
   // This is a nop on arm.
 }

 void LiftoffAssembler::emit_f32_copysign(DoubleRegister dst, DoubleRegister lhs,
                                          DoubleRegister rhs) {
   constexpr uint32_t kF32SignBit = uint32_t{1} << 31;
   UseScratchRegisterScope temps(this);
   Register scratch = GetUnusedRegister(kGpReg, {}).gp();
   Register scratch2 = temps.Acquire();
   VmovLow(scratch, lhs);
   // Clear sign bit in {scratch}.
   bic(scratch, scratch, Operand(kF32SignBit));
   VmovLow(scratch2, rhs);
   // Isolate sign bit in {scratch2}.
   and_(scratch2, scratch2, Operand(kF32SignBit));
   // Combine {scratch2} into {scratch}.
   orr(scratch, scratch, scratch2);
   VmovLow(dst, scratch);
 }

 void LiftoffAssembler::emit_f64_copysign(DoubleRegister dst, DoubleRegister lhs,
                                          DoubleRegister rhs) {
   constexpr uint32_t kF64SignBitHighWord = uint32_t{1} << 31;
   // On arm, we cannot hold the whole f64 value in a gp register, so we just
   // operate on the upper half (UH).
   UseScratchRegisterScope temps(this);
   Register scratch = GetUnusedRegister(kGpReg, {}).gp();
   Register scratch2 = temps.Acquire();
   VmovHigh(scratch, lhs);
   // Clear sign bit in {scratch}.
   bic(scratch, scratch, Operand(kF64SignBitHighWord));
   VmovHigh(scratch2, rhs);
   // Isolate sign bit in {scratch2}.
   and_(scratch2, scratch2, Operand(kF64SignBitHighWord));
   // Combine {scratch2} into {scratch}.
   orr(scratch, scratch, scratch2);
   vmov(dst, lhs);
   VmovHigh(dst, scratch);
 }

 bool LiftoffAssembler::emit_type_conversion(WasmOpcode opcode,
                                             LiftoffRegister dst,
                                             LiftoffRegister src, Label* trap) {
   switch (opcode) {
     case kExprI32ConvertI64:
       TurboAssembler::Move(dst.gp(), src.low_gp());
       return true;
     case kExprI32SConvertF32: {
       UseScratchRegisterScope temps(this);
       SwVfpRegister scratch_f = temps.AcquireS();
       vcvt_s32_f32(
           scratch_f,
           liftoff::GetFloatRegister(src.fp()));  // f32 -> i32 round to zero.
       vmov(dst.gp(), scratch_f);
       // Check underflow and NaN.
       vmov(scratch_f, Float32(static_cast<float>(INT32_MIN)));
       VFPCompareAndSetFlags(liftoff::GetFloatRegister(src.fp()), scratch_f);
       b(trap, lt);
       // Check overflow.
       cmp(dst.gp(), Operand(-1));
       b(trap, vs);
       return true;
     }
     case kExprI32UConvertF32: {
       UseScratchRegisterScope temps(this);
       SwVfpRegister scratch_f = temps.AcquireS();
       vcvt_u32_f32(
           scratch_f,
           liftoff::GetFloatRegister(src.fp()));  // f32 -> i32 round to zero.
       vmov(dst.gp(), scratch_f);
       // Check underflow and NaN.
       vmov(scratch_f, Float32(-1.0f));
       VFPCompareAndSetFlags(liftoff::GetFloatRegister(src.fp()), scratch_f);
       b(trap, le);
       // Check overflow.
       cmp(dst.gp(), Operand(-1));
       b(trap, eq);
       return true;
     }
     case kExprI32SConvertF64: {
       UseScratchRegisterScope temps(this);
       SwVfpRegister scratch_f = temps.AcquireS();
       vcvt_s32_f64(scratch_f, src.fp());  // f64 -> i32 round to zero.
       vmov(dst.gp(), scratch_f);
       // Check underflow and NaN.
       DwVfpRegister scratch_d = temps.AcquireD();
       vmov(scratch_d, Double(static_cast<double>(INT32_MIN - 1.0)));
       VFPCompareAndSetFlags(src.fp(), scratch_d);
       b(trap, le);
       // Check overflow.
       vmov(scratch_d, Double(static_cast<double>(INT32_MAX + 1.0)));
       VFPCompareAndSetFlags(src.fp(), scratch_d);
       b(trap, ge);
       return true;
     }
     case kExprI32UConvertF64: {
       UseScratchRegisterScope temps(this);
       SwVfpRegister scratch_f = temps.AcquireS();
       vcvt_u32_f64(scratch_f, src.fp());  // f64 -> i32 round to zero.
       vmov(dst.gp(), scratch_f);
       // Check underflow and NaN.
       DwVfpRegister scratch_d = temps.AcquireD();
       vmov(scratch_d, Double(static_cast<double>(-1.0)));
       VFPCompareAndSetFlags(src.fp(), scratch_d);
       b(trap, le);
       // Check overflow.
       vmov(scratch_d, Double(static_cast<double>(UINT32_MAX + 1.0)));
       VFPCompareAndSetFlags(src.fp(), scratch_d);
       b(trap, ge);
       return true;
     }
     case kExprI32SConvertSatF32: {
       UseScratchRegisterScope temps(this);
       SwVfpRegister scratch_f = temps.AcquireS();
       vcvt_s32_f32(
           scratch_f,
           liftoff::GetFloatRegister(src.fp()));  // f32 -> i32 round to zero.
       vmov(dst.gp(), scratch_f);
       return true;
     }
     case kExprI32UConvertSatF32: {
       UseScratchRegisterScope temps(this);
       SwVfpRegister scratch_f = temps.AcquireS();
       vcvt_u32_f32(
           scratch_f,
           liftoff::GetFloatRegister(src.fp()));  // f32 -> u32 round to zero.
       vmov(dst.gp(), scratch_f);
       return true;
     }
     case kExprI32SConvertSatF64: {
       UseScratchRegisterScope temps(this);
       SwVfpRegister scratch_f = temps.AcquireS();
       vcvt_s32_f64(scratch_f, src.fp());  // f64 -> i32 round to zero.
       vmov(dst.gp(), scratch_f);
       return true;
     }
     case kExprI32UConvertSatF64: {
       UseScratchRegisterScope temps(this);
       SwVfpRegister scratch_f = temps.AcquireS();
       vcvt_u32_f64(scratch_f, src.fp());  // f64 -> u32 round to zero.
       vmov(dst.gp(), scratch_f);
       return true;
     }
     case kExprI32ReinterpretF32:
       vmov(dst.gp(), liftoff::GetFloatRegister(src.fp()));
       return true;
     case kExprI64SConvertI32:
       if (dst.low_gp() != src.gp()) mov(dst.low_gp(), src.gp());
       mov(dst.high_gp(), Operand(src.gp(), ASR, 31));
       return true;
     case kExprI64UConvertI32:
       if (dst.low_gp() != src.gp()) mov(dst.low_gp(), src.gp());
       mov(dst.high_gp(), Operand(0));
       return true;
     case kExprI64ReinterpretF64:
       vmov(dst.low_gp(), dst.high_gp(), src.fp());
       return true;
     case kExprF32SConvertI32: {
       SwVfpRegister dst_float = liftoff::GetFloatRegister(dst.fp());
       vmov(dst_float, src.gp());
       vcvt_f32_s32(dst_float, dst_float);
       return true;
     }
     case kExprF32UConvertI32: {
       SwVfpRegister dst_float = liftoff::GetFloatRegister(dst.fp());
       vmov(dst_float, src.gp());
       vcvt_f32_u32(dst_float, dst_float);
       return true;
     }
     case kExprF32ConvertF64:
       vcvt_f32_f64(liftoff::GetFloatRegister(dst.fp()), src.fp());
       return true;
     case kExprF32ReinterpretI32:
       vmov(liftoff::GetFloatRegister(dst.fp()), src.gp());
       return true;
     case kExprF64SConvertI32: {
       vmov(liftoff::GetFloatRegister(dst.fp()), src.gp());
       vcvt_f64_s32(dst.fp(), liftoff::GetFloatRegister(dst.fp()));
       return true;
     }
     case kExprF64UConvertI32: {
       vmov(liftoff::GetFloatRegister(dst.fp()), src.gp());
       vcvt_f64_u32(dst.fp(), liftoff::GetFloatRegister(dst.fp()));
       return true;
     }
     case kExprF64ConvertF32:
       vcvt_f64_f32(dst.fp(), liftoff::GetFloatRegister(src.fp()));
       return true;
     case kExprF64ReinterpretI64:
       vmov(dst.fp(), src.low_gp(), src.high_gp());
       return true;
     case kExprF64SConvertI64:
     case kExprF64UConvertI64:
     case kExprI64SConvertF32:
     case kExprI64UConvertF32:
     case kExprI64SConvertSatF32:
     case kExprI64UConvertSatF32:
     case kExprF32SConvertI64:
     case kExprF32UConvertI64:
     case kExprI64SConvertF64:
     case kExprI64UConvertF64:
     case kExprI64SConvertSatF64:
     case kExprI64UConvertSatF64:
       // These cases can be handled by the C fallback function.
       return false;
     default:
       UNREACHABLE();
   }
 }

 void LiftoffAssembler::emit_i32_signextend_i8(Register dst, Register src) {
   sxtb(dst, src);
 }

 void LiftoffAssembler::emit_i32_signextend_i16(Register dst, Register src) {
   sxth(dst, src);
 }

 void LiftoffAssembler::emit_i64_signextend_i8(LiftoffRegister dst,
                                               LiftoffRegister src) {
   emit_i32_signextend_i8(dst.low_gp(), src.low_gp());
   mov(dst.high_gp(), Operand(dst.low_gp(), ASR, 31));
 }

 void LiftoffAssembler::emit_i64_signextend_i16(LiftoffRegister dst,
                                                LiftoffRegister src) {
   emit_i32_signextend_i16(dst.low_gp(), src.low_gp());
   mov(dst.high_gp(), Operand(dst.low_gp(), ASR, 31));
 }

 void LiftoffAssembler::emit_i64_signextend_i32(LiftoffRegister dst,
                                                LiftoffRegister src) {
   TurboAssembler::Move(dst.low_gp(), src.low_gp());
   mov(dst.high_gp(), Operand(src.low_gp(), ASR, 31));
 }

 void LiftoffAssembler::emit_jump(Label* label) { b(label); }

 void LiftoffAssembler::emit_jump(Register target) { bx(target); }

 void LiftoffAssembler::emit_cond_jump(Condition cond, Label* label,
                                       ValueType type, Register lhs,
                                       Register rhs) {
   DCHECK_EQ(type, kWasmI32);
   if (rhs == no_reg) {
     cmp(lhs, Operand(0));
   } else {
     cmp(lhs, rhs);
   }
   b(label, cond);
 }

 void LiftoffAssembler::emit_i32_eqz(Register dst, Register src) {
   clz(dst, src);
   mov(dst, Operand(dst, LSR, kRegSizeInBitsLog2));
 }

 void LiftoffAssembler::emit_i32_set_cond(Condition cond, Register dst,
                                          Register lhs, Register rhs) {
   cmp(lhs, rhs);
   mov(dst, Operand(0), LeaveCC);
   mov(dst, Operand(1), LeaveCC, cond);
 }

 void LiftoffAssembler::emit_i64_eqz(Register dst, LiftoffRegister src) {
   orr(dst, src.low_gp(), src.high_gp());
   clz(dst, dst);
   mov(dst, Operand(dst, LSR, 5));
 }

 void LiftoffAssembler::emit_i64_set_cond(Condition cond, Register dst,
                                          LiftoffRegister lhs,
                                          LiftoffRegister rhs) {
   // For signed i64 comparisons, we still need to use unsigned comparison for
   // the low word (the only bit carrying signedness information is the MSB in
   // the high word).
   Condition unsigned_cond = liftoff::MakeUnsigned(cond);
   Label set_cond;
   Label cont;
   LiftoffRegister dest = LiftoffRegister(dst);
   bool speculative_move = !dest.overlaps(lhs) && !dest.overlaps(rhs);
   if (speculative_move) {
     mov(dst, Operand(0));
   }
   // Compare high word first. If it differs, use it for the set_cond. If it's
   // equal, compare the low word and use that for set_cond.
   cmp(lhs.high_gp(), rhs.high_gp());
   if (unsigned_cond == cond) {
     cmp(lhs.low_gp(), rhs.low_gp(), kEqual);
     if (!speculative_move) {
       mov(dst, Operand(0));
     }
     mov(dst, Operand(1), LeaveCC, cond);
   } else {
     // If the condition predicate for the low differs from that for the high
     // word, the conditional move instructions must be separated.
     b(ne, &set_cond);
     cmp(lhs.low_gp(), rhs.low_gp());
     if (!speculative_move) {
       mov(dst, Operand(0));
     }
     mov(dst, Operand(1), LeaveCC, unsigned_cond);
     b(&cont);
     bind(&set_cond);
     if (!speculative_move) {
       mov(dst, Operand(0));
     }
     mov(dst, Operand(1), LeaveCC, cond);
     bind(&cont);
   }
 }

 void LiftoffAssembler::emit_f32_set_cond(Condition cond, Register dst,
                                          DoubleRegister lhs,
                                          DoubleRegister rhs) {
   VFPCompareAndSetFlags(liftoff::GetFloatRegister(lhs),
                         liftoff::GetFloatRegister(rhs));
   mov(dst, Operand(0), LeaveCC);
   mov(dst, Operand(1), LeaveCC, cond);
   if (cond != ne) {
     // If V flag set, at least one of the arguments was a Nan -> false.
     mov(dst, Operand(0), LeaveCC, vs);
   }
 }

 void LiftoffAssembler::emit_f64_set_cond(Condition cond, Register dst,
                                          DoubleRegister lhs,
                                          DoubleRegister rhs) {
   VFPCompareAndSetFlags(lhs, rhs);
   mov(dst, Operand(0), LeaveCC);
   mov(dst, Operand(1), LeaveCC, cond);
   if (cond != ne) {
     // If V flag set, at least one of the arguments was a Nan -> false.
     mov(dst, Operand(0), LeaveCC, vs);
   }
 }

 bool LiftoffAssembler::emit_select(LiftoffRegister dst, Register condition,
                                    LiftoffRegister true_value,
                                    LiftoffRegister false_value,
                                    ValueType type) {
   return false;
 }

 void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
                                      Register offset_reg, uint32_t offset_imm,
                                      LoadType type,
                                      LoadTransformationKind transform,
                                      uint32_t* protected_load_pc) {
   UseScratchRegisterScope temps(this);
   Register actual_src_addr = liftoff::CalculateActualAddress(
       this, &temps, src_addr, offset_reg, offset_imm);
   *protected_load_pc = pc_offset();
   MachineType memtype = type.mem_type();

   if (transform == LoadTransformationKind::kExtend) {
     if (memtype == MachineType::Int8()) {
       vld1(Neon8, NeonListOperand(dst.low_fp()),
            NeonMemOperand(actual_src_addr));
       vmovl(NeonS8, liftoff::GetSimd128Register(dst), dst.low_fp());
     } else if (memtype == MachineType::Uint8()) {
       vld1(Neon8, NeonListOperand(dst.low_fp()),
            NeonMemOperand(actual_src_addr));
       vmovl(NeonU8, liftoff::GetSimd128Register(dst), dst.low_fp());
     } else if (memtype == MachineType::Int16()) {
       vld1(Neon16, NeonListOperand(dst.low_fp()),
            NeonMemOperand(actual_src_addr));
       vmovl(NeonS16, liftoff::GetSimd128Register(dst), dst.low_fp());
     } else if (memtype == MachineType::Uint16()) {
       vld1(Neon16, NeonListOperand(dst.low_fp()),
            NeonMemOperand(actual_src_addr));
       vmovl(NeonU16, liftoff::GetSimd128Register(dst), dst.low_fp());
     } else if (memtype == MachineType::Int32()) {
       vld1(Neon32, NeonListOperand(dst.low_fp()),
            NeonMemOperand(actual_src_addr));
       vmovl(NeonS32, liftoff::GetSimd128Register(dst), dst.low_fp());
     } else if (memtype == MachineType::Uint32()) {
       vld1(Neon32, NeonListOperand(dst.low_fp()),
            NeonMemOperand(actual_src_addr));
       vmovl(NeonU32, liftoff::GetSimd128Register(dst), dst.low_fp());
     }
   } else if (transform == LoadTransformationKind::kZeroExtend) {
     Simd128Register dest = liftoff::GetSimd128Register(dst);
     if (memtype == MachineType::Int32()) {
       vmov(dest, 0);
       vld1s(Neon32, NeonListOperand(dst.low_fp()), 0,
             NeonMemOperand(actual_src_addr));
     } else {
       DCHECK_EQ(MachineType::Int64(), memtype);
       vmov(dest.high(), 0);
       vld1(Neon64, NeonListOperand(dest.low()),
            NeonMemOperand(actual_src_addr));
     }
   } else {
     DCHECK_EQ(LoadTransformationKind::kSplat, transform);
     if (memtype == MachineType::Int8()) {
       vld1r(Neon8, NeonListOperand(liftoff::GetSimd128Register(dst)),
             NeonMemOperand(actual_src_addr));
     } else if (memtype == MachineType::Int16()) {
       vld1r(Neon16, NeonListOperand(liftoff::GetSimd128Register(dst)),
             NeonMemOperand(actual_src_addr));
     } else if (memtype == MachineType::Int32()) {
       vld1r(Neon32, NeonListOperand(liftoff::GetSimd128Register(dst)),
             NeonMemOperand(actual_src_addr));
     } else if (memtype == MachineType::Int64()) {
       vld1(Neon32, NeonListOperand(dst.low_fp()),
            NeonMemOperand(actual_src_addr));
       TurboAssembler::Move(dst.high_fp(), dst.low_fp());
     }
   }
 }

 void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
                                           LiftoffRegister lhs,
                                           LiftoffRegister rhs) {
   UseScratchRegisterScope temps(this);

   NeonListOperand table(liftoff::GetSimd128Register(lhs));
   if (dst == lhs) {
     // dst will be overwritten, so keep the table somewhere else.
     QwNeonRegister tbl = temps.AcquireQ();
     TurboAssembler::Move(tbl, liftoff::GetSimd128Register(lhs));
     table = NeonListOperand(tbl);
   }

   vtbl(dst.low_fp(), table, rhs.low_fp());
   vtbl(dst.high_fp(), table, rhs.high_fp());
 }

 void LiftoffAssembler::emit_f64x2_splat(LiftoffRegister dst,
                                         LiftoffRegister src) {
   TurboAssembler::Move(dst.low_fp(), src.fp());
   TurboAssembler::Move(dst.high_fp(), src.fp());
 }

 void LiftoffAssembler::emit_f64x2_extract_lane(LiftoffRegister dst,
                                                LiftoffRegister lhs,
                                                uint8_t imm_lane_idx) {
   ExtractLane(dst.fp(), liftoff::GetSimd128Register(lhs), imm_lane_idx);
 }

 void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
                                                LiftoffRegister src1,
                                                LiftoffRegister src2,
                                                uint8_t imm_lane_idx) {
   ReplaceLane(liftoff::GetSimd128Register(dst),
               liftoff::GetSimd128Register(src1), src2.fp(), imm_lane_idx);
 }

 void LiftoffAssembler::emit_f64x2_abs(LiftoffRegister dst,
                                       LiftoffRegister src) {
   vabs(dst.low_fp(), src.low_fp());
   vabs(dst.high_fp(), src.high_fp());
 }

 void LiftoffAssembler::emit_f64x2_neg(LiftoffRegister dst,
                                       LiftoffRegister src) {
   vneg(dst.low_fp(), src.low_fp());
   vneg(dst.high_fp(), src.high_fp());
 }

 void LiftoffAssembler::emit_f64x2_sqrt(LiftoffRegister dst,
                                        LiftoffRegister src) {
   vsqrt(dst.low_fp(), src.low_fp());
   vsqrt(dst.high_fp(), src.high_fp());
 }

 bool LiftoffAssembler::emit_f64x2_ceil(LiftoffRegister dst,
                                        LiftoffRegister src) {
   if (!CpuFeatures::IsSupported(ARMv8)) {
     return false;
   }

   CpuFeatureScope scope(this, ARMv8);
   vrintp(dst.low_fp(), src.low_fp());
   vrintp(dst.high_fp(), src.high_fp());
   return true;
 }

 bool LiftoffAssembler::emit_f64x2_floor(LiftoffRegister dst,
                                         LiftoffRegister src) {
   if (!CpuFeatures::IsSupported(ARMv8)) {
     return false;
   }

   CpuFeatureScope scope(this, ARMv8);
   vrintm(dst.low_fp(), src.low_fp());
   vrintm(dst.high_fp(), src.high_fp());
   return true;
 }

 bool LiftoffAssembler::emit_f64x2_trunc(LiftoffRegister dst,
                                         LiftoffRegister src) {
   if (!CpuFeatures::IsSupported(ARMv8)) {
     return false;
   }

   CpuFeatureScope scope(this, ARMv8);
   vrintz(dst.low_fp(), src.low_fp());
   vrintz(dst.high_fp(), src.high_fp());
   return true;
 }

 bool LiftoffAssembler::emit_f64x2_nearest_int(LiftoffRegister dst,
                                               LiftoffRegister src) {
   if (!CpuFeatures::IsSupported(ARMv8)) {
     return false;
   }

   CpuFeatureScope scope(this, ARMv8);
   vrintn(dst.low_fp(), src.low_fp());
   vrintn(dst.high_fp(), src.high_fp());
   return true;
 }

 void LiftoffAssembler::emit_f64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vadd(dst.low_fp(), lhs.low_fp(), rhs.low_fp());
   vadd(dst.high_fp(), lhs.high_fp(), rhs.high_fp());
 }

 void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vsub(dst.low_fp(), lhs.low_fp(), rhs.low_fp());
   vsub(dst.high_fp(), lhs.high_fp(), rhs.high_fp());
 }

 void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vmul(dst.low_fp(), lhs.low_fp(), rhs.low_fp());
   vmul(dst.high_fp(), lhs.high_fp(), rhs.high_fp());
 }

 void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vdiv(dst.low_fp(), lhs.low_fp(), rhs.low_fp());
   vdiv(dst.high_fp(), lhs.high_fp(), rhs.high_fp());
 }

 void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   Simd128Register dest = liftoff::GetSimd128Register(dst);
   Simd128Register left = liftoff::GetSimd128Register(lhs);
   Simd128Register right = liftoff::GetSimd128Register(rhs);

   liftoff::EmitFloatMinOrMax(this, dest.low(), left.low(), right.low(),
                              liftoff::MinOrMax::kMin);
   liftoff::EmitFloatMinOrMax(this, dest.high(), left.high(), right.high(),
                              liftoff::MinOrMax::kMin);
 }

 void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   Simd128Register dest = liftoff::GetSimd128Register(dst);
   Simd128Register left = liftoff::GetSimd128Register(lhs);
   Simd128Register right = liftoff::GetSimd128Register(rhs);

   liftoff::EmitFloatMinOrMax(this, dest.low(), left.low(), right.low(),
                              liftoff::MinOrMax::kMax);
   liftoff::EmitFloatMinOrMax(this, dest.high(), left.high(), right.high(),
                              liftoff::MinOrMax::kMax);
 }

 void LiftoffAssembler::emit_f64x2_pmin(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   QwNeonRegister dest = liftoff::GetSimd128Register(dst);
   QwNeonRegister left = liftoff::GetSimd128Register(lhs);
   QwNeonRegister right = liftoff::GetSimd128Register(rhs);

   if (dst != rhs) {
     vmov(dest, left);
   }

   VFPCompareAndSetFlags(right.low(), left.low());
   vmov(dest.low(), right.low(), mi);
   VFPCompareAndSetFlags(right.high(), left.high());
   vmov(dest.high(), right.high(), mi);
 }

 void LiftoffAssembler::emit_f64x2_pmax(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   QwNeonRegister dest = liftoff::GetSimd128Register(dst);
   QwNeonRegister left = liftoff::GetSimd128Register(lhs);
   QwNeonRegister right = liftoff::GetSimd128Register(rhs);

   if (dst != rhs) {
     vmov(dest, left);
   }

   VFPCompareAndSetFlags(right.low(), left.low());
   vmov(dest.low(), right.low(), gt);
   VFPCompareAndSetFlags(right.high(), left.high());
   vmov(dest.high(), right.high(), gt);
 }

 void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
                                         LiftoffRegister src) {
   vdup(Neon32, liftoff::GetSimd128Register(dst), src.fp(), 0);
 }

 void LiftoffAssembler::emit_f32x4_extract_lane(LiftoffRegister dst,
                                                LiftoffRegister lhs,
                                                uint8_t imm_lane_idx) {
   ExtractLane(liftoff::GetFloatRegister(dst.fp()),
               liftoff::GetSimd128Register(lhs), imm_lane_idx);
 }

 void LiftoffAssembler::emit_f32x4_replace_lane(LiftoffRegister dst,
                                                LiftoffRegister src1,
                                                LiftoffRegister src2,
                                                uint8_t imm_lane_idx) {
   ReplaceLane(liftoff::GetSimd128Register(dst),
               liftoff::GetSimd128Register(src1),
               liftoff::GetFloatRegister(src2.fp()), imm_lane_idx);
 }

 void LiftoffAssembler::emit_f32x4_abs(LiftoffRegister dst,
                                       LiftoffRegister src) {
   vabs(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_f32x4_neg(LiftoffRegister dst,
                                       LiftoffRegister src) {
   vneg(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_f32x4_sqrt(LiftoffRegister dst,
                                        LiftoffRegister src) {
   // The list of d registers available to us is from d0 to d15, which always
   // maps to 2 s registers.
   LowDwVfpRegister dst_low = LowDwVfpRegister::from_code(dst.low_fp().code());
   LowDwVfpRegister src_low = LowDwVfpRegister::from_code(src.low_fp().code());

   LowDwVfpRegister dst_high = LowDwVfpRegister::from_code(dst.high_fp().code());
   LowDwVfpRegister src_high = LowDwVfpRegister::from_code(src.high_fp().code());

   vsqrt(dst_low.low(), src_low.low());
   vsqrt(dst_low.high(), src_low.high());
   vsqrt(dst_high.low(), src_high.low());
   vsqrt(dst_high.high(), src_high.high());
 }

 bool LiftoffAssembler::emit_f32x4_ceil(LiftoffRegister dst,
                                        LiftoffRegister src) {
   if (!CpuFeatures::IsSupported(ARMv8)) {
     return false;
   }

   CpuFeatureScope scope(this, ARMv8);
   vrintp(NeonS32, liftoff::GetSimd128Register(dst),
          liftoff::GetSimd128Register(src));
   return true;
 }

 bool LiftoffAssembler::emit_f32x4_floor(LiftoffRegister dst,
                                         LiftoffRegister src) {
   if (!CpuFeatures::IsSupported(ARMv8)) {
     return false;
   }

   CpuFeatureScope scope(this, ARMv8);
   vrintm(NeonS32, liftoff::GetSimd128Register(dst),
          liftoff::GetSimd128Register(src));
   return true;
 }

 bool LiftoffAssembler::emit_f32x4_trunc(LiftoffRegister dst,
                                         LiftoffRegister src) {
   if (!CpuFeatures::IsSupported(ARMv8)) {
     return false;
   }

   CpuFeatureScope scope(this, ARMv8);
   vrintz(NeonS32, liftoff::GetSimd128Register(dst),
          liftoff::GetSimd128Register(src));
   return true;
 }

 bool LiftoffAssembler::emit_f32x4_nearest_int(LiftoffRegister dst,
                                               LiftoffRegister src) {
   if (!CpuFeatures::IsSupported(ARMv8)) {
     return false;
   }

   CpuFeatureScope scope(this, ARMv8);
   vrintn(NeonS32, liftoff::GetSimd128Register(dst),
          liftoff::GetSimd128Register(src));
   return true;
 }

 void LiftoffAssembler::emit_f32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vadd(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vsub(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vmul(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   // The list of d registers available to us is from d0 to d15, which always
   // maps to 2 s registers.
   LowDwVfpRegister dst_low = LowDwVfpRegister::from_code(dst.low_fp().code());
   LowDwVfpRegister lhs_low = LowDwVfpRegister::from_code(lhs.low_fp().code());
   LowDwVfpRegister rhs_low = LowDwVfpRegister::from_code(rhs.low_fp().code());

   LowDwVfpRegister dst_high = LowDwVfpRegister::from_code(dst.high_fp().code());
   LowDwVfpRegister lhs_high = LowDwVfpRegister::from_code(lhs.high_fp().code());
   LowDwVfpRegister rhs_high = LowDwVfpRegister::from_code(rhs.high_fp().code());

   vdiv(dst_low.low(), lhs_low.low(), rhs_low.low());
   vdiv(dst_low.high(), lhs_low.high(), rhs_low.high());
   vdiv(dst_high.low(), lhs_high.low(), rhs_high.low());
   vdiv(dst_high.high(), lhs_high.high(), rhs_high.high());
 }

 void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vmin(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vmax(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_f32x4_pmin(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   UseScratchRegisterScope temps(this);

   QwNeonRegister tmp = liftoff::GetSimd128Register(dst);
   if (dst == lhs || dst == rhs) {
     tmp = temps.AcquireQ();
   }

   QwNeonRegister left = liftoff::GetSimd128Register(lhs);
   QwNeonRegister right = liftoff::GetSimd128Register(rhs);
   vcgt(tmp, left, right);
   vbsl(tmp, right, left);

   if (dst == lhs || dst == rhs) {
     vmov(liftoff::GetSimd128Register(dst), tmp);
   }
 }

 void LiftoffAssembler::emit_f32x4_pmax(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   UseScratchRegisterScope temps(this);

   QwNeonRegister tmp = liftoff::GetSimd128Register(dst);
   if (dst == lhs || dst == rhs) {
     tmp = temps.AcquireQ();
   }

   QwNeonRegister left = liftoff::GetSimd128Register(lhs);
   QwNeonRegister right = liftoff::GetSimd128Register(rhs);
   vcgt(tmp, right, left);
   vbsl(tmp, right, left);

   if (dst == lhs || dst == rhs) {
     vmov(liftoff::GetSimd128Register(dst), tmp);
   }
 }

 void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
                                         LiftoffRegister src) {
   Simd128Register dst_simd = liftoff::GetSimd128Register(dst);
   vdup(Neon32, dst_simd, src.low_gp());
   ReplaceLane(dst_simd, dst_simd, src.high_gp(), NeonS32, 1);
   ReplaceLane(dst_simd, dst_simd, src.high_gp(), NeonS32, 3);
 }

 void LiftoffAssembler::emit_i64x2_extract_lane(LiftoffRegister dst,
                                                LiftoffRegister lhs,
                                                uint8_t imm_lane_idx) {
   ExtractLane(dst.low_gp(), liftoff::GetSimd128Register(lhs), NeonS32,
               imm_lane_idx * 2);
   ExtractLane(dst.high_gp(), liftoff::GetSimd128Register(lhs), NeonS32,
               imm_lane_idx * 2 + 1);
 }

 void LiftoffAssembler::emit_i64x2_replace_lane(LiftoffRegister dst,
                                                LiftoffRegister src1,
                                                LiftoffRegister src2,
                                                uint8_t imm_lane_idx) {
   Simd128Register dst_simd = liftoff::GetSimd128Register(dst);
   Simd128Register src1_simd = liftoff::GetSimd128Register(src1);
   ReplaceLane(dst_simd, src1_simd, src2.low_gp(), NeonS32, imm_lane_idx * 2);
   ReplaceLane(dst_simd, dst_simd, src2.high_gp(), NeonS32,
               imm_lane_idx * 2 + 1);
 }

 void LiftoffAssembler::emit_i64x2_neg(LiftoffRegister dst,
                                       LiftoffRegister src) {
   UseScratchRegisterScope temps(this);
   QwNeonRegister zero =
       dst == src ? temps.AcquireQ() : liftoff::GetSimd128Register(dst);
   vmov(zero, uint64_t{0});
   vsub(Neon64, liftoff::GetSimd128Register(dst), zero,
        liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_i64x2_shl(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kLeft, NeonS64, Neon32>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i64x2_shli(LiftoffRegister dst, LiftoffRegister lhs,
                                        int32_t rhs) {
   vshl(NeonS64, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), rhs & 63);
 }

 void LiftoffAssembler::emit_i64x2_shr_s(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kRight, NeonS64, Neon32>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i64x2_shri_s(LiftoffRegister dst,
                                          LiftoffRegister lhs, int32_t rhs) {
   liftoff::EmitSimdShiftImmediate<liftoff::kRight, NeonS64>(this, dst, lhs,
                                                             rhs);
 }

 void LiftoffAssembler::emit_i64x2_shr_u(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kRight, NeonU64, Neon32>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i64x2_shri_u(LiftoffRegister dst,
                                          LiftoffRegister lhs, int32_t rhs) {
   liftoff::EmitSimdShiftImmediate<liftoff::kRight, NeonU64>(this, dst, lhs,
                                                             rhs);
 }

 void LiftoffAssembler::emit_i64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vadd(Neon64, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vsub(Neon64, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   UseScratchRegisterScope temps(this);

   QwNeonRegister dst_neon = liftoff::GetSimd128Register(dst);
   QwNeonRegister left = liftoff::GetSimd128Register(lhs);
   QwNeonRegister right = liftoff::GetSimd128Register(rhs);

   // These temporary registers will be modified. We can directly modify lhs and
   // rhs if they are not uesd, saving on temporaries.
   QwNeonRegister tmp1 = left;
   QwNeonRegister tmp2 = right;

   LiftoffRegList used_plus_dst =
       cache_state()->used_registers | LiftoffRegList::ForRegs(dst);

   if (used_plus_dst.has(lhs) && used_plus_dst.has(rhs)) {
     tmp1 = temps.AcquireQ();
     // We only have 1 scratch Q register, so acquire another ourselves.
     LiftoffRegList pinned = LiftoffRegList::ForRegs(dst);
     LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
     tmp2 = liftoff::GetSimd128Register(unused_pair);
   } else if (used_plus_dst.has(lhs)) {
     tmp1 = temps.AcquireQ();
   } else if (used_plus_dst.has(rhs)) {
     tmp2 = temps.AcquireQ();
   }

   // Algorithm from code-generator-arm.cc, refer to comments there for details.
   if (tmp1 != left) {
     vmov(tmp1, left);
   }
   if (tmp2 != right) {
     vmov(tmp2, right);
   }

   vtrn(Neon32, tmp1.low(), tmp1.high());
   vtrn(Neon32, tmp2.low(), tmp2.high());

   vmull(NeonU32, dst_neon, tmp1.low(), tmp2.high());
   vmlal(NeonU32, dst_neon, tmp1.high(), tmp2.low());
   vshl(NeonU64, dst_neon, dst_neon, 32);

   vmlal(NeonU32, dst_neon, tmp1.low(), tmp2.low());
 }

 void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
                                         LiftoffRegister src) {
   vdup(Neon32, liftoff::GetSimd128Register(dst), src.gp());
 }

 void LiftoffAssembler::emit_i32x4_extract_lane(LiftoffRegister dst,
                                                LiftoffRegister lhs,
                                                uint8_t imm_lane_idx) {
   ExtractLane(dst.gp(), liftoff::GetSimd128Register(lhs), NeonS32,
               imm_lane_idx);
 }

 void LiftoffAssembler::emit_i32x4_replace_lane(LiftoffRegister dst,
                                                LiftoffRegister src1,
                                                LiftoffRegister src2,
                                                uint8_t imm_lane_idx) {
   ReplaceLane(liftoff::GetSimd128Register(dst),
               liftoff::GetSimd128Register(src1), src2.gp(), NeonS32,
               imm_lane_idx);
 }

 void LiftoffAssembler::emit_i32x4_neg(LiftoffRegister dst,
                                       LiftoffRegister src) {
   vneg(Neon32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_v32x4_anytrue(LiftoffRegister dst,
                                           LiftoffRegister src) {
   liftoff::EmitAnyTrue(this, dst, src);
 }

 void LiftoffAssembler::emit_v32x4_alltrue(LiftoffRegister dst,
                                           LiftoffRegister src) {
   UseScratchRegisterScope temps(this);
   DwVfpRegister scratch = temps.AcquireD();
   vpmin(NeonU32, scratch, src.low_fp(), src.high_fp());
   vpmin(NeonU32, scratch, scratch, scratch);
   ExtractLane(dst.gp(), scratch, NeonS32, 0);
   cmp(dst.gp(), Operand(0));
   mov(dst.gp(), Operand(1), LeaveCC, ne);
 }

 void LiftoffAssembler::emit_i32x4_bitmask(LiftoffRegister dst,
                                           LiftoffRegister src) {
   UseScratchRegisterScope temps(this);
   Simd128Register tmp = liftoff::GetSimd128Register(src);
   Simd128Register mask = temps.AcquireQ();

   if (cache_state()->is_used(src)) {
     // We only have 1 scratch Q register, so try and reuse src.
     LiftoffRegList pinned = LiftoffRegList::ForRegs(src);
     LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
     mask = liftoff::GetSimd128Register(unused_pair);
   }

   vshr(NeonS32, tmp, liftoff::GetSimd128Register(src), 31);
   // Set i-th bit of each lane i. When AND with tmp, the lanes that
   // are signed will have i-th bit set, unsigned will be 0.
   vmov(mask.low(), Double((uint64_t)0x0000'0002'0000'0001));
   vmov(mask.high(), Double((uint64_t)0x0000'0008'0000'0004));
   vand(tmp, mask, tmp);
   vpadd(Neon32, tmp.low(), tmp.low(), tmp.high());
   vpadd(Neon32, tmp.low(), tmp.low(), kDoubleRegZero);
   VmovLow(dst.gp(), tmp.low());
 }

 void LiftoffAssembler::emit_i32x4_shl(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kLeft, NeonS32, Neon32>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i32x4_shli(LiftoffRegister dst, LiftoffRegister lhs,
                                        int32_t rhs) {
   vshl(NeonS32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), rhs & 31);
 }

 void LiftoffAssembler::emit_i32x4_shr_s(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kRight, NeonS32, Neon32>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i32x4_shri_s(LiftoffRegister dst,
                                          LiftoffRegister lhs, int32_t rhs) {
   liftoff::EmitSimdShiftImmediate<liftoff::kRight, NeonS32>(this, dst, lhs,
                                                             rhs);
 }

 void LiftoffAssembler::emit_i32x4_shr_u(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kRight, NeonU32, Neon32>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i32x4_shri_u(LiftoffRegister dst,
                                          LiftoffRegister lhs, int32_t rhs) {
   liftoff::EmitSimdShiftImmediate<liftoff::kRight, NeonU32>(this, dst, lhs,
                                                             rhs);
 }

 void LiftoffAssembler::emit_i32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vadd(Neon32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vsub(Neon32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vmul(Neon32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_min_s(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmin(NeonS32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_min_u(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmin(NeonU32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_max_s(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmax(NeonS32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_max_u(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmax(NeonU32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
                                               LiftoffRegister lhs,
                                               LiftoffRegister rhs) {
   QwNeonRegister dest = liftoff::GetSimd128Register(dst);
   QwNeonRegister left = liftoff::GetSimd128Register(lhs);
   QwNeonRegister right = liftoff::GetSimd128Register(rhs);

   UseScratchRegisterScope temps(this);
   Simd128Register scratch = temps.AcquireQ();

   vmull(NeonS16, scratch, left.low(), right.low());
   vpadd(Neon32, dest.low(), scratch.low(), scratch.high());

   vmull(NeonS16, scratch, left.high(), right.high());
   vpadd(Neon32, dest.high(), scratch.low(), scratch.high());
 }

 void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
                                         LiftoffRegister src) {
   vdup(Neon16, liftoff::GetSimd128Register(dst), src.gp());
 }

 void LiftoffAssembler::emit_i16x8_neg(LiftoffRegister dst,
                                       LiftoffRegister src) {
   vneg(Neon16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_v16x8_anytrue(LiftoffRegister dst,
                                           LiftoffRegister src) {
   liftoff::EmitAnyTrue(this, dst, src);
 }

 void LiftoffAssembler::emit_v16x8_alltrue(LiftoffRegister dst,
                                           LiftoffRegister src) {
   UseScratchRegisterScope temps(this);
   DwVfpRegister scratch = temps.AcquireD();
   vpmin(NeonU16, scratch, src.low_fp(), src.high_fp());
   vpmin(NeonU16, scratch, scratch, scratch);
   vpmin(NeonU16, scratch, scratch, scratch);
   ExtractLane(dst.gp(), scratch, NeonS16, 0);
   cmp(dst.gp(), Operand(0));
   mov(dst.gp(), Operand(1), LeaveCC, ne);
 }

 void LiftoffAssembler::emit_i16x8_bitmask(LiftoffRegister dst,
                                           LiftoffRegister src) {
   UseScratchRegisterScope temps(this);
   Simd128Register tmp = liftoff::GetSimd128Register(src);
   Simd128Register mask = temps.AcquireQ();

   if (cache_state()->is_used(src)) {
     // We only have 1 scratch Q register, so try and reuse src.
     LiftoffRegList pinned = LiftoffRegList::ForRegs(src);
     LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
     mask = liftoff::GetSimd128Register(unused_pair);
   }

   vshr(NeonS16, tmp, liftoff::GetSimd128Register(src), 15);
   // Set i-th bit of each lane i. When AND with tmp, the lanes that
   // are signed will have i-th bit set, unsigned will be 0.
   vmov(mask.low(), Double((uint64_t)0x0008'0004'0002'0001));
   vmov(mask.high(), Double((uint64_t)0x0080'0040'0020'0010));
   vand(tmp, mask, tmp);
   vpadd(Neon16, tmp.low(), tmp.low(), tmp.high());
   vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
   vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
   vmov(NeonU16, dst.gp(), tmp.low(), 0);
 }

 void LiftoffAssembler::emit_i16x8_shl(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kLeft, NeonS16, Neon16>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i16x8_shli(LiftoffRegister dst, LiftoffRegister lhs,
                                        int32_t rhs) {
   vshl(NeonS16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), rhs & 15);
 }

 void LiftoffAssembler::emit_i16x8_shr_s(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kRight, NeonS16, Neon16>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i16x8_shri_s(LiftoffRegister dst,
                                          LiftoffRegister lhs, int32_t rhs) {
   liftoff::EmitSimdShiftImmediate<liftoff::kRight, NeonS16>(this, dst, lhs,
                                                             rhs);
 }

 void LiftoffAssembler::emit_i16x8_shr_u(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kRight, NeonU16, Neon16>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i16x8_shri_u(LiftoffRegister dst,
                                          LiftoffRegister lhs, int32_t rhs) {
   liftoff::EmitSimdShiftImmediate<liftoff::kRight, NeonU16>(this, dst, lhs,
                                                             rhs);
 }

 void LiftoffAssembler::emit_i16x8_add(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vadd(Neon16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_add_sat_s(LiftoffRegister dst,
                                             LiftoffRegister lhs,
                                             LiftoffRegister rhs) {
   vqadd(NeonS16, liftoff::GetSimd128Register(dst),
         liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vsub(Neon16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_sub_sat_s(LiftoffRegister dst,
                                             LiftoffRegister lhs,
                                             LiftoffRegister rhs) {
   vqsub(NeonS16, liftoff::GetSimd128Register(dst),
         liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_sub_sat_u(LiftoffRegister dst,
                                             LiftoffRegister lhs,
                                             LiftoffRegister rhs) {
   vqsub(NeonU16, liftoff::GetSimd128Register(dst),
         liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vmul(Neon16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_add_sat_u(LiftoffRegister dst,
                                             LiftoffRegister lhs,
                                             LiftoffRegister rhs) {
   vqadd(NeonU16, liftoff::GetSimd128Register(dst),
         liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_min_s(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmin(NeonS16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_min_u(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmin(NeonU16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_max_s(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmax(NeonS16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmax(NeonU16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_extract_lane_u(LiftoffRegister dst,
                                                  LiftoffRegister lhs,
                                                  uint8_t imm_lane_idx) {
   ExtractLane(dst.gp(), liftoff::GetSimd128Register(lhs), NeonU16,
               imm_lane_idx);
 }

 void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst,
                                                  LiftoffRegister lhs,
                                                  uint8_t imm_lane_idx) {
   ExtractLane(dst.gp(), liftoff::GetSimd128Register(lhs), NeonS16,
               imm_lane_idx);
 }

 void LiftoffAssembler::emit_i16x8_replace_lane(LiftoffRegister dst,
                                                LiftoffRegister src1,
                                                LiftoffRegister src2,
                                                uint8_t imm_lane_idx) {
   ReplaceLane(liftoff::GetSimd128Register(dst),
               liftoff::GetSimd128Register(src1), src2.gp(), NeonS16,
               imm_lane_idx);
 }

 void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
                                           LiftoffRegister lhs,
                                           LiftoffRegister rhs,
                                           const uint8_t shuffle[16],
                                           bool is_swizzle) {
   Simd128Register dest = liftoff::GetSimd128Register(dst);
   Simd128Register src1 = liftoff::GetSimd128Register(lhs);
   Simd128Register src2 = liftoff::GetSimd128Register(rhs);
   UseScratchRegisterScope temps(this);
   Simd128Register scratch = temps.AcquireQ();
   if ((src1 != src2) && src1.code() + 1 != src2.code()) {
     // vtbl requires the operands to be consecutive or the same.
     // If they are the same, we build a smaller list operand (table_size = 2).
     // If they are not the same, and not consecutive, we move the src1 and src2
     // to q14 and q15, which will be unused since they are not allocatable in
     // Liftoff. If the operands are the same, then we build a smaller list
     // operand below.
     static_assert(!(kLiftoffAssemblerFpCacheRegs &
                     (d28.bit() | d29.bit() | d30.bit() | d31.bit())),
                   "This only works if q14-q15 (d28-d31) are not used.");
     vmov(q14, src1);
     src1 = q14;
     vmov(q15, src2);
     src2 = q15;
   }

   int table_size = src1 == src2 ? 2 : 4;

   int scratch_s_base = scratch.code() * 4;
   for (int j = 0; j < 4; j++) {
     uint32_t imm = 0;
     for (int i = 3; i >= 0; i--) {
       imm = (imm << 8) | shuffle[j * 4 + i];
     }
     DCHECK_EQ(0, imm & (table_size == 2 ? 0xF0F0F0F0 : 0xE0E0E0E0));
     // Ensure indices are in [0,15] if table_size is 2, or [0,31] if 4.
     vmov(SwVfpRegister::from_code(scratch_s_base + j), Float32::FromBits(imm));
   }

   DwVfpRegister table_base = src1.low();
   NeonListOperand table(table_base, table_size);

   if (dest != src1 && dest != src2) {
     vtbl(dest.low(), table, scratch.low());
     vtbl(dest.high(), table, scratch.high());
   } else {
     vtbl(scratch.low(), table, scratch.low());
     vtbl(scratch.high(), table, scratch.high());
     vmov(dest, scratch);
   }
 }

 void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
                                         LiftoffRegister src) {
   vdup(Neon8, liftoff::GetSimd128Register(dst), src.gp());
 }

 void LiftoffAssembler::emit_i8x16_extract_lane_u(LiftoffRegister dst,
                                                  LiftoffRegister lhs,
                                                  uint8_t imm_lane_idx) {
   ExtractLane(dst.gp(), liftoff::GetSimd128Register(lhs), NeonU8, imm_lane_idx);
 }

 void LiftoffAssembler::emit_i8x16_extract_lane_s(LiftoffRegister dst,
                                                  LiftoffRegister lhs,
                                                  uint8_t imm_lane_idx) {
   ExtractLane(dst.gp(), liftoff::GetSimd128Register(lhs), NeonS8, imm_lane_idx);
 }

 void LiftoffAssembler::emit_i8x16_replace_lane(LiftoffRegister dst,
                                                LiftoffRegister src1,
                                                LiftoffRegister src2,
                                                uint8_t imm_lane_idx) {
   ReplaceLane(liftoff::GetSimd128Register(dst),
               liftoff::GetSimd128Register(src1), src2.gp(), NeonS8,
               imm_lane_idx);
 }

 void LiftoffAssembler::emit_i8x16_neg(LiftoffRegister dst,
                                       LiftoffRegister src) {
   vneg(Neon8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_v8x16_anytrue(LiftoffRegister dst,
                                           LiftoffRegister src) {
   liftoff::EmitAnyTrue(this, dst, src);
 }

 void LiftoffAssembler::emit_v8x16_alltrue(LiftoffRegister dst,
                                           LiftoffRegister src) {
   UseScratchRegisterScope temps(this);
   DwVfpRegister scratch = temps.AcquireD();
   vpmin(NeonU8, scratch, src.low_fp(), src.high_fp());
   vpmin(NeonU8, scratch, scratch, scratch);
   vpmin(NeonU8, scratch, scratch, scratch);
   vpmin(NeonU8, scratch, scratch, scratch);
   ExtractLane(dst.gp(), scratch, NeonS8, 0);
   cmp(dst.gp(), Operand(0));
   mov(dst.gp(), Operand(1), LeaveCC, ne);
 }

 void LiftoffAssembler::emit_i8x16_bitmask(LiftoffRegister dst,
                                           LiftoffRegister src) {
   UseScratchRegisterScope temps(this);
   Simd128Register tmp = liftoff::GetSimd128Register(src);
   Simd128Register mask = temps.AcquireQ();

   if (cache_state()->is_used(src)) {
     // We only have 1 scratch Q register, so try and reuse src.
     LiftoffRegList pinned = LiftoffRegList::ForRegs(src);
     LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
     mask = liftoff::GetSimd128Register(unused_pair);
   }

   vshr(NeonS8, tmp, liftoff::GetSimd128Register(src), 7);
   // Set i-th bit of each lane i. When AND with tmp, the lanes that
   // are signed will have i-th bit set, unsigned will be 0.
   vmov(mask.low(), Double((uint64_t)0x8040'2010'0804'0201));
   vmov(mask.high(), Double((uint64_t)0x8040'2010'0804'0201));
   vand(tmp, mask, tmp);
   vext(mask, tmp, tmp, 8);
   vzip(Neon8, mask, tmp);
   vpadd(Neon16, tmp.low(), tmp.low(), tmp.high());
   vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
   vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
   vmov(NeonU16, dst.gp(), tmp.low(), 0);
 }

 void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kLeft, NeonS8, Neon8>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
                                        int32_t rhs) {
   vshl(NeonS8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), rhs & 7);
 }

 void LiftoffAssembler::emit_i8x16_shr_s(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kRight, NeonS8, Neon8>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i8x16_shri_s(LiftoffRegister dst,
                                          LiftoffRegister lhs, int32_t rhs) {
   liftoff::EmitSimdShiftImmediate<liftoff::kRight, NeonS8>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i8x16_shr_u(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   liftoff::EmitSimdShift<liftoff::kRight, NeonU8, Neon8>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i8x16_shri_u(LiftoffRegister dst,
                                          LiftoffRegister lhs, int32_t rhs) {
   liftoff::EmitSimdShiftImmediate<liftoff::kRight, NeonU8>(this, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vadd(Neon8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_add_sat_s(LiftoffRegister dst,
                                             LiftoffRegister lhs,
                                             LiftoffRegister rhs) {
   vqadd(NeonS8, liftoff::GetSimd128Register(dst),
         liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vsub(Neon8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_sub_sat_s(LiftoffRegister dst,
                                             LiftoffRegister lhs,
                                             LiftoffRegister rhs) {
   vqsub(NeonS8, liftoff::GetSimd128Register(dst),
         liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_sub_sat_u(LiftoffRegister dst,
                                             LiftoffRegister lhs,
                                             LiftoffRegister rhs) {
   vqsub(NeonU8, liftoff::GetSimd128Register(dst),
         liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
                                       LiftoffRegister rhs) {
   vmul(Neon8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_add_sat_u(LiftoffRegister dst,
                                             LiftoffRegister lhs,
                                             LiftoffRegister rhs) {
   vqadd(NeonU8, liftoff::GetSimd128Register(dst),
         liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_min_s(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmin(NeonS8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_min_u(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmin(NeonU8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_max_s(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmax(NeonS8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_max_u(LiftoffRegister dst,
                                         LiftoffRegister lhs,
                                         LiftoffRegister rhs) {
   vmax(NeonU8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_eq(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vceq(Neon8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_ne(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vceq(Neon8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
   vmvn(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(dst));
 }

 void LiftoffAssembler::emit_i8x16_gt_s(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcgt(NeonS8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_gt_u(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcgt(NeonU8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_ge_s(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcge(NeonS8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_ge_u(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcge(NeonU8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_eq(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vceq(Neon16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_ne(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vceq(Neon16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
   vmvn(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(dst));
 }

 void LiftoffAssembler::emit_i16x8_gt_s(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcgt(NeonS16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_gt_u(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcgt(NeonU16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_ge_s(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcge(NeonS16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_ge_u(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcge(NeonU16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_eq(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vceq(Neon32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_ne(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vceq(Neon32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
   vmvn(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(dst));
 }

 void LiftoffAssembler::emit_i32x4_gt_s(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcgt(NeonS32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_gt_u(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcgt(NeonU32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_ge_s(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcge(NeonS32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i32x4_ge_u(LiftoffRegister dst, LiftoffRegister lhs,
                                        LiftoffRegister rhs) {
   vcge(NeonU32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_f32x4_eq(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vceq(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_f32x4_ne(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vceq(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
   vmvn(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(dst));
 }

 void LiftoffAssembler::emit_f32x4_lt(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vcgt(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(rhs),
        liftoff::GetSimd128Register(lhs));
 }

 void LiftoffAssembler::emit_f32x4_le(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vcge(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(rhs),
        liftoff::GetSimd128Register(lhs));
 }

 void LiftoffAssembler::emit_f64x2_eq(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   liftoff::F64x2Compare(this, dst, lhs, rhs, eq);
 }

 void LiftoffAssembler::emit_f64x2_ne(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   liftoff::F64x2Compare(this, dst, lhs, rhs, ne);
 }

 void LiftoffAssembler::emit_f64x2_lt(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   liftoff::F64x2Compare(this, dst, lhs, rhs, lt);
 }

 void LiftoffAssembler::emit_f64x2_le(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   liftoff::F64x2Compare(this, dst, lhs, rhs, le);
 }

 void LiftoffAssembler::emit_s128_const(LiftoffRegister dst,
                                        const uint8_t imms[16]) {
   uint64_t vals[2];
   memcpy(vals, imms, sizeof(vals));
   vmov(dst.low_fp(), Double(vals[0]));
   vmov(dst.high_fp(), Double(vals[1]));
 }

 void LiftoffAssembler::emit_s128_not(LiftoffRegister dst, LiftoffRegister src) {
   vmvn(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_s128_and(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   vand(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_s128_or(LiftoffRegister dst, LiftoffRegister lhs,
                                     LiftoffRegister rhs) {
   vorr(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_s128_xor(LiftoffRegister dst, LiftoffRegister lhs,
                                      LiftoffRegister rhs) {
   veor(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_s128_select(LiftoffRegister dst,
                                         LiftoffRegister src1,
                                         LiftoffRegister src2,
                                         LiftoffRegister mask) {
   if (dst != mask) {
     vmov(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(mask));
   }
   vbsl(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(src1),
        liftoff::GetSimd128Register(src2));
 }

 void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
                                                  LiftoffRegister src) {
   vcvt_s32_f32(liftoff::GetSimd128Register(dst),
                liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
                                                  LiftoffRegister src) {
   vcvt_u32_f32(liftoff::GetSimd128Register(dst),
                liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_f32x4_sconvert_i32x4(LiftoffRegister dst,
                                                  LiftoffRegister src) {
   vcvt_f32_s32(liftoff::GetSimd128Register(dst),
                liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_f32x4_uconvert_i32x4(LiftoffRegister dst,
                                                  LiftoffRegister src) {
   vcvt_f32_u32(liftoff::GetSimd128Register(dst),
                liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
                                                  LiftoffRegister lhs,
                                                  LiftoffRegister rhs) {
   liftoff::S128NarrowOp(this, NeonS8, NeonS8, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i8x16_uconvert_i16x8(LiftoffRegister dst,
                                                  LiftoffRegister lhs,
                                                  LiftoffRegister rhs) {
   liftoff::S128NarrowOp(this, NeonU8, NeonS8, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i16x8_sconvert_i32x4(LiftoffRegister dst,
                                                  LiftoffRegister lhs,
                                                  LiftoffRegister rhs) {
   liftoff::S128NarrowOp(this, NeonS16, NeonS16, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i16x8_uconvert_i32x4(LiftoffRegister dst,
                                                  LiftoffRegister lhs,
                                                  LiftoffRegister rhs) {
   liftoff::S128NarrowOp(this, NeonU16, NeonS16, dst, lhs, rhs);
 }

 void LiftoffAssembler::emit_i16x8_sconvert_i8x16_low(LiftoffRegister dst,
                                                      LiftoffRegister src) {
   vmovl(NeonS8, liftoff::GetSimd128Register(dst), src.low_fp());
 }

 void LiftoffAssembler::emit_i16x8_sconvert_i8x16_high(LiftoffRegister dst,
                                                       LiftoffRegister src) {
   vmovl(NeonS8, liftoff::GetSimd128Register(dst), src.high_fp());
 }

 void LiftoffAssembler::emit_i16x8_uconvert_i8x16_low(LiftoffRegister dst,
                                                      LiftoffRegister src) {
   vmovl(NeonU8, liftoff::GetSimd128Register(dst), src.low_fp());
 }

 void LiftoffAssembler::emit_i16x8_uconvert_i8x16_high(LiftoffRegister dst,
                                                       LiftoffRegister src) {
   vmovl(NeonU8, liftoff::GetSimd128Register(dst), src.high_fp());
 }

 void LiftoffAssembler::emit_i32x4_sconvert_i16x8_low(LiftoffRegister dst,
                                                      LiftoffRegister src) {
   vmovl(NeonS16, liftoff::GetSimd128Register(dst), src.low_fp());
 }

 void LiftoffAssembler::emit_i32x4_sconvert_i16x8_high(LiftoffRegister dst,
                                                       LiftoffRegister src) {
   vmovl(NeonS16, liftoff::GetSimd128Register(dst), src.high_fp());
 }

 void LiftoffAssembler::emit_i32x4_uconvert_i16x8_low(LiftoffRegister dst,
                                                      LiftoffRegister src) {
   vmovl(NeonU16, liftoff::GetSimd128Register(dst), src.low_fp());
 }

 void LiftoffAssembler::emit_i32x4_uconvert_i16x8_high(LiftoffRegister dst,
                                                       LiftoffRegister src) {
   vmovl(NeonU16, liftoff::GetSimd128Register(dst), src.high_fp());
 }

 void LiftoffAssembler::emit_s128_and_not(LiftoffRegister dst,
                                          LiftoffRegister lhs,
                                          LiftoffRegister rhs) {
   vbic(liftoff::GetSimd128Register(dst), liftoff::GetSimd128Register(lhs),
        liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_rounding_average_u(LiftoffRegister dst,
                                                      LiftoffRegister lhs,
                                                      LiftoffRegister rhs) {
   vrhadd(NeonU8, liftoff::GetSimd128Register(dst),
          liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i16x8_rounding_average_u(LiftoffRegister dst,
                                                      LiftoffRegister lhs,
                                                      LiftoffRegister rhs) {
   vrhadd(NeonU16, liftoff::GetSimd128Register(dst),
          liftoff::GetSimd128Register(lhs), liftoff::GetSimd128Register(rhs));
 }

 void LiftoffAssembler::emit_i8x16_abs(LiftoffRegister dst,
                                       LiftoffRegister src) {
   vabs(Neon8, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_i16x8_abs(LiftoffRegister dst,
                                       LiftoffRegister src) {
   vabs(Neon16, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::emit_i32x4_abs(LiftoffRegister dst,
                                       LiftoffRegister src) {
   vabs(Neon32, liftoff::GetSimd128Register(dst),
        liftoff::GetSimd128Register(src));
 }

 void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
   ldr(limit_address, MemOperand(limit_address));
   cmp(sp, limit_address);
   b(ool_code, ls);
 }

 void LiftoffAssembler::CallTrapCallbackForTesting() {
   PrepareCallCFunction(0, 0);
   CallCFunction(ExternalReference::wasm_call_trap_callback_for_testing(), 0);
 }

 void LiftoffAssembler::AssertUnreachable(AbortReason reason) {
   // Asserts unreachable within the wasm code.
   TurboAssembler::AssertUnreachable(reason);
 }

 void LiftoffAssembler::PushRegisters(LiftoffRegList regs) {
   RegList core_regs = regs.GetGpList();
   if (core_regs != 0) {
     stm(db_w, sp, core_regs);
   }
   LiftoffRegList fp_regs = regs & kFpCacheRegList;
   while (!fp_regs.is_empty()) {
     LiftoffRegister reg = fp_regs.GetFirstRegSet();
     DoubleRegister first = reg.fp();
     DoubleRegister last = first;
     fp_regs.clear(reg);
     while (!fp_regs.is_empty()) {
       LiftoffRegister reg = fp_regs.GetFirstRegSet();
       int code = reg.fp().code();
       // vstm can not push more than 16 registers. We have to make sure the
       // condition is met.
       if ((code != last.code() + 1) || ((code - first.code() + 1) > 16)) break;
       last = reg.fp();
       fp_regs.clear(reg);
     }
     vstm(db_w, sp, first, last);
   }
 }

 void LiftoffAssembler::PopRegisters(LiftoffRegList regs) {
   LiftoffRegList fp_regs = regs & kFpCacheRegList;
   while (!fp_regs.is_empty()) {
     LiftoffRegister reg = fp_regs.GetLastRegSet();
     DoubleRegister last = reg.fp();
     DoubleRegister first = last;
     fp_regs.clear(reg);
     while (!fp_regs.is_empty()) {
       LiftoffRegister reg = fp_regs.GetLastRegSet();
       int code = reg.fp().code();
       if ((code != first.code() - 1) || ((last.code() - code + 1) > 16)) break;
       first = reg.fp();
       fp_regs.clear(reg);
     }
     vldm(ia_w, sp, first, last);
   }
   RegList core_regs = regs.GetGpList();
   if (core_regs != 0) {
     ldm(ia_w, sp, core_regs);
   }
 }

 void LiftoffAssembler::DropStackSlotsAndRet(uint32_t num_stack_slots) {
   Drop(num_stack_slots);
   Ret();
 }

 void LiftoffAssembler::CallC(const wasm::FunctionSig* sig,
                              const LiftoffRegister* args,
                              const LiftoffRegister* rets,
                              ValueType out_argument_type, int stack_bytes,
                              ExternalReference ext_ref) {
   // Arguments are passed by pushing them all to the stack and then passing
   // a pointer to them.
   DCHECK(IsAligned(stack_bytes, kSystemPointerSize));
   // Reserve space in the stack.
   AllocateStackSpace(stack_bytes);

   int arg_bytes = 0;
   for (ValueType param_type : sig->parameters()) {
     switch (param_type.kind()) {
       case ValueType::kI32:
         str(args->gp(), MemOperand(sp, arg_bytes));
         break;
       case ValueType::kI64:
         str(args->low_gp(), MemOperand(sp, arg_bytes));
         str(args->high_gp(), MemOperand(sp, arg_bytes + kSystemPointerSize));
         break;
       case ValueType::kF32:
         vstr(liftoff::GetFloatRegister(args->fp()), MemOperand(sp, arg_bytes));
         break;
       case ValueType::kF64:
         vstr(args->fp(), MemOperand(sp, arg_bytes));
         break;
       case ValueType::kS128:
         vstr(args->low_fp(), MemOperand(sp, arg_bytes));
         vstr(args->high_fp(),
              MemOperand(sp, arg_bytes + 2 * kSystemPointerSize));
         break;
       default:
         UNREACHABLE();
     }
     args++;
     arg_bytes += param_type.element_size_bytes();
   }
   DCHECK_LE(arg_bytes, stack_bytes);

   // Pass a pointer to the buffer with the arguments to the C function.
   mov(r0, sp);

   // Now call the C function.
   constexpr int kNumCCallArgs = 1;
   PrepareCallCFunction(kNumCCallArgs);
   CallCFunction(ext_ref, kNumCCallArgs);

   // Move return value to the right register.
   const LiftoffRegister* result_reg = rets;
   if (sig->return_count() > 0) {
     DCHECK_EQ(1, sig->return_count());
     constexpr Register kReturnReg = r0;
     if (kReturnReg != rets->gp()) {
       Move(*rets, LiftoffRegister(kReturnReg), sig->GetReturn(0));
     }
     result_reg++;
   }

   // Load potential output value from the buffer on the stack.
   if (out_argument_type != kWasmStmt) {
     switch (out_argument_type.kind()) {
       case ValueType::kI32:
         ldr(result_reg->gp(), MemOperand(sp));
         break;
       case ValueType::kI64:
         ldr(result_reg->low_gp(), MemOperand(sp));
         ldr(result_reg->high_gp(), MemOperand(sp, kSystemPointerSize));
         break;
       case ValueType::kF32:
         vldr(liftoff::GetFloatRegister(result_reg->fp()), MemOperand(sp));
         break;
       case ValueType::kF64:
         vldr(result_reg->fp(), MemOperand(sp));
         break;
       case ValueType::kS128:
         vld1(Neon8, NeonListOperand(result_reg->low_fp(), 2),
              NeonMemOperand(sp));
         break;
       default:
         UNREACHABLE();
     }
   }
   add(sp, sp, Operand(stack_bytes));
 }

 void LiftoffAssembler::CallNativeWasmCode(Address addr) {
   Call(addr, RelocInfo::WASM_CALL);
 }

 void LiftoffAssembler::TailCallNativeWasmCode(Address addr) {
   Jump(addr, RelocInfo::WASM_CALL);
 }

 void LiftoffAssembler::CallIndirect(const wasm::FunctionSig* sig,
                                     compiler::CallDescriptor* call_descriptor,
                                     Register target) {
   DCHECK(target != no_reg);
   Call(target);
 }

 void LiftoffAssembler::TailCallIndirect(Register target) {
   DCHECK(target != no_reg);
   Jump(target);
 }

 void LiftoffAssembler::CallRuntimeStub(WasmCode::RuntimeStubId sid) {
   // A direct call to a wasm runtime stub defined in this module.
   // Just encode the stub index. This will be patched at relocation.
   Call(static_cast<Address>(sid), RelocInfo::WASM_STUB_CALL);
 }

 void LiftoffAssembler::AllocateStackSlot(Register addr, uint32_t size) {
   AllocateStackSpace(size);
   mov(addr, sp);
 }

 void LiftoffAssembler::DeallocateStackSlot(uint32_t size) {
   add(sp, sp, Operand(size));
 }

 void LiftoffStackSlots::Construct() {
   for (auto& slot : slots_) {
     const LiftoffAssembler::VarState& src = slot.src_;
     switch (src.loc()) {
       case LiftoffAssembler::VarState::kStack: {
         switch (src.type().kind()) {
           // i32 and i64 can be treated as similar cases, i64 being previously
           // split into two i32 registers
           case ValueType::kI32:
           case ValueType::kI64:
           case ValueType::kF32: {
             UseScratchRegisterScope temps(asm_);
             Register scratch = temps.Acquire();
             asm_->ldr(scratch,
                       liftoff::GetHalfStackSlot(slot.src_offset_, slot.half_));
             asm_->Push(scratch);
           } break;
           case ValueType::kF64: {
             UseScratchRegisterScope temps(asm_);
             DwVfpRegister scratch = temps.AcquireD();
             asm_->vldr(scratch, liftoff::GetStackSlot(slot.src_offset_));
             asm_->vpush(scratch);
           } break;
           case ValueType::kS128: {
             MemOperand mem_op = liftoff::GetStackSlot(slot.src_offset_);
             UseScratchRegisterScope temps(asm_);
             Register addr = liftoff::CalculateActualAddress(
                 asm_, &temps, mem_op.rn(), no_reg, mem_op.offset());
             QwNeonRegister scratch = temps.AcquireQ();
             asm_->vld1(Neon8, NeonListOperand(scratch), NeonMemOperand(addr));
             asm_->vpush(scratch);
             break;
           }
           default:
             UNREACHABLE();
         }
         break;
       }
       case LiftoffAssembler::VarState::kRegister:
         switch (src.type().kind()) {
           case ValueType::kI64: {
             LiftoffRegister reg =
                 slot.half_ == kLowWord ? src.reg().low() : src.reg().high();
             asm_->push(reg.gp());
           } break;
           case ValueType::kI32:
             asm_->push(src.reg().gp());
             break;
           case ValueType::kF32:
             asm_->vpush(liftoff::GetFloatRegister(src.reg().fp()));
             break;
           case ValueType::kF64:
             asm_->vpush(src.reg().fp());
             break;
           case ValueType::kS128:
             asm_->vpush(liftoff::GetSimd128Register(src.reg()));
             break;
           default:
             UNREACHABLE();
         }
         break;
       case LiftoffAssembler::VarState::kIntConst: {
         DCHECK(src.type() == kWasmI32 || src.type() == kWasmI64);
         UseScratchRegisterScope temps(asm_);
         Register scratch = temps.Acquire();
         // The high word is the sign extension of the low word.
         asm_->mov(scratch,
                   Operand(slot.half_ == kLowWord ? src.i32_const()
                                                  : src.i32_const() >> 31));
         asm_->push(scratch);
         break;
       }
     }
   }
 }

 }  // namespace wasm
 }  // namespace internal
 }  // namespace v8

 #endif  // V8_WASM_BASELINE_ARM_LIFTOFF_ASSEMBLER_ARM_H_