third_party/v8/src/compiler/backend/ia32/code-generator-ia32.cc - cobalt - Git at Google

 // Copyright 2013 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "src/base/overflowing-math.h"
 #include "src/codegen/assembler-inl.h"
 #include "src/codegen/callable.h"
 #include "src/codegen/ia32/assembler-ia32.h"
 #include "src/codegen/macro-assembler.h"
 #include "src/codegen/optimized-compilation-info.h"
 #include "src/compiler/backend/code-generator-impl.h"
 #include "src/compiler/backend/code-generator.h"
 #include "src/compiler/backend/gap-resolver.h"
 #include "src/compiler/node-matchers.h"
 #include "src/compiler/osr.h"
 #include "src/execution/frame-constants.h"
 #include "src/execution/frames.h"
 #include "src/heap/memory-chunk.h"
 #include "src/objects/smi.h"
 #include "src/wasm/wasm-code-manager.h"
 #include "src/wasm/wasm-objects.h"

 namespace v8 {
 namespace internal {
 namespace compiler {

 #define __ tasm()->

 #define kScratchDoubleReg xmm0

 // Adds IA-32 specific methods for decoding operands.
 class IA32OperandConverter : public InstructionOperandConverter {
  public:
   IA32OperandConverter(CodeGenerator* gen, Instruction* instr)
       : InstructionOperandConverter(gen, instr) {}

   Operand InputOperand(size_t index, int extra = 0) {
     return ToOperand(instr_->InputAt(index), extra);
   }

   Immediate InputImmediate(size_t index) {
     return ToImmediate(instr_->InputAt(index));
   }

   Operand OutputOperand() { return ToOperand(instr_->Output()); }

   Operand ToOperand(InstructionOperand* op, int extra = 0) {
     if (op->IsRegister()) {
       DCHECK_EQ(0, extra);
       return Operand(ToRegister(op));
     } else if (op->IsFPRegister()) {
       DCHECK_EQ(0, extra);
       return Operand(ToDoubleRegister(op));
     }
     DCHECK(op->IsStackSlot() || op->IsFPStackSlot());
     return SlotToOperand(AllocatedOperand::cast(op)->index(), extra);
   }

   Operand SlotToOperand(int slot, int extra = 0) {
     FrameOffset offset = frame_access_state()->GetFrameOffset(slot);
     return Operand(offset.from_stack_pointer() ? esp : ebp,
                    offset.offset() + extra);
   }

   Immediate ToImmediate(InstructionOperand* operand) {
     Constant constant = ToConstant(operand);
     if (constant.type() == Constant::kInt32 &&
         RelocInfo::IsWasmReference(constant.rmode())) {
       return Immediate(static_cast<Address>(constant.ToInt32()),
                        constant.rmode());
     }
     switch (constant.type()) {
       case Constant::kInt32:
         return Immediate(constant.ToInt32());
       case Constant::kFloat32:
         return Immediate::EmbeddedNumber(constant.ToFloat32());
       case Constant::kFloat64:
         return Immediate::EmbeddedNumber(constant.ToFloat64().value());
       case Constant::kExternalReference:
         return Immediate(constant.ToExternalReference());
       case Constant::kHeapObject:
         return Immediate(constant.ToHeapObject());
       case Constant::kCompressedHeapObject:
         break;
       case Constant::kDelayedStringConstant:
         return Immediate::EmbeddedStringConstant(
             constant.ToDelayedStringConstant());
       case Constant::kInt64:
         break;
       case Constant::kRpoNumber:
         return Immediate::CodeRelativeOffset(ToLabel(operand));
     }
     UNREACHABLE();
   }

   static size_t NextOffset(size_t* offset) {
     size_t i = *offset;
     (*offset)++;
     return i;
   }

   static ScaleFactor ScaleFor(AddressingMode one, AddressingMode mode) {
     STATIC_ASSERT(0 == static_cast<int>(times_1));
     STATIC_ASSERT(1 == static_cast<int>(times_2));
     STATIC_ASSERT(2 == static_cast<int>(times_4));
     STATIC_ASSERT(3 == static_cast<int>(times_8));
     int scale = static_cast<int>(mode - one);
     DCHECK(scale >= 0 && scale < 4);
     return static_cast<ScaleFactor>(scale);
   }

   Operand MemoryOperand(size_t* offset) {
     AddressingMode mode = AddressingModeField::decode(instr_->opcode());
     switch (mode) {
       case kMode_MR: {
         Register base = InputRegister(NextOffset(offset));
         int32_t disp = 0;
         return Operand(base, disp);
       }
       case kMode_MRI: {
         Register base = InputRegister(NextOffset(offset));
         Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
         return Operand(base, ctant.ToInt32(), ctant.rmode());
       }
       case kMode_MR1:
       case kMode_MR2:
       case kMode_MR4:
       case kMode_MR8: {
         Register base = InputRegister(NextOffset(offset));
         Register index = InputRegister(NextOffset(offset));
         ScaleFactor scale = ScaleFor(kMode_MR1, mode);
         int32_t disp = 0;
         return Operand(base, index, scale, disp);
       }
       case kMode_MR1I:
       case kMode_MR2I:
       case kMode_MR4I:
       case kMode_MR8I: {
         Register base = InputRegister(NextOffset(offset));
         Register index = InputRegister(NextOffset(offset));
         ScaleFactor scale = ScaleFor(kMode_MR1I, mode);
         Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
         return Operand(base, index, scale, ctant.ToInt32(), ctant.rmode());
       }
       case kMode_M1:
       case kMode_M2:
       case kMode_M4:
       case kMode_M8: {
         Register index = InputRegister(NextOffset(offset));
         ScaleFactor scale = ScaleFor(kMode_M1, mode);
         int32_t disp = 0;
         return Operand(index, scale, disp);
       }
       case kMode_M1I:
       case kMode_M2I:
       case kMode_M4I:
       case kMode_M8I: {
         Register index = InputRegister(NextOffset(offset));
         ScaleFactor scale = ScaleFor(kMode_M1I, mode);
         Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
         return Operand(index, scale, ctant.ToInt32(), ctant.rmode());
       }
       case kMode_MI: {
         Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
         return Operand(ctant.ToInt32(), ctant.rmode());
       }
       case kMode_Root: {
         Register base = kRootRegister;
         int32_t disp = InputInt32(NextOffset(offset));
         return Operand(base, disp);
       }
       case kMode_None:
         UNREACHABLE();
     }
     UNREACHABLE();
   }

   Operand MemoryOperand(size_t first_input = 0) {
     return MemoryOperand(&first_input);
   }

   Operand NextMemoryOperand(size_t offset = 0) {
     AddressingMode mode = AddressingModeField::decode(instr_->opcode());
     Register base = InputRegister(NextOffset(&offset));
     const int32_t disp = 4;
     if (mode == kMode_MR1) {
       Register index = InputRegister(NextOffset(&offset));
       ScaleFactor scale = ScaleFor(kMode_MR1, kMode_MR1);
       return Operand(base, index, scale, disp);
     } else if (mode == kMode_MRI) {
       Constant ctant = ToConstant(instr_->InputAt(NextOffset(&offset)));
       return Operand(base, ctant.ToInt32() + disp, ctant.rmode());
     } else {
       UNREACHABLE();
     }
   }

   void MoveInstructionOperandToRegister(Register destination,
                                         InstructionOperand* op) {
     if (op->IsImmediate() || op->IsConstant()) {
       gen_->tasm()->mov(destination, ToImmediate(op));
     } else if (op->IsRegister()) {
       gen_->tasm()->Move(destination, ToRegister(op));
     } else {
       gen_->tasm()->mov(destination, ToOperand(op));
     }
   }
 };

 namespace {

 bool HasAddressingMode(Instruction* instr) {
   return instr->addressing_mode() != kMode_None;
 }

 bool HasImmediateInput(Instruction* instr, size_t index) {
   return instr->InputAt(index)->IsImmediate();
 }

 bool HasRegisterInput(Instruction* instr, size_t index) {
   return instr->InputAt(index)->IsRegister();
 }

 class OutOfLineLoadFloat32NaN final : public OutOfLineCode {
  public:
   OutOfLineLoadFloat32NaN(CodeGenerator* gen, XMMRegister result)
       : OutOfLineCode(gen), result_(result) {}

   void Generate() final {
     __ xorps(result_, result_);
     __ divss(result_, result_);
   }

  private:
   XMMRegister const result_;
 };

 class OutOfLineLoadFloat64NaN final : public OutOfLineCode {
  public:
   OutOfLineLoadFloat64NaN(CodeGenerator* gen, XMMRegister result)
       : OutOfLineCode(gen), result_(result) {}

   void Generate() final {
     __ xorpd(result_, result_);
     __ divsd(result_, result_);
   }

  private:
   XMMRegister const result_;
 };

 class OutOfLineTruncateDoubleToI final : public OutOfLineCode {
  public:
   OutOfLineTruncateDoubleToI(CodeGenerator* gen, Register result,
                              XMMRegister input, StubCallMode stub_mode)
       : OutOfLineCode(gen),
         result_(result),
         input_(input),
         stub_mode_(stub_mode),
         isolate_(gen->isolate()),
         zone_(gen->zone()) {}

   void Generate() final {
     __ AllocateStackSpace(kDoubleSize);
     __ movsd(MemOperand(esp, 0), input_);
     if (stub_mode_ == StubCallMode::kCallWasmRuntimeStub) {
       // A direct call to a wasm runtime stub defined in this module.
       // Just encode the stub index. This will be patched when the code
       // is added to the native module and copied into wasm code space.
       __ wasm_call(wasm::WasmCode::kDoubleToI, RelocInfo::WASM_STUB_CALL);
     } else if (tasm()->options().inline_offheap_trampolines) {
       __ CallBuiltin(Builtins::kDoubleToI);
     } else {
       __ Call(BUILTIN_CODE(isolate_, DoubleToI), RelocInfo::CODE_TARGET);
     }
     __ mov(result_, MemOperand(esp, 0));
     __ add(esp, Immediate(kDoubleSize));
   }

  private:
   Register const result_;
   XMMRegister const input_;
   StubCallMode stub_mode_;
   Isolate* isolate_;
   Zone* zone_;
 };

 class OutOfLineRecordWrite final : public OutOfLineCode {
  public:
   OutOfLineRecordWrite(CodeGenerator* gen, Register object, Operand operand,
                        Register value, Register scratch0, Register scratch1,
                        RecordWriteMode mode, StubCallMode stub_mode)
       : OutOfLineCode(gen),
         object_(object),
         operand_(operand),
         value_(value),
         scratch0_(scratch0),
         scratch1_(scratch1),
         mode_(mode),
         stub_mode_(stub_mode),
         zone_(gen->zone()) {}

   void Generate() final {
     if (mode_ > RecordWriteMode::kValueIsPointer) {
       __ JumpIfSmi(value_, exit());
     }
     __ CheckPageFlag(value_, scratch0_,
                      MemoryChunk::kPointersToHereAreInterestingMask, zero,
                      exit());
     __ lea(scratch1_, operand_);
     RememberedSetAction const remembered_set_action =
         mode_ > RecordWriteMode::kValueIsMap ? EMIT_REMEMBERED_SET
                                              : OMIT_REMEMBERED_SET;
     SaveFPRegsMode const save_fp_mode =
         frame()->DidAllocateDoubleRegisters() ? kSaveFPRegs : kDontSaveFPRegs;
     if (mode_ == RecordWriteMode::kValueIsEphemeronKey) {
       __ CallEphemeronKeyBarrier(object_, scratch1_, save_fp_mode);
     } else if (stub_mode_ == StubCallMode::kCallWasmRuntimeStub) {
       // A direct call to a wasm runtime stub defined in this module.
       // Just encode the stub index. This will be patched when the code
       // is added to the native module and copied into wasm code space.
       __ CallRecordWriteStub(object_, scratch1_, remembered_set_action,
                              save_fp_mode, wasm::WasmCode::kRecordWrite);
     } else {
       __ CallRecordWriteStub(object_, scratch1_, remembered_set_action,
                              save_fp_mode);
     }
   }

  private:
   Register const object_;
   Operand const operand_;
   Register const value_;
   Register const scratch0_;
   Register const scratch1_;
   RecordWriteMode const mode_;
   StubCallMode const stub_mode_;
   Zone* zone_;
 };

 }  // namespace

 #define ASSEMBLE_COMPARE(asm_instr)                              \
   do {                                                           \
     if (HasAddressingMode(instr)) {                              \
       size_t index = 0;                                          \
       Operand left = i.MemoryOperand(&index);                    \
       if (HasImmediateInput(instr, index)) {                     \
         __ asm_instr(left, i.InputImmediate(index));             \
       } else {                                                   \
         __ asm_instr(left, i.InputRegister(index));              \
       }                                                          \
     } else {                                                     \
       if (HasImmediateInput(instr, 1)) {                         \
         if (HasRegisterInput(instr, 0)) {                        \
           __ asm_instr(i.InputRegister(0), i.InputImmediate(1)); \
         } else {                                                 \
           __ asm_instr(i.InputOperand(0), i.InputImmediate(1));  \
         }                                                        \
       } else {                                                   \
         if (HasRegisterInput(instr, 1)) {                        \
           __ asm_instr(i.InputRegister(0), i.InputRegister(1));  \
         } else {                                                 \
           __ asm_instr(i.InputRegister(0), i.InputOperand(1));   \
         }                                                        \
       }                                                          \
     }                                                            \
   } while (0)

 #define ASSEMBLE_IEEE754_BINOP(name)                                     \
   do {                                                                   \
     /* Pass two doubles as arguments on the stack. */                    \
     __ PrepareCallCFunction(4, eax);                                     \
     __ movsd(Operand(esp, 0 * kDoubleSize), i.InputDoubleRegister(0));   \
     __ movsd(Operand(esp, 1 * kDoubleSize), i.InputDoubleRegister(1));   \
     __ CallCFunction(ExternalReference::ieee754_##name##_function(), 4); \
     /* Return value is in st(0) on ia32. */                              \
     /* Store it into the result register. */                             \
     __ AllocateStackSpace(kDoubleSize);                                  \
     __ fstp_d(Operand(esp, 0));                                          \
     __ movsd(i.OutputDoubleRegister(), Operand(esp, 0));                 \
     __ add(esp, Immediate(kDoubleSize));                                 \
   } while (false)

 #define ASSEMBLE_IEEE754_UNOP(name)                                      \
   do {                                                                   \
     /* Pass one double as argument on the stack. */                      \
     __ PrepareCallCFunction(2, eax);                                     \
     __ movsd(Operand(esp, 0 * kDoubleSize), i.InputDoubleRegister(0));   \
     __ CallCFunction(ExternalReference::ieee754_##name##_function(), 2); \
     /* Return value is in st(0) on ia32. */                              \
     /* Store it into the result register. */                             \
     __ AllocateStackSpace(kDoubleSize);                                  \
     __ fstp_d(Operand(esp, 0));                                          \
     __ movsd(i.OutputDoubleRegister(), Operand(esp, 0));                 \
     __ add(esp, Immediate(kDoubleSize));                                 \
   } while (false)

 #define ASSEMBLE_BINOP(asm_instr)                             \
   do {                                                        \
     if (HasAddressingMode(instr)) {                           \
       size_t index = 1;                                       \
       Operand right = i.MemoryOperand(&index);                \
       __ asm_instr(i.InputRegister(0), right);                \
     } else {                                                  \
       if (HasImmediateInput(instr, 1)) {                      \
         __ asm_instr(i.InputOperand(0), i.InputImmediate(1)); \
       } else {                                                \
         __ asm_instr(i.InputRegister(0), i.InputOperand(1));  \
       }                                                       \
     }                                                         \
   } while (0)

 #define ASSEMBLE_ATOMIC_BINOP(bin_inst, mov_inst, cmpxchg_inst) \
   do {                                                          \
     Label binop;                                                \
     __ bind(&binop);                                            \
     __ mov_inst(eax, i.MemoryOperand(1));                       \
     __ Move(i.TempRegister(0), eax);                            \
     __ bin_inst(i.TempRegister(0), i.InputRegister(0));         \
     __ lock();                                                  \
     __ cmpxchg_inst(i.MemoryOperand(1), i.TempRegister(0));     \
     __ j(not_equal, &binop);                                    \
   } while (false)

 #define ASSEMBLE_I64ATOMIC_BINOP(instr1, instr2)                \
   do {                                                          \
     Label binop;                                                \
     __ bind(&binop);                                            \
     __ mov(eax, i.MemoryOperand(2));                            \
     __ mov(edx, i.NextMemoryOperand(2));                        \
     __ push(ebx);                                               \
     frame_access_state()->IncreaseSPDelta(1);                   \
     i.MoveInstructionOperandToRegister(ebx, instr->InputAt(0)); \
     __ push(i.InputRegister(1));                                \
     __ instr1(ebx, eax);                                        \
     __ instr2(i.InputRegister(1), edx);                         \
     __ lock();                                                  \
     __ cmpxchg8b(i.MemoryOperand(2));                           \
     __ pop(i.InputRegister(1));                                 \
     __ pop(ebx);                                                \
     frame_access_state()->IncreaseSPDelta(-1);                  \
     __ j(not_equal, &binop);                                    \
   } while (false);

 #define ASSEMBLE_MOVX(mov_instr)                            \
   do {                                                      \
     if (HasAddressingMode(instr)) {                         \
       __ mov_instr(i.OutputRegister(), i.MemoryOperand());  \
     } else if (HasRegisterInput(instr, 0)) {                \
       __ mov_instr(i.OutputRegister(), i.InputRegister(0)); \
     } else {                                                \
       __ mov_instr(i.OutputRegister(), i.InputOperand(0));  \
     }                                                       \
   } while (0)

 #define ASSEMBLE_SIMD_PUNPCK_SHUFFLE(opcode)                         \
   do {                                                               \
     XMMRegister src0 = i.InputSimd128Register(0);                    \
     Operand src1 = i.InputOperand(instr->InputCount() == 2 ? 1 : 0); \
     if (CpuFeatures::IsSupported(AVX)) {                             \
       CpuFeatureScope avx_scope(tasm(), AVX);                        \
       __ v##opcode(i.OutputSimd128Register(), src0, src1);           \
     } else {                                                         \
       DCHECK_EQ(i.OutputSimd128Register(), src0);                    \
       __ opcode(i.OutputSimd128Register(), src1);                    \
     }                                                                \
   } while (false)

 #define ASSEMBLE_SIMD_IMM_SHUFFLE(opcode, SSELevel, imm)               \
   if (CpuFeatures::IsSupported(AVX)) {                                 \
     CpuFeatureScope avx_scope(tasm(), AVX);                            \
     __ v##opcode(i.OutputSimd128Register(), i.InputSimd128Register(0), \
                  i.InputOperand(1), imm);                              \
   } else {                                                             \
     CpuFeatureScope sse_scope(tasm(), SSELevel);                       \
     DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));   \
     __ opcode(i.OutputSimd128Register(), i.InputOperand(1), imm);      \
   }

 #define ASSEMBLE_SIMD_ALL_TRUE(opcode)               \
   do {                                               \
     Register dst = i.OutputRegister();               \
     Operand src = i.InputOperand(0);                 \
     Register tmp = i.TempRegister(0);                \
     XMMRegister tmp_simd = i.TempSimd128Register(1); \
     __ mov(tmp, Immediate(1));                       \
     __ xor_(dst, dst);                               \
     __ Pxor(tmp_simd, tmp_simd);                     \
     __ opcode(tmp_simd, src);                        \
     __ Ptest(tmp_simd, tmp_simd);                    \
     __ cmov(zero, dst, tmp);                         \
   } while (false)

 #define ASSEMBLE_SIMD_SHIFT(opcode, width)             \
   do {                                                 \
     XMMRegister dst = i.OutputSimd128Register();       \
     DCHECK_EQ(dst, i.InputSimd128Register(0));         \
     if (HasImmediateInput(instr, 1)) {                 \
       __ opcode(dst, dst, byte{i.InputInt##width(1)}); \
     } else {                                           \
       XMMRegister tmp = i.TempSimd128Register(0);      \
       Register tmp_shift = i.TempRegister(1);          \
       constexpr int mask = (1 << width) - 1;           \
       __ mov(tmp_shift, i.InputRegister(1));           \
       __ and_(tmp_shift, Immediate(mask));             \
       __ Movd(tmp, tmp_shift);                         \
       __ opcode(dst, dst, tmp);                        \
     }                                                  \
   } while (false)

 void CodeGenerator::AssembleDeconstructFrame() {
   __ mov(esp, ebp);
   __ pop(ebp);
 }

 void CodeGenerator::AssemblePrepareTailCall() {
   if (frame_access_state()->has_frame()) {
     __ mov(ebp, MemOperand(ebp, 0));
   }
   frame_access_state()->SetFrameAccessToSP();
 }

 void CodeGenerator::AssemblePopArgumentsAdaptorFrame(Register args_reg,
                                                      Register, Register,
                                                      Register) {
   // There are not enough temp registers left on ia32 for a call instruction
   // so we pick some scratch registers and save/restore them manually here.
   int scratch_count = 3;
   Register scratch1 = esi;
   Register scratch2 = ecx;
   Register scratch3 = edx;
   DCHECK(!AreAliased(args_reg, scratch1, scratch2, scratch3));
   Label done;

   // Check if current frame is an arguments adaptor frame.
   __ cmp(Operand(ebp, StandardFrameConstants::kContextOffset),
          Immediate(StackFrame::TypeToMarker(StackFrame::ARGUMENTS_ADAPTOR)));
   __ j(not_equal, &done, Label::kNear);

   __ push(scratch1);
   __ push(scratch2);
   __ push(scratch3);

   // Load arguments count from current arguments adaptor frame (note, it
   // does not include receiver).
   Register caller_args_count_reg = scratch1;
   __ mov(caller_args_count_reg,
          Operand(ebp, ArgumentsAdaptorFrameConstants::kLengthOffset));
   __ SmiUntag(caller_args_count_reg);

   __ PrepareForTailCall(args_reg, caller_args_count_reg, scratch2, scratch3,
                         scratch_count);
   __ pop(scratch3);
   __ pop(scratch2);
   __ pop(scratch1);

   __ bind(&done);
 }

 namespace {

 void AdjustStackPointerForTailCall(TurboAssembler* tasm,
                                    FrameAccessState* state,
                                    int new_slot_above_sp,
                                    bool allow_shrinkage = true) {
   int current_sp_offset = state->GetSPToFPSlotCount() +
                           StandardFrameConstants::kFixedSlotCountAboveFp;
   int stack_slot_delta = new_slot_above_sp - current_sp_offset;
   if (stack_slot_delta > 0) {
     tasm->AllocateStackSpace(stack_slot_delta * kSystemPointerSize);
     state->IncreaseSPDelta(stack_slot_delta);
   } else if (allow_shrinkage && stack_slot_delta < 0) {
     tasm->add(esp, Immediate(-stack_slot_delta * kSystemPointerSize));
     state->IncreaseSPDelta(stack_slot_delta);
   }
 }

 #ifdef DEBUG
 bool VerifyOutputOfAtomicPairInstr(IA32OperandConverter* converter,
                                    const Instruction* instr) {
   if (instr->OutputCount() == 2) {
     return (converter->OutputRegister(0) == eax &&
             converter->OutputRegister(1) == edx);
   }
   if (instr->OutputCount() == 1) {
     return (converter->OutputRegister(0) == eax &&
             converter->TempRegister(0) == edx) ||
            (converter->OutputRegister(0) == edx &&
             converter->TempRegister(0) == eax);
   }
   DCHECK_EQ(instr->OutputCount(), 0);
   return (converter->TempRegister(0) == eax &&
           converter->TempRegister(1) == edx);
 }
 #endif

 }  // namespace

 void CodeGenerator::AssembleTailCallBeforeGap(Instruction* instr,
                                               int first_unused_stack_slot) {
   CodeGenerator::PushTypeFlags flags(kImmediatePush | kScalarPush);
   ZoneVector<MoveOperands*> pushes(zone());
   GetPushCompatibleMoves(instr, flags, &pushes);

   if (!pushes.empty() &&
       (LocationOperand::cast(pushes.back()->destination()).index() + 1 ==
        first_unused_stack_slot)) {
     IA32OperandConverter g(this, instr);
     for (auto move : pushes) {
       LocationOperand destination_location(
           LocationOperand::cast(move->destination()));
       InstructionOperand source(move->source());
       AdjustStackPointerForTailCall(tasm(), frame_access_state(),
                                     destination_location.index());
       if (source.IsStackSlot()) {
         LocationOperand source_location(LocationOperand::cast(source));
         __ push(g.SlotToOperand(source_location.index()));
       } else if (source.IsRegister()) {
         LocationOperand source_location(LocationOperand::cast(source));
         __ push(source_location.GetRegister());
       } else if (source.IsImmediate()) {
         __ Push(Immediate(ImmediateOperand::cast(source).inline_value()));
       } else {
         // Pushes of non-scalar data types is not supported.
         UNIMPLEMENTED();
       }
       frame_access_state()->IncreaseSPDelta(1);
       move->Eliminate();
     }
   }
   AdjustStackPointerForTailCall(tasm(), frame_access_state(),
                                 first_unused_stack_slot, false);
 }

 void CodeGenerator::AssembleTailCallAfterGap(Instruction* instr,
                                              int first_unused_stack_slot) {
   AdjustStackPointerForTailCall(tasm(), frame_access_state(),
                                 first_unused_stack_slot);
 }

 // Check that {kJavaScriptCallCodeStartRegister} is correct.
 void CodeGenerator::AssembleCodeStartRegisterCheck() {
   __ push(eax);  // Push eax so we can use it as a scratch register.
   __ ComputeCodeStartAddress(eax);
   __ cmp(eax, kJavaScriptCallCodeStartRegister);
   __ Assert(equal, AbortReason::kWrongFunctionCodeStart);
   __ pop(eax);  // Restore eax.
 }

 // Check if the code object is marked for deoptimization. If it is, then it
 // jumps to the CompileLazyDeoptimizedCode builtin. In order to do this we need
 // to:
 //    1. read from memory the word that contains that bit, which can be found in
 //       the flags in the referenced {CodeDataContainer} object;
 //    2. test kMarkedForDeoptimizationBit in those flags; and
 //    3. if it is not zero then it jumps to the builtin.
 void CodeGenerator::BailoutIfDeoptimized() {
   int offset = Code::kCodeDataContainerOffset - Code::kHeaderSize;
   __ push(eax);  // Push eax so we can use it as a scratch register.
   __ mov(eax, Operand(kJavaScriptCallCodeStartRegister, offset));
   __ test(FieldOperand(eax, CodeDataContainer::kKindSpecificFlagsOffset),
           Immediate(1 << Code::kMarkedForDeoptimizationBit));
   __ pop(eax);  // Restore eax.

   Label skip;
   __ j(zero, &skip, Label::kNear);
   __ Jump(BUILTIN_CODE(isolate(), CompileLazyDeoptimizedCode),
           RelocInfo::CODE_TARGET);
   __ bind(&skip);
 }

 void CodeGenerator::GenerateSpeculationPoisonFromCodeStartRegister() {
   // TODO(860429): Remove remaining poisoning infrastructure on ia32.
   UNREACHABLE();
 }

 void CodeGenerator::AssembleRegisterArgumentPoisoning() {
   // TODO(860429): Remove remaining poisoning infrastructure on ia32.
   UNREACHABLE();
 }

 // Assembles an instruction after register allocation, producing machine code.
 CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
     Instruction* instr) {
   IA32OperandConverter i(this, instr);
   InstructionCode opcode = instr->opcode();
   ArchOpcode arch_opcode = ArchOpcodeField::decode(opcode);
   switch (arch_opcode) {
     case kArchCallCodeObject: {
       InstructionOperand* op = instr->InputAt(0);
       if (op->IsImmediate()) {
         Handle<Code> code = i.InputCode(0);
         __ Call(code, RelocInfo::CODE_TARGET);
       } else {
         Register reg = i.InputRegister(0);
         DCHECK_IMPLIES(
             instr->HasCallDescriptorFlag(CallDescriptor::kFixedTargetRegister),
             reg == kJavaScriptCallCodeStartRegister);
         __ LoadCodeObjectEntry(reg, reg);
         if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
           __ RetpolineCall(reg);
         } else {
           __ call(reg);
         }
       }
       RecordCallPosition(instr);
       frame_access_state()->ClearSPDelta();
       break;
     }
     case kArchCallBuiltinPointer: {
       DCHECK(!HasImmediateInput(instr, 0));
       Register builtin_index = i.InputRegister(0);
       __ CallBuiltinByIndex(builtin_index);
       RecordCallPosition(instr);
       frame_access_state()->ClearSPDelta();
       break;
     }
     case kArchCallWasmFunction: {
       if (HasImmediateInput(instr, 0)) {
         Constant constant = i.ToConstant(instr->InputAt(0));
         Address wasm_code = static_cast<Address>(constant.ToInt32());
         if (DetermineStubCallMode() == StubCallMode::kCallWasmRuntimeStub) {
           __ wasm_call(wasm_code, constant.rmode());
         } else {
           if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
             __ RetpolineCall(wasm_code, constant.rmode());
           } else {
             __ call(wasm_code, constant.rmode());
           }
         }
       } else {
         Register reg = i.InputRegister(0);
         if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
           __ RetpolineCall(reg);
         } else {
           __ call(reg);
         }
       }
       RecordCallPosition(instr);
       frame_access_state()->ClearSPDelta();
       break;
     }
     case kArchTailCallCodeObjectFromJSFunction:
     case kArchTailCallCodeObject: {
       if (arch_opcode == kArchTailCallCodeObjectFromJSFunction) {
         AssemblePopArgumentsAdaptorFrame(kJavaScriptCallArgCountRegister,
                                          no_reg, no_reg, no_reg);
       }
       if (HasImmediateInput(instr, 0)) {
         Handle<Code> code = i.InputCode(0);
         __ Jump(code, RelocInfo::CODE_TARGET);
       } else {
         Register reg = i.InputRegister(0);
         DCHECK_IMPLIES(
             instr->HasCallDescriptorFlag(CallDescriptor::kFixedTargetRegister),
             reg == kJavaScriptCallCodeStartRegister);
         __ LoadCodeObjectEntry(reg, reg);
         if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
           __ RetpolineJump(reg);
         } else {
           __ jmp(reg);
         }
       }
       frame_access_state()->ClearSPDelta();
       frame_access_state()->SetFrameAccessToDefault();
       break;
     }
     case kArchTailCallWasm: {
       if (HasImmediateInput(instr, 0)) {
         Constant constant = i.ToConstant(instr->InputAt(0));
         Address wasm_code = static_cast<Address>(constant.ToInt32());
         __ jmp(wasm_code, constant.rmode());
       } else {
         Register reg = i.InputRegister(0);
         if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
           __ RetpolineJump(reg);
         } else {
           __ jmp(reg);
         }
       }
       frame_access_state()->ClearSPDelta();
       frame_access_state()->SetFrameAccessToDefault();
       break;
     }
     case kArchTailCallAddress: {
       CHECK(!HasImmediateInput(instr, 0));
       Register reg = i.InputRegister(0);
       DCHECK_IMPLIES(
           instr->HasCallDescriptorFlag(CallDescriptor::kFixedTargetRegister),
           reg == kJavaScriptCallCodeStartRegister);
       if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
         __ RetpolineJump(reg);
       } else {
         __ jmp(reg);
       }
       frame_access_state()->ClearSPDelta();
       frame_access_state()->SetFrameAccessToDefault();
       break;
     }
     case kArchCallJSFunction: {
       Register func = i.InputRegister(0);
       if (FLAG_debug_code) {
         // Check the function's context matches the context argument.
         __ cmp(esi, FieldOperand(func, JSFunction::kContextOffset));
         __ Assert(equal, AbortReason::kWrongFunctionContext);
       }
       static_assert(kJavaScriptCallCodeStartRegister == ecx, "ABI mismatch");
       __ mov(ecx, FieldOperand(func, JSFunction::kCodeOffset));
       __ CallCodeObject(ecx);
       RecordCallPosition(instr);
       frame_access_state()->ClearSPDelta();
       break;
     }
     case kArchPrepareCallCFunction: {
       // Frame alignment requires using FP-relative frame addressing.
       frame_access_state()->SetFrameAccessToFP();
       int const num_parameters = MiscField::decode(instr->opcode());
       __ PrepareCallCFunction(num_parameters, i.TempRegister(0));
       break;
     }
     case kArchSaveCallerRegisters: {
       fp_mode_ =
           static_cast<SaveFPRegsMode>(MiscField::decode(instr->opcode()));
       DCHECK(fp_mode_ == kDontSaveFPRegs || fp_mode_ == kSaveFPRegs);
       // kReturnRegister0 should have been saved before entering the stub.
       int bytes = __ PushCallerSaved(fp_mode_, kReturnRegister0);
       DCHECK(IsAligned(bytes, kSystemPointerSize));
       DCHECK_EQ(0, frame_access_state()->sp_delta());
       frame_access_state()->IncreaseSPDelta(bytes / kSystemPointerSize);
       DCHECK(!caller_registers_saved_);
       caller_registers_saved_ = true;
       break;
     }
     case kArchRestoreCallerRegisters: {
       DCHECK(fp_mode_ ==
              static_cast<SaveFPRegsMode>(MiscField::decode(instr->opcode())));
       DCHECK(fp_mode_ == kDontSaveFPRegs || fp_mode_ == kSaveFPRegs);
       // Don't overwrite the returned value.
       int bytes = __ PopCallerSaved(fp_mode_, kReturnRegister0);
       frame_access_state()->IncreaseSPDelta(-(bytes / kSystemPointerSize));
       DCHECK_EQ(0, frame_access_state()->sp_delta());
       DCHECK(caller_registers_saved_);
       caller_registers_saved_ = false;
       break;
     }
     case kArchPrepareTailCall:
       AssemblePrepareTailCall();
       break;
     case kArchCallCFunction: {
       int const num_parameters = MiscField::decode(instr->opcode());
       Label return_location;
       if (linkage()->GetIncomingDescriptor()->IsWasmCapiFunction()) {
         // Put the return address in a stack slot.
         Register scratch = eax;
         __ push(scratch);
         __ PushPC();
         int pc = __ pc_offset();
         __ pop(scratch);
         __ sub(scratch, Immediate(pc + Code::kHeaderSize - kHeapObjectTag));
         __ add(scratch, Immediate::CodeRelativeOffset(&return_location));
         __ mov(MemOperand(ebp, WasmExitFrameConstants::kCallingPCOffset),
                scratch);
         __ pop(scratch);
       }
       if (HasImmediateInput(instr, 0)) {
         ExternalReference ref = i.InputExternalReference(0);
         __ CallCFunction(ref, num_parameters);
       } else {
         Register func = i.InputRegister(0);
         __ CallCFunction(func, num_parameters);
       }
       __ bind(&return_location);
       if (linkage()->GetIncomingDescriptor()->IsWasmCapiFunction()) {
         RecordSafepoint(instr->reference_map(), Safepoint::kNoLazyDeopt);
       }
       frame_access_state()->SetFrameAccessToDefault();
       // Ideally, we should decrement SP delta to match the change of stack
       // pointer in CallCFunction. However, for certain architectures (e.g.
       // ARM), there may be more strict alignment requirement, causing old SP
       // to be saved on the stack. In those cases, we can not calculate the SP
       // delta statically.
       frame_access_state()->ClearSPDelta();
       if (caller_registers_saved_) {
         // Need to re-sync SP delta introduced in kArchSaveCallerRegisters.
         // Here, we assume the sequence to be:
         //   kArchSaveCallerRegisters;
         //   kArchCallCFunction;
         //   kArchRestoreCallerRegisters;
         int bytes =
             __ RequiredStackSizeForCallerSaved(fp_mode_, kReturnRegister0);
         frame_access_state()->IncreaseSPDelta(bytes / kSystemPointerSize);
       }
       break;
     }
     case kArchJmp:
       AssembleArchJump(i.InputRpo(0));
       break;
     case kArchBinarySearchSwitch:
       AssembleArchBinarySearchSwitch(instr);
       break;
     case kArchTableSwitch:
       AssembleArchTableSwitch(instr);
       break;
     case kArchComment:
       __ RecordComment(reinterpret_cast<const char*>(i.InputInt32(0)));
       break;
     case kArchAbortCSAAssert:
       DCHECK(i.InputRegister(0) == edx);
       {
         // We don't actually want to generate a pile of code for this, so just
         // claim there is a stack frame, without generating one.
         FrameScope scope(tasm(), StackFrame::NONE);
         __ Call(
             isolate()->builtins()->builtin_handle(Builtins::kAbortCSAAssert),
             RelocInfo::CODE_TARGET);
       }
       __ int3();
       break;
     case kArchDebugBreak:
       __ DebugBreak();
       break;
     case kArchNop:
     case kArchThrowTerminator:
       // don't emit code for nops.
       break;
     case kArchDeoptimize: {
       DeoptimizationExit* exit =
           BuildTranslation(instr, -1, 0, OutputFrameStateCombine::Ignore());
       __ jmp(exit->label());
       break;
     }
     case kArchRet:
       AssembleReturn(instr->InputAt(0));
       break;
     case kArchFramePointer:
       __ mov(i.OutputRegister(), ebp);
       break;
     case kArchParentFramePointer:
       if (frame_access_state()->has_frame()) {
         __ mov(i.OutputRegister(), Operand(ebp, 0));
       } else {
         __ mov(i.OutputRegister(), ebp);
       }
       break;
     case kArchStackPointerGreaterThan: {
       // Potentially apply an offset to the current stack pointer before the
       // comparison to consider the size difference of an optimized frame versus
       // the contained unoptimized frames.
       Register lhs_register = esp;
       uint32_t offset;

       if (ShouldApplyOffsetToStackCheck(instr, &offset)) {
         lhs_register = i.TempRegister(0);
         __ lea(lhs_register, Operand(esp, -1 * static_cast<int32_t>(offset)));
       }

       constexpr size_t kValueIndex = 0;
       if (HasAddressingMode(instr)) {
         __ cmp(lhs_register, i.MemoryOperand(kValueIndex));
       } else {
         __ cmp(lhs_register, i.InputRegister(kValueIndex));
       }
       break;
     }
     case kArchStackCheckOffset:
       __ Move(i.OutputRegister(), Smi::FromInt(GetStackCheckOffset()));
       break;
     case kArchTruncateDoubleToI: {
       auto result = i.OutputRegister();
       auto input = i.InputDoubleRegister(0);
       auto ool = zone()->New<OutOfLineTruncateDoubleToI>(
           this, result, input, DetermineStubCallMode());
       __ cvttsd2si(result, Operand(input));
       __ cmp(result, 1);
       __ j(overflow, ool->entry());
       __ bind(ool->exit());
       break;
     }
     case kArchStoreWithWriteBarrier: {
       RecordWriteMode mode =
           static_cast<RecordWriteMode>(MiscField::decode(instr->opcode()));
       Register object = i.InputRegister(0);
       size_t index = 0;
       Operand operand = i.MemoryOperand(&index);
       Register value = i.InputRegister(index);
       Register scratch0 = i.TempRegister(0);
       Register scratch1 = i.TempRegister(1);
       auto ool = zone()->New<OutOfLineRecordWrite>(this, object, operand, value,
                                                    scratch0, scratch1, mode,
                                                    DetermineStubCallMode());
       __ mov(operand, value);
       __ CheckPageFlag(object, scratch0,
                        MemoryChunk::kPointersFromHereAreInterestingMask,
                        not_zero, ool->entry());
       __ bind(ool->exit());
       break;
     }
     case kArchStackSlot: {
       FrameOffset offset =
           frame_access_state()->GetFrameOffset(i.InputInt32(0));
       Register base = offset.from_stack_pointer() ? esp : ebp;
       __ lea(i.OutputRegister(), Operand(base, offset.offset()));
       break;
     }
     case kIeee754Float64Acos:
       ASSEMBLE_IEEE754_UNOP(acos);
       break;
     case kIeee754Float64Acosh:
       ASSEMBLE_IEEE754_UNOP(acosh);
       break;
     case kIeee754Float64Asin:
       ASSEMBLE_IEEE754_UNOP(asin);
       break;
     case kIeee754Float64Asinh:
       ASSEMBLE_IEEE754_UNOP(asinh);
       break;
     case kIeee754Float64Atan:
       ASSEMBLE_IEEE754_UNOP(atan);
       break;
     case kIeee754Float64Atanh:
       ASSEMBLE_IEEE754_UNOP(atanh);
       break;
     case kIeee754Float64Atan2:
       ASSEMBLE_IEEE754_BINOP(atan2);
       break;
     case kIeee754Float64Cbrt:
       ASSEMBLE_IEEE754_UNOP(cbrt);
       break;
     case kIeee754Float64Cos:
       ASSEMBLE_IEEE754_UNOP(cos);
       break;
     case kIeee754Float64Cosh:
       ASSEMBLE_IEEE754_UNOP(cosh);
       break;
     case kIeee754Float64Expm1:
       ASSEMBLE_IEEE754_UNOP(expm1);
       break;
     case kIeee754Float64Exp:
       ASSEMBLE_IEEE754_UNOP(exp);
       break;
     case kIeee754Float64Log:
       ASSEMBLE_IEEE754_UNOP(log);
       break;
     case kIeee754Float64Log1p:
       ASSEMBLE_IEEE754_UNOP(log1p);
       break;
     case kIeee754Float64Log2:
       ASSEMBLE_IEEE754_UNOP(log2);
       break;
     case kIeee754Float64Log10:
       ASSEMBLE_IEEE754_UNOP(log10);
       break;
     case kIeee754Float64Pow:
       ASSEMBLE_IEEE754_BINOP(pow);
       break;
     case kIeee754Float64Sin:
       ASSEMBLE_IEEE754_UNOP(sin);
       break;
     case kIeee754Float64Sinh:
       ASSEMBLE_IEEE754_UNOP(sinh);
       break;
     case kIeee754Float64Tan:
       ASSEMBLE_IEEE754_UNOP(tan);
       break;
     case kIeee754Float64Tanh:
       ASSEMBLE_IEEE754_UNOP(tanh);
       break;
     case kIA32Add:
       ASSEMBLE_BINOP(add);
       break;
     case kIA32And:
       ASSEMBLE_BINOP(and_);
       break;
     case kIA32Cmp:
       ASSEMBLE_COMPARE(cmp);
       break;
     case kIA32Cmp16:
       ASSEMBLE_COMPARE(cmpw);
       break;
     case kIA32Cmp8:
       ASSEMBLE_COMPARE(cmpb);
       break;
     case kIA32Test:
       ASSEMBLE_COMPARE(test);
       break;
     case kIA32Test16:
       ASSEMBLE_COMPARE(test_w);
       break;
     case kIA32Test8:
       ASSEMBLE_COMPARE(test_b);
       break;
     case kIA32Imul:
       if (HasImmediateInput(instr, 1)) {
         __ imul(i.OutputRegister(), i.InputOperand(0), i.InputInt32(1));
       } else {
         __ imul(i.OutputRegister(), i.InputOperand(1));
       }
       break;
     case kIA32ImulHigh:
       __ imul(i.InputRegister(1));
       break;
     case kIA32UmulHigh:
       __ mul(i.InputRegister(1));
       break;
     case kIA32Idiv:
       __ cdq();
       __ idiv(i.InputOperand(1));
       break;
     case kIA32Udiv:
       __ Move(edx, Immediate(0));
       __ div(i.InputOperand(1));
       break;
     case kIA32Not:
       __ not_(i.OutputOperand());
       break;
     case kIA32Neg:
       __ neg(i.OutputOperand());
       break;
     case kIA32Or:
       ASSEMBLE_BINOP(or_);
       break;
     case kIA32Xor:
       ASSEMBLE_BINOP(xor_);
       break;
     case kIA32Sub:
       ASSEMBLE_BINOP(sub);
       break;
     case kIA32Shl:
       if (HasImmediateInput(instr, 1)) {
         __ shl(i.OutputOperand(), i.InputInt5(1));
       } else {
         __ shl_cl(i.OutputOperand());
       }
       break;
     case kIA32Shr:
       if (HasImmediateInput(instr, 1)) {
         __ shr(i.OutputOperand(), i.InputInt5(1));
       } else {
         __ shr_cl(i.OutputOperand());
       }
       break;
     case kIA32Sar:
       if (HasImmediateInput(instr, 1)) {
         __ sar(i.OutputOperand(), i.InputInt5(1));
       } else {
         __ sar_cl(i.OutputOperand());
       }
       break;
     case kIA32AddPair: {
       // i.OutputRegister(0) == i.InputRegister(0) ... left low word.
       // i.InputRegister(1) ... left high word.
       // i.InputRegister(2) ... right low word.
       // i.InputRegister(3) ... right high word.
       bool use_temp = false;
       if ((HasRegisterInput(instr, 1) &&
            i.OutputRegister(0).code() == i.InputRegister(1).code()) ||
           i.OutputRegister(0).code() == i.InputRegister(3).code()) {
         // We cannot write to the output register directly, because it would
         // overwrite an input for adc. We have to use the temp register.
         use_temp = true;
         __ Move(i.TempRegister(0), i.InputRegister(0));
         __ add(i.TempRegister(0), i.InputRegister(2));
       } else {
         __ add(i.OutputRegister(0), i.InputRegister(2));
       }
       i.MoveInstructionOperandToRegister(i.OutputRegister(1),
                                          instr->InputAt(1));
       __ adc(i.OutputRegister(1), Operand(i.InputRegister(3)));
       if (use_temp) {
         __ Move(i.OutputRegister(0), i.TempRegister(0));
       }
       break;
     }
     case kIA32SubPair: {
       // i.OutputRegister(0) == i.InputRegister(0) ... left low word.
       // i.InputRegister(1) ... left high word.
       // i.InputRegister(2) ... right low word.
       // i.InputRegister(3) ... right high word.
       bool use_temp = false;
       if ((HasRegisterInput(instr, 1) &&
            i.OutputRegister(0).code() == i.InputRegister(1).code()) ||
           i.OutputRegister(0).code() == i.InputRegister(3).code()) {
         // We cannot write to the output register directly, because it would
         // overwrite an input for adc. We have to use the temp register.
         use_temp = true;
         __ Move(i.TempRegister(0), i.InputRegister(0));
         __ sub(i.TempRegister(0), i.InputRegister(2));
       } else {
         __ sub(i.OutputRegister(0), i.InputRegister(2));
       }
       i.MoveInstructionOperandToRegister(i.OutputRegister(1),
                                          instr->InputAt(1));
       __ sbb(i.OutputRegister(1), Operand(i.InputRegister(3)));
       if (use_temp) {
         __ Move(i.OutputRegister(0), i.TempRegister(0));
       }
       break;
     }
     case kIA32MulPair: {
       __ imul(i.OutputRegister(1), i.InputOperand(0));
       i.MoveInstructionOperandToRegister(i.TempRegister(0), instr->InputAt(1));
       __ imul(i.TempRegister(0), i.InputOperand(2));
       __ add(i.OutputRegister(1), i.TempRegister(0));
       __ mov(i.OutputRegister(0), i.InputOperand(0));
       // Multiplies the low words and stores them in eax and edx.
       __ mul(i.InputRegister(2));
       __ add(i.OutputRegister(1), i.TempRegister(0));

       break;
     }
     case kIA32ShlPair:
       if (HasImmediateInput(instr, 2)) {
         __ ShlPair(i.InputRegister(1), i.InputRegister(0), i.InputInt6(2));
       } else {
         // Shift has been loaded into CL by the register allocator.
         __ ShlPair_cl(i.InputRegister(1), i.InputRegister(0));
       }
       break;
     case kIA32ShrPair:
       if (HasImmediateInput(instr, 2)) {
         __ ShrPair(i.InputRegister(1), i.InputRegister(0), i.InputInt6(2));
       } else {
         // Shift has been loaded into CL by the register allocator.
         __ ShrPair_cl(i.InputRegister(1), i.InputRegister(0));
       }
       break;
     case kIA32SarPair:
       if (HasImmediateInput(instr, 2)) {
         __ SarPair(i.InputRegister(1), i.InputRegister(0), i.InputInt6(2));
       } else {
         // Shift has been loaded into CL by the register allocator.
         __ SarPair_cl(i.InputRegister(1), i.InputRegister(0));
       }
       break;
     case kIA32Rol:
       if (HasImmediateInput(instr, 1)) {
         __ rol(i.OutputOperand(), i.InputInt5(1));
       } else {
         __ rol_cl(i.OutputOperand());
       }
       break;
     case kIA32Ror:
       if (HasImmediateInput(instr, 1)) {
         __ ror(i.OutputOperand(), i.InputInt5(1));
       } else {
         __ ror_cl(i.OutputOperand());
       }
       break;
     case kIA32Lzcnt:
       __ Lzcnt(i.OutputRegister(), i.InputOperand(0));
       break;
     case kIA32Tzcnt:
       __ Tzcnt(i.OutputRegister(), i.InputOperand(0));
       break;
     case kIA32Popcnt:
       __ Popcnt(i.OutputRegister(), i.InputOperand(0));
       break;
     case kIA32Bswap:
       __ bswap(i.OutputRegister());
       break;
     case kArchWordPoisonOnSpeculation:
       // TODO(860429): Remove remaining poisoning infrastructure on ia32.
       UNREACHABLE();
     case kIA32MFence:
       __ mfence();
       break;
     case kIA32LFence:
       __ lfence();
       break;
     case kSSEFloat32Cmp:
       __ ucomiss(i.InputDoubleRegister(0), i.InputOperand(1));
       break;
     case kSSEFloat32Add:
       __ addss(i.InputDoubleRegister(0), i.InputOperand(1));
       break;
     case kSSEFloat32Sub:
       __ subss(i.InputDoubleRegister(0), i.InputOperand(1));
       break;
     case kSSEFloat32Mul:
       __ mulss(i.InputDoubleRegister(0), i.InputOperand(1));
       break;
     case kSSEFloat32Div:
       __ divss(i.InputDoubleRegister(0), i.InputOperand(1));
       // Don't delete this mov. It may improve performance on some CPUs,
       // when there is a (v)mulss depending on the result.
       __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
       break;
     case kSSEFloat32Sqrt:
       __ sqrtss(i.OutputDoubleRegister(), i.InputOperand(0));
       break;
     case kSSEFloat32Abs: {
       // TODO(bmeurer): Use 128-bit constants.
       XMMRegister tmp = i.TempSimd128Register(0);
       __ pcmpeqd(tmp, tmp);
       __ psrlq(tmp, 33);
       __ andps(i.OutputDoubleRegister(), tmp);
       break;
     }
     case kSSEFloat32Neg: {
       // TODO(bmeurer): Use 128-bit constants.
       XMMRegister tmp = i.TempSimd128Register(0);
       __ pcmpeqd(tmp, tmp);
       __ psllq(tmp, 31);
       __ xorps(i.OutputDoubleRegister(), tmp);
       break;
     }
     case kSSEFloat32Round: {
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       RoundingMode const mode =
           static_cast<RoundingMode>(MiscField::decode(instr->opcode()));
       __ roundss(i.OutputDoubleRegister(), i.InputDoubleRegister(0), mode);
       break;
     }
     case kSSEFloat64Cmp:
       __ ucomisd(i.InputDoubleRegister(0), i.InputOperand(1));
       break;
     case kSSEFloat64Add:
       __ addsd(i.InputDoubleRegister(0), i.InputOperand(1));
       break;
     case kSSEFloat64Sub:
       __ subsd(i.InputDoubleRegister(0), i.InputOperand(1));
       break;
     case kSSEFloat64Mul:
       __ mulsd(i.InputDoubleRegister(0), i.InputOperand(1));
       break;
     case kSSEFloat64Div:
       __ divsd(i.InputDoubleRegister(0), i.InputOperand(1));
       // Don't delete this mov. It may improve performance on some CPUs,
       // when there is a (v)mulsd depending on the result.
       __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
       break;
     case kSSEFloat32Max: {
       Label compare_swap, done_compare;
       if (instr->InputAt(1)->IsFPRegister()) {
         __ ucomiss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
       } else {
         __ ucomiss(i.InputDoubleRegister(0), i.InputOperand(1));
       }
       auto ool =
           zone()->New<OutOfLineLoadFloat32NaN>(this, i.OutputDoubleRegister());
       __ j(parity_even, ool->entry());
       __ j(above, &done_compare, Label::kNear);
       __ j(below, &compare_swap, Label::kNear);
       __ movmskps(i.TempRegister(0), i.InputDoubleRegister(0));
       __ test(i.TempRegister(0), Immediate(1));
       __ j(zero, &done_compare, Label::kNear);
       __ bind(&compare_swap);
       if (instr->InputAt(1)->IsFPRegister()) {
         __ movss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
       } else {
         __ movss(i.InputDoubleRegister(0), i.InputOperand(1));
       }
       __ bind(&done_compare);
       __ bind(ool->exit());
       break;
     }

     case kSSEFloat64Max: {
       Label compare_swap, done_compare;
       if (instr->InputAt(1)->IsFPRegister()) {
         __ ucomisd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
       } else {
         __ ucomisd(i.InputDoubleRegister(0), i.InputOperand(1));
       }
       auto ool =
           zone()->New<OutOfLineLoadFloat64NaN>(this, i.OutputDoubleRegister());
       __ j(parity_even, ool->entry());
       __ j(above, &done_compare, Label::kNear);
       __ j(below, &compare_swap, Label::kNear);
       __ movmskpd(i.TempRegister(0), i.InputDoubleRegister(0));
       __ test(i.TempRegister(0), Immediate(1));
       __ j(zero, &done_compare, Label::kNear);
       __ bind(&compare_swap);
       if (instr->InputAt(1)->IsFPRegister()) {
         __ movsd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
       } else {
         __ movsd(i.InputDoubleRegister(0), i.InputOperand(1));
       }
       __ bind(&done_compare);
       __ bind(ool->exit());
       break;
     }
     case kSSEFloat32Min: {
       Label compare_swap, done_compare;
       if (instr->InputAt(1)->IsFPRegister()) {
         __ ucomiss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
       } else {
         __ ucomiss(i.InputDoubleRegister(0), i.InputOperand(1));
       }
       auto ool =
           zone()->New<OutOfLineLoadFloat32NaN>(this, i.OutputDoubleRegister());
       __ j(parity_even, ool->entry());
       __ j(below, &done_compare, Label::kNear);
       __ j(above, &compare_swap, Label::kNear);
       if (instr->InputAt(1)->IsFPRegister()) {
         __ movmskps(i.TempRegister(0), i.InputDoubleRegister(1));
       } else {
         __ movss(kScratchDoubleReg, i.InputOperand(1));
         __ movmskps(i.TempRegister(0), kScratchDoubleReg);
       }
       __ test(i.TempRegister(0), Immediate(1));
       __ j(zero, &done_compare, Label::kNear);
       __ bind(&compare_swap);
       if (instr->InputAt(1)->IsFPRegister()) {
         __ movss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
       } else {
         __ movss(i.InputDoubleRegister(0), i.InputOperand(1));
       }
       __ bind(&done_compare);
       __ bind(ool->exit());
       break;
     }
     case kSSEFloat64Min: {
       Label compare_swap, done_compare;
       if (instr->InputAt(1)->IsFPRegister()) {
         __ ucomisd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
       } else {
         __ ucomisd(i.InputDoubleRegister(0), i.InputOperand(1));
       }
       auto ool =
           zone()->New<OutOfLineLoadFloat64NaN>(this, i.OutputDoubleRegister());
       __ j(parity_even, ool->entry());
       __ j(below, &done_compare, Label::kNear);
       __ j(above, &compare_swap, Label::kNear);
       if (instr->InputAt(1)->IsFPRegister()) {
         __ movmskpd(i.TempRegister(0), i.InputDoubleRegister(1));
       } else {
         __ movsd(kScratchDoubleReg, i.InputOperand(1));
         __ movmskpd(i.TempRegister(0), kScratchDoubleReg);
       }
       __ test(i.TempRegister(0), Immediate(1));
       __ j(zero, &done_compare, Label::kNear);
       __ bind(&compare_swap);
       if (instr->InputAt(1)->IsFPRegister()) {
         __ movsd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
       } else {
         __ movsd(i.InputDoubleRegister(0), i.InputOperand(1));
       }
       __ bind(&done_compare);
       __ bind(ool->exit());
       break;
     }
     case kSSEFloat64Mod: {
       Register tmp = i.TempRegister(1);
       __ mov(tmp, esp);
       __ AllocateStackSpace(kDoubleSize);
       __ and_(esp, -8);  // align to 8 byte boundary.
       // Move values to st(0) and st(1).
       __ movsd(Operand(esp, 0), i.InputDoubleRegister(1));
       __ fld_d(Operand(esp, 0));
       __ movsd(Operand(esp, 0), i.InputDoubleRegister(0));
       __ fld_d(Operand(esp, 0));
       // Loop while fprem isn't done.
       Label mod_loop;
       __ bind(&mod_loop);
       // This instruction traps on all kinds of inputs, but we are assuming the
       // floating point control word is set to ignore them all.
       __ fprem();
       // fnstsw_ax clobbers eax.
       DCHECK_EQ(eax, i.TempRegister(0));
       __ fnstsw_ax();
       __ sahf();
       __ j(parity_even, &mod_loop);
       // Move output to stack and clean up.
       __ fstp(1);
       __ fstp_d(Operand(esp, 0));
       __ movsd(i.OutputDoubleRegister(), Operand(esp, 0));
       __ mov(esp, tmp);
       break;
     }
     case kSSEFloat64Abs: {
       // TODO(bmeurer): Use 128-bit constants.
       XMMRegister tmp = i.TempSimd128Register(0);
       __ pcmpeqd(tmp, tmp);
       __ psrlq(tmp, 1);
       __ andpd(i.OutputDoubleRegister(), tmp);
       break;
     }
     case kSSEFloat64Neg: {
       // TODO(bmeurer): Use 128-bit constants.
       XMMRegister tmp = i.TempSimd128Register(0);
       __ pcmpeqd(tmp, tmp);
       __ psllq(tmp, 63);
       __ xorpd(i.OutputDoubleRegister(), tmp);
       break;
     }
     case kSSEFloat64Sqrt:
       __ sqrtsd(i.OutputDoubleRegister(), i.InputOperand(0));
       break;
     case kSSEFloat64Round: {
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       RoundingMode const mode =
           static_cast<RoundingMode>(MiscField::decode(instr->opcode()));
       __ roundsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0), mode);
       break;
     }
     case kSSEFloat32ToFloat64:
       __ cvtss2sd(i.OutputDoubleRegister(), i.InputOperand(0));
       break;
     case kSSEFloat64ToFloat32:
       __ cvtsd2ss(i.OutputDoubleRegister(), i.InputOperand(0));
       break;
     case kSSEFloat32ToInt32:
       __ cvttss2si(i.OutputRegister(), i.InputOperand(0));
       break;
     case kSSEFloat32ToUint32:
       __ Cvttss2ui(i.OutputRegister(), i.InputOperand(0),
                    i.TempSimd128Register(0));
       break;
     case kSSEFloat64ToInt32:
       __ cvttsd2si(i.OutputRegister(), i.InputOperand(0));
       break;
     case kSSEFloat64ToUint32:
       __ Cvttsd2ui(i.OutputRegister(), i.InputOperand(0),
                    i.TempSimd128Register(0));
       break;
     case kSSEInt32ToFloat32:
       __ cvtsi2ss(i.OutputDoubleRegister(), i.InputOperand(0));
       break;
     case kSSEUint32ToFloat32:
       __ Cvtui2ss(i.OutputDoubleRegister(), i.InputOperand(0),
                   i.TempRegister(0));
       break;
     case kSSEInt32ToFloat64:
       __ cvtsi2sd(i.OutputDoubleRegister(), i.InputOperand(0));
       break;
     case kSSEUint32ToFloat64:
       __ Cvtui2sd(i.OutputDoubleRegister(), i.InputOperand(0),
                   i.TempRegister(0));
       break;
     case kSSEFloat64ExtractLowWord32:
       if (instr->InputAt(0)->IsFPStackSlot()) {
         __ mov(i.OutputRegister(), i.InputOperand(0));
       } else {
         __ movd(i.OutputRegister(), i.InputDoubleRegister(0));
       }
       break;
     case kSSEFloat64ExtractHighWord32:
       if (instr->InputAt(0)->IsFPStackSlot()) {
         __ mov(i.OutputRegister(), i.InputOperand(0, kDoubleSize / 2));
       } else {
         __ Pextrd(i.OutputRegister(), i.InputDoubleRegister(0), 1);
       }
       break;
     case kSSEFloat64InsertLowWord32:
       __ Pinsrd(i.OutputDoubleRegister(), i.InputOperand(1), 0);
       break;
     case kSSEFloat64InsertHighWord32:
       __ Pinsrd(i.OutputDoubleRegister(), i.InputOperand(1), 1);
       break;
     case kSSEFloat64LoadLowWord32:
       __ movd(i.OutputDoubleRegister(), i.InputOperand(0));
       break;
     case kAVXFloat32Add: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vaddss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                 i.InputOperand(1));
       break;
     }
     case kAVXFloat32Sub: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vsubss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                 i.InputOperand(1));
       break;
     }
     case kAVXFloat32Mul: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vmulss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                 i.InputOperand(1));
       break;
     }
     case kAVXFloat32Div: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vdivss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                 i.InputOperand(1));
       // Don't delete this mov. It may improve performance on some CPUs,
       // when there is a (v)mulss depending on the result.
       __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
       break;
     }
     case kAVXFloat64Add: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vaddsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                 i.InputOperand(1));
       break;
     }
     case kAVXFloat64Sub: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vsubsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                 i.InputOperand(1));
       break;
     }
     case kAVXFloat64Mul: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vmulsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                 i.InputOperand(1));
       break;
     }
     case kAVXFloat64Div: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vdivsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                 i.InputOperand(1));
       // Don't delete this mov. It may improve performance on some CPUs,
       // when there is a (v)mulsd depending on the result.
       __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
       break;
     }
     case kAVXFloat32Abs: {
       // TODO(bmeurer): Use RIP relative 128-bit constants.
       XMMRegister tmp = i.TempSimd128Register(0);
       __ pcmpeqd(tmp, tmp);
       __ psrlq(tmp, 33);
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vandps(i.OutputDoubleRegister(), tmp, i.InputOperand(0));
       break;
     }
     case kAVXFloat32Neg: {
       // TODO(bmeurer): Use RIP relative 128-bit constants.
       XMMRegister tmp = i.TempSimd128Register(0);
       __ pcmpeqd(tmp, tmp);
       __ psllq(tmp, 31);
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vxorps(i.OutputDoubleRegister(), tmp, i.InputOperand(0));
       break;
     }
     case kAVXFloat64Abs: {
       // TODO(bmeurer): Use RIP relative 128-bit constants.
       XMMRegister tmp = i.TempSimd128Register(0);
       __ pcmpeqd(tmp, tmp);
       __ psrlq(tmp, 1);
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vandpd(i.OutputDoubleRegister(), tmp, i.InputOperand(0));
       break;
     }
     case kAVXFloat64Neg: {
       // TODO(bmeurer): Use RIP relative 128-bit constants.
       XMMRegister tmp = i.TempSimd128Register(0);
       __ pcmpeqd(tmp, tmp);
       __ psllq(tmp, 63);
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vxorpd(i.OutputDoubleRegister(), tmp, i.InputOperand(0));
       break;
     }
     case kSSEFloat64SilenceNaN:
       __ xorpd(kScratchDoubleReg, kScratchDoubleReg);
       __ subsd(i.InputDoubleRegister(0), kScratchDoubleReg);
       break;
     case kIA32Movsxbl:
       ASSEMBLE_MOVX(movsx_b);
       break;
     case kIA32Movzxbl:
       ASSEMBLE_MOVX(movzx_b);
       break;
     case kIA32Movb: {
       size_t index = 0;
       Operand operand = i.MemoryOperand(&index);
       if (HasImmediateInput(instr, index)) {
         __ mov_b(operand, i.InputInt8(index));
       } else {
         __ mov_b(operand, i.InputRegister(index));
       }
       break;
     }
     case kIA32Movsxwl:
       ASSEMBLE_MOVX(movsx_w);
       break;
     case kIA32Movzxwl:
       ASSEMBLE_MOVX(movzx_w);
       break;
     case kIA32Movw: {
       size_t index = 0;
       Operand operand = i.MemoryOperand(&index);
       if (HasImmediateInput(instr, index)) {
         __ mov_w(operand, i.InputInt16(index));
       } else {
         __ mov_w(operand, i.InputRegister(index));
       }
       break;
     }
     case kIA32Movl:
       if (instr->HasOutput()) {
         __ mov(i.OutputRegister(), i.MemoryOperand());
       } else {
         size_t index = 0;
         Operand operand = i.MemoryOperand(&index);
         if (HasImmediateInput(instr, index)) {
           __ mov(operand, i.InputImmediate(index));
         } else {
           __ mov(operand, i.InputRegister(index));
         }
       }
       break;
     case kIA32Movsd:
       if (instr->HasOutput()) {
         __ movsd(i.OutputDoubleRegister(), i.MemoryOperand());
       } else {
         size_t index = 0;
         Operand operand = i.MemoryOperand(&index);
         __ movsd(operand, i.InputDoubleRegister(index));
       }
       break;
     case kIA32Movss:
       if (instr->HasOutput()) {
         __ movss(i.OutputDoubleRegister(), i.MemoryOperand());
       } else {
         size_t index = 0;
         Operand operand = i.MemoryOperand(&index);
         __ movss(operand, i.InputDoubleRegister(index));
       }
       break;
     case kIA32Movdqu:
       if (instr->HasOutput()) {
         __ Movdqu(i.OutputSimd128Register(), i.MemoryOperand());
       } else {
         size_t index = 0;
         Operand operand = i.MemoryOperand(&index);
         __ Movdqu(operand, i.InputSimd128Register(index));
       }
       break;
     case kIA32BitcastFI:
       if (instr->InputAt(0)->IsFPStackSlot()) {
         __ mov(i.OutputRegister(), i.InputOperand(0));
       } else {
         __ movd(i.OutputRegister(), i.InputDoubleRegister(0));
       }
       break;
     case kIA32BitcastIF:
       if (HasRegisterInput(instr, 0)) {
         __ movd(i.OutputDoubleRegister(), i.InputRegister(0));
       } else {
         __ movss(i.OutputDoubleRegister(), i.InputOperand(0));
       }
       break;
     case kIA32Lea: {
       AddressingMode mode = AddressingModeField::decode(instr->opcode());
       // Shorten "leal" to "addl", "subl" or "shll" if the register allocation
       // and addressing mode just happens to work out. The "addl"/"subl" forms
       // in these cases are faster based on measurements.
       if (mode == kMode_MI) {
         __ Move(i.OutputRegister(), Immediate(i.InputInt32(0)));
       } else if (i.InputRegister(0) == i.OutputRegister()) {
         if (mode == kMode_MRI) {
           int32_t constant_summand = i.InputInt32(1);
           if (constant_summand > 0) {
             __ add(i.OutputRegister(), Immediate(constant_summand));
           } else if (constant_summand < 0) {
             __ sub(i.OutputRegister(),
                    Immediate(base::NegateWithWraparound(constant_summand)));
           }
         } else if (mode == kMode_MR1) {
           if (i.InputRegister(1) == i.OutputRegister()) {
             __ shl(i.OutputRegister(), 1);
           } else {
             __ add(i.OutputRegister(), i.InputRegister(1));
           }
         } else if (mode == kMode_M2) {
           __ shl(i.OutputRegister(), 1);
         } else if (mode == kMode_M4) {
           __ shl(i.OutputRegister(), 2);
         } else if (mode == kMode_M8) {
           __ shl(i.OutputRegister(), 3);
         } else {
           __ lea(i.OutputRegister(), i.MemoryOperand());
         }
       } else if (mode == kMode_MR1 &&
                  i.InputRegister(1) == i.OutputRegister()) {
         __ add(i.OutputRegister(), i.InputRegister(0));
       } else {
         __ lea(i.OutputRegister(), i.MemoryOperand());
       }
       break;
     }
     case kIA32PushFloat32:
       if (instr->InputAt(0)->IsFPRegister()) {
         __ AllocateStackSpace(kFloatSize);
         __ movss(Operand(esp, 0), i.InputDoubleRegister(0));
         frame_access_state()->IncreaseSPDelta(kFloatSize / kSystemPointerSize);
       } else if (HasImmediateInput(instr, 0)) {
         __ Move(kScratchDoubleReg, i.InputFloat32(0));
         __ AllocateStackSpace(kFloatSize);
         __ movss(Operand(esp, 0), kScratchDoubleReg);
         frame_access_state()->IncreaseSPDelta(kFloatSize / kSystemPointerSize);
       } else {
         __ movss(kScratchDoubleReg, i.InputOperand(0));
         __ AllocateStackSpace(kFloatSize);
         __ movss(Operand(esp, 0), kScratchDoubleReg);
         frame_access_state()->IncreaseSPDelta(kFloatSize / kSystemPointerSize);
       }
       break;
     case kIA32PushFloat64:
       if (instr->InputAt(0)->IsFPRegister()) {
         __ AllocateStackSpace(kDoubleSize);
         __ movsd(Operand(esp, 0), i.InputDoubleRegister(0));
         frame_access_state()->IncreaseSPDelta(kDoubleSize / kSystemPointerSize);
       } else if (HasImmediateInput(instr, 0)) {
         __ Move(kScratchDoubleReg, i.InputDouble(0));
         __ AllocateStackSpace(kDoubleSize);
         __ movsd(Operand(esp, 0), kScratchDoubleReg);
         frame_access_state()->IncreaseSPDelta(kDoubleSize / kSystemPointerSize);
       } else {
         __ movsd(kScratchDoubleReg, i.InputOperand(0));
         __ AllocateStackSpace(kDoubleSize);
         __ movsd(Operand(esp, 0), kScratchDoubleReg);
         frame_access_state()->IncreaseSPDelta(kDoubleSize / kSystemPointerSize);
       }
       break;
     case kIA32PushSimd128:
       if (instr->InputAt(0)->IsFPRegister()) {
         __ AllocateStackSpace(kSimd128Size);
         __ movups(Operand(esp, 0), i.InputSimd128Register(0));
       } else {
         __ movups(kScratchDoubleReg, i.InputOperand(0));
         __ AllocateStackSpace(kSimd128Size);
         __ movups(Operand(esp, 0), kScratchDoubleReg);
       }
       frame_access_state()->IncreaseSPDelta(kSimd128Size / kSystemPointerSize);
       break;
     case kIA32Push:
       if (HasAddressingMode(instr)) {
         size_t index = 0;
         Operand operand = i.MemoryOperand(&index);
         __ push(operand);
         frame_access_state()->IncreaseSPDelta(kFloatSize / kSystemPointerSize);
       } else if (instr->InputAt(0)->IsFPRegister()) {
         __ AllocateStackSpace(kFloatSize);
         __ movsd(Operand(esp, 0), i.InputDoubleRegister(0));
         frame_access_state()->IncreaseSPDelta(kFloatSize / kSystemPointerSize);
       } else if (HasImmediateInput(instr, 0)) {
         __ push(i.InputImmediate(0));
         frame_access_state()->IncreaseSPDelta(1);
       } else {
         __ push(i.InputOperand(0));
         frame_access_state()->IncreaseSPDelta(1);
       }
       break;
     case kIA32Poke: {
       int slot = MiscField::decode(instr->opcode());
       if (HasImmediateInput(instr, 0)) {
         __ mov(Operand(esp, slot * kSystemPointerSize), i.InputImmediate(0));
       } else {
         __ mov(Operand(esp, slot * kSystemPointerSize), i.InputRegister(0));
       }
       break;
     }
     case kIA32Peek: {
       int reverse_slot = i.InputInt32(0);
       int offset =
           FrameSlotToFPOffset(frame()->GetTotalFrameSlotCount() - reverse_slot);
       if (instr->OutputAt(0)->IsFPRegister()) {
         LocationOperand* op = LocationOperand::cast(instr->OutputAt(0));
         if (op->representation() == MachineRepresentation::kFloat64) {
           __ movsd(i.OutputDoubleRegister(), Operand(ebp, offset));
         } else if (op->representation() == MachineRepresentation::kFloat32) {
           __ movss(i.OutputFloatRegister(), Operand(ebp, offset));
         } else {
           DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
           __ movdqu(i.OutputSimd128Register(), Operand(ebp, offset));
         }
       } else {
         __ mov(i.OutputRegister(), Operand(ebp, offset));
       }
       break;
     }
     case kSSEF64x2Splat: {
       DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
       XMMRegister dst = i.OutputSimd128Register();
       __ shufpd(dst, dst, 0x0);
       break;
     }
     case kAVXF64x2Splat: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister src = i.InputDoubleRegister(0);
       __ vshufpd(i.OutputSimd128Register(), src, src, 0x0);
       break;
     }
     case kSSEF64x2ExtractLane: {
       DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
       XMMRegister dst = i.OutputDoubleRegister();
       int8_t lane = i.InputInt8(1);
       if (lane != 0) {
         DCHECK_LT(lane, 4);
         __ shufpd(dst, dst, lane);
       }
       break;
     }
     case kAVXF64x2ExtractLane: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputDoubleRegister();
       XMMRegister src = i.InputSimd128Register(0);
       int8_t lane = i.InputInt8(1);
       if (lane == 0) {
         if (dst != src) __ vmovapd(dst, src);
       } else {
         DCHECK_LT(lane, 4);
         __ vshufpd(dst, src, src, lane);
       }
       break;
     }
     case kSSEF64x2ReplaceLane: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       int8_t lane = i.InputInt8(1);
       DoubleRegister rep = i.InputDoubleRegister(2);

       // insertps takes a mask which contains (high to low):
       // - 2 bit specifying source float element to copy
       // - 2 bit specifying destination float element to write to
       // - 4 bits specifying which elements of the destination to zero
       DCHECK_LT(lane, 2);
       if (lane == 0) {
         __ insertps(dst, rep, 0b00000000);
         __ insertps(dst, rep, 0b01010000);
       } else {
         __ insertps(dst, rep, 0b00100000);
         __ insertps(dst, rep, 0b01110000);
       }
       break;
     }
     case kAVXF64x2ReplaceLane: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src = i.InputSimd128Register(0);
       int8_t lane = i.InputInt8(1);
       DoubleRegister rep = i.InputDoubleRegister(2);
       DCHECK_NE(dst, rep);

       DCHECK_LT(lane, 2);
       if (lane == 0) {
         __ vinsertps(dst, src, rep, 0b00000000);
         __ vinsertps(dst, dst, rep, 0b01010000);
       } else {
         __ vinsertps(dst, src, rep, 0b00100000);
         __ vinsertps(dst, dst, rep, 0b01110000);
       }
       break;
     }
     case kIA32F64x2Sqrt: {
       __ Sqrtpd(i.OutputSimd128Register(), i.InputOperand(0));
       break;
     }
     case kIA32F64x2Add: {
       __ Addpd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
       break;
     }
     case kIA32F64x2Sub: {
       __ Subpd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
       break;
     }
     case kIA32F64x2Mul: {
       __ Mulpd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
       break;
     }
     case kIA32F64x2Div: {
       __ Divpd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
       break;
     }
     case kIA32F64x2Min: {
       Operand src1 = i.InputOperand(1);
       XMMRegister dst = i.OutputSimd128Register(),
                   src = i.InputSimd128Register(0),
                   tmp = i.TempSimd128Register(0);
       // The minpd instruction doesn't propagate NaNs and +0's in its first
       // operand. Perform minpd in both orders, merge the resuls, and adjust.
       __ Movupd(tmp, src1);
       __ Minpd(tmp, tmp, src);
       __ Minpd(dst, src, src1);
       // propagate -0's and NaNs, which may be non-canonical.
       __ Orpd(tmp, dst);
       // Canonicalize NaNs by quieting and clearing the payload.
       __ Cmpunordpd(dst, dst, tmp);
       __ Orpd(tmp, dst);
       __ Psrlq(dst, 13);
       __ Andnpd(dst, tmp);
       break;
     }
     case kIA32F64x2Max: {
       Operand src1 = i.InputOperand(1);
       XMMRegister dst = i.OutputSimd128Register(),
                   src = i.InputSimd128Register(0),
                   tmp = i.TempSimd128Register(0);
       // The maxpd instruction doesn't propagate NaNs and +0's in its first
       // operand. Perform maxpd in both orders, merge the resuls, and adjust.
       __ Movupd(tmp, src1);
       __ Maxpd(tmp, tmp, src);
       __ Maxpd(dst, src, src1);
       // Find discrepancies.
       __ Xorpd(dst, tmp);
       // Propagate NaNs, which may be non-canonical.
       __ Orpd(tmp, dst);
       // Propagate sign discrepancy and (subtle) quiet NaNs.
       __ Subpd(tmp, tmp, dst);
       // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
       __ Cmpunordpd(dst, dst, tmp);
       __ Psrlq(dst, 13);
       __ Andnpd(dst, tmp);
       break;
     }
     case kIA32F64x2Eq: {
       __ Cmpeqpd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kIA32F64x2Ne: {
       __ Cmpneqpd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kIA32F64x2Lt: {
       __ Cmpltpd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kIA32F64x2Le: {
       __ Cmplepd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kIA32F64x2Pmin: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       __ Minpd(dst, dst, i.InputSimd128Register(1));
       break;
     }
     case kIA32F64x2Pmax: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       __ Maxpd(dst, dst, i.InputSimd128Register(1));
       break;
     }
     case kIA32F64x2Round: {
       RoundingMode const mode =
           static_cast<RoundingMode>(MiscField::decode(instr->opcode()));
       __ Roundpd(i.OutputSimd128Register(), i.InputDoubleRegister(0), mode);
       break;
     }
     case kIA32I64x2SplatI32Pair: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Pinsrd(dst, i.InputRegister(0), 0);
       __ Pinsrd(dst, i.InputOperand(1), 1);
       __ Pshufd(dst, dst, 0x44);
       break;
     }
     case kIA32I64x2ReplaceLaneI32Pair: {
       int8_t lane = i.InputInt8(1);
       __ Pinsrd(i.OutputSimd128Register(), i.InputOperand(2), lane * 2);
       __ Pinsrd(i.OutputSimd128Register(), i.InputOperand(3), lane * 2 + 1);
       break;
     }
     case kIA32I64x2Neg: {
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(0);
       __ Pxor(dst, dst);
       __ Psubq(dst, src);
       break;
     }
     case kIA32I64x2Shl: {
       ASSEMBLE_SIMD_SHIFT(Psllq, 6);
       break;
     }
     case kIA32I64x2ShrS: {
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src = i.InputSimd128Register(0);
       XMMRegister tmp = i.TempSimd128Register(0);
       XMMRegister tmp2 = i.TempSimd128Register(1);
       Operand shift = i.InputOperand(1);

       // Take shift value modulo 64.
       __ and_(shift, Immediate(63));
       __ Movd(tmp, shift);

       // Set up a mask [0x80000000,0,0x80000000,0].
       __ Pcmpeqb(tmp2, tmp2);
       __ Psllq(tmp2, tmp2, 63);

       __ Psrlq(tmp2, tmp2, tmp);
       __ Psrlq(dst, src, tmp);
       __ Pxor(dst, tmp2);
       __ Psubq(dst, tmp2);
       break;
     }
     case kIA32I64x2Add: {
       __ Paddq(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
       break;
     }
     case kIA32I64x2Sub: {
       __ Psubq(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
       break;
     }
     case kIA32I64x2Mul: {
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister left = i.InputSimd128Register(0);
       XMMRegister right = i.InputSimd128Register(1);
       XMMRegister tmp1 = i.TempSimd128Register(0);
       XMMRegister tmp2 = i.TempSimd128Register(1);

       __ Movaps(tmp1, left);
       __ Movaps(tmp2, right);

       // Multiply high dword of each qword of left with right.
       __ Psrlq(tmp1, 32);
       __ Pmuludq(tmp1, tmp1, right);

       // Multiply high dword of each qword of right with left.
       __ Psrlq(tmp2, 32);
       __ Pmuludq(tmp2, tmp2, left);

       __ Paddq(tmp2, tmp2, tmp1);
       __ Psllq(tmp2, tmp2, 32);

       __ Pmuludq(dst, left, right);
       __ Paddq(dst, dst, tmp2);
       break;
     }
     case kIA32I64x2ShrU: {
       ASSEMBLE_SIMD_SHIFT(Psrlq, 6);
       break;
     }
     case kSSEF32x4Splat: {
       DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
       XMMRegister dst = i.OutputSimd128Register();
       __ shufps(dst, dst, 0x0);
       break;
     }
     case kAVXF32x4Splat: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister src = i.InputFloatRegister(0);
       __ vshufps(i.OutputSimd128Register(), src, src, 0x0);
       break;
     }
     case kSSEF32x4ExtractLane: {
       DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
       XMMRegister dst = i.OutputFloatRegister();
       int8_t lane = i.InputInt8(1);
       if (lane != 0) {
         DCHECK_LT(lane, 4);
         __ shufps(dst, dst, lane);
       }
       break;
     }
     case kAVXF32x4ExtractLane: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputFloatRegister();
       XMMRegister src = i.InputSimd128Register(0);
       int8_t lane = i.InputInt8(1);
       if (lane == 0) {
         if (dst != src) __ vmovaps(dst, src);
       } else {
         DCHECK_LT(lane, 4);
         __ vshufps(dst, src, src, lane);
       }
       break;
     }
     case kSSEF32x4ReplaceLane: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ insertps(i.OutputSimd128Register(), i.InputOperand(2),
                   i.InputInt8(1) << 4);
       break;
     }
     case kAVXF32x4ReplaceLane: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vinsertps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                    i.InputOperand(2), i.InputInt8(1) << 4);
       break;
     }
     case kIA32F32x4SConvertI32x4: {
       __ Cvtdq2ps(i.OutputSimd128Register(), i.InputOperand(0));
       break;
     }
     case kSSEF32x4UConvertI32x4: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       __ pxor(kScratchDoubleReg, kScratchDoubleReg);      // zeros
       __ pblendw(kScratchDoubleReg, dst, 0x55);           // get lo 16 bits
       __ psubd(dst, kScratchDoubleReg);                   // get hi 16 bits
       __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // convert lo exactly
       __ psrld(dst, 1);                  // divide by 2 to get in unsigned range
       __ cvtdq2ps(dst, dst);             // convert hi exactly
       __ addps(dst, dst);                // double hi, exactly
       __ addps(dst, kScratchDoubleReg);  // add hi and lo, may round.
       break;
     }
     case kAVXF32x4UConvertI32x4: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src = i.InputSimd128Register(0);
       __ vpxor(kScratchDoubleReg, kScratchDoubleReg,
                kScratchDoubleReg);  // zeros
       __ vpblendw(kScratchDoubleReg, kScratchDoubleReg, src,
                   0x55);                                   // get lo 16 bits
       __ vpsubd(dst, src, kScratchDoubleReg);              // get hi 16 bits
       __ vcvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // convert lo exactly
       __ vpsrld(dst, dst, 1);    // divide by 2 to get in unsigned range
       __ vcvtdq2ps(dst, dst);    // convert hi exactly
       __ vaddps(dst, dst, dst);  // double hi, exactly
       __ vaddps(dst, dst, kScratchDoubleReg);  // add hi and lo, may round.
       break;
     }
     case kSSEF32x4Abs: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(i.InputSimd128Register(0), dst);
       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
       __ psrld(kScratchDoubleReg, 1);
       __ andps(dst, kScratchDoubleReg);
       break;
     }
     case kAVXF32x4Abs: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 1);
       __ vandps(i.OutputSimd128Register(), kScratchDoubleReg,
                 i.InputOperand(0));
       break;
     }
     case kSSEF32x4Neg: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
       __ pslld(kScratchDoubleReg, 31);
       __ xorps(dst, kScratchDoubleReg);
       break;
     }
     case kAVXF32x4Neg: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpslld(kScratchDoubleReg, kScratchDoubleReg, 31);
       __ vxorps(i.OutputSimd128Register(), kScratchDoubleReg,
                 i.InputOperand(0));
       break;
     }
     case kSSEF32x4Sqrt: {
       __ sqrtps(i.OutputSimd128Register(), i.InputSimd128Register(0));
       break;
     }
     case kAVXF32x4Sqrt: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vsqrtps(i.OutputSimd128Register(), i.InputOperand(0));
       break;
     }
     case kIA32F32x4RecipApprox: {
       __ Rcpps(i.OutputSimd128Register(), i.InputOperand(0));
       break;
     }
     case kIA32F32x4RecipSqrtApprox: {
       __ Rsqrtps(i.OutputSimd128Register(), i.InputOperand(0));
       break;
     }
     case kSSEF32x4Add: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ addps(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXF32x4Add: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vaddps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
       break;
     }
     case kSSEF32x4AddHoriz: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE3);
       __ haddps(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXF32x4AddHoriz: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vhaddps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEF32x4Sub: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ subps(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXF32x4Sub: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vsubps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
       break;
     }
     case kSSEF32x4Mul: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ mulps(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXF32x4Mul: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vmulps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
       break;
     }
     case kSSEF32x4Div: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ divps(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXF32x4Div: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vdivps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
       break;
     }
     case kSSEF32x4Min: {
       XMMRegister src1 = i.InputSimd128Register(1),
                   dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       // The minps instruction doesn't propagate NaNs and +0's in its first
       // operand. Perform minps in both orders, merge the resuls, and adjust.
       __ movaps(kScratchDoubleReg, src1);
       __ minps(kScratchDoubleReg, dst);
       __ minps(dst, src1);
       // propagate -0's and NaNs, which may be non-canonical.
       __ orps(kScratchDoubleReg, dst);
       // Canonicalize NaNs by quieting and clearing the payload.
       __ cmpps(dst, kScratchDoubleReg, 3);
       __ orps(kScratchDoubleReg, dst);
       __ psrld(dst, 10);
       __ andnps(dst, kScratchDoubleReg);
       break;
     }
     case kAVXF32x4Min: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src0 = i.InputSimd128Register(0);
       Operand src1 = i.InputOperand(1);
       // See comment above for correction of minps.
       __ movups(kScratchDoubleReg, src1);
       __ vminps(kScratchDoubleReg, kScratchDoubleReg, src0);
       __ vminps(dst, src0, src1);
       __ vorps(dst, dst, kScratchDoubleReg);
       __ vcmpneqps(kScratchDoubleReg, dst, dst);
       __ vorps(dst, dst, kScratchDoubleReg);
       __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 10);
       __ vandnps(dst, kScratchDoubleReg, dst);
       break;
     }
     case kSSEF32x4Max: {
       XMMRegister src1 = i.InputSimd128Register(1),
                   dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       // The maxps instruction doesn't propagate NaNs and +0's in its first
       // operand. Perform maxps in both orders, merge the resuls, and adjust.
       __ movaps(kScratchDoubleReg, src1);
       __ maxps(kScratchDoubleReg, dst);
       __ maxps(dst, src1);
       // Find discrepancies.
       __ xorps(dst, kScratchDoubleReg);
       // Propagate NaNs, which may be non-canonical.
       __ orps(kScratchDoubleReg, dst);
       // Propagate sign discrepancy and (subtle) quiet NaNs.
       __ subps(kScratchDoubleReg, dst);
       // Canonicalize NaNs by clearing the payload.
       __ cmpps(dst, kScratchDoubleReg, 3);
       __ psrld(dst, 10);
       __ andnps(dst, kScratchDoubleReg);
       break;
     }
     case kAVXF32x4Max: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src0 = i.InputSimd128Register(0);
       Operand src1 = i.InputOperand(1);
       // See comment above for correction of maxps.
       __ vmovups(kScratchDoubleReg, src1);
       __ vmaxps(kScratchDoubleReg, kScratchDoubleReg, src0);
       __ vmaxps(dst, src0, src1);
       __ vxorps(dst, dst, kScratchDoubleReg);
       __ vorps(kScratchDoubleReg, kScratchDoubleReg, dst);
       __ vsubps(kScratchDoubleReg, kScratchDoubleReg, dst);
       __ vcmpneqps(dst, kScratchDoubleReg, kScratchDoubleReg);
       __ vpsrld(dst, dst, 10);
       __ vandnps(dst, dst, kScratchDoubleReg);
       break;
     }
     case kSSEF32x4Eq: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ cmpeqps(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXF32x4Eq: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vcmpeqps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEF32x4Ne: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ cmpneqps(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXF32x4Ne: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vcmpneqps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                    i.InputOperand(1));
       break;
     }
     case kSSEF32x4Lt: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ cmpltps(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXF32x4Lt: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vcmpltps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEF32x4Le: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ cmpleps(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXF32x4Le: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vcmpleps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kIA32F32x4Pmin: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       __ Minps(dst, dst, i.InputSimd128Register(1));
       break;
     }
     case kIA32F32x4Pmax: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       __ Maxps(dst, dst, i.InputSimd128Register(1));
       break;
     }
     case kIA32F32x4Round: {
       RoundingMode const mode =
           static_cast<RoundingMode>(MiscField::decode(instr->opcode()));
       __ Roundps(i.OutputSimd128Register(), i.InputDoubleRegister(0), mode);
       break;
     }
     case kIA32I32x4Splat: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Movd(dst, i.InputOperand(0));
       __ Pshufd(dst, dst, 0x0);
       break;
     }
     case kIA32I32x4ExtractLane: {
       __ Pextrd(i.OutputRegister(), i.InputSimd128Register(0), i.InputInt8(1));
       break;
     }
     case kSSEI32x4ReplaceLane: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pinsrd(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1));
       break;
     }
     case kAVXI32x4ReplaceLane: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpinsrd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(2), i.InputInt8(1));
       break;
     }
     case kSSEI32x4SConvertF32x4: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       XMMRegister dst = i.OutputSimd128Register();
       // NAN->0
       __ movaps(kScratchDoubleReg, dst);
       __ cmpeqps(kScratchDoubleReg, kScratchDoubleReg);
       __ pand(dst, kScratchDoubleReg);
       // Set top bit if >= 0 (but not -0.0!)
       __ pxor(kScratchDoubleReg, dst);
       // Convert
       __ cvttps2dq(dst, dst);
       // Set top bit if >=0 is now < 0
       __ pand(kScratchDoubleReg, dst);
       __ psrad(kScratchDoubleReg, 31);
       // Set positive overflow lanes to 0x7FFFFFFF
       __ pxor(dst, kScratchDoubleReg);
       break;
     }
     case kAVXI32x4SConvertF32x4: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src = i.InputSimd128Register(0);
       // NAN->0
       __ vcmpeqps(kScratchDoubleReg, src, src);
       __ vpand(dst, src, kScratchDoubleReg);
       // Set top bit if >= 0 (but not -0.0!)
       __ vpxor(kScratchDoubleReg, kScratchDoubleReg, dst);
       // Convert
       __ vcvttps2dq(dst, dst);
       // Set top bit if >=0 is now < 0
       __ vpand(kScratchDoubleReg, kScratchDoubleReg, dst);
       __ vpsrad(kScratchDoubleReg, kScratchDoubleReg, 31);
       // Set positive overflow lanes to 0x7FFFFFFF
       __ vpxor(dst, dst, kScratchDoubleReg);
       break;
     }
     case kIA32I32x4SConvertI16x8Low: {
       __ Pmovsxwd(i.OutputSimd128Register(), i.InputOperand(0));
       break;
     }
     case kIA32I32x4SConvertI16x8High: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Palignr(dst, i.InputOperand(0), 8);
       __ Pmovsxwd(dst, dst);
       break;
     }
     case kIA32I32x4Neg: {
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(0);
       if (src.is_reg(dst)) {
         __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
         __ Psignd(dst, kScratchDoubleReg);
       } else {
         __ Pxor(dst, dst);
         __ Psubd(dst, src);
       }
       break;
     }
     case kIA32I32x4Shl: {
       ASSEMBLE_SIMD_SHIFT(Pslld, 5);
       break;
     }
     case kIA32I32x4ShrS: {
       ASSEMBLE_SIMD_SHIFT(Psrad, 5);
       break;
     }
     case kSSEI32x4Add: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ paddd(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI32x4Add: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpaddd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
       break;
     }
     case kSSEI32x4AddHoriz: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSSE3);
       __ phaddd(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI32x4AddHoriz: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vphaddd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI32x4Sub: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ psubd(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI32x4Sub: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpsubd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
       break;
     }
     case kSSEI32x4Mul: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pmulld(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI32x4Mul: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpmulld(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI32x4MinS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pminsd(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI32x4MinS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpminsd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI32x4MaxS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pmaxsd(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI32x4MaxS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpmaxsd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI32x4Eq: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI32x4Eq: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpeqd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEI32x4Ne: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1));
       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
       __ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
       break;
     }
     case kAVXI32x4Ne: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpeqd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
                kScratchDoubleReg);
       break;
     }
     case kSSEI32x4GtS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pcmpgtd(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI32x4GtS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpgtd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEI32x4GeS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(1);
       __ pminsd(dst, src);
       __ pcmpeqd(dst, src);
       break;
     }
     case kAVXI32x4GeS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister src1 = i.InputSimd128Register(0);
       Operand src2 = i.InputOperand(1);
       __ vpminsd(kScratchDoubleReg, src1, src2);
       __ vpcmpeqd(i.OutputSimd128Register(), kScratchDoubleReg, src2);
       break;
     }
     case kSSEI32x4UConvertF32x4: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister tmp = i.TempSimd128Register(0);
       // NAN->0, negative->0
       __ pxor(kScratchDoubleReg, kScratchDoubleReg);
       __ maxps(dst, kScratchDoubleReg);
       // scratch: float representation of max_signed
       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
       __ psrld(kScratchDoubleReg, 1);                     // 0x7fffffff
       __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // 0x4f000000
       // tmp: convert (src-max_signed).
       // Positive overflow lanes -> 0x7FFFFFFF
       // Negative lanes -> 0
       __ movaps(tmp, dst);
       __ subps(tmp, kScratchDoubleReg);
       __ cmpleps(kScratchDoubleReg, tmp);
       __ cvttps2dq(tmp, tmp);
       __ pxor(tmp, kScratchDoubleReg);
       __ pxor(kScratchDoubleReg, kScratchDoubleReg);
       __ pmaxsd(tmp, kScratchDoubleReg);
       // convert. Overflow lanes above max_signed will be 0x80000000
       __ cvttps2dq(dst, dst);
       // Add (src-max_signed) for overflow lanes.
       __ paddd(dst, tmp);
       break;
     }
     case kAVXI32x4UConvertF32x4: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister tmp = i.TempSimd128Register(0);
       // NAN->0, negative->0
       __ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vmaxps(dst, dst, kScratchDoubleReg);
       // scratch: float representation of max_signed
       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 1);  // 0x7fffffff
       __ vcvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // 0x4f000000
       // tmp: convert (src-max_signed).
       // Positive overflow lanes -> 0x7FFFFFFF
       // Negative lanes -> 0
       __ vsubps(tmp, dst, kScratchDoubleReg);
       __ vcmpleps(kScratchDoubleReg, kScratchDoubleReg, tmp);
       __ vcvttps2dq(tmp, tmp);
       __ vpxor(tmp, tmp, kScratchDoubleReg);
       __ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpmaxsd(tmp, tmp, kScratchDoubleReg);
       // convert. Overflow lanes above max_signed will be 0x80000000
       __ vcvttps2dq(dst, dst);
       // Add (src-max_signed) for overflow lanes.
       __ vpaddd(dst, dst, tmp);
       break;
     }
     case kIA32I32x4UConvertI16x8Low: {
       __ Pmovzxwd(i.OutputSimd128Register(), i.InputOperand(0));
       break;
     }
     case kIA32I32x4UConvertI16x8High: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Palignr(dst, i.InputOperand(0), 8);
       __ Pmovzxwd(dst, dst);
       break;
     }
     case kIA32I32x4ShrU: {
       ASSEMBLE_SIMD_SHIFT(Psrld, 5);
       break;
     }
     case kSSEI32x4MinU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pminud(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI32x4MinU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpminud(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI32x4MaxU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pmaxud(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI32x4MaxU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpmaxud(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI32x4GtU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(1);
       __ pmaxud(dst, src);
       __ pcmpeqd(dst, src);
       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
       __ pxor(dst, kScratchDoubleReg);
       break;
     }
     case kAVXI32x4GtU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src1 = i.InputSimd128Register(0);
       Operand src2 = i.InputOperand(1);
       __ vpmaxud(kScratchDoubleReg, src1, src2);
       __ vpcmpeqd(dst, kScratchDoubleReg, src2);
       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpxor(dst, dst, kScratchDoubleReg);
       break;
     }
     case kSSEI32x4GeU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(1);
       __ pminud(dst, src);
       __ pcmpeqd(dst, src);
       break;
     }
     case kAVXI32x4GeU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister src1 = i.InputSimd128Register(0);
       Operand src2 = i.InputOperand(1);
       __ vpminud(kScratchDoubleReg, src1, src2);
       __ vpcmpeqd(i.OutputSimd128Register(), kScratchDoubleReg, src2);
       break;
     }
     case kIA32I32x4Abs: {
       __ Pabsd(i.OutputSimd128Register(), i.InputSimd128Register(0));
       break;
     }
     case kIA32I32x4BitMask: {
       __ Movmskps(i.OutputRegister(), i.InputSimd128Register(0));
       break;
     }
     case kIA32I32x4DotI16x8S: {
       __ Pmaddwd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputSimd128Register(1));
       break;
     }
     case kIA32I16x8Splat: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Movd(dst, i.InputOperand(0));
       __ Pshuflw(dst, dst, 0x0);
       __ Pshufd(dst, dst, 0x0);
       break;
     }
     case kIA32I16x8ExtractLaneU: {
       Register dst = i.OutputRegister();
       __ Pextrw(dst, i.InputSimd128Register(0), i.InputInt8(1));
       break;
     }
     case kIA32I16x8ExtractLaneS: {
       Register dst = i.OutputRegister();
       __ Pextrw(dst, i.InputSimd128Register(0), i.InputInt8(1));
       __ movsx_w(dst, dst);
       break;
     }
     case kSSEI16x8ReplaceLane: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pinsrw(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1));
       break;
     }
     case kAVXI16x8ReplaceLane: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpinsrw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(2), i.InputInt8(1));
       break;
     }
     case kIA32I16x8SConvertI8x16Low: {
       __ Pmovsxbw(i.OutputSimd128Register(), i.InputOperand(0));
       break;
     }
     case kIA32I16x8SConvertI8x16High: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Palignr(dst, i.InputOperand(0), 8);
       __ Pmovsxbw(dst, dst);
       break;
     }
     case kIA32I16x8Neg: {
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(0);
       if (src.is_reg(dst)) {
         __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
         __ Psignw(dst, kScratchDoubleReg);
       } else {
         __ Pxor(dst, dst);
         __ Psubw(dst, src);
       }
       break;
     }
     case kIA32I16x8Shl: {
       ASSEMBLE_SIMD_SHIFT(Psllw, 4);
       break;
     }
     case kIA32I16x8ShrS: {
       ASSEMBLE_SIMD_SHIFT(Psraw, 4);
       break;
     }
     case kSSEI16x8SConvertI32x4: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ packssdw(i.OutputSimd128Register(), i.InputSimd128Register(1));
       break;
     }
     case kAVXI16x8SConvertI32x4: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpackssdw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                    i.InputSimd128Register(1));
       break;
     }
     case kSSEI16x8Add: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ paddw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8Add: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpaddw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
       break;
     }
     case kSSEI16x8AddSatS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ paddsw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8AddSatS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpaddsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI16x8AddHoriz: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSSE3);
       __ phaddw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8AddHoriz: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vphaddw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI16x8Sub: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ psubw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8Sub: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpsubw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
       break;
     }
     case kSSEI16x8SubSatS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ psubsw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8SubSatS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpsubsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI16x8Mul: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pmullw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8Mul: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpmullw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI16x8MinS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pminsw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8MinS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpminsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI16x8MaxS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pmaxsw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8MaxS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpmaxsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI16x8Eq: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pcmpeqw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8Eq: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpeqw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEI16x8Ne: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pcmpeqw(i.OutputSimd128Register(), i.InputOperand(1));
       __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
       __ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
       break;
     }
     case kAVXI16x8Ne: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpeqw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       __ vpcmpeqw(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
                kScratchDoubleReg);
       break;
     }
     case kSSEI16x8GtS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pcmpgtw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8GtS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpgtw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEI16x8GeS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(1);
       __ pminsw(dst, src);
       __ pcmpeqw(dst, src);
       break;
     }
     case kAVXI16x8GeS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister src1 = i.InputSimd128Register(0);
       Operand src2 = i.InputOperand(1);
       __ vpminsw(kScratchDoubleReg, src1, src2);
       __ vpcmpeqw(i.OutputSimd128Register(), kScratchDoubleReg, src2);
       break;
     }
     case kIA32I16x8UConvertI8x16Low: {
       __ Pmovzxbw(i.OutputSimd128Register(), i.InputOperand(0));
       break;
     }
     case kIA32I16x8UConvertI8x16High: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Palignr(dst, i.InputOperand(0), 8);
       __ Pmovzxbw(dst, dst);
       break;
     }
     case kIA32I16x8ShrU: {
       ASSEMBLE_SIMD_SHIFT(Psrlw, 4);
       break;
     }
     case kSSEI16x8UConvertI32x4: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ packusdw(i.OutputSimd128Register(), i.InputSimd128Register(1));
       break;
     }
     case kAVXI16x8UConvertI32x4: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       __ vpackusdw(dst, dst, i.InputSimd128Register(1));
       break;
     }
     case kSSEI16x8AddSatU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ paddusw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8AddSatU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpaddusw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEI16x8SubSatU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ psubusw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8SubSatU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpsubusw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEI16x8MinU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pminuw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8MinU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpminuw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI16x8MaxU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pmaxuw(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI16x8MaxU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpmaxuw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI16x8GtU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(1);
       __ pmaxuw(dst, src);
       __ pcmpeqw(dst, src);
       __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
       __ pxor(dst, kScratchDoubleReg);
       break;
     }
     case kAVXI16x8GtU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src1 = i.InputSimd128Register(0);
       Operand src2 = i.InputOperand(1);
       __ vpmaxuw(kScratchDoubleReg, src1, src2);
       __ vpcmpeqw(dst, kScratchDoubleReg, src2);
       __ vpcmpeqw(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpxor(dst, dst, kScratchDoubleReg);
       break;
     }
     case kSSEI16x8GeU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(1);
       __ pminuw(dst, src);
       __ pcmpeqw(dst, src);
       break;
     }
     case kAVXI16x8GeU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister src1 = i.InputSimd128Register(0);
       Operand src2 = i.InputOperand(1);
       __ vpminuw(kScratchDoubleReg, src1, src2);
       __ vpcmpeqw(i.OutputSimd128Register(), kScratchDoubleReg, src2);
       break;
     }
     case kIA32I16x8RoundingAverageU: {
       __ Pavgw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
       break;
     }
     case kIA32I16x8Abs: {
       __ Pabsw(i.OutputSimd128Register(), i.InputSimd128Register(0));
       break;
     }
     case kIA32I16x8BitMask: {
       Register dst = i.OutputRegister();
       XMMRegister tmp = i.TempSimd128Register(0);
       __ Packsswb(tmp, i.InputSimd128Register(0));
       __ Pmovmskb(dst, tmp);
       __ shr(dst, 8);
       break;
     }
     case kIA32I8x16Splat: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Movd(dst, i.InputOperand(0));
       __ Pxor(kScratchDoubleReg, kScratchDoubleReg);
       __ Pshufb(dst, kScratchDoubleReg);
       break;
     }
     case kIA32I8x16ExtractLaneU: {
       Register dst = i.OutputRegister();
       __ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1));
       break;
     }
     case kIA32I8x16ExtractLaneS: {
       Register dst = i.OutputRegister();
       __ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1));
       __ movsx_b(dst, dst);
       break;
     }
     case kSSEI8x16ReplaceLane: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pinsrb(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1));
       break;
     }
     case kAVXI8x16ReplaceLane: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpinsrb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(2), i.InputInt8(1));
       break;
     }
     case kSSEI8x16SConvertI16x8: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ packsswb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16SConvertI16x8: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpacksswb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                    i.InputOperand(1));
       break;
     }
     case kIA32I8x16Neg: {
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(0);
       if (src.is_reg(dst)) {
         __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
         __ Psignb(dst, kScratchDoubleReg);
       } else {
         __ Pxor(dst, dst);
         __ Psubb(dst, src);
       }
       break;
     }
     case kIA32I8x16Shl: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       Register tmp = i.ToRegister(instr->TempAt(0));
       XMMRegister tmp_simd = i.TempSimd128Register(1);

       if (HasImmediateInput(instr, 1)) {
         // Perform 16-bit shift, then mask away low bits.
         uint8_t shift = i.InputInt3(1);
         __ Psllw(dst, dst, byte{shift});

         uint8_t bmask = static_cast<uint8_t>(0xff << shift);
         uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
         __ mov(tmp, mask);
         __ Movd(tmp_simd, tmp);
         __ Pshufd(tmp_simd, tmp_simd, 0);
         __ Pand(dst, tmp_simd);
       } else {
         // Take shift value modulo 8.
         __ mov(tmp, i.InputRegister(1));
         __ and_(tmp, 7);
         // Mask off the unwanted bits before word-shifting.
         __ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
         __ add(tmp, Immediate(8));
         __ Movd(tmp_simd, tmp);
         __ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
         __ Packuswb(kScratchDoubleReg, kScratchDoubleReg);
         __ Pand(dst, kScratchDoubleReg);
         // TODO(zhin): sub here to avoid asking for another temporary register,
         // examine codegen for other i8x16 shifts, they use less instructions.
         __ sub(tmp, Immediate(8));
         __ Movd(tmp_simd, tmp);
         __ Psllw(dst, dst, tmp_simd);
       }
       break;
     }
     case kIA32I8x16ShrS: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       if (HasImmediateInput(instr, 1)) {
         __ Punpckhbw(kScratchDoubleReg, dst);
         __ Punpcklbw(dst, dst);
         uint8_t shift = i.InputInt3(1) + 8;
         __ Psraw(kScratchDoubleReg, shift);
         __ Psraw(dst, shift);
         __ Packsswb(dst, kScratchDoubleReg);
       } else {
         Register tmp = i.ToRegister(instr->TempAt(0));
         XMMRegister tmp_simd = i.TempSimd128Register(1);
         // Unpack the bytes into words, do arithmetic shifts, and repack.
         __ Punpckhbw(kScratchDoubleReg, dst);
         __ Punpcklbw(dst, dst);
         __ mov(tmp, i.InputRegister(1));
         // Take shift value modulo 8.
         __ and_(tmp, 7);
         __ add(tmp, Immediate(8));
         __ Movd(tmp_simd, tmp);
         __ Psraw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
         __ Psraw(dst, dst, tmp_simd);
         __ Packsswb(dst, kScratchDoubleReg);
       }
       break;
     }
     case kSSEI8x16Add: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ paddb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16Add: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpaddb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
       break;
     }
     case kSSEI8x16AddSatS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ paddsb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16AddSatS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpaddsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI8x16Sub: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ psubb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16Sub: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpsubb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
       break;
     }
     case kSSEI8x16SubSatS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ psubsb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16SubSatS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpsubsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI8x16Mul: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       XMMRegister right = i.InputSimd128Register(1);
       XMMRegister tmp = i.TempSimd128Register(0);

       // I16x8 view of I8x16
       // left = AAaa AAaa ... AAaa AAaa
       // right= BBbb BBbb ... BBbb BBbb

       // t = 00AA 00AA ... 00AA 00AA
       // s = 00BB 00BB ... 00BB 00BB
       __ movaps(tmp, dst);
       __ movaps(kScratchDoubleReg, right);
       __ psrlw(tmp, 8);
       __ psrlw(kScratchDoubleReg, 8);
       // dst = left * 256
       __ psllw(dst, 8);

       // t = I16x8Mul(t, s)
       //    => __PP __PP ...  __PP  __PP
       __ pmullw(tmp, kScratchDoubleReg);
       // dst = I16x8Mul(left * 256, right)
       //    => pp__ pp__ ...  pp__  pp__
       __ pmullw(dst, right);

       // t = I16x8Shl(t, 8)
       //    => PP00 PP00 ...  PP00  PP00
       __ psllw(tmp, 8);

       // dst = I16x8Shr(dst, 8)
       //    => 00pp 00pp ...  00pp  00pp
       __ psrlw(dst, 8);

       // dst = I16x8Or(dst, t)
       //    => PPpp PPpp ...  PPpp  PPpp
       __ por(dst, tmp);
       break;
     }
     case kAVXI8x16Mul: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister left = i.InputSimd128Register(0);
       XMMRegister right = i.InputSimd128Register(1);
       XMMRegister tmp = i.TempSimd128Register(0);

       // I16x8 view of I8x16
       // left = AAaa AAaa ... AAaa AAaa
       // right= BBbb BBbb ... BBbb BBbb

       // t = 00AA 00AA ... 00AA 00AA
       // s = 00BB 00BB ... 00BB 00BB
       __ vpsrlw(tmp, left, 8);
       __ vpsrlw(kScratchDoubleReg, right, 8);

       // t = I16x8Mul(t0, t1)
       //    => __PP __PP ...  __PP  __PP
       __ vpmullw(tmp, tmp, kScratchDoubleReg);

       // s = left * 256
       __ vpsllw(kScratchDoubleReg, left, 8);

       // dst = I16x8Mul(left * 256, right)
       //    => pp__ pp__ ...  pp__  pp__
       __ vpmullw(dst, kScratchDoubleReg, right);

       // dst = I16x8Shr(dst, 8)
       //    => 00pp 00pp ...  00pp  00pp
       __ vpsrlw(dst, dst, 8);

       // t = I16x8Shl(t, 8)
       //    => PP00 PP00 ...  PP00  PP00
       __ vpsllw(tmp, tmp, 8);

       // dst = I16x8Or(dst, t)
       //    => PPpp PPpp ...  PPpp  PPpp
       __ vpor(dst, dst, tmp);
       break;
     }
     case kSSEI8x16MinS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pminsb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16MinS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpminsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI8x16MaxS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       __ pmaxsb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16MaxS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpmaxsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI8x16Eq: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pcmpeqb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16Eq: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpeqb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEI8x16Ne: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pcmpeqb(i.OutputSimd128Register(), i.InputOperand(1));
       __ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
       __ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
       break;
     }
     case kAVXI8x16Ne: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpeqb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       __ vpcmpeqb(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
                kScratchDoubleReg);
       break;
     }
     case kSSEI8x16GtS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pcmpgtb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16GtS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpgtb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEI8x16GeS: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(1);
       __ pminsb(dst, src);
       __ pcmpeqb(dst, src);
       break;
     }
     case kAVXI8x16GeS: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister src1 = i.InputSimd128Register(0);
       Operand src2 = i.InputOperand(1);
       __ vpminsb(kScratchDoubleReg, src1, src2);
       __ vpcmpeqb(i.OutputSimd128Register(), kScratchDoubleReg, src2);
       break;
     }
     case kSSEI8x16UConvertI16x8: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       __ packuswb(dst, i.InputOperand(1));
       break;
     }
     case kAVXI8x16UConvertI16x8: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       __ vpackuswb(dst, dst, i.InputOperand(1));
       break;
     }
     case kSSEI8x16AddSatU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ paddusb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16AddSatU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpaddusb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kSSEI8x16SubSatU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ psubusb(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16SubSatU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpsubusb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
       break;
     }
     case kIA32I8x16ShrU: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       Register tmp = i.ToRegister(instr->TempAt(0));
       XMMRegister tmp_simd = i.TempSimd128Register(1);

       if (HasImmediateInput(instr, 1)) {
         // Perform 16-bit shift, then mask away high bits.
         uint8_t shift = i.InputInt3(1);
         __ Psrlw(dst, dst, byte{shift});

         uint8_t bmask = 0xff >> shift;
         uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
         __ mov(tmp, mask);
         __ Movd(tmp_simd, tmp);
         __ Pshufd(tmp_simd, tmp_simd, 0);
         __ Pand(dst, tmp_simd);
       } else {
         // Unpack the bytes into words, do logical shifts, and repack.
         __ Punpckhbw(kScratchDoubleReg, dst);
         __ Punpcklbw(dst, dst);
         __ mov(tmp, i.InputRegister(1));
         // Take shift value modulo 8.
         __ and_(tmp, 7);
         __ add(tmp, Immediate(8));
         __ Movd(tmp_simd, tmp);
         __ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
         __ Psrlw(dst, dst, tmp_simd);
         __ Packuswb(dst, kScratchDoubleReg);
       }
       break;
     }
     case kSSEI8x16MinU: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       __ pminub(dst, i.InputOperand(1));
       break;
     }
     case kAVXI8x16MinU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpminub(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI8x16MaxU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pmaxub(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXI8x16MaxU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpmaxub(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
       break;
     }
     case kSSEI8x16GtU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(1);
       __ pmaxub(dst, src);
       __ pcmpeqb(dst, src);
       __ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
       __ pxor(dst, kScratchDoubleReg);
       break;
     }
     case kAVXI8x16GtU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src1 = i.InputSimd128Register(0);
       Operand src2 = i.InputOperand(1);
       __ vpmaxub(kScratchDoubleReg, src1, src2);
       __ vpcmpeqb(dst, kScratchDoubleReg, src2);
       __ vpcmpeqb(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpxor(dst, dst, kScratchDoubleReg);
       break;
     }
     case kSSEI8x16GeU: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(1);
       __ pminub(dst, src);
       __ pcmpeqb(dst, src);
       break;
     }
     case kAVXI8x16GeU: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister src1 = i.InputSimd128Register(0);
       Operand src2 = i.InputOperand(1);
       __ vpminub(kScratchDoubleReg, src1, src2);
       __ vpcmpeqb(i.OutputSimd128Register(), kScratchDoubleReg, src2);
       break;
     }
     case kIA32I8x16RoundingAverageU: {
       __ Pavgb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
       break;
     }
     case kIA32I8x16Abs: {
       __ Pabsb(i.OutputSimd128Register(), i.InputSimd128Register(0));
       break;
     }
     case kIA32I8x16BitMask: {
       __ Pmovmskb(i.OutputRegister(), i.InputSimd128Register(0));
       break;
     }
     case kIA32S128Const: {
       XMMRegister dst = i.OutputSimd128Register();
       Register tmp = i.TempRegister(0);
       uint64_t low_qword = make_uint64(i.InputUint32(1), i.InputUint32(0));
       __ Move(dst, low_qword);
       __ Move(tmp, Immediate(i.InputUint32(2)));
       __ Pinsrd(dst, tmp, 2);
       __ Move(tmp, Immediate(i.InputUint32(3)));
       __ Pinsrd(dst, tmp, 3);
       break;
     }
     case kIA32S128Zero: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Pxor(dst, dst);
       break;
     }
     case kIA32S128AllOnes: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Pcmpeqd(dst, dst);
       break;
     }
     case kSSES128Not: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
       __ pxor(dst, kScratchDoubleReg);
       break;
     }
     case kAVXS128Not: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       __ vpxor(i.OutputSimd128Register(), kScratchDoubleReg, i.InputOperand(0));
       break;
     }
     case kSSES128And: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pand(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXS128And: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpand(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
       break;
     }
     case kSSES128Or: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ por(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXS128Or: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpor(i.OutputSimd128Register(), i.InputSimd128Register(0),
               i.InputOperand(1));
       break;
     }
     case kSSES128Xor: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       __ pxor(i.OutputSimd128Register(), i.InputOperand(1));
       break;
     }
     case kAVXS128Xor: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       __ vpxor(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
       break;
     }
     case kSSES128Select: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       // Mask used here is stored in dst.
       XMMRegister dst = i.OutputSimd128Register();
       __ movaps(kScratchDoubleReg, i.InputSimd128Register(1));
       __ xorps(kScratchDoubleReg, i.InputSimd128Register(2));
       __ andps(dst, kScratchDoubleReg);
       __ xorps(dst, i.InputSimd128Register(2));
       break;
     }
     case kAVXS128Select: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       __ vxorps(kScratchDoubleReg, i.InputSimd128Register(2),
                 i.InputOperand(1));
       __ vandps(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(0));
       __ vxorps(dst, kScratchDoubleReg, i.InputSimd128Register(2));
       break;
     }
     case kIA32S128AndNot: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       // The inputs have been inverted by instruction selector, so we can call
       // andnps here without any modifications.
       XMMRegister src1 = i.InputSimd128Register(1);
       __ Andnps(dst, src1);
       break;
     }
     case kIA32I8x16Swizzle: {
       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister mask = i.TempSimd128Register(0);

       // Out-of-range indices should return 0, add 112 so that any value > 15
       // saturates to 128 (top bit set), so pshufb will zero that lane.
       __ Move(mask, uint32_t{0x70707070});
       __ Pshufd(mask, mask, 0x0);
       __ Paddusb(mask, i.InputSimd128Register(1));
       __ Pshufb(dst, mask);
       break;
     }
     case kIA32I8x16Shuffle: {
       XMMRegister dst = i.OutputSimd128Register();
       Operand src0 = i.InputOperand(0);
       Register tmp = i.TempRegister(0);
       // Prepare 16 byte aligned buffer for shuffle control mask
       __ mov(tmp, esp);
       __ and_(esp, -16);
       if (instr->InputCount() == 5) {  // only one input operand
         DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
         for (int j = 4; j > 0; j--) {
           uint32_t mask = i.InputUint32(j);
           __ push(Immediate(mask));
         }
         __ Pshufb(dst, Operand(esp, 0));
       } else {  // two input operands
         DCHECK_EQ(6, instr->InputCount());
         __ movups(kScratchDoubleReg, src0);
         for (int j = 5; j > 1; j--) {
           uint32_t lanes = i.InputUint32(j);
           uint32_t mask = 0;
           for (int k = 0; k < 32; k += 8) {
             uint8_t lane = lanes >> k;
             mask |= (lane < kSimd128Size ? lane : 0x80) << k;
           }
           __ push(Immediate(mask));
         }
         __ Pshufb(kScratchDoubleReg, Operand(esp, 0));
         Operand src1 = i.InputOperand(1);
         if (!src1.is_reg(dst)) __ movups(dst, src1);
         for (int j = 5; j > 1; j--) {
           uint32_t lanes = i.InputUint32(j);
           uint32_t mask = 0;
           for (int k = 0; k < 32; k += 8) {
             uint8_t lane = lanes >> k;
             mask |= (lane >= kSimd128Size ? (lane & 0xF) : 0x80) << k;
           }
           __ push(Immediate(mask));
         }
         __ Pshufb(dst, Operand(esp, 0));
         __ por(dst, kScratchDoubleReg);
       }
       __ mov(esp, tmp);
       break;
     }
     case kIA32S128Load8Splat: {
       __ Pinsrb(i.OutputSimd128Register(), i.MemoryOperand(), 0);
       __ Pxor(kScratchDoubleReg, kScratchDoubleReg);
       __ Pshufb(i.OutputSimd128Register(), kScratchDoubleReg);
       break;
     }
     case kIA32S128Load16Splat: {
       __ Pinsrw(i.OutputSimd128Register(), i.MemoryOperand(), 0);
       __ Pshuflw(i.OutputSimd128Register(), i.OutputSimd128Register(),
                  uint8_t{0});
       __ Punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register());
       break;
     }
     case kIA32S128Load32Splat: {
       __ Vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand());
       break;
     }
     case kIA32S128Load64Splat: {
       __ Movddup(i.OutputSimd128Register(), i.MemoryOperand());
       break;
     }
     case kIA32S128Load8x8S: {
       __ Pmovsxbw(i.OutputSimd128Register(), i.MemoryOperand());
       break;
     }
     case kIA32S128Load8x8U: {
       __ Pmovzxbw(i.OutputSimd128Register(), i.MemoryOperand());
       break;
     }
     case kIA32S128Load16x4S: {
       __ Pmovsxwd(i.OutputSimd128Register(), i.MemoryOperand());
       break;
     }
     case kIA32S128Load16x4U: {
       __ Pmovzxwd(i.OutputSimd128Register(), i.MemoryOperand());
       break;
     }
     case kIA32S128Load32x2S: {
       __ Pmovsxdq(i.OutputSimd128Register(), i.MemoryOperand());
       break;
     }
     case kIA32S128Load32x2U: {
       __ Pmovzxdq(i.OutputSimd128Register(), i.MemoryOperand());
       break;
     }
     case kIA32S32x4Swizzle: {
       DCHECK_EQ(2, instr->InputCount());
       __ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(1));
       break;
     }
     case kIA32S32x4Shuffle: {
       DCHECK_EQ(4, instr->InputCount());  // Swizzles should be handled above.
       int8_t shuffle = i.InputInt8(2);
       DCHECK_NE(0xe4, shuffle);  // A simple blend should be handled below.
       __ Pshufd(kScratchDoubleReg, i.InputOperand(1), shuffle);
       __ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), shuffle);
       __ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3));
       break;
     }
     case kIA32S16x8Blend:
       ASSEMBLE_SIMD_IMM_SHUFFLE(pblendw, SSE4_1, i.InputInt8(2));
       break;
     case kIA32S16x8HalfShuffle1: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1));
       __ Pshufhw(dst, dst, i.InputInt8(2));
       break;
     }
     case kIA32S16x8HalfShuffle2: {
       XMMRegister dst = i.OutputSimd128Register();
       __ Pshuflw(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
       __ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputInt8(3));
       __ Pshuflw(dst, i.InputOperand(0), i.InputInt8(2));
       __ Pshufhw(dst, dst, i.InputInt8(3));
       __ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4));
       break;
     }
     case kIA32S8x16Alignr:
       ASSEMBLE_SIMD_IMM_SHUFFLE(palignr, SSSE3, i.InputInt8(2));
       break;
     case kIA32S16x8Dup: {
       XMMRegister dst = i.OutputSimd128Register();
       Operand src = i.InputOperand(0);
       int8_t lane = i.InputInt8(1) & 0x7;
       int8_t lane4 = lane & 0x3;
       int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
       if (lane < 4) {
         __ Pshuflw(dst, src, half_dup);
         __ Pshufd(dst, dst, 0);
       } else {
         __ Pshufhw(dst, src, half_dup);
         __ Pshufd(dst, dst, 0xaa);
       }
       break;
     }
     case kIA32S8x16Dup: {
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src = i.InputSimd128Register(0);
       int8_t lane = i.InputInt8(1) & 0xf;
       if (CpuFeatures::IsSupported(AVX)) {
         CpuFeatureScope avx_scope(tasm(), AVX);
         if (lane < 8) {
           __ vpunpcklbw(dst, src, src);
         } else {
           __ vpunpckhbw(dst, src, src);
         }
       } else {
         DCHECK_EQ(dst, src);
         if (lane < 8) {
           __ punpcklbw(dst, dst);
         } else {
           __ punpckhbw(dst, dst);
         }
       }
       lane &= 0x7;
       int8_t lane4 = lane & 0x3;
       int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
       if (lane < 4) {
         __ Pshuflw(dst, dst, half_dup);
         __ Pshufd(dst, dst, 0);
       } else {
         __ Pshufhw(dst, dst, half_dup);
         __ Pshufd(dst, dst, 0xaa);
       }
       break;
     }
     case kIA32S64x2UnpackHigh:
       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhqdq);
       break;
     case kIA32S32x4UnpackHigh:
       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhdq);
       break;
     case kIA32S16x8UnpackHigh:
       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhwd);
       break;
     case kIA32S8x16UnpackHigh:
       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhbw);
       break;
     case kIA32S64x2UnpackLow:
       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklqdq);
       break;
     case kIA32S32x4UnpackLow:
       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckldq);
       break;
     case kIA32S16x8UnpackLow:
       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklwd);
       break;
     case kIA32S8x16UnpackLow:
       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklbw);
       break;
     case kSSES16x8UnzipHigh: {
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src2 = dst;
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       if (instr->InputCount() == 2) {
         __ movups(kScratchDoubleReg, i.InputOperand(1));
         __ psrld(kScratchDoubleReg, 16);
         src2 = kScratchDoubleReg;
       }
       __ psrld(dst, 16);
       __ packusdw(dst, src2);
       break;
     }
     case kAVXS16x8UnzipHigh: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src2 = dst;
       if (instr->InputCount() == 2) {
         __ vpsrld(kScratchDoubleReg, i.InputSimd128Register(1), 16);
         src2 = kScratchDoubleReg;
       }
       __ vpsrld(dst, i.InputSimd128Register(0), 16);
       __ vpackusdw(dst, dst, src2);
       break;
     }
     case kSSES16x8UnzipLow: {
       CpuFeatureScope sse_scope(tasm(), SSE4_1);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src2 = dst;
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       __ pxor(kScratchDoubleReg, kScratchDoubleReg);
       if (instr->InputCount() == 2) {
         __ pblendw(kScratchDoubleReg, i.InputOperand(1), 0x55);
         src2 = kScratchDoubleReg;
       }
       __ pblendw(dst, kScratchDoubleReg, 0xaa);
       __ packusdw(dst, src2);
       break;
     }
     case kAVXS16x8UnzipLow: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src2 = dst;
       __ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
       if (instr->InputCount() == 2) {
         __ vpblendw(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(1),
                     0x55);
         src2 = kScratchDoubleReg;
       }
       __ vpblendw(dst, kScratchDoubleReg, i.InputSimd128Register(0), 0x55);
       __ vpackusdw(dst, dst, src2);
       break;
     }
     case kSSES8x16UnzipHigh: {
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src2 = dst;
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       if (instr->InputCount() == 2) {
         __ movups(kScratchDoubleReg, i.InputOperand(1));
         __ psrlw(kScratchDoubleReg, 8);
         src2 = kScratchDoubleReg;
       }
       __ psrlw(dst, 8);
       __ packuswb(dst, src2);
       break;
     }
     case kAVXS8x16UnzipHigh: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src2 = dst;
       if (instr->InputCount() == 2) {
         __ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
         src2 = kScratchDoubleReg;
       }
       __ vpsrlw(dst, i.InputSimd128Register(0), 8);
       __ vpackuswb(dst, dst, src2);
       break;
     }
     case kSSES8x16UnzipLow: {
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src2 = dst;
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       if (instr->InputCount() == 2) {
         __ movups(kScratchDoubleReg, i.InputOperand(1));
         __ psllw(kScratchDoubleReg, 8);
         __ psrlw(kScratchDoubleReg, 8);
         src2 = kScratchDoubleReg;
       }
       __ psllw(dst, 8);
       __ psrlw(dst, 8);
       __ packuswb(dst, src2);
       break;
     }
     case kAVXS8x16UnzipLow: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src2 = dst;
       if (instr->InputCount() == 2) {
         __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
         __ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 8);
         src2 = kScratchDoubleReg;
       }
       __ vpsllw(dst, i.InputSimd128Register(0), 8);
       __ vpsrlw(dst, dst, 8);
       __ vpackuswb(dst, dst, src2);
       break;
     }
     case kSSES8x16TransposeLow: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       __ psllw(dst, 8);
       if (instr->InputCount() == 1) {
         __ movups(kScratchDoubleReg, dst);
       } else {
         DCHECK_EQ(2, instr->InputCount());
         __ movups(kScratchDoubleReg, i.InputOperand(1));
         __ psllw(kScratchDoubleReg, 8);
       }
       __ psrlw(dst, 8);
       __ por(dst, kScratchDoubleReg);
       break;
     }
     case kAVXS8x16TransposeLow: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       if (instr->InputCount() == 1) {
         __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(0), 8);
         __ vpsrlw(dst, kScratchDoubleReg, 8);
       } else {
         DCHECK_EQ(2, instr->InputCount());
         __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
         __ vpsllw(dst, i.InputSimd128Register(0), 8);
         __ vpsrlw(dst, dst, 8);
       }
       __ vpor(dst, dst, kScratchDoubleReg);
       break;
     }
     case kSSES8x16TransposeHigh: {
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       __ psrlw(dst, 8);
       if (instr->InputCount() == 1) {
         __ movups(kScratchDoubleReg, dst);
       } else {
         DCHECK_EQ(2, instr->InputCount());
         __ movups(kScratchDoubleReg, i.InputOperand(1));
         __ psrlw(kScratchDoubleReg, 8);
       }
       __ psllw(kScratchDoubleReg, 8);
       __ por(dst, kScratchDoubleReg);
       break;
     }
     case kAVXS8x16TransposeHigh: {
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       if (instr->InputCount() == 1) {
         __ vpsrlw(dst, i.InputSimd128Register(0), 8);
         __ vpsllw(kScratchDoubleReg, dst, 8);
       } else {
         DCHECK_EQ(2, instr->InputCount());
         __ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
         __ vpsrlw(dst, i.InputSimd128Register(0), 8);
         __ vpsllw(kScratchDoubleReg, kScratchDoubleReg, 8);
       }
       __ vpor(dst, dst, kScratchDoubleReg);
       break;
     }
     case kSSES8x8Reverse:
     case kSSES8x4Reverse:
     case kSSES8x2Reverse: {
       DCHECK_EQ(1, instr->InputCount());
       XMMRegister dst = i.OutputSimd128Register();
       DCHECK_EQ(dst, i.InputSimd128Register(0));
       if (arch_opcode != kSSES8x2Reverse) {
         // First shuffle words into position.
         int8_t shuffle_mask = arch_opcode == kSSES8x4Reverse ? 0xB1 : 0x1B;
         __ pshuflw(dst, dst, shuffle_mask);
         __ pshufhw(dst, dst, shuffle_mask);
       }
       __ movaps(kScratchDoubleReg, dst);
       __ psrlw(kScratchDoubleReg, 8);
       __ psllw(dst, 8);
       __ por(dst, kScratchDoubleReg);
       break;
     }
     case kAVXS8x2Reverse:
     case kAVXS8x4Reverse:
     case kAVXS8x8Reverse: {
       DCHECK_EQ(1, instr->InputCount());
       CpuFeatureScope avx_scope(tasm(), AVX);
       XMMRegister dst = i.OutputSimd128Register();
       XMMRegister src = dst;
       if (arch_opcode != kAVXS8x2Reverse) {
         // First shuffle words into position.
         int8_t shuffle_mask = arch_opcode == kAVXS8x4Reverse ? 0xB1 : 0x1B;
         __ vpshuflw(dst, i.InputOperand(0), shuffle_mask);
         __ vpshufhw(dst, dst, shuffle_mask);
       } else {
         src = i.InputSimd128Register(0);
       }
       // Reverse each 16 bit lane.
       __ vpsrlw(kScratchDoubleReg, src, 8);
       __ vpsllw(dst, src, 8);
       __ vpor(dst, dst, kScratchDoubleReg);
       break;
     }
     case kIA32V32x4AnyTrue:
     case kIA32V16x8AnyTrue:
     case kIA32V8x16AnyTrue: {
       Register dst = i.OutputRegister();
       XMMRegister src = i.InputSimd128Register(0);
       Register tmp = i.TempRegister(0);
       __ xor_(tmp, tmp);
       __ mov(dst, Immediate(1));
       __ Ptest(src, src);
       __ cmov(zero, dst, tmp);
       break;
     }
     // Need to split up all the different lane structures because the
     // comparison instruction used matters, e.g. given 0xff00, pcmpeqb returns
     // 0x0011, pcmpeqw returns 0x0000, ptest will set ZF to 0 and 1
     // respectively.
     case kIA32V32x4AllTrue:
       ASSEMBLE_SIMD_ALL_TRUE(Pcmpeqd);
       break;
     case kIA32V16x8AllTrue:
       ASSEMBLE_SIMD_ALL_TRUE(pcmpeqw);
       break;
     case kIA32V8x16AllTrue: {
       ASSEMBLE_SIMD_ALL_TRUE(pcmpeqb);
       break;
     }
     case kIA32Word32AtomicPairLoad: {
       XMMRegister tmp = i.ToDoubleRegister(instr->TempAt(0));
       __ movq(tmp, i.MemoryOperand());
       __ Pextrd(i.OutputRegister(0), tmp, 0);
       __ Pextrd(i.OutputRegister(1), tmp, 1);
       break;
     }
     case kIA32Word32AtomicPairStore: {
       Label store;
       __ bind(&store);
       __ mov(i.TempRegister(0), i.MemoryOperand(2));
       __ mov(i.TempRegister(1), i.NextMemoryOperand(2));
       __ push(ebx);
       frame_access_state()->IncreaseSPDelta(1);
       i.MoveInstructionOperandToRegister(ebx, instr->InputAt(0));
       __ lock();
       __ cmpxchg8b(i.MemoryOperand(2));
       __ pop(ebx);
       frame_access_state()->IncreaseSPDelta(-1);
       __ j(not_equal, &store);
       break;
     }
     case kWord32AtomicExchangeInt8: {
       __ xchg_b(i.InputRegister(0), i.MemoryOperand(1));
       __ movsx_b(i.InputRegister(0), i.InputRegister(0));
       break;
     }
     case kWord32AtomicExchangeUint8: {
       __ xchg_b(i.InputRegister(0), i.MemoryOperand(1));
       __ movzx_b(i.InputRegister(0), i.InputRegister(0));
       break;
     }
     case kWord32AtomicExchangeInt16: {
       __ xchg_w(i.InputRegister(0), i.MemoryOperand(1));
       __ movsx_w(i.InputRegister(0), i.InputRegister(0));
       break;
     }
     case kWord32AtomicExchangeUint16: {
       __ xchg_w(i.InputRegister(0), i.MemoryOperand(1));
       __ movzx_w(i.InputRegister(0), i.InputRegister(0));
       break;
     }
     case kWord32AtomicExchangeWord32: {
       __ xchg(i.InputRegister(0), i.MemoryOperand(1));
       break;
     }
     case kIA32Word32AtomicPairExchange: {
       DCHECK(VerifyOutputOfAtomicPairInstr(&i, instr));
       Label exchange;
       __ bind(&exchange);
       __ mov(eax, i.MemoryOperand(2));
       __ mov(edx, i.NextMemoryOperand(2));
       __ push(ebx);
       frame_access_state()->IncreaseSPDelta(1);
       i.MoveInstructionOperandToRegister(ebx, instr->InputAt(0));
       __ lock();
       __ cmpxchg8b(i.MemoryOperand(2));
       __ pop(ebx);
       frame_access_state()->IncreaseSPDelta(-1);
       __ j(not_equal, &exchange);
       break;
     }
     case kWord32AtomicCompareExchangeInt8: {
       __ lock();
       __ cmpxchg_b(i.MemoryOperand(2), i.InputRegister(1));
       __ movsx_b(eax, eax);
       break;
     }
     case kWord32AtomicCompareExchangeUint8: {
       __ lock();
       __ cmpxchg_b(i.MemoryOperand(2), i.InputRegister(1));
       __ movzx_b(eax, eax);
       break;
     }
     case kWord32AtomicCompareExchangeInt16: {
       __ lock();
       __ cmpxchg_w(i.MemoryOperand(2), i.InputRegister(1));
       __ movsx_w(eax, eax);
       break;
     }
     case kWord32AtomicCompareExchangeUint16: {
       __ lock();
       __ cmpxchg_w(i.MemoryOperand(2), i.InputRegister(1));
       __ movzx_w(eax, eax);
       break;
     }
     case kWord32AtomicCompareExchangeWord32: {
       __ lock();
       __ cmpxchg(i.MemoryOperand(2), i.InputRegister(1));
       break;
     }
     case kIA32Word32AtomicPairCompareExchange: {
       __ push(ebx);
       frame_access_state()->IncreaseSPDelta(1);
       i.MoveInstructionOperandToRegister(ebx, instr->InputAt(2));
       __ lock();
       __ cmpxchg8b(i.MemoryOperand(4));
       __ pop(ebx);
       frame_access_state()->IncreaseSPDelta(-1);
       break;
     }
 #define ATOMIC_BINOP_CASE(op, inst)                \
   case kWord32Atomic##op##Int8: {                  \
     ASSEMBLE_ATOMIC_BINOP(inst, mov_b, cmpxchg_b); \
     __ movsx_b(eax, eax);                          \
     break;                                         \
   }                                                \
   case kWord32Atomic##op##Uint8: {                 \
     ASSEMBLE_ATOMIC_BINOP(inst, mov_b, cmpxchg_b); \
     __ movzx_b(eax, eax);                          \
     break;                                         \
   }                                                \
   case kWord32Atomic##op##Int16: {                 \
     ASSEMBLE_ATOMIC_BINOP(inst, mov_w, cmpxchg_w); \
     __ movsx_w(eax, eax);                          \
     break;                                         \
   }                                                \
   case kWord32Atomic##op##Uint16: {                \
     ASSEMBLE_ATOMIC_BINOP(inst, mov_w, cmpxchg_w); \
     __ movzx_w(eax, eax);                          \
     break;                                         \
   }                                                \
   case kWord32Atomic##op##Word32: {                \
     ASSEMBLE_ATOMIC_BINOP(inst, mov, cmpxchg);     \
     break;                                         \
   }
       ATOMIC_BINOP_CASE(Add, add)
       ATOMIC_BINOP_CASE(Sub, sub)
       ATOMIC_BINOP_CASE(And, and_)
       ATOMIC_BINOP_CASE(Or, or_)
       ATOMIC_BINOP_CASE(Xor, xor_)
 #undef ATOMIC_BINOP_CASE
 #define ATOMIC_BINOP_CASE(op, instr1, instr2)         \
   case kIA32Word32AtomicPair##op: {                   \
     DCHECK(VerifyOutputOfAtomicPairInstr(&i, instr)); \
     ASSEMBLE_I64ATOMIC_BINOP(instr1, instr2)          \
     break;                                            \
   }
       ATOMIC_BINOP_CASE(Add, add, adc)
       ATOMIC_BINOP_CASE(And, and_, and_)
       ATOMIC_BINOP_CASE(Or, or_, or_)
       ATOMIC_BINOP_CASE(Xor, xor_, xor_)
 #undef ATOMIC_BINOP_CASE
     case kIA32Word32AtomicPairSub: {
       DCHECK(VerifyOutputOfAtomicPairInstr(&i, instr));
       Label binop;
       __ bind(&binop);
       // Move memory operand into edx:eax
       __ mov(eax, i.MemoryOperand(2));
       __ mov(edx, i.NextMemoryOperand(2));
       // Save input registers temporarily on the stack.
       __ push(ebx);
       frame_access_state()->IncreaseSPDelta(1);
       i.MoveInstructionOperandToRegister(ebx, instr->InputAt(0));
       __ push(i.InputRegister(1));
       // Negate input in place
       __ neg(ebx);
       __ adc(i.InputRegister(1), 0);
       __ neg(i.InputRegister(1));
       // Add memory operand, negated input.
       __ add(ebx, eax);
       __ adc(i.InputRegister(1), edx);
       __ lock();
       __ cmpxchg8b(i.MemoryOperand(2));
       // Restore input registers
       __ pop(i.InputRegister(1));
       __ pop(ebx);
       frame_access_state()->IncreaseSPDelta(-1);
       __ j(not_equal, &binop);
       break;
     }
     case kWord32AtomicLoadInt8:
     case kWord32AtomicLoadUint8:
     case kWord32AtomicLoadInt16:
     case kWord32AtomicLoadUint16:
     case kWord32AtomicLoadWord32:
     case kWord32AtomicStoreWord8:
     case kWord32AtomicStoreWord16:
     case kWord32AtomicStoreWord32:
       UNREACHABLE();  // Won't be generated by instruction selector.
       break;
   }
   return kSuccess;
 }  // NOLINT(readability/fn_size)

 static Condition FlagsConditionToCondition(FlagsCondition condition) {
   switch (condition) {
     case kUnorderedEqual:
     case kEqual:
       return equal;
       break;
     case kUnorderedNotEqual:
     case kNotEqual:
       return not_equal;
       break;
     case kSignedLessThan:
       return less;
       break;
     case kSignedGreaterThanOrEqual:
       return greater_equal;
       break;
     case kSignedLessThanOrEqual:
       return less_equal;
       break;
     case kSignedGreaterThan:
       return greater;
       break;
     case kUnsignedLessThan:
       return below;
       break;
     case kUnsignedGreaterThanOrEqual:
       return above_equal;
       break;
     case kUnsignedLessThanOrEqual:
       return below_equal;
       break;
     case kUnsignedGreaterThan:
       return above;
       break;
     case kOverflow:
       return overflow;
       break;
     case kNotOverflow:
       return no_overflow;
       break;
     default:
       UNREACHABLE();
   }
 }

 // Assembles a branch after an instruction.
 void CodeGenerator::AssembleArchBranch(Instruction* instr, BranchInfo* branch) {
   Label::Distance flabel_distance =
       branch->fallthru ? Label::kNear : Label::kFar;
   Label* tlabel = branch->true_label;
   Label* flabel = branch->false_label;
   if (branch->condition == kUnorderedEqual) {
     __ j(parity_even, flabel, flabel_distance);
   } else if (branch->condition == kUnorderedNotEqual) {
     __ j(parity_even, tlabel);
   }
   __ j(FlagsConditionToCondition(branch->condition), tlabel);

   // Add a jump if not falling through to the next block.
   if (!branch->fallthru) __ jmp(flabel);
 }

 void CodeGenerator::AssembleBranchPoisoning(FlagsCondition condition,
                                             Instruction* instr) {
   // TODO(860429): Remove remaining poisoning infrastructure on ia32.
   UNREACHABLE();
 }

 void CodeGenerator::AssembleArchDeoptBranch(Instruction* instr,
                                             BranchInfo* branch) {
   AssembleArchBranch(instr, branch);
 }

 void CodeGenerator::AssembleArchJump(RpoNumber target) {
   if (!IsNextInAssemblyOrder(target)) __ jmp(GetLabel(target));
 }

 void CodeGenerator::AssembleArchTrap(Instruction* instr,
                                      FlagsCondition condition) {
   class OutOfLineTrap final : public OutOfLineCode {
    public:
     OutOfLineTrap(CodeGenerator* gen, Instruction* instr)
         : OutOfLineCode(gen), instr_(instr), gen_(gen) {}

     void Generate() final {
       IA32OperandConverter i(gen_, instr_);
       TrapId trap_id =
           static_cast<TrapId>(i.InputInt32(instr_->InputCount() - 1));
       GenerateCallToTrap(trap_id);
     }

    private:
     void GenerateCallToTrap(TrapId trap_id) {
       if (trap_id == TrapId::kInvalid) {
         // We cannot test calls to the runtime in cctest/test-run-wasm.
         // Therefore we emit a call to C here instead of a call to the runtime.
         __ PrepareCallCFunction(0, esi);
         __ CallCFunction(
             ExternalReference::wasm_call_trap_callback_for_testing(), 0);
         __ LeaveFrame(StackFrame::WASM);
         auto call_descriptor = gen_->linkage()->GetIncomingDescriptor();
         size_t pop_size =
             call_descriptor->StackParameterCount() * kSystemPointerSize;
         // Use ecx as a scratch register, we return anyways immediately.
         __ Ret(static_cast<int>(pop_size), ecx);
       } else {
         gen_->AssembleSourcePosition(instr_);
         // A direct call to a wasm runtime stub defined in this module.
         // Just encode the stub index. This will be patched when the code
         // is added to the native module and copied into wasm code space.
         __ wasm_call(static_cast<Address>(trap_id), RelocInfo::WASM_STUB_CALL);
         ReferenceMap* reference_map =
             gen_->zone()->New<ReferenceMap>(gen_->zone());
         gen_->RecordSafepoint(reference_map, Safepoint::kNoLazyDeopt);
         __ AssertUnreachable(AbortReason::kUnexpectedReturnFromWasmTrap);
       }
     }

     Instruction* instr_;
     CodeGenerator* gen_;
   };
   auto ool = zone()->New<OutOfLineTrap>(this, instr);
   Label* tlabel = ool->entry();
   Label end;
   if (condition == kUnorderedEqual) {
     __ j(parity_even, &end, Label::kNear);
   } else if (condition == kUnorderedNotEqual) {
     __ j(parity_even, tlabel);
   }
   __ j(FlagsConditionToCondition(condition), tlabel);
   __ bind(&end);
 }

 // Assembles boolean materializations after an instruction.
 void CodeGenerator::AssembleArchBoolean(Instruction* instr,
                                         FlagsCondition condition) {
   IA32OperandConverter i(this, instr);
   Label done;

   // Materialize a full 32-bit 1 or 0 value. The result register is always the
   // last output of the instruction.
   Label check;
   DCHECK_NE(0u, instr->OutputCount());
   Register reg = i.OutputRegister(instr->OutputCount() - 1);
   if (condition == kUnorderedEqual) {
     __ j(parity_odd, &check, Label::kNear);
     __ Move(reg, Immediate(0));
     __ jmp(&done, Label::kNear);
   } else if (condition == kUnorderedNotEqual) {
     __ j(parity_odd, &check, Label::kNear);
     __ mov(reg, Immediate(1));
     __ jmp(&done, Label::kNear);
   }
   Condition cc = FlagsConditionToCondition(condition);

   __ bind(&check);
   if (reg.is_byte_register()) {
     // setcc for byte registers (al, bl, cl, dl).
     __ setcc(cc, reg);
     __ movzx_b(reg, reg);
   } else {
     // Emit a branch to set a register to either 1 or 0.
     Label set;
     __ j(cc, &set, Label::kNear);
     __ Move(reg, Immediate(0));
     __ jmp(&done, Label::kNear);
     __ bind(&set);
     __ mov(reg, Immediate(1));
   }
   __ bind(&done);
 }

 void CodeGenerator::AssembleArchBinarySearchSwitch(Instruction* instr) {
   IA32OperandConverter i(this, instr);
   Register input = i.InputRegister(0);
   std::vector<std::pair<int32_t, Label*>> cases;
   for (size_t index = 2; index < instr->InputCount(); index += 2) {
     cases.push_back({i.InputInt32(index + 0), GetLabel(i.InputRpo(index + 1))});
   }
   AssembleArchBinarySearchSwitchRange(input, i.InputRpo(1), cases.data(),
                                       cases.data() + cases.size());
 }

 void CodeGenerator::AssembleArchTableSwitch(Instruction* instr) {
   IA32OperandConverter i(this, instr);
   Register input = i.InputRegister(0);
   size_t const case_count = instr->InputCount() - 2;
   Label** cases = zone()->NewArray<Label*>(case_count);
   for (size_t index = 0; index < case_count; ++index) {
     cases[index] = GetLabel(i.InputRpo(index + 2));
   }
   Label* const table = AddJumpTable(cases, case_count);
   __ cmp(input, Immediate(case_count));
   __ j(above_equal, GetLabel(i.InputRpo(1)));
   __ jmp(Operand::JumpTable(input, times_system_pointer_size, table));
 }

 // The calling convention for JSFunctions on IA32 passes arguments on the
 // stack and the JSFunction and context in EDI and ESI, respectively, thus
 // the steps of the call look as follows:

 // --{ before the call instruction }--------------------------------------------
 //                                                         |  caller frame |
 //                                                         ^ esp           ^ ebp

 // --{ push arguments and setup ESI, EDI }--------------------------------------
 //                                       | args + receiver |  caller frame |
 //                                       ^ esp                             ^ ebp
 //                 [edi = JSFunction, esi = context]

 // --{ call [edi + kCodeEntryOffset] }------------------------------------------
 //                                 | RET | args + receiver |  caller frame |
 //                                 ^ esp                                   ^ ebp

 // =={ prologue of called function }============================================
 // --{ push ebp }---------------------------------------------------------------
 //                            | FP | RET | args + receiver |  caller frame |
 //                            ^ esp                                        ^ ebp

 // --{ mov ebp, esp }-----------------------------------------------------------
 //                            | FP | RET | args + receiver |  caller frame |
 //                            ^ ebp,esp

 // --{ push esi }---------------------------------------------------------------
 //                      | CTX | FP | RET | args + receiver |  caller frame |
 //                      ^esp  ^ ebp

 // --{ push edi }---------------------------------------------------------------
 //                | FNC | CTX | FP | RET | args + receiver |  caller frame |
 //                ^esp        ^ ebp

 // --{ subi esp, #N }-----------------------------------------------------------
 // | callee frame | FNC | CTX | FP | RET | args + receiver |  caller frame |
 // ^esp                       ^ ebp

 // =={ body of called function }================================================

 // =={ epilogue of called function }============================================
 // --{ mov esp, ebp }-----------------------------------------------------------
 //                            | FP | RET | args + receiver |  caller frame |
 //                            ^ esp,ebp

 // --{ pop ebp }-----------------------------------------------------------
 // |                               | RET | args + receiver |  caller frame |
 //                                 ^ esp                                   ^ ebp

 // --{ ret #A+1 }-----------------------------------------------------------
 // |                                                       |  caller frame |
 //                                                         ^ esp           ^ ebp

 // Runtime function calls are accomplished by doing a stub call to the
 // CEntry (a real code object). On IA32 passes arguments on the
 // stack, the number of arguments in EAX, the address of the runtime function
 // in EBX, and the context in ESI.

 // --{ before the call instruction }--------------------------------------------
 //                                                         |  caller frame |
 //                                                         ^ esp           ^ ebp

 // --{ push arguments and setup EAX, EBX, and ESI }-----------------------------
 //                                       | args + receiver |  caller frame |
 //                                       ^ esp                             ^ ebp
 //              [eax = #args, ebx = runtime function, esi = context]

 // --{ call #CEntry }-----------------------------------------------------------
 //                                 | RET | args + receiver |  caller frame |
 //                                 ^ esp                                   ^ ebp

 // =={ body of runtime function }===============================================

 // --{ runtime returns }--------------------------------------------------------
 //                                                         |  caller frame |
 //                                                         ^ esp           ^ ebp

 // Other custom linkages (e.g. for calling directly into and out of C++) may
 // need to save callee-saved registers on the stack, which is done in the
 // function prologue of generated code.

 // --{ before the call instruction }--------------------------------------------
 //                                                         |  caller frame |
 //                                                         ^ esp           ^ ebp

 // --{ set up arguments in registers on stack }---------------------------------
 //                                                  | args |  caller frame |
 //                                                  ^ esp                  ^ ebp
 //                  [r0 = arg0, r1 = arg1, ...]

 // --{ call code }--------------------------------------------------------------
 //                                            | RET | args |  caller frame |
 //                                            ^ esp                        ^ ebp

 // =={ prologue of called function }============================================
 // --{ push ebp }---------------------------------------------------------------
 //                                       | FP | RET | args |  caller frame |
 //                                       ^ esp                             ^ ebp

 // --{ mov ebp, esp }-----------------------------------------------------------
 //                                       | FP | RET | args |  caller frame |
 //                                       ^ ebp,esp

 // --{ save registers }---------------------------------------------------------
 //                                | regs | FP | RET | args |  caller frame |
 //                                ^ esp  ^ ebp

 // --{ subi esp, #N }-----------------------------------------------------------
 //                 | callee frame | regs | FP | RET | args |  caller frame |
 //                 ^esp                  ^ ebp

 // =={ body of called function }================================================

 // =={ epilogue of called function }============================================
 // --{ restore registers }------------------------------------------------------
 //                                | regs | FP | RET | args |  caller frame |
 //                                ^ esp  ^ ebp

 // --{ mov esp, ebp }-----------------------------------------------------------
 //                                       | FP | RET | args |  caller frame |
 //                                       ^ esp,ebp

 // --{ pop ebp }----------------------------------------------------------------
 //                                            | RET | args |  caller frame |
 //                                            ^ esp                        ^ ebp

 void CodeGenerator::FinishFrame(Frame* frame) {
   auto call_descriptor = linkage()->GetIncomingDescriptor();
   const RegList saves = call_descriptor->CalleeSavedRegisters();
   if (saves != 0) {  // Save callee-saved registers.
     DCHECK(!info()->is_osr());
     int pushed = 0;
     for (int i = Register::kNumRegisters - 1; i >= 0; i--) {
       if (!((1 << i) & saves)) continue;
       ++pushed;
     }
     frame->AllocateSavedCalleeRegisterSlots(pushed);
   }
 }

 void CodeGenerator::AssembleConstructFrame() {
   auto call_descriptor = linkage()->GetIncomingDescriptor();
   if (frame_access_state()->has_frame()) {
     if (call_descriptor->IsCFunctionCall()) {
       __ push(ebp);
       __ mov(ebp, esp);
       if (info()->GetOutputStackFrameType() == StackFrame::C_WASM_ENTRY) {
         __ Push(Immediate(StackFrame::TypeToMarker(StackFrame::C_WASM_ENTRY)));
         // Reserve stack space for saving the c_entry_fp later.
         __ AllocateStackSpace(kSystemPointerSize);
       }
     } else if (call_descriptor->IsJSFunctionCall()) {
       __ Prologue();
     } else {
       __ StubPrologue(info()->GetOutputStackFrameType());
       if (call_descriptor->IsWasmFunctionCall()) {
         __ push(kWasmInstanceRegister);
       } else if (call_descriptor->IsWasmImportWrapper() ||
                  call_descriptor->IsWasmCapiFunction()) {
         // Wasm import wrappers are passed a tuple in the place of the instance.
         // Unpack the tuple into the instance and the target callable.
         // This must be done here in the codegen because it cannot be expressed
         // properly in the graph.
         __ mov(kJSFunctionRegister,
                Operand(kWasmInstanceRegister,
                        Tuple2::kValue2Offset - kHeapObjectTag));
         __ mov(kWasmInstanceRegister,
                Operand(kWasmInstanceRegister,
                        Tuple2::kValue1Offset - kHeapObjectTag));
         __ push(kWasmInstanceRegister);
         if (call_descriptor->IsWasmCapiFunction()) {
           // Reserve space for saving the PC later.
           __ AllocateStackSpace(kSystemPointerSize);
         }
       }
     }
   }

   int required_slots =
       frame()->GetTotalFrameSlotCount() - frame()->GetFixedSlotCount();

   if (info()->is_osr()) {
     // TurboFan OSR-compiled functions cannot be entered directly.
     __ Abort(AbortReason::kShouldNotDirectlyEnterOsrFunction);

     // Unoptimized code jumps directly to this entrypoint while the unoptimized
     // frame is still on the stack. Optimized code uses OSR values directly from
     // the unoptimized frame. Thus, all that needs to be done is to allocate the
     // remaining stack slots.
     if (FLAG_code_comments) __ RecordComment("-- OSR entrypoint --");
     osr_pc_offset_ = __ pc_offset();
     required_slots -= osr_helper()->UnoptimizedFrameSlots();
   }

   const RegList saves = call_descriptor->CalleeSavedRegisters();
   if (required_slots > 0) {
     DCHECK(frame_access_state()->has_frame());
     if (info()->IsWasm() && required_slots > 128) {
       // For WebAssembly functions with big frames we have to do the stack
       // overflow check before we construct the frame. Otherwise we may not
       // have enough space on the stack to call the runtime for the stack
       // overflow.
       Label done;

       // If the frame is bigger than the stack, we throw the stack overflow
       // exception unconditionally. Thereby we can avoid the integer overflow
       // check in the condition code.
       if (required_slots * kSystemPointerSize < FLAG_stack_size * 1024) {
         Register scratch = esi;
         __ push(scratch);
         __ mov(scratch,
                FieldOperand(kWasmInstanceRegister,
                             WasmInstanceObject::kRealStackLimitAddressOffset));
         __ mov(scratch, Operand(scratch, 0));
         __ add(scratch, Immediate(required_slots * kSystemPointerSize));
         __ cmp(esp, scratch);
         __ pop(scratch);
         __ j(above_equal, &done, Label::kNear);
       }

       __ wasm_call(wasm::WasmCode::kWasmStackOverflow,
                    RelocInfo::WASM_STUB_CALL);
       ReferenceMap* reference_map = zone()->New<ReferenceMap>(zone());
       RecordSafepoint(reference_map, Safepoint::kNoLazyDeopt);
       __ AssertUnreachable(AbortReason::kUnexpectedReturnFromWasmTrap);
       __ bind(&done);
     }

     // Skip callee-saved and return slots, which are created below.
     required_slots -= base::bits::CountPopulation(saves);
     required_slots -= frame()->GetReturnSlotCount();
     if (required_slots > 0) {
       __ AllocateStackSpace(required_slots * kSystemPointerSize);
     }
   }

   if (saves != 0) {  // Save callee-saved registers.
     DCHECK(!info()->is_osr());
     for (int i = Register::kNumRegisters - 1; i >= 0; i--) {
       if (((1 << i) & saves)) __ push(Register::from_code(i));
     }
   }

   // Allocate return slots (located after callee-saved).
   if (frame()->GetReturnSlotCount() > 0) {
     __ AllocateStackSpace(frame()->GetReturnSlotCount() * kSystemPointerSize);
   }
 }

 void CodeGenerator::AssembleReturn(InstructionOperand* additional_pop_count) {
   auto call_descriptor = linkage()->GetIncomingDescriptor();

   const RegList saves = call_descriptor->CalleeSavedRegisters();
   // Restore registers.
   if (saves != 0) {
     const int returns = frame()->GetReturnSlotCount();
     if (returns != 0) {
       __ add(esp, Immediate(returns * kSystemPointerSize));
     }
     for (int i = 0; i < Register::kNumRegisters; i++) {
       if (!((1 << i) & saves)) continue;
       __ pop(Register::from_code(i));
     }
   }

   // We might need ecx and edx for scratch.
   DCHECK_EQ(0u, call_descriptor->CalleeSavedRegisters() & edx.bit());
   DCHECK_EQ(0u, call_descriptor->CalleeSavedRegisters() & ecx.bit());
   IA32OperandConverter g(this, nullptr);
   int parameter_count =
       static_cast<int>(call_descriptor->StackParameterCount());

   // {aditional_pop_count} is only greater than zero if {parameter_count = 0}.
   // Check RawMachineAssembler::PopAndReturn.
   if (parameter_count != 0) {
     if (additional_pop_count->IsImmediate()) {
       DCHECK_EQ(g.ToConstant(additional_pop_count).ToInt32(), 0);
     } else if (__ emit_debug_code()) {
       __ cmp(g.ToRegister(additional_pop_count), Immediate(0));
       __ Assert(equal, AbortReason::kUnexpectedAdditionalPopValue);
     }
   }

   Register argc_reg = ecx;
 #ifdef V8_NO_ARGUMENTS_ADAPTOR
   // Functions with JS linkage have at least one parameter (the receiver).
   // If {parameter_count} == 0, it means it is a builtin with
   // kDontAdaptArgumentsSentinel, which takes care of JS arguments popping
   // itself.
   const bool drop_jsargs = frame_access_state()->has_frame() &&
                            call_descriptor->IsJSFunctionCall() &&
                            parameter_count != 0;
 #else
   const bool drop_jsargs = false;
 #endif
   if (call_descriptor->IsCFunctionCall()) {
     AssembleDeconstructFrame();
   } else if (frame_access_state()->has_frame()) {
     // Canonicalize JSFunction return sites for now if they always have the same
     // number of return args.
     if (additional_pop_count->IsImmediate() &&
         g.ToConstant(additional_pop_count).ToInt32() == 0) {
       if (return_label_.is_bound()) {
         __ jmp(&return_label_);
         return;
       } else {
         __ bind(&return_label_);
       }
     }
     if (drop_jsargs) {
       // Get the actual argument count.
       __ mov(argc_reg, Operand(ebp, StandardFrameConstants::kArgCOffset));
     }
     AssembleDeconstructFrame();
   }

   if (drop_jsargs) {
     // We must pop all arguments from the stack (including the receiver). This
     // number of arguments is given by max(1 + argc_reg, parameter_count).
     int parameter_count_without_receiver =
         parameter_count - 1;  // Exclude the receiver to simplify the
                               // computation. We'll account for it at the end.
     Label mismatch_return;
     Register scratch_reg = edx;
     DCHECK_NE(argc_reg, scratch_reg);
     __ cmp(argc_reg, Immediate(parameter_count_without_receiver));
     __ j(greater, &mismatch_return, Label::kNear);
     __ Ret(parameter_count * kSystemPointerSize, scratch_reg);
     __ bind(&mismatch_return);
     __ PopReturnAddressTo(scratch_reg);
     __ lea(esp, Operand(esp, argc_reg, times_system_pointer_size,
                         kSystemPointerSize));  // Also pop the receiver.
     // We use a return instead of a jump for better return address prediction.
     __ PushReturnAddressFrom(scratch_reg);
     __ Ret();
   } else if (additional_pop_count->IsImmediate()) {
     Register scratch_reg = ecx;
     int additional_count = g.ToConstant(additional_pop_count).ToInt32();
     size_t pop_size = (parameter_count + additional_count) * kSystemPointerSize;
     CHECK_LE(pop_size, static_cast<size_t>(std::numeric_limits<int>::max()));
     __ Ret(static_cast<int>(pop_size), scratch_reg);
   } else {
     Register pop_reg = g.ToRegister(additional_pop_count);
     Register scratch_reg = pop_reg == ecx ? edx : ecx;
     int pop_size = static_cast<int>(parameter_count * kSystemPointerSize);
     __ PopReturnAddressTo(scratch_reg);
     __ lea(esp, Operand(esp, pop_reg, times_system_pointer_size,
                         static_cast<int>(pop_size)));
     __ PushReturnAddressFrom(scratch_reg);
     __ Ret();
   }
 }

 void CodeGenerator::FinishCode() {}

 void CodeGenerator::PrepareForDeoptimizationExits(
     ZoneDeque<DeoptimizationExit*>* exits) {}

 void CodeGenerator::AssembleMove(InstructionOperand* source,
                                  InstructionOperand* destination) {
   IA32OperandConverter g(this, nullptr);
   // Dispatch on the source and destination operand kinds.
   switch (MoveType::InferMove(source, destination)) {
     case MoveType::kRegisterToRegister:
       if (source->IsRegister()) {
         __ mov(g.ToRegister(destination), g.ToRegister(source));
       } else {
         DCHECK(source->IsFPRegister());
         __ movaps(g.ToDoubleRegister(destination), g.ToDoubleRegister(source));
       }
       return;
     case MoveType::kRegisterToStack: {
       Operand dst = g.ToOperand(destination);
       if (source->IsRegister()) {
         __ mov(dst, g.ToRegister(source));
       } else {
         DCHECK(source->IsFPRegister());
         XMMRegister src = g.ToDoubleRegister(source);
         MachineRepresentation rep =
             LocationOperand::cast(source)->representation();
         if (rep == MachineRepresentation::kFloat32) {
           __ movss(dst, src);
         } else if (rep == MachineRepresentation::kFloat64) {
           __ movsd(dst, src);
         } else {
           DCHECK_EQ(MachineRepresentation::kSimd128, rep);
           __ movups(dst, src);
         }
       }
       return;
     }
     case MoveType::kStackToRegister: {
       Operand src = g.ToOperand(source);
       if (source->IsStackSlot()) {
         __ mov(g.ToRegister(destination), src);
       } else {
         DCHECK(source->IsFPStackSlot());
         XMMRegister dst = g.ToDoubleRegister(destination);
         MachineRepresentation rep =
             LocationOperand::cast(source)->representation();
         if (rep == MachineRepresentation::kFloat32) {
           __ movss(dst, src);
         } else if (rep == MachineRepresentation::kFloat64) {
           __ movsd(dst, src);
         } else {
           DCHECK_EQ(MachineRepresentation::kSimd128, rep);
           __ movups(dst, src);
         }
       }
       return;
     }
     case MoveType::kStackToStack: {
       Operand src = g.ToOperand(source);
       Operand dst = g.ToOperand(destination);
       if (source->IsStackSlot()) {
         __ push(src);
         __ pop(dst);
       } else {
         MachineRepresentation rep =
             LocationOperand::cast(source)->representation();
         if (rep == MachineRepresentation::kFloat32) {
           __ movss(kScratchDoubleReg, src);
           __ movss(dst, kScratchDoubleReg);
         } else if (rep == MachineRepresentation::kFloat64) {
           __ movsd(kScratchDoubleReg, src);
           __ movsd(dst, kScratchDoubleReg);
         } else {
           DCHECK_EQ(MachineRepresentation::kSimd128, rep);
           __ movups(kScratchDoubleReg, src);
           __ movups(dst, kScratchDoubleReg);
         }
       }
       return;
     }
     case MoveType::kConstantToRegister: {
       Constant src = g.ToConstant(source);
       if (destination->IsRegister()) {
         Register dst = g.ToRegister(destination);
         if (src.type() == Constant::kHeapObject) {
           __ Move(dst, src.ToHeapObject());
         } else {
           __ Move(dst, g.ToImmediate(source));
         }
       } else {
         DCHECK(destination->IsFPRegister());
         XMMRegister dst = g.ToDoubleRegister(destination);
         if (src.type() == Constant::kFloat32) {
           // TODO(turbofan): Can we do better here?
           __ Move(dst, src.ToFloat32AsInt());
         } else {
           DCHECK_EQ(src.type(), Constant::kFloat64);
           __ Move(dst, src.ToFloat64().AsUint64());
         }
       }
       return;
     }
     case MoveType::kConstantToStack: {
       Constant src = g.ToConstant(source);
       Operand dst = g.ToOperand(destination);
       if (destination->IsStackSlot()) {
         __ Move(dst, g.ToImmediate(source));
       } else {
         DCHECK(destination->IsFPStackSlot());
         if (src.type() == Constant::kFloat32) {
           __ Move(dst, Immediate(src.ToFloat32AsInt()));
         } else {
           DCHECK_EQ(src.type(), Constant::kFloat64);
           uint64_t constant_value = src.ToFloat64().AsUint64();
           uint32_t lower = static_cast<uint32_t>(constant_value);
           uint32_t upper = static_cast<uint32_t>(constant_value >> 32);
           Operand dst0 = dst;
           Operand dst1 = g.ToOperand(destination, kSystemPointerSize);
           __ Move(dst0, Immediate(lower));
           __ Move(dst1, Immediate(upper));
         }
       }
       return;
     }
   }
   UNREACHABLE();
 }

 void CodeGenerator::AssembleSwap(InstructionOperand* source,
                                  InstructionOperand* destination) {
   IA32OperandConverter g(this, nullptr);
   // Dispatch on the source and destination operand kinds.  Not all
   // combinations are possible.
   switch (MoveType::InferSwap(source, destination)) {
     case MoveType::kRegisterToRegister: {
       if (source->IsRegister()) {
         Register src = g.ToRegister(source);
         Register dst = g.ToRegister(destination);
         __ push(src);
         __ mov(src, dst);
         __ pop(dst);
       } else {
         DCHECK(source->IsFPRegister());
         XMMRegister src = g.ToDoubleRegister(source);
         XMMRegister dst = g.ToDoubleRegister(destination);
         __ movaps(kScratchDoubleReg, src);
         __ movaps(src, dst);
         __ movaps(dst, kScratchDoubleReg);
       }
       return;
     }
     case MoveType::kRegisterToStack: {
       if (source->IsRegister()) {
         Register src = g.ToRegister(source);
         __ push(src);
         frame_access_state()->IncreaseSPDelta(1);
         Operand dst = g.ToOperand(destination);
         __ mov(src, dst);
         frame_access_state()->IncreaseSPDelta(-1);
         dst = g.ToOperand(destination);
         __ pop(dst);
       } else {
         DCHECK(source->IsFPRegister());
         XMMRegister src = g.ToDoubleRegister(source);
         Operand dst = g.ToOperand(destination);
         MachineRepresentation rep =
             LocationOperand::cast(source)->representation();
         if (rep == MachineRepresentation::kFloat32) {
           __ movss(kScratchDoubleReg, dst);
           __ movss(dst, src);
           __ movaps(src, kScratchDoubleReg);
         } else if (rep == MachineRepresentation::kFloat64) {
           __ movsd(kScratchDoubleReg, dst);
           __ movsd(dst, src);
           __ movaps(src, kScratchDoubleReg);
         } else {
           DCHECK_EQ(MachineRepresentation::kSimd128, rep);
           __ movups(kScratchDoubleReg, dst);
           __ movups(dst, src);
           __ movups(src, kScratchDoubleReg);
         }
       }
       return;
     }
     case MoveType::kStackToStack: {
       if (source->IsStackSlot()) {
         Operand dst1 = g.ToOperand(destination);
         __ push(dst1);
         frame_access_state()->IncreaseSPDelta(1);
         Operand src1 = g.ToOperand(source);
         __ push(src1);
         Operand dst2 = g.ToOperand(destination);
         __ pop(dst2);
         frame_access_state()->IncreaseSPDelta(-1);
         Operand src2 = g.ToOperand(source);
         __ pop(src2);
       } else {
         DCHECK(source->IsFPStackSlot());
         Operand src0 = g.ToOperand(source);
         Operand dst0 = g.ToOperand(destination);
         MachineRepresentation rep =
             LocationOperand::cast(source)->representation();
         if (rep == MachineRepresentation::kFloat32) {
           __ movss(kScratchDoubleReg, dst0);  // Save dst in scratch register.
           __ push(src0);  // Then use stack to copy src to destination.
           __ pop(dst0);
           __ movss(src0, kScratchDoubleReg);
         } else if (rep == MachineRepresentation::kFloat64) {
           __ movsd(kScratchDoubleReg, dst0);  // Save dst in scratch register.
           __ push(src0);  // Then use stack to copy src to destination.
           __ pop(dst0);
           __ push(g.ToOperand(source, kSystemPointerSize));
           __ pop(g.ToOperand(destination, kSystemPointerSize));
           __ movsd(src0, kScratchDoubleReg);
         } else {
           DCHECK_EQ(MachineRepresentation::kSimd128, rep);
           __ movups(kScratchDoubleReg, dst0);  // Save dst in scratch register.
           __ push(src0);  // Then use stack to copy src to destination.
           __ pop(dst0);
           __ push(g.ToOperand(source, kSystemPointerSize));
           __ pop(g.ToOperand(destination, kSystemPointerSize));
           __ push(g.ToOperand(source, 2 * kSystemPointerSize));
           __ pop(g.ToOperand(destination, 2 * kSystemPointerSize));
           __ push(g.ToOperand(source, 3 * kSystemPointerSize));
           __ pop(g.ToOperand(destination, 3 * kSystemPointerSize));
           __ movups(src0, kScratchDoubleReg);
         }
       }
       return;
     }
     default:
       UNREACHABLE();
   }
 }

 void CodeGenerator::AssembleJumpTable(Label** targets, size_t target_count) {
   for (size_t index = 0; index < target_count; ++index) {
     __ dd(targets[index]);
   }
 }

 #undef __
 #undef kScratchDoubleReg
 #undef ASSEMBLE_COMPARE
 #undef ASSEMBLE_IEEE754_BINOP
 #undef ASSEMBLE_IEEE754_UNOP
 #undef ASSEMBLE_BINOP
 #undef ASSEMBLE_ATOMIC_BINOP
 #undef ASSEMBLE_I64ATOMIC_BINOP
 #undef ASSEMBLE_MOVX
 #undef ASSEMBLE_SIMD_PUNPCK_SHUFFLE
 #undef ASSEMBLE_SIMD_IMM_SHUFFLE
 #undef ASSEMBLE_SIMD_ALL_TRUE
 #undef ASSEMBLE_SIMD_SHIFT

 }  // namespace compiler
 }  // namespace internal
 }  // namespace v8