cccccccccc10


nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
                                                methodHandle method,
                                                int compile_id,
                                                BasicType* in_sig_bt,
                                                VMRegPair* in_regs,
                                                BasicType ret_type) {
  if (method->is_method_handle_intrinsic()) {
    vmIntrinsics::ID iid = method->intrinsic_id();
    intptr_t start = (intptr_t)__ pc();
    int vep_offset = ((intptr_t)__ pc()) - start;
    gen_special_dispatch(masm,
                         method,
                         in_sig_bt,
                         in_regs);
    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
    __ flush();
    int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
    return nmethod::new_native_nmethod(method,
                                       compile_id,
                                       masm->code(),
                                       vep_offset,
                                       frame_complete,
                                       stack_slots / VMRegImpl::slots_per_word,
                                       in_ByteSize(-1),
                                       in_ByteSize(-1),
                                       (OopMapSet*)NULL);
  }
  bool is_critical_native = true;
  address native_func = method->critical_native_function();
  if (native_func == NULL) {
    native_func = method->native_function();
    is_critical_native = false;
  }
  assert(native_func != NULL, "must have function");
  OopMapSet *oop_maps = new OopMapSet();
  intptr_t start = (intptr_t)__ pc();
  const int total_in_args = method->size_of_parameters();
  int total_c_args = total_in_args;
  if (!is_critical_native) {
    total_c_args += 1;
    if (method->is_static()) {
      total_c_args++;
    }
  } else {
    for (int i = 0; i < total_in_args; i++) {
      if (in_sig_bt[i] == T_ARRAY) {
        total_c_args++;
      }
    }
  }
  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
  BasicType* in_elem_bt = NULL;
  int argc = 0;
  if (!is_critical_native) {
    out_sig_bt[argc++] = T_ADDRESS;
    if (method->is_static()) {
      out_sig_bt[argc++] = T_OBJECT;
    }
    for (int i = 0; i < total_in_args ; i++ ) {
      out_sig_bt[argc++] = in_sig_bt[i];
    }
  } else {
    Thread* THREAD = Thread::current();
    in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
    SignatureStream ss(method->signature());
    for (int i = 0; i < total_in_args ; i++ ) {
      if (in_sig_bt[i] == T_ARRAY) {
        out_sig_bt[argc++] = T_INT;
        out_sig_bt[argc++] = T_ADDRESS;
        Symbol* atype = ss.as_symbol(CHECK_NULL);
        const char* at = atype->as_C_string();
        if (strlen(at) == 2) {
          assert(at[0] == '[', "must be");
          switch (at[1]) {
            case 'B': in_elem_bt[i]  = T_BYTE; break;
            case 'C': in_elem_bt[i]  = T_CHAR; break;
            case 'D': in_elem_bt[i]  = T_DOUBLE; break;
            case 'F': in_elem_bt[i]  = T_FLOAT; break;
            case 'I': in_elem_bt[i]  = T_INT; break;
            case 'J': in_elem_bt[i]  = T_LONG; break;
            case 'S': in_elem_bt[i]  = T_SHORT; break;
            case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;
            default: ShouldNotReachHere();
          }
        }
      } else {
        out_sig_bt[argc++] = in_sig_bt[i];
        in_elem_bt[i] = T_VOID;
      }
      if (in_sig_bt[i] != T_VOID) {
        assert(in_sig_bt[i] == ss.type(), "must match");
        ss.next();
      }
    }
  }
  int out_arg_slots;
  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
  int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
  if (is_critical_native) {
    int double_slots = 0;
    int single_slots = 0;
    for ( int i = 0; i < total_in_args; i++) {
      if (in_regs[i].first()->is_Register()) {
        const Register reg = in_regs[i].first()->as_Register();
        switch (in_sig_bt[i]) {
          case T_BOOLEAN:
          case T_BYTE:
          case T_SHORT:
          case T_CHAR:
          case T_INT:  single_slots++; break;
          case T_ARRAY:  // specific to LP64 (7145024)
          case T_LONG: double_slots++; break;
          default:  ShouldNotReachHere();
        }
      } else if (in_regs[i].first()->is_XMMRegister()) {
        switch (in_sig_bt[i]) {
          case T_FLOAT:  single_slots++; break;
          case T_DOUBLE: double_slots++; break;
          default:  ShouldNotReachHere();
        }
      } else if (in_regs[i].first()->is_FloatRegister()) {
        ShouldNotReachHere();
      }
    }
    total_save_slots = double_slots * 2 + single_slots;
    if (double_slots != 0) {
      stack_slots = round_to(stack_slots, 2);
    }
  }
  int oop_handle_offset = stack_slots;
  stack_slots += total_save_slots;
  int klass_slot_offset = 0;
  int klass_offset = -1;
  int lock_slot_offset = 0;
  bool is_static = false;
  if (method->is_static()) {
    klass_slot_offset = stack_slots;
    stack_slots += VMRegImpl::slots_per_word;
    klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
    is_static = true;
  }
  if (method->is_synchronized()) {
    lock_slot_offset = stack_slots;
    stack_slots += VMRegImpl::slots_per_word;
  }
  stack_slots += 6;
  stack_slots = round_to(stack_slots, StackAlignmentInSlots);
  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
  const Register ic_reg = rax;
  const Register receiver = j_rarg0;
  Label hit;
  Label exception_pending;
  assert_different_registers(ic_reg, receiver, rscratch1);
  __ verify_oop(receiver);
  __ load_klass(rscratch1, receiver);
  __ cmpq(ic_reg, rscratch1);
  __ jcc(Assembler::equal, hit);
  __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
  __ align(8);
  __ bind(hit);
  int vep_offset = ((intptr_t)__ pc()) - start;
  if (UseStackBanging) {
    __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
  } else {
    __ fat_nop();
  }
  __ enter();
  __ subptr(rsp, stack_size - 2*wordSize);
  int frame_complete = ((intptr_t)__ pc()) - start;
    if (UseRTMLocking) {
      __ xabort(0);
    }
#ifdef ASSERT
    {
      Label L;
      __ mov(rax, rsp);
      __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
      __ cmpptr(rax, rsp);
      __ jcc(Assembler::equal, L);
      __ stop("improperly aligned stack");
      __ bind(L);
    }
#endif /* ASSERT */
  const Register oop_handle_reg = r14;
  if (is_critical_native) {
    check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,
                                       oop_handle_offset, oop_maps, in_regs, in_sig_bt);
  }
  int receiver_offset = -1;
  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
#ifdef ASSERT
  bool reg_destroyed[RegisterImpl::number_of_registers];
  bool freg_destroyed[XMMRegisterImpl::number_of_registers];
  for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
    reg_destroyed[r] = false;
  }
  for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
    freg_destroyed[f] = false;
  }
#endif /* ASSERT */
  GrowableArray<int> arg_order(2 * total_in_args);
  VMRegPair tmp_vmreg;
  tmp_vmreg.set2(rbx->as_VMReg());
  if (!is_critical_native) {
    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
      arg_order.push(i);
      arg_order.push(c_arg);
    }
  } else {
    ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
  }
  int temploc = -1;
  for (int ai = 0; ai < arg_order.length(); ai += 2) {
    int i = arg_order.at(ai);
    int c_arg = arg_order.at(ai + 1);
    __ block_comment(err_msg("move %d -> %d", i, c_arg));
    if (c_arg == -1) {
      assert(is_critical_native, "should only be required for critical natives");
      __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
      in_regs[i] = tmp_vmreg;
      temploc = i;
      continue;
    } else if (i == -1) {
      assert(is_critical_native, "should only be required for critical natives");
      assert(temploc != -1, "must be valid");
      i = temploc;
      temploc = -1;
    }
#ifdef ASSERT
    if (in_regs[i].first()->is_Register()) {
      assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
    } else if (in_regs[i].first()->is_XMMRegister()) {
      assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
    }
    if (out_regs[c_arg].first()->is_Register()) {
      reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
    } else if (out_regs[c_arg].first()->is_XMMRegister()) {
      freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
    }
#endif /* ASSERT */
    switch (in_sig_bt[i]) {
      case T_ARRAY:
        if (is_critical_native) {
          unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
          c_arg++;
#ifdef ASSERT
          if (out_regs[c_arg].first()->is_Register()) {
            reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
          } else if (out_regs[c_arg].first()->is_XMMRegister()) {
            freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
          }
#endif
          break;
        }
      case T_OBJECT:
        assert(!is_critical_native, "no oop arguments");
        object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
                    ((i == 0) && (!is_static)),
                    &receiver_offset);
        break;
      case T_VOID:
        break;
      case T_FLOAT:
        float_move(masm, in_regs[i], out_regs[c_arg]);
          break;
      case T_DOUBLE:
        assert( i + 1 < total_in_args &&
                in_sig_bt[i + 1] == T_VOID &&
                out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
        double_move(masm, in_regs[i], out_regs[c_arg]);
        break;
      case T_LONG :
        long_move(masm, in_regs[i], out_regs[c_arg]);
        break;
      case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
      default:
        move32_64(masm, in_regs[i], out_regs[c_arg]);
    }
  }
  int c_arg;
  if (!is_critical_native) {
    c_arg = total_c_args - total_in_args;
    if (method->is_static()) {
      __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
      __ movptr(Address(rsp, klass_offset), oop_handle_reg);
      map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
      __ lea(oop_handle_reg, Address(rsp, klass_offset));
      __ movptr(c_rarg1, oop_handle_reg);
      c_arg--;
    }
  } else {
    c_arg = 0;
  }
  intptr_t the_pc = (intptr_t) __ pc();
  oop_maps->add_gc_map(the_pc - start, map);
  __ set_last_Java_frame(rsp, noreg, (address)the_pc);
  {
    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
    save_args(masm, total_c_args, c_arg, out_regs);
    __ mov_metadata(c_rarg1, method());
    __ call_VM_leaf(
      CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
      r15_thread, c_rarg1);
    restore_args(masm, total_c_args, c_arg, out_regs);
  }
  if (RC_TRACE_IN_RANGE(0x00001000, 0x00002000)) {
    save_args(masm, total_c_args, c_arg, out_regs);
    __ mov_metadata(c_rarg1, method());
    __ call_VM_leaf(
      CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
      r15_thread, c_rarg1);
    restore_args(masm, total_c_args, c_arg, out_regs);
  }
  const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
  const Register obj_reg  = rbx;  // Will contain the oop
  const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
  const Register old_hdr  = r13;  // value of old header at unlock time
  Label slow_path_lock;
  Label lock_done;
  if (method->is_synchronized()) {
    assert(!is_critical_native, "unhandled");
    const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
    __ mov(oop_handle_reg, c_rarg1);
    __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
    __ movptr(obj_reg, Address(oop_handle_reg, 0));
    if (UseBiasedLocking) {
      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, false, lock_done, &slow_path_lock);
    }
    __ movl(swap_reg, 1);
    __ orptr(swap_reg, Address(obj_reg, 0));
    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
    if (os::is_MP()) {
      __ lock();
    }
    __ cmpxchgptr(lock_reg, Address(obj_reg, 0));
    __ jcc(Assembler::equal, lock_done);
    __ subptr(swap_reg, rsp);
    __ andptr(swap_reg, 3 - os::vm_page_size());
    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
    __ jcc(Assembler::notEqual, slow_path_lock);
    __ bind(lock_done);
  }
  if (!is_critical_native) {
    __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
  }
  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
  __ call(RuntimeAddress(native_func));
  __ restore_cpu_control_state_after_jni();
  switch (ret_type) {
  case T_BOOLEAN: __ c2bool(rax);            break;
  case T_CHAR   : __ movzwl(rax, rax);      break;
  case T_BYTE   : __ sign_extend_byte (rax); break;
  case T_SHORT  : __ sign_extend_short(rax); break;
  case T_INT    : /* nothing to do */        break;
  case T_DOUBLE :
  case T_FLOAT  :
    break;
  case T_ARRAY:                 // Really a handle
  case T_OBJECT:                // Really a handle
      break; // can't de-handlize until after safepoint check
  case T_VOID: break;
  case T_LONG: break;
  default       : ShouldNotReachHere();
  }
  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
  if(os::is_MP()) {
    if (UseMembar) {
      __ membar(Assembler::Membar_mask_bits(
           Assembler::LoadLoad | Assembler::LoadStore |
           Assembler::StoreLoad | Assembler::StoreStore));
    } else {
      __ serialize_memory(r15_thread, rcx);
    }
  }
  Label after_transition;
  {
    Label Continue;
    __ cmp32(ExternalAddress((address)SafepointSynchronize::address_of_state()),
             SafepointSynchronize::_not_synchronized);
    Label L;
    __ jcc(Assembler::notEqual, L);
    __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
    __ jcc(Assembler::equal, Continue);
    __ bind(L);
    save_native_result(masm, ret_type, stack_slots);
    __ mov(c_rarg0, r15_thread);
    __ mov(r12, rsp); // remember sp
    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
    __ andptr(rsp, -16); // align stack as required by ABI
    if (!is_critical_native) {
      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
    } else {
      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
    }
    __ mov(rsp, r12); // restore sp
    __ reinit_heapbase();
    restore_native_result(masm, ret_type, stack_slots);
    if (is_critical_native) {
      __ jmpb(after_transition);
    }
    __ bind(Continue);
  }
  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
  __ bind(after_transition);
  Label reguard;
  Label reguard_done;
  __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), JavaThread::stack_guard_yellow_disabled);
  __ jcc(Assembler::equal, reguard);
  __ bind(reguard_done);
  Label unlock_done;
  Label slow_path_unlock;
  if (method->is_synchronized()) {
    __ movptr(obj_reg, Address(oop_handle_reg, 0));
    Label done;
    if (UseBiasedLocking) {
      __ biased_locking_exit(obj_reg, old_hdr, done);
    }
    __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
    __ jcc(Assembler::equal, done);
    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
      save_native_result(masm, ret_type, stack_slots);
    }
    __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
    __ movptr(old_hdr, Address(rax, 0));
    if (os::is_MP()) {
      __ lock();
    }
    __ cmpxchgptr(old_hdr, Address(obj_reg, 0));
    __ jcc(Assembler::notEqual, slow_path_unlock);
    __ bind(unlock_done);
    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
      restore_native_result(masm, ret_type, stack_slots);
    }
    __ bind(done);
  }
  {
    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
    save_native_result(masm, ret_type, stack_slots);
    __ mov_metadata(c_rarg1, method());
    __ call_VM_leaf(
         CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
         r15_thread, c_rarg1);
    restore_native_result(masm, ret_type, stack_slots);
  }
  __ reset_last_Java_frame(false);
  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
    __ resolve_jobject(rax /* value */,
                       r15_thread /* thread */,
                       rcx /* tmp */);
  }
  if (!is_critical_native) {
    __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
    __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
  }
  __ leave();
  if (!is_critical_native) {
    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
    __ jcc(Assembler::notEqual, exception_pending);
  }
  __ ret(0);
  if (!is_critical_native) {
    __ bind(exception_pending);
    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
  }
  if (method->is_synchronized()) {
    __ bind(slow_path_lock);
    save_args(masm, total_c_args, c_arg, out_regs);
    __ mov(c_rarg0, obj_reg);
    __ mov(c_rarg1, lock_reg);
    __ mov(c_rarg2, r15_thread);
    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
    restore_args(masm, total_c_args, c_arg, out_regs);
#ifdef ASSERT
    { Label L;
    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
    __ jcc(Assembler::equal, L);
    __ stop("no pending exception allowed on exit from monitorenter");
    __ bind(L);
    }
#endif
    __ jmp(lock_done);
    __ bind(slow_path_unlock);
    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
      save_native_result(masm, ret_type, stack_slots);
    }
    __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
    __ mov(c_rarg0, obj_reg);
    __ mov(r12, rsp); // remember sp
    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
    __ andptr(rsp, -16); // align stack as required by ABI
    __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
    __ mov(rsp, r12); // restore sp
    __ reinit_heapbase();
#ifdef ASSERT
    {
      Label L;
      __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
      __ jcc(Assembler::equal, L);
      __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
      __ bind(L);
    }
#endif /* ASSERT */
    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
      restore_native_result(masm, ret_type, stack_slots);
    }
    __ jmp(unlock_done);
  } // synchronized
  __ bind(reguard);
  save_native_result(masm, ret_type, stack_slots);
  __ mov(r12, rsp); // remember sp
  __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
  __ andptr(rsp, -16); // align stack as required by ABI
  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
  __ mov(rsp, r12); // restore sp
  __ reinit_heapbase();
  restore_native_result(masm, ret_type, stack_slots);
  __ jmp(reguard_done);
  __ flush();
  nmethod *nm = nmethod::new_native_nmethod(method,
                                            compile_id,
                                            masm->code(),
                                            vep_offset,
                                            frame_complete,
                                            stack_slots / VMRegImpl::slots_per_word,
                                            (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
                                            in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
                                            oop_maps);
  if (is_critical_native) {
    nm->set_lazy_critical_native(true);
  }
  return nm;
}
#ifdef HAVE_DTRACE_H
static int  fp_offset[ConcreteRegisterImpl::number_of_registers] = { 0 };
static bool offsets_initialized = false;
nmethod *SharedRuntime::generate_dtrace_nmethod(MacroAssembler *masm,
                                                methodHandle method) {
  assert(AdapterHandlerLibrary_lock->owned_by_self(), "must be");
  if (!offsets_initialized) {
    fp_offset[c_rarg0->as_VMReg()->value()] = -1 * wordSize;
    fp_offset[c_rarg1->as_VMReg()->value()] = -2 * wordSize;
    fp_offset[c_rarg2->as_VMReg()->value()] = -3 * wordSize;
    fp_offset[c_rarg3->as_VMReg()->value()] = -4 * wordSize;
    fp_offset[c_rarg4->as_VMReg()->value()] = -5 * wordSize;
    fp_offset[c_rarg5->as_VMReg()->value()] = -6 * wordSize;
    fp_offset[c_farg0->as_VMReg()->value()] = -7 * wordSize;
    fp_offset[c_farg1->as_VMReg()->value()] = -8 * wordSize;
    fp_offset[c_farg2->as_VMReg()->value()] = -9 * wordSize;
    fp_offset[c_farg3->as_VMReg()->value()] = -10 * wordSize;
    fp_offset[c_farg4->as_VMReg()->value()] = -11 * wordSize;
    fp_offset[c_farg5->as_VMReg()->value()] = -12 * wordSize;
    fp_offset[c_farg6->as_VMReg()->value()] = -13 * wordSize;
    fp_offset[c_farg7->as_VMReg()->value()] = -14 * wordSize;
    offsets_initialized = true;
  }
  int total_args_passed = method->size_of_parameters();
  BasicType* in_sig_bt  = NEW_RESOURCE_ARRAY(BasicType, total_args_passed);
  VMRegPair  *in_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed);
  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_args_passed * 2);
  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed * 2);
  int i=0;
  int total_strings = 0;
  int first_arg_to_pass = 0;
  int total_c_args = 0;
  if( !method->is_static() ) {
    in_sig_bt[i++] = T_OBJECT;
    first_arg_to_pass = 1;
  }
  SignatureStream ss(method->signature());
  for ( ; !ss.at_return_type(); ss.next()) {
    BasicType bt = ss.type();
    in_sig_bt[i++] = bt;  // Collect remaining bits of signature
    out_sig_bt[total_c_args++] = bt;
    if( bt == T_OBJECT) {
      Symbol* s = ss.as_symbol_or_null();   // symbol is created
      if (s == vmSymbols::java_lang_String()) {
        total_strings++;
        out_sig_bt[total_c_args-1] = T_ADDRESS;
      } else if (s == vmSymbols::java_lang_Boolean() ||
                 s == vmSymbols::java_lang_Character() ||
                 s == vmSymbols::java_lang_Byte() ||
                 s == vmSymbols::java_lang_Short() ||
                 s == vmSymbols::java_lang_Integer() ||
                 s == vmSymbols::java_lang_Float()) {
        out_sig_bt[total_c_args-1] = T_INT;
      } else if (s == vmSymbols::java_lang_Long() ||
                 s == vmSymbols::java_lang_Double()) {
        out_sig_bt[total_c_args-1] = T_LONG;
        out_sig_bt[total_c_args++] = T_VOID;
      }
    } else if ( bt == T_LONG || bt == T_DOUBLE ) {
      in_sig_bt[i++] = T_VOID;   // Longs & doubles take 2 Java slots
      out_sig_bt[total_c_args-1] = T_LONG;
      out_sig_bt[total_c_args++] = T_VOID;
    } else if ( bt == T_FLOAT) {
      out_sig_bt[total_c_args-1] = T_INT;
    }
  }
  assert(i==total_args_passed, "validly parsed signature");
  int comp_args_on_stack;
  comp_args_on_stack = SharedRuntime::java_calling_convention(
      in_sig_bt, in_regs, total_args_passed, false);
  int out_arg_slots;
  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
  int* string_locs   = NEW_RESOURCE_ARRAY(int, total_strings + 1);
  for (i = 0; i < total_strings ; i++) {
    string_locs[i] = stack_slots;
    stack_slots += max_dtrace_string_size / VMRegImpl::stack_slot_size;
  }
  stack_slots += (Argument::n_int_register_parameters_c +
                  Argument::n_float_register_parameters_c) * 2;
  stack_slots += 4;
  stack_slots = round_to(stack_slots, 4 * VMRegImpl::slots_per_word);
  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
  intptr_t start = (intptr_t)__ pc();
  const Register ic_reg = rax;
  const Register receiver = rcx;
  Label hit;
  Label exception_pending;
  __ verify_oop(receiver);
  __ cmpl(ic_reg, Address(receiver, oopDesc::klass_offset_in_bytes()));
  __ jcc(Assembler::equal, hit);
  __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
  __ align(8);
  __ bind(hit);
  int vep_offset = ((intptr_t)__ pc()) - start;
  if (UseStackBanging) {
    if (stack_size <= StackShadowPages*os::vm_page_size()) {
      __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
    } else {
      __ movl(rax, stack_size);
      __ bang_stack_size(rax, rbx);
    }
  } else {
    __ fat_nop();
  }
  assert(((uintptr_t)__ pc() - start - vep_offset) >= 5,
         "valid size for make_non_entrant");
  __ enter();
  if (stack_size - 2*wordSize != 0) {
    __ subq(rsp, stack_size - 2*wordSize);
  }
  int frame_complete = ((intptr_t)__ pc()) - start;
  int c_arg, j_arg;
  bool  live[ConcreteRegisterImpl::number_of_registers];
  live[j_rarg0->as_VMReg()->value()] = false;
  live[j_rarg1->as_VMReg()->value()] = false;
  live[j_rarg2->as_VMReg()->value()] = false;
  live[j_rarg3->as_VMReg()->value()] = false;
  live[j_rarg4->as_VMReg()->value()] = false;
  live[j_rarg5->as_VMReg()->value()] = false;
  live[j_farg0->as_VMReg()->value()] = false;
  live[j_farg1->as_VMReg()->value()] = false;
  live[j_farg2->as_VMReg()->value()] = false;
  live[j_farg3->as_VMReg()->value()] = false;
  live[j_farg4->as_VMReg()->value()] = false;
  live[j_farg5->as_VMReg()->value()] = false;
  live[j_farg6->as_VMReg()->value()] = false;
  live[j_farg7->as_VMReg()->value()] = false;
  bool rax_is_zero = false;
  for (j_arg = first_arg_to_pass, c_arg = 0 ;
       j_arg < total_args_passed ; j_arg++, c_arg++ ) {
    VMRegPair src = in_regs[j_arg];
    VMRegPair dst = out_regs[c_arg];
    int src_reg = src.first()->is_reg() ?
                  src.first()->value() :
                  rsp->as_VMReg()->value();
    bool useless =  in_sig_bt[j_arg] == T_ARRAY ||
                    (in_sig_bt[j_arg] == T_OBJECT &&
                     out_sig_bt[c_arg] != T_INT &&
                     out_sig_bt[c_arg] != T_ADDRESS &&
                     out_sig_bt[c_arg] != T_LONG);
    live[src_reg] = !useless;
    if (dst.first()->is_stack()) {
      live[src_reg] = false;
      switch (in_sig_bt[j_arg]) {
        case T_ARRAY:
        case T_OBJECT:
          {
            Address stack_dst(rsp, reg2offset_out(dst.first()));
            if (out_sig_bt[c_arg] == T_INT || out_sig_bt[c_arg] == T_LONG) {
              Register in_reg = rax;
              if ( src.first()->is_reg() ) {
                in_reg = src.first()->as_Register();
              } else {
                __ movq(rax, Address(rbp, reg2offset_in(src.first())));
                rax_is_zero = false;
              }
              Label skipUnbox;
              __ movptr(Address(rsp, reg2offset_out(dst.first())),
                        (int32_t)NULL_WORD);
              __ testq(in_reg, in_reg);
              __ jcc(Assembler::zero, skipUnbox);
              BasicType bt = out_sig_bt[c_arg];
              int box_offset = java_lang_boxing_object::value_offset_in_bytes(bt);
              Address src1(in_reg, box_offset);
              if ( bt == T_LONG ) {
                __ movq(in_reg,  src1);
                __ movq(stack_dst, in_reg);
                assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
                ++c_arg; // skip over T_VOID to keep the loop indices in sync
              } else {
                __ movl(in_reg,  src1);
                __ movl(stack_dst, in_reg);
              }
              __ bind(skipUnbox);
            } else if (out_sig_bt[c_arg] != T_ADDRESS) {
              if (!rax_is_zero) {
                __ xorq(rax, rax);
                rax_is_zero = true;
              }
              __ movq(stack_dst, rax);
            }
          }
          break;
        case T_VOID:
          break;
        case T_FLOAT:
          float_move(masm, src, dst);
          break;
        case T_DOUBLE:
          double_move(masm, src, dst);
          break;
        case T_LONG :
          long_move(masm, src, dst);
          break;
        case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
        default:
          move32_64(masm, src, dst);
      }
    }
  }
  int sid = 0;
  if (total_strings > 0 ) {
    for (j_arg = first_arg_to_pass, c_arg = 0 ;
         j_arg < total_args_passed ; j_arg++, c_arg++ ) {
      VMRegPair src = in_regs[j_arg];
      VMRegPair dst = out_regs[c_arg];
      if (src.first()->is_reg()) {
        Address src_tmp(rbp, fp_offset[src.first()->value()]);
        if (out_sig_bt[c_arg] == T_ADDRESS) {
          Address utf8_addr = Address(
              rsp, string_locs[sid++] * VMRegImpl::stack_slot_size);
          if (sid != 1) {
            __ movq(utf8_addr, src.first()->as_Register());
          }
        } else if (dst.first()->is_reg()) {
          if (in_sig_bt[j_arg] == T_FLOAT || in_sig_bt[j_arg] == T_DOUBLE) {
            XMMRegister f = src.first()->as_XMMRegister();
            if (in_sig_bt[j_arg] == T_FLOAT) {
              __ movflt(src_tmp, f);
            } else {
              __ movdbl(src_tmp, f);
            }
          } else {
            bool useless =  in_sig_bt[j_arg] == T_ARRAY ||
                            (in_sig_bt[j_arg] == T_OBJECT &&
                             out_sig_bt[c_arg] != T_INT &&
                             out_sig_bt[c_arg] != T_LONG);
            if (!useless) {
              __ movq(src_tmp, src.first()->as_Register());
            }
          }
        }
      }
      if (in_sig_bt[j_arg] == T_OBJECT && out_sig_bt[c_arg] == T_LONG) {
        assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
        ++c_arg; // skip over T_VOID to keep the loop indices in sync
      }
    }
    sid = 0;
    for (j_arg = first_arg_to_pass, c_arg = 0 ;
         j_arg < total_args_passed ; j_arg++, c_arg++ ) {
      if (out_sig_bt[c_arg] == T_ADDRESS) {
        Address utf8_addr = Address(
            rsp, string_locs[sid++] * VMRegImpl::stack_slot_size);
        VMReg src = in_regs[j_arg].first();
        int src_off = src->is_reg() ?
            fp_offset[src->value()] : reg2offset_in(src);
        VMRegPair dst = out_regs[c_arg];
        if (src->is_reg()) {
          if (sid == 1) {
            __ movq(c_rarg0, src->as_Register());
          } else {
            __ movq(c_rarg0, utf8_addr);
          }
        } else {
          __ movq(c_rarg0, Address(rbp, reg2offset_in(src)));
        }
        Label done, convert;
        __ testq(c_rarg0, c_rarg0);
        __ jcc(Assembler::notEqual, convert);
        if (dst.first()->is_reg()) {
          __ movq(Address(rbp, src_off), c_rarg0);
        } else {
          __ movq(Address(rsp, reg2offset_out(dst.first())), c_rarg0);
        }
        __ jmp(done);
        __ bind(convert);
        __ lea(c_rarg1, utf8_addr);
        if (dst.first()->is_reg()) {
          __ movq(Address(rbp, src_off), c_rarg1);
        } else {
          __ movq(Address(rsp, reg2offset_out(dst.first())), c_rarg1);
        }
        __ call(RuntimeAddress(
                CAST_FROM_FN_PTR(address, SharedRuntime::get_utf)));
        __ bind(done);
      }
      if (in_sig_bt[j_arg] == T_OBJECT && out_sig_bt[c_arg] == T_LONG) {
        assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
        ++c_arg; // skip over T_VOID to keep the loop indices in sync
      }
    }
    live[c_rarg0->as_VMReg()->value()] = false;
    live[c_rarg1->as_VMReg()->value()] = false;
    live[c_rarg2->as_VMReg()->value()] = false;
    live[c_rarg3->as_VMReg()->value()] = false;
    live[c_rarg4->as_VMReg()->value()] = false;
    live[c_rarg5->as_VMReg()->value()] = false;
    live[c_farg0->as_VMReg()->value()] = false;
    live[c_farg1->as_VMReg()->value()] = false;
    live[c_farg2->as_VMReg()->value()] = false;
    live[c_farg3->as_VMReg()->value()] = false;
    live[c_farg4->as_VMReg()->value()] = false;
    live[c_farg5->as_VMReg()->value()] = false;
    live[c_farg6->as_VMReg()->value()] = false;
    live[c_farg7->as_VMReg()->value()] = false;
  }
  rax_is_zero = false;
  for (j_arg = first_arg_to_pass, c_arg = 0 ;
       j_arg < total_args_passed ; j_arg++, c_arg++ ) {
    VMRegPair src = in_regs[j_arg];
    VMRegPair dst = out_regs[c_arg];
    if (dst.first()->is_reg()) {
      Register r =  dst.first()->as_Register();
      bool useless =  in_sig_bt[j_arg] == T_ARRAY ||
                      (in_sig_bt[j_arg] == T_OBJECT &&
                       out_sig_bt[c_arg] != T_INT &&
                       out_sig_bt[c_arg] != T_ADDRESS &&
                       out_sig_bt[c_arg] != T_LONG);
      if (live[dst.first()->value()]) {
        if (src.first() != dst.first()) {
          __ movq(Address(rbp, fp_offset[dst.first()->value()]), r);
        }
      }
      if (src.first()->is_reg()) {
        if (live[src.first()->value()] ) {
          if (in_sig_bt[j_arg] == T_FLOAT) {
            __ movdl(r, src.first()->as_XMMRegister());
          } else if (in_sig_bt[j_arg] == T_DOUBLE) {
            __ movdq(r, src.first()->as_XMMRegister());
          } else if (r != src.first()->as_Register()) {
            if (!useless) {
              __ movq(r, src.first()->as_Register());
            }
          }
        } else {
          if (!useless) {
            if (in_sig_bt[j_arg] == T_DOUBLE ||
                in_sig_bt[j_arg] == T_LONG  ||
                in_sig_bt[j_arg] == T_OBJECT ) {
              __ movq(r, Address(rbp, fp_offset[src.first()->value()]));
            } else {
              __ movl(r, Address(rbp, fp_offset[src.first()->value()]));
            }
          }
        }
        live[src.first()->value()] = false;
      } else if (!useless) {
        __ movq(r, Address(rbp, reg2offset_in(src.first())));
      }
      if (in_sig_bt[j_arg] == T_ARRAY || in_sig_bt[j_arg] == T_OBJECT ) {
        if (out_sig_bt[c_arg] == T_INT || out_sig_bt[c_arg] == T_LONG) {
          Label skip;
          __ testq(r, r);
          __ jcc(Assembler::equal, skip);
          BasicType bt = out_sig_bt[c_arg];
          int box_offset = java_lang_boxing_object::value_offset_in_bytes(bt);
          Address src1(r, box_offset);
          if ( bt == T_LONG ) {
            __ movq(r, src1);
          } else {
            __ movl(r, src1);
          }
          __ bind(skip);
        } else if (out_sig_bt[c_arg] != T_ADDRESS) {
          __ xorq(r, r);
        }
      }
      live[dst.first()->value()] = false;
    }
    if (in_sig_bt[j_arg] == T_OBJECT && out_sig_bt[c_arg] == T_LONG) {
      assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
      ++c_arg; // skip over T_VOID to keep the loop indices in sync
    }
  }
  int patch_offset = ((intptr_t)__ pc()) - start;
  __ nop();
  __ leave();
  __ ret(0);
  __ flush();
  nmethod *nm = nmethod::new_dtrace_nmethod(
      method, masm->code(), vep_offset, patch_offset, frame_complete,
      stack_slots / VMRegImpl::slots_per_word);
  return nm;
}
#endif // HAVE_DTRACE_H
int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
  return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
}
uint SharedRuntime::out_preserve_stack_slots() {
  return 0;
}
void SharedRuntime::generate_deopt_blob() {
  ResourceMark rm;
  CodeBuffer buffer("deopt_blob", 2048, 1024);
  MacroAssembler* masm = new MacroAssembler(&buffer);
  int frame_size_in_words;
  OopMap* map = NULL;
  OopMapSet *oop_maps = new OopMapSet();
  address start = __ pc();
  Label cont;
  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
  __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
  __ jmp(cont);
  int reexecute_offset = __ pc() - start;
  (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
  __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
  __ jmp(cont);
  int exception_offset = __ pc() - start;
  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
  int exception_in_tls_offset = __ pc() - start;
  __ push(0);
  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
  __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
  __ movptr(Address(rbp, wordSize), rdx);
  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
#ifdef ASSERT
  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
  __ verify_oop(rax);
  Label no_pending_exception;
  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
  __ testptr(rax, rax);
  __ jcc(Assembler::zero, no_pending_exception);
  __ stop("must not have pending exception here");
  __ bind(no_pending_exception);
#endif
  __ bind(cont);
  __ set_last_Java_frame(noreg, noreg, NULL);
#ifdef ASSERT
  { Label L;
    __ cmpptr(Address(r15_thread,
                    JavaThread::last_Java_fp_offset()),
            (int32_t)0);
    __ jcc(Assembler::equal, L);
    __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
    __ bind(L);
  }
#endif // ASSERT
  __ mov(c_rarg0, r15_thread);
  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
  oop_maps->add_gc_map(__ pc() - start, map);
  __ reset_last_Java_frame(false);
  __ mov(rdi, rax);
   Label noException;
  __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
  __ jcc(Assembler::notEqual, noException);
  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
  __ verify_oop(rax);
  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
  __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
  __ bind(noException);
  RegisterSaver::restore_result_registers(masm);
  __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
  __ addptr(rsp, rcx);
  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
#ifdef ASSERT
  if (UseStackBanging) {
    __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
    __ bang_stack_size(rbx, rcx);
  }
#endif
  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
  __ addptr(rsp, wordSize);
  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
  const Register sender_sp = r8;
  __ mov(sender_sp, rsp);
  __ movl(rbx, Address(rdi,
                       Deoptimization::UnrollBlock::
                       caller_adjustment_offset_in_bytes()));
  __ subptr(rsp, rbx);
  Label loop;
  __ bind(loop);
  __ movptr(rbx, Address(rsi, 0));      // Load frame size
#ifdef CC_INTERP
  __ subptr(rbx, 4*wordSize);           // we'll push pc and ebp by hand and
#ifdef ASSERT
  __ push(0xDEADDEAD);                  // Make a recognizable pattern
  __ push(0xDEADDEAD);
#else /* ASSERT */
  __ subptr(rsp, 2*wordSize);           // skip the "static long no_param"
#endif /* ASSERT */
#else
  __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
#endif // CC_INTERP
  __ pushptr(Address(rcx, 0));          // Save return address
  __ enter();                           // Save old & set new ebp
  __ subptr(rsp, rbx);                  // Prolog
#ifdef CC_INTERP
  __ movptr(Address(rbp,
                  -(sizeof(BytecodeInterpreter)) + in_bytes(byte_offset_of(BytecodeInterpreter, _sender_sp))),
            sender_sp); // Make it walkable
#else /* CC_INTERP */
  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
#endif /* CC_INTERP */
  __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
  __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
  __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
  __ decrementl(rdx);                   // Decrement counter
  __ jcc(Assembler::notZero, loop);
  __ pushptr(Address(rcx, 0));          // Save final return address
  __ enter();                           // Save old & set new ebp
  __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
  __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
  address the_pc = __ pc();
  __ set_last_Java_frame(noreg, rbp, the_pc);
  __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
  __ mov(c_rarg0, r15_thread);
  __ movl(c_rarg1, r14); // second arg: exec_mode
  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
  __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
  oop_maps->add_gc_map(the_pc - start,
                       new OopMap( frame_size_in_words, 0 ));
  __ reset_last_Java_frame(true);
  __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
  __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
  __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
  __ leave();                           // Epilog
  __ ret(0);
  masm->flush();
  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
}
#ifdef COMPILER2
void SharedRuntime::generate_uncommon_trap_blob() {
  ResourceMark rm;
  CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
  MacroAssembler* masm = new MacroAssembler(&buffer);
  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
  address start = __ pc();
  if (UseRTMLocking) {
    __ xabort(0);
  }
  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
  __ movl(c_rarg1, j_rarg0);
  __ set_last_Java_frame(noreg, noreg, NULL);
  __ mov(c_rarg0, r15_thread);
  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
  OopMapSet* oop_maps = new OopMapSet();
  OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
  oop_maps->add_gc_map(__ pc() - start, map);
  __ reset_last_Java_frame(false);
  __ mov(rdi, rax);
  __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
  __ movl(rcx, Address(rdi,
                       Deoptimization::UnrollBlock::
                       size_of_deoptimized_frame_offset_in_bytes()));
  __ addptr(rsp, rcx);
  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
#ifdef ASSERT
  if (UseStackBanging) {
    __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
    __ bang_stack_size(rbx, rcx);
  }
#endif
  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
  __ addptr(rsp, wordSize);
  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
  const Register sender_sp = r8;
  __ mov(sender_sp, rsp);
  __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
  __ subptr(rsp, rbx);
  Label loop;
  __ bind(loop);
  __ movptr(rbx, Address(rsi, 0)); // Load frame size
  __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
  __ pushptr(Address(rcx, 0));     // Save return address
  __ enter();                      // Save old & set new rbp
  __ subptr(rsp, rbx);             // Prolog
#ifdef CC_INTERP
  __ movptr(Address(rbp,
                  -(sizeof(BytecodeInterpreter)) + in_bytes(byte_offset_of(BytecodeInterpreter, _sender_sp))),
            sender_sp); // Make it walkable
#else // CC_INTERP
  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
            sender_sp);            // Make it walkable
  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
#endif // CC_INTERP
  __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
  __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
  __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
  __ decrementl(rdx);              // Decrement counter
  __ jcc(Assembler::notZero, loop);
  __ pushptr(Address(rcx, 0));     // Save final return address
  __ enter();                 // Save old & set new rbp
  __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
  address the_pc = __ pc();
  __ set_last_Java_frame(noreg, rbp, the_pc);
  __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
  __ mov(c_rarg0, r15_thread);
  __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
  __ reset_last_Java_frame(true);
  __ leave();                 // Epilog
  __ ret(0);
  masm->flush();
  _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
                                                 SimpleRuntimeFrame::framesize >> 1);
}
#endif // COMPILER2
SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
  assert(StubRoutines::forward_exception_entry() != NULL,
         "must be generated before");
  ResourceMark rm;
  OopMapSet *oop_maps = new OopMapSet();
  OopMap* map;
  CodeBuffer buffer("handler_blob", 2048, 1024);
  MacroAssembler* masm = new MacroAssembler(&buffer);
  address start   = __ pc();
  address call_pc = NULL;
  int frame_size_in_words;
  bool cause_return = (poll_type == POLL_AT_RETURN);
  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
  if (UseRTMLocking) {
    __ xabort(0);
  }
  if (!cause_return) {
    __ push(rbx);
  }
  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
  __ set_last_Java_frame(noreg, noreg, NULL);
  if (!cause_return) {
    __ movptr(c_rarg0, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
    __ movptr(Address(rbp, wordSize), c_rarg0);
  }
  __ mov(c_rarg0, r15_thread);
  __ call(RuntimeAddress(call_ptr));
  oop_maps->add_gc_map( __ pc() - start, map);
  Label noException;
  __ reset_last_Java_frame(false);
  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
  __ jcc(Assembler::equal, noException);
  RegisterSaver::restore_live_registers(masm, save_vectors);
  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
  __ bind(noException);
  RegisterSaver::restore_live_registers(masm, save_vectors);
  __ ret(0);
  masm->flush();
  return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
}
RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
  assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
  ResourceMark rm;
  CodeBuffer buffer(name, 1000, 512);
  MacroAssembler* masm                = new MacroAssembler(&buffer);
  int frame_size_in_words;
  OopMapSet *oop_maps = new OopMapSet();
  OopMap* map = NULL;
  int start = __ offset();
  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
  int frame_complete = __ offset();
  __ set_last_Java_frame(noreg, noreg, NULL);
  __ mov(c_rarg0, r15_thread);
  __ call(RuntimeAddress(destination));
  oop_maps->add_gc_map( __ offset() - start, map);
  __ reset_last_Java_frame(false);
  Label pending;
  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
  __ jcc(Assembler::notEqual, pending);
  __ get_vm_result_2(rbx, r15_thread);
  __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
  RegisterSaver::restore_live_registers(masm);
  __ jmp(rax);
  __ bind(pending);
  RegisterSaver::restore_live_registers(masm);
  __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
  masm->flush();
  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
}
#ifndef _WINDOWS
#define ASM_SUBTRACT
#ifdef ASM_SUBTRACT
static unsigned long
sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
  long i = 0, cnt = len;
  unsigned long tmp;
  asm volatile("clc; "
               "0: ; "
               "mov (%[b], %[i], 8), %[tmp]; "
               "sbb %[tmp], (%[a], %[i], 8); "
               "inc %[i]; dec %[cnt]; "
               "jne 0b; "
               "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
               : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
               : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
               : "memory");
  return tmp;
}
#else // ASM_SUBTRACT
typedef int __attribute__((mode(TI))) int128;
static unsigned long
sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
  int128 tmp = 0;
  int i;
  for (i = 0; i < len; i++) {
    tmp += a[i];
    tmp -= b[i];
    a[i] = tmp;
    tmp >>= 64;
    assert(-1 <= tmp && tmp <= 0, "invariant");
  }
  return tmp + carry;
}
#endif // ! ASM_SUBTRACT
#define MACC(A, B, T0, T1, T2)                                      \
do {                                                                \
  unsigned long hi, lo;                                             \
  asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)      \
           : "r"(A), "a"(B) : "cc");                                \
 } while(0)
#define MACC2(A, B, T0, T1, T2)                                     \
do {                                                                \
  unsigned long hi, lo;                                             \
  asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4;"  \
           "add %%rax, %2; adc %%rdx, %3; adc $0, %4"               \
           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)      \
           : "r"(A), "a"(B) : "cc");                                \
 } while(0)
static void __attribute__((noinline))
montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
                    unsigned long m[], unsigned long inv, int len) {
  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
  int i;
  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
  for (i = 0; i < len; i++) {
    int j;
    for (j = 0; j < i; j++) {
      MACC(a[j], b[i-j], t0, t1, t2);
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    MACC(a[i], b[0], t0, t1, t2);
    m[i] = t0 * inv;
    MACC(m[i], n[0], t0, t1, t2);
    assert(t0 == 0, "broken Montgomery multiply");
    t0 = t1; t1 = t2; t2 = 0;
  }
  for (i = len; i < 2*len; i++) {
    int j;
    for (j = i-len+1; j < len; j++) {
      MACC(a[j], b[i-j], t0, t1, t2);
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    m[i-len] = t0;
    t0 = t1; t1 = t2; t2 = 0;
  }
  while (t0)
    t0 = sub(m, n, t0, len);
}
static void __attribute__((noinline))
montgomery_square(unsigned long a[], unsigned long n[],
                  unsigned long m[], unsigned long inv, int len) {
  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
  int i;
  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
  for (i = 0; i < len; i++) {
    int j;
    int end = (i+1)/2;
    for (j = 0; j < end; j++) {
      MACC2(a[j], a[i-j], t0, t1, t2);
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    if ((i & 1) == 0) {
      MACC(a[j], a[j], t0, t1, t2);
    }
    for (; j < i; j++) {
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    m[i] = t0 * inv;
    MACC(m[i], n[0], t0, t1, t2);
    assert(t0 == 0, "broken Montgomery square");
    t0 = t1; t1 = t2; t2 = 0;
  }
  for (i = len; i < 2*len; i++) {
    int start = i-len+1;
    int end = start + (len - start)/2;
    int j;
    for (j = start; j < end; j++) {
      MACC2(a[j], a[i-j], t0, t1, t2);
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    if ((i & 1) == 0) {
      MACC(a[j], a[j], t0, t1, t2);
    }
    for (; j < len; j++) {
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    m[i-len] = t0;
    t0 = t1; t1 = t2; t2 = 0;
  }
  while (t0)
    t0 = sub(m, n, t0, len);
}
static unsigned long swap(unsigned long x) {
  return (x << 32) | (x >> 32);
}
static void reverse_words(unsigned long *s, unsigned long *d, int len) {
  d += len;
  while(len-- > 0) {
    d--;
    s++;
  }
}
#define MONTGOMERY_SQUARING_THRESHOLD 64
void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
                                        jint len, jlong inv,
                                        jint *m_ints) {
  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
  int longwords = len/2;
  int total_allocation = longwords * sizeof (unsigned long) * 4;
  guarantee(total_allocation <= 8192, "must be");
  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
  unsigned long
  reverse_words((unsigned long *)a_ints, a, longwords);
  reverse_words((unsigned long *)b_ints, b, longwords);
  reverse_words((unsigned long *)n_ints, n, longwords);
  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
  reverse_words(m, (unsigned long *)m_ints, longwords);
}
void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
                                      jint len, jlong inv,
                                      jint *m_ints) {
  assert(len % 2 == 0, "array length in montgomery_square must be even");
  int longwords = len/2;
  int total_allocation = longwords * sizeof (unsigned long) * 3;
  guarantee(total_allocation <= 8192, "must be");
  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
  unsigned long
  reverse_words((unsigned long *)a_ints, a, longwords);
  reverse_words((unsigned long *)n_ints, n, longwords);
#ifndef SOLARIS
  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
#else
  if (0) {
#endif
    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
  } else {
    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
  }
  reverse_words(m, (unsigned long *)m_ints, longwords);
}
#endif // WINDOWS
#ifdef COMPILER2
void OptoRuntime::generate_exception_blob() {
  assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
  assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
  assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
  ResourceMark rm;
  CodeBuffer buffer("exception_blob", 2048, 1024);
  MacroAssembler* masm = new MacroAssembler(&buffer);
  address start = __ pc();
  __ push(rdx);
  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
  address the_pc = __ pc();
  __ set_last_Java_frame(noreg, noreg, the_pc);
  __ mov(c_rarg0, r15_thread);
  __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
  OopMapSet* oop_maps = new OopMapSet();
  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
  __ reset_last_Java_frame(false);
  __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
  __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
  __ pop(rdx);                  // No need for exception pc anymore
  __ mov(r8, rax);
  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
#ifdef ASSERT
  __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
#endif
  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
  __ jmp(r8);
  masm->flush();
  _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
}
#endif // COMPILER2
C:\hotspot-69087d08d473\src\cpu\x86\vm/stubGenerator_x86_32.cpp
#include "precompiled.hpp"
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "interpreter/interpreter.hpp"
#include "nativeInst_x86.hpp"
#include "oops/instanceOop.hpp"
#include "oops/method.hpp"
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/sharedRuntime.hpp"
#include "runtime/stubCodeGenerator.hpp"
#include "runtime/stubRoutines.hpp"
#include "runtime/thread.inline.hpp"
#include "utilities/top.hpp"
#ifdef COMPILER2
#include "opto/runtime.hpp"
#endif
#define __ _masm->
#define a__ ((Assembler*)_masm)->
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
const int MXCSR_MASK  = 0xFFC0;  // Mask out any pending exceptions
const int FPU_CNTRL_WRD_MASK = 0xFFFF;
static address handle_unsafe_access() {
  JavaThread* thread = JavaThread::current();
  address pc  = thread->saved_exception_pc();
  address npc = Assembler::locate_next_instruction(pc);
  thread->set_pending_unsafe_access_error();
  return npc;
}
class StubGenerator: public StubCodeGenerator {
 private:
#ifdef PRODUCT
#define inc_counter_np(counter) ((void)0)
#else
  void inc_counter_np_(int& counter) {
    __ incrementl(ExternalAddress((address)&counter));
  }
#define inc_counter_np(counter) \
  BLOCK_COMMENT("inc_counter " #counter); \
  inc_counter_np_(counter);
#endif //PRODUCT
  void inc_copy_counter_np(BasicType t) {
#ifndef PRODUCT
    switch (t) {
    case T_BYTE:    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
    case T_SHORT:   inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
    case T_INT:     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
    case T_LONG:    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
    case T_OBJECT:  inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
    }
    ShouldNotReachHere();
#endif //PRODUCT
  }
  address generate_call_stub(address& return_address) {
    StubCodeMark mark(this, "StubRoutines", "call_stub");
    address start = __ pc();
    assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
    bool  sse_save = false;
    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
    const int     locals_count_in_bytes  (4*wordSize);
    const Address mxcsr_save    (rbp, -4 * wordSize);
    const Address saved_rbx     (rbp, -3 * wordSize);
    const Address saved_rsi     (rbp, -2 * wordSize);
    const Address saved_rdi     (rbp, -1 * wordSize);
    const Address result        (rbp,  3 * wordSize);
    const Address result_type   (rbp,  4 * wordSize);
    const Address method        (rbp,  5 * wordSize);
    const Address entry_point   (rbp,  6 * wordSize);
    const Address parameters    (rbp,  7 * wordSize);
    const Address parameter_size(rbp,  8 * wordSize);
    const Address thread        (rbp,  9 * wordSize); // same as in generate_catch_exception()!
    sse_save =  UseSSE > 0;
    __ enter();
    __ movptr(rcx, parameter_size);              // parameter counter
    __ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes
    __ addptr(rcx, locals_count_in_bytes);       // reserve space for register saves
    __ subptr(rsp, rcx);
    __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
    __ movptr(saved_rdi, rdi);
    __ movptr(saved_rsi, rsi);
    __ movptr(saved_rbx, rbx);
    if (sse_save) {
      Label skip_ldmx;
      __ stmxcsr(mxcsr_save);
      __ movl(rax, mxcsr_save);
      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
      __ cmp32(rax, mxcsr_std);
      __ jcc(Assembler::equal, skip_ldmx);
      __ ldmxcsr(mxcsr_std);
      __ bind(skip_ldmx);
    }
    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
#ifdef ASSERT
    { Label L;
      __ movptr(rcx, thread);
      __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
      __ jcc(Assembler::equal, L);
      __ stop("StubRoutines::call_stub: entered with pending exception");
      __ bind(L);
    }
#endif
    BLOCK_COMMENT("pass parameters if any");
    Label parameters_done;
    __ movl(rcx, parameter_size);  // parameter counter
    __ testl(rcx, rcx);
    __ jcc(Assembler::zero, parameters_done);
    Label loop;
    __ movptr(rdx, parameters);          // parameter pointer
    __ xorptr(rbx, rbx);
    __ BIND(loop);
    __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
    __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
                    Interpreter::expr_offset_in_bytes(0)), rax);          // store parameter
    __ increment(rbx);
    __ decrement(rcx);
    __ jcc(Assembler::notZero, loop);
    __ BIND(parameters_done);
    __ movptr(rbx, method);           // get Method*
    __ movptr(rax, entry_point);      // get entry_point
    __ mov(rsi, rsp);                 // set sender sp
    BLOCK_COMMENT("call Java function");
    __ call(rax);
    BLOCK_COMMENT("call_stub_return_address:");
    return_address = __ pc();
#ifdef COMPILER2
    {
      Label L_skip;
      if (UseSSE >= 2) {
        __ verify_FPU(0, "call_stub_return");
      } else {
        for (int i = 1; i < 8; i++) {
          __ ffree(i);
        }
        __ movl(rsi, result_type);
        __ cmpl(rsi, T_DOUBLE);
        __ jcc(Assembler::equal, L_skip);
        if (UseSSE == 0) {
          __ cmpl(rsi, T_FLOAT);
          __ jcc(Assembler::equal, L_skip);
        }
        __ ffree(0);
      }
      __ BIND(L_skip);
    }
#endif // COMPILER2
    __ movptr(rdi, result);
    Label is_long, is_float, is_double, exit;
    __ movl(rsi, result_type);
    __ cmpl(rsi, T_LONG);
    __ jcc(Assembler::equal, is_long);
    __ cmpl(rsi, T_FLOAT);
    __ jcc(Assembler::equal, is_float);
    __ cmpl(rsi, T_DOUBLE);
    __ jcc(Assembler::equal, is_double);
    __ movl(Address(rdi, 0), rax);
    __ BIND(exit);
    __ verify_FPU(0, "generate_call_stub");
    __ lea(rsp, rsp_after_call);
    if (sse_save) {
      __ ldmxcsr(mxcsr_save);
    }
    __ movptr(rbx, saved_rbx);
    __ movptr(rsi, saved_rsi);
    __ movptr(rdi, saved_rdi);
    __ addptr(rsp, 4*wordSize);
    __ pop(rbp);
    __ ret(0);
    __ BIND(is_long);
    __ movl(Address(rdi, 0 * wordSize), rax);
    __ movl(Address(rdi, 1 * wordSize), rdx);
    __ jmp(exit);
    __ BIND(is_float);
    if (UseSSE >= 1) {
      __ movflt(Address(rdi, 0), xmm0);
    } else {
      __ fstp_s(Address(rdi, 0));
    }
    __ jmp(exit);
    __ BIND(is_double);
    if (UseSSE >= 2) {
      __ movdbl(Address(rdi, 0), xmm0);
    } else {
      __ fstp_d(Address(rdi, 0));
    }
    __ jmp(exit);
    return start;
  }
  address generate_catch_exception() {
    StubCodeMark mark(this, "StubRoutines", "catch_exception");
    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
    const Address thread        (rbp,  9 * wordSize); // same as in generate_call_stub()!
    address start = __ pc();
    __ movptr(rcx, thread);
#ifdef ASSERT
    { Label L;
      __ get_thread(rbx);
      __ cmpptr(rbx, rcx);
      __ jcc(Assembler::equal, L);
      __ stop("StubRoutines::catch_exception: threads must correspond");
      __ bind(L);
    }
#endif
    __ verify_oop(rax);
    __ movptr(Address(rcx, Thread::pending_exception_offset()), rax          );
    __ lea(Address(rcx, Thread::exception_file_offset   ()),
           ExternalAddress((address)__FILE__));
    __ movl(Address(rcx, Thread::exception_line_offset   ()), __LINE__ );
    assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
    __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
    return start;
  }
  address generate_forward_exception() {
    StubCodeMark mark(this, "StubRoutines", "forward exception");
    address start = __ pc();
    const Register thread = rcx;
    const Register exception_oop = rax;
    const Register handler_addr  = rbx;
    const Register exception_pc  = rdx;
#ifdef ASSERT
    { Label L;
      __ get_thread(thread);
      __ cmpptr(Address(thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
      __ jcc(Assembler::notEqual, L);
      __ stop("StubRoutines::forward exception: no pending exception (1)");
      __ bind(L);
    }
#endif
    __ get_thread(thread);
    __ movptr(exception_pc, Address(rsp, 0));
    BLOCK_COMMENT("call exception_handler_for_return_address");
    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc);
    __ mov(handler_addr, rax);
    __ get_thread(thread);
    __ pop(exception_pc);
    __ movptr(exception_oop, Address(thread, Thread::pending_exception_offset()));
    __ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
#ifdef ASSERT
    { Label L;
      __ testptr(exception_oop, exception_oop);
      __ jcc(Assembler::notEqual, L);
      __ stop("StubRoutines::forward exception: no pending exception (2)");
      __ bind(L);
    }
#endif
    __ verify_oop(exception_oop);
    __ jmp(handler_addr);
    return start;
  }
  address generate_atomic_xchg() {
    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
    address start = __ pc();
    __ push(rdx);
    Address exchange(rsp, 2 * wordSize);
    Address dest_addr(rsp, 3 * wordSize);
    __ movl(rax, exchange);
    __ movptr(rdx, dest_addr);
    __ xchgl(rax, Address(rdx, 0));
    __ pop(rdx);
    __ ret(0);
    return start;
  }
  address generate_verify_mxcsr() {
    StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
    address start = __ pc();
    const Address mxcsr_save(rsp, 0);
    if (CheckJNICalls && UseSSE > 0 ) {
      Label ok_ret;
      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
      __ push(rax);
      __ subptr(rsp, wordSize);      // allocate a temp location
      __ stmxcsr(mxcsr_save);
      __ movl(rax, mxcsr_save);
      __ andl(rax, MXCSR_MASK);
      __ cmp32(rax, mxcsr_std);
      __ jcc(Assembler::equal, ok_ret);
      __ warn("MXCSR changed by native JNI code.");
      __ ldmxcsr(mxcsr_std);
      __ bind(ok_ret);
      __ addptr(rsp, wordSize);
      __ pop(rax);
    }
    __ ret(0);
    return start;
  }
  address generate_verify_fpu_cntrl_wrd() {
    StubCodeMark mark(this, "StubRoutines", "verify_spcw");
    address start = __ pc();
    const Address fpu_cntrl_wrd_save(rsp, 0);
    if (CheckJNICalls) {
      Label ok_ret;
      __ push(rax);
      __ subptr(rsp, wordSize);      // allocate a temp location
      __ fnstcw(fpu_cntrl_wrd_save);
      __ movl(rax, fpu_cntrl_wrd_save);
      __ andl(rax, FPU_CNTRL_WRD_MASK);
      ExternalAddress fpu_std(StubRoutines::addr_fpu_cntrl_wrd_std());
      __ cmp32(rax, fpu_std);
      __ jcc(Assembler::equal, ok_ret);
      __ warn("Floating point control word changed by native JNI code.");
      __ fldcw(fpu_std);
      __ bind(ok_ret);
      __ addptr(rsp, wordSize);
      __ pop(rax);
    }
    __ ret(0);
    return start;
  }
  address generate_d2i_wrapper(BasicType t, address fcn) {
    StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
    address start = __ pc();
  enum layout { FPUState_off         = 0,
                rbp_off              = FPUStateSizeInWords,
                rdi_off,
                rsi_off,
                rcx_off,
                rbx_off,
                saved_argument_off,
                saved_argument_off2, // 2nd half of double
                framesize
  };
  assert(FPUStateSizeInWords == 27, "update stack layout");
    __ subptr(rsp, wordSize * 2);
    __ fstp_d(Address(rsp, 0));
    __ push(rbx);
    __ push(rcx);
    __ push(rsi);
    __ push(rdi);
    __ push(rbp);
    __ push_FPU_state();
    __ fld_d(Address(rsp, saved_argument_off * wordSize));
    __ subptr(rsp, wordSize*2);
    __ fst_d(Address(rsp, 0));
    __ empty_FPU_stack();
    if (t == T_INT)
      { BLOCK_COMMENT("SharedRuntime::d2i"); }
    else if (t == T_LONG)
      { BLOCK_COMMENT("SharedRuntime::d2l"); }
    __ call_VM_leaf( fcn, 2 );
    __ pop_FPU_state();
    __ pop(rbp);
    __ pop(rdi);
    __ pop(rsi);
    __ pop(rcx);
    __ pop(rbx);
    __ addptr(rsp, wordSize * 2);
    __ ret(0);
    return start;
  }
  address generate_handler_for_unsafe_access() {
    StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
    address start = __ pc();
    __ push(0);                       // hole for return address-to-be
    __ pusha();                       // push registers
    Address next_pc(rsp, RegisterImpl::number_of_registers * BytesPerWord);
    BLOCK_COMMENT("call handle_unsafe_access");
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, handle_unsafe_access)));
    __ movptr(next_pc, rax);          // stuff next address
    __ popa();
    __ ret(0);                        // jump to next address
    return start;
  }
  address generate_verify_oop() {
    StubCodeMark mark(this, "StubRoutines", "verify_oop");
    address start = __ pc();
    Label exit, error;
    __ pushf();
    __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
    __ push(rdx);                                // save rdx
    __ movptr(rax, Address(rsp, 4 * wordSize));    // get object
    __ testptr(rax, rax);
    __ jcc(Assembler::zero, exit);               // if obj is NULL it is ok
    const int oop_mask = Universe::verify_oop_mask();
    const int oop_bits = Universe::verify_oop_bits();
    __ mov(rdx, rax);
    __ andptr(rdx, oop_mask);
    __ cmpptr(rdx, oop_bits);
    __ jcc(Assembler::notZero, error);
    __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
    __ testptr(rax, rax);
    __ jcc(Assembler::zero, error);              // if klass is NULL it is broken
    __ bind(exit);
    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
    __ pop(rdx);                                 // restore rdx
    __ popf();                                   // restore EFLAGS
    __ ret(3 * wordSize);                        // pop arguments
    __ bind(error);
    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
    __ pop(rdx);                                 // get saved rdx back
    __ popf();                                   // get saved EFLAGS off stack -- will be ignored
    __ pusha();                                  // push registers (eip = return address & msg are already pushed)
    BLOCK_COMMENT("call MacroAssembler::debug");
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
    __ popa();
    __ ret(3 * wordSize);                        // pop arguments
    return start;
  }
  void  gen_write_ref_array_pre_barrier(Register start, Register count, bool uninitialized_target) {
    assert_different_registers(start, count);
    BarrierSet* bs = Universe::heap()->barrier_set();
    switch (bs->kind()) {
      case BarrierSet::G1SATBCT:
      case BarrierSet::G1SATBCTLogging:
        if (!uninitialized_target) {
           __ pusha();                      // push registers
           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre),
                           start, count);
           __ popa();
         }
        break;
      case BarrierSet::CardTableModRef:
      case BarrierSet::CardTableExtension:
      case BarrierSet::ModRef:
        break;
      default      :
        ShouldNotReachHere();
    }
  }
  void  gen_write_ref_array_post_barrier(Register start, Register count) {
    BarrierSet* bs = Universe::heap()->barrier_set();
    assert_different_registers(start, count);
    switch (bs->kind()) {
      case BarrierSet::G1SATBCT:
      case BarrierSet::G1SATBCTLogging:
        {
          __ pusha();                      // push registers
          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post),
                          start, count);
          __ popa();
        }
        break;
      case BarrierSet::CardTableModRef:
      case BarrierSet::CardTableExtension:
        {
          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
          Label L_loop;
          const Register end = count;  // elements count; end == start+count-1
          assert_different_registers(start, end);
          __ lea(end,  Address(start, count, Address::times_ptr, -wordSize));
          __ shrptr(start, CardTableModRefBS::card_shift);
          __ shrptr(end,   CardTableModRefBS::card_shift);
          __ subptr(end, start); // end --> count
        __ BIND(L_loop);
          intptr_t disp = (intptr_t) ct->byte_map_base;
          Address cardtable(start, count, Address::times_1, disp);
          __ movb(cardtable, 0);
          __ decrement(count);
          __ jcc(Assembler::greaterEqual, L_loop);
        }
        break;
      case BarrierSet::ModRef:
        break;
      default      :
        ShouldNotReachHere();
    }
  }
  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
    assert( UseSSE >= 2, "supported cpu only" );
    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
    __ jmpb(L_copy_64_bytes);
    __ align(OptoLoopAlignment);
  __ BIND(L_copy_64_bytes_loop);
    if (UseUnalignedLoadStores) {
      if (UseAVX >= 2) {
        __ vmovdqu(xmm0, Address(from,  0));
        __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
        __ vmovdqu(xmm1, Address(from, 32));
        __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
      } else {
        __ movdqu(xmm0, Address(from, 0));
        __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
        __ movdqu(xmm1, Address(from, 16));
        __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
        __ movdqu(xmm2, Address(from, 32));
        __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
        __ movdqu(xmm3, Address(from, 48));
        __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
      }
    } else {
      __ movq(xmm0, Address(from, 0));
      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
      __ movq(xmm1, Address(from, 8));
      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
      __ movq(xmm2, Address(from, 16));
      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
      __ movq(xmm3, Address(from, 24));
      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
      __ movq(xmm4, Address(from, 32));
      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
      __ movq(xmm5, Address(from, 40));
      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
      __ movq(xmm6, Address(from, 48));
      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
      __ movq(xmm7, Address(from, 56));
      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
    }
    __ addl(from, 64);
  __ BIND(L_copy_64_bytes);
    __ subl(qword_count, 8);
    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
    if (UseUnalignedLoadStores && (UseAVX >= 2)) {
      __ vpxor(xmm0, xmm0);
      __ vpxor(xmm1, xmm1);
    }
    __ addl(qword_count, 8);
    __ jccb(Assembler::zero, L_exit);
  __ BIND(L_copy_8_bytes);
    __ movq(xmm0, Address(from, 0));
    __ movq(Address(from, to_from, Address::times_1), xmm0);
    __ addl(from, 8);
    __ decrement(qword_count);
    __ jcc(Assembler::greater, L_copy_8_bytes);
  __ BIND(L_exit);
  }
  void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
    assert( VM_Version::supports_mmx(), "supported cpu only" );
    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
    __ jmpb(L_copy_64_bytes);
    __ align(OptoLoopAlignment);
  __ BIND(L_copy_64_bytes_loop);
    __ movq(mmx0, Address(from, 0));
    __ movq(mmx1, Address(from, 8));
    __ movq(mmx2, Address(from, 16));
    __ movq(Address(from, to_from, Address::times_1, 0), mmx0);
    __ movq(mmx3, Address(from, 24));
    __ movq(Address(from, to_from, Address::times_1, 8), mmx1);
    __ movq(mmx4, Address(from, 32));
    __ movq(Address(from, to_from, Address::times_1, 16), mmx2);
    __ movq(mmx5, Address(from, 40));
    __ movq(Address(from, to_from, Address::times_1, 24), mmx3);
    __ movq(mmx6, Address(from, 48));
    __ movq(Address(from, to_from, Address::times_1, 32), mmx4);
    __ movq(mmx7, Address(from, 56));
    __ movq(Address(from, to_from, Address::times_1, 40), mmx5);
    __ movq(Address(from, to_from, Address::times_1, 48), mmx6);
    __ movq(Address(from, to_from, Address::times_1, 56), mmx7);
    __ addptr(from, 64);
  __ BIND(L_copy_64_bytes);
    __ subl(qword_count, 8);
    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
    __ addl(qword_count, 8);
    __ jccb(Assembler::zero, L_exit);
  __ BIND(L_copy_8_bytes);
    __ movq(mmx0, Address(from, 0));
    __ movq(Address(from, to_from, Address::times_1), mmx0);
    __ addptr(from, 8);
    __ decrement(qword_count);
    __ jcc(Assembler::greater, L_copy_8_bytes);
  __ BIND(L_exit);
    __ emms();
  }
  address generate_disjoint_copy(BasicType t, bool aligned,
                                 Address::ScaleFactor sf,
                                 address* entry, const char *name,
                                 bool dest_uninitialized = false) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
    int shift = Address::times_ptr - sf;
    const Register from     = rsi;  // source array address
    const Register to       = rdi;  // destination array address
    const Register count    = rcx;  // elements count
    const Register to_from  = to;   // (to - from)
    const Register saved_to = rdx;  // saved destination array address
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ push(rsi);
    __ push(rdi);
    __ movptr(from , Address(rsp, 12+ 4));
    __ movptr(to   , Address(rsp, 12+ 8));
    __ movl(count, Address(rsp, 12+ 12));
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    if (t == T_OBJECT) {
      __ testl(count, count);
      __ jcc(Assembler::zero, L_0_count);
      gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
      __ mov(saved_to, to);          // save 'to'
    }
    __ subptr(to, from); // to --> to_from
    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
    if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
      if (t == T_BYTE) {
        __ testl(from, 1);
        __ jccb(Assembler::zero, L_skip_align1);
        __ movb(rax, Address(from, 0));
        __ movb(Address(from, to_from, Address::times_1, 0), rax);
        __ increment(from);
        __ decrement(count);
      __ BIND(L_skip_align1);
      }
      __ testl(from, 2);
      __ jccb(Assembler::zero, L_skip_align2);
      __ movw(rax, Address(from, 0));
      __ movw(Address(from, to_from, Address::times_1, 0), rax);
      __ addptr(from, 2);
      __ subl(count, 1<<(shift-1));
    __ BIND(L_skip_align2);
    }
    if (!VM_Version::supports_mmx()) {
      __ mov(rax, count);      // save 'count'
      __ shrl(count, shift); // bytes count
      __ addptr(to_from, from);// restore 'to'
      __ rep_mov();
      __ subptr(to_from, from);// restore 'to_from'
      __ mov(count, rax);      // restore 'count'
      __ jmpb(L_copy_2_bytes); // all dwords were copied
    } else {
      if (!UseUnalignedLoadStores) {
        __ testptr(from, 4);
        __ jccb(Assembler::zero, L_copy_64_bytes);
        __ movl(rax, Address(from, 0));
        __ movl(Address(from, to_from, Address::times_1, 0), rax);
        __ addptr(from, 4);
        __ subl(count, 1<<shift);
      }
    __ BIND(L_copy_64_bytes);
      __ mov(rax, count);
      __ shrl(rax, shift+1);  // 8 bytes chunk count
      if (UseXMMForArrayCopy) {
        xmm_copy_forward(from, to_from, rax);
      } else {
        mmx_copy_forward(from, to_from, rax);
      }
    }
  __ BIND(L_copy_4_bytes);
    __ testl(count, 1<<shift);
    __ jccb(Assembler::zero, L_copy_2_bytes);
    __ movl(rax, Address(from, 0));
    __ movl(Address(from, to_from, Address::times_1, 0), rax);
    if (t == T_BYTE || t == T_SHORT) {
      __ addptr(from, 4);
    __ BIND(L_copy_2_bytes);
      __ testl(count, 1<<(shift-1));
      __ jccb(Assembler::zero, L_copy_byte);
      __ movw(rax, Address(from, 0));
      __ movw(Address(from, to_from, Address::times_1, 0), rax);
      if (t == T_BYTE) {
        __ addptr(from, 2);
      __ BIND(L_copy_byte);
        __ testl(count, 1);
        __ jccb(Assembler::zero, L_exit);
        __ movb(rax, Address(from, 0));
        __ movb(Address(from, to_from, Address::times_1, 0), rax);
      __ BIND(L_exit);
      } else {
      __ BIND(L_copy_byte);
      }
    } else {
    __ BIND(L_copy_2_bytes);
    }
    if (t == T_OBJECT) {
      __ movl(count, Address(rsp, 12+12)); // reread 'count'
      __ mov(to, saved_to); // restore 'to'
      gen_write_ref_array_post_barrier(to, count);
    __ BIND(L_0_count);
    }
    inc_copy_counter_np(t);
    __ pop(rdi);
    __ pop(rsi);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ xorptr(rax, rax); // return 0
    __ ret(0);
    return start;
  }
  address generate_fill(BasicType t, bool aligned, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    BLOCK_COMMENT("Entry:");
    const Register to       = rdi;  // source array address
    const Register value    = rdx;  // value
    const Register count    = rsi;  // elements count
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ push(rsi);
    __ push(rdi);
    __ movptr(to   , Address(rsp, 12+ 4));
    __ movl(value, Address(rsp, 12+ 8));
    __ movl(count, Address(rsp, 12+ 12));
    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
    __ pop(rdi);
    __ pop(rsi);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  address generate_conjoint_copy(BasicType t, bool aligned,
                                 Address::ScaleFactor sf,
                                 address nooverlap_target,
                                 address* entry, const char *name,
                                 bool dest_uninitialized = false) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
    int shift = Address::times_ptr - sf;
    const Register src   = rax;  // source array address
    const Register dst   = rdx;  // destination array address
    const Register from  = rsi;  // source array address
    const Register to    = rdi;  // destination array address
    const Register count = rcx;  // elements count
    const Register end   = rax;  // array end address
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ push(rsi);
    __ push(rdi);
    __ movptr(src  , Address(rsp, 12+ 4));   // from
    __ movptr(dst  , Address(rsp, 12+ 8));   // to
    __ movl2ptr(count, Address(rsp, 12+12)); // count
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    __ mov(from, src);
    __ mov(to  , dst);
    RuntimeAddress nooverlap(nooverlap_target);
    __ cmpptr(dst, src);
    __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
    __ jump_cc(Assembler::belowEqual, nooverlap);
    __ cmpptr(dst, end);
    __ jump_cc(Assembler::aboveEqual, nooverlap);
    if (t == T_OBJECT) {
      __ testl(count, count);
      __ jcc(Assembler::zero, L_0_count);
      gen_write_ref_array_pre_barrier(dst, count, dest_uninitialized);
    }
    __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
    __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
    if (t == T_BYTE || t == T_SHORT) {
      __ lea(end, Address(dst, count, sf, 0));
      if (t == T_BYTE) {
        __ testl(end, 1);
        __ jccb(Assembler::zero, L_skip_align1);
        __ decrement(count);
        __ movb(rdx, Address(from, count, sf, 0));
        __ movb(Address(to, count, sf, 0), rdx);
      __ BIND(L_skip_align1);
      }
      __ testl(end, 2);
      __ jccb(Assembler::zero, L_skip_align2);
      __ subptr(count, 1<<(shift-1));
      __ movw(rdx, Address(from, count, sf, 0));
      __ movw(Address(to, count, sf, 0), rdx);
    __ BIND(L_skip_align2);
      __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
      __ jcc(Assembler::below, L_copy_4_bytes);
    }
    if (!VM_Version::supports_mmx()) {
      __ std();
      __ mov(rax, count); // Save 'count'
      __ mov(rdx, to);    // Save 'to'
      __ lea(rsi, Address(from, count, sf, -4));
      __ lea(rdi, Address(to  , count, sf, -4));
      __ shrptr(count, shift); // bytes count
      __ rep_mov();
      __ cld();
      __ mov(count, rax); // restore 'count'
      __ andl(count, (1<<shift)-1);      // mask the number of rest elements
      __ movptr(from, Address(rsp, 12+4)); // reread 'from'
      __ mov(to, rdx);   // restore 'to'
      __ jmpb(L_copy_2_bytes); // all dword were copied
   } else {
      __ testptr(end, 4);
      __ jccb(Assembler::zero, L_copy_8_bytes);
      __ subl(count, 1<<shift);
      __ movl(rdx, Address(from, count, sf, 0));
      __ movl(Address(to, count, sf, 0), rdx);
      __ jmpb(L_copy_8_bytes);
      __ align(OptoLoopAlignment);
    __ BIND(L_copy_8_bytes_loop);
      if (UseXMMForArrayCopy) {
        __ movq(xmm0, Address(from, count, sf, 0));
        __ movq(Address(to, count, sf, 0), xmm0);
      } else {
        __ movq(mmx0, Address(from, count, sf, 0));
        __ movq(Address(to, count, sf, 0), mmx0);
      }
    __ BIND(L_copy_8_bytes);
      __ subl(count, 2<<shift);
      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
      __ addl(count, 2<<shift);
      if (!UseXMMForArrayCopy) {
        __ emms();
      }
    }
  __ BIND(L_copy_4_bytes);
    __ testl(count, 1<<shift);
    __ jccb(Assembler::zero, L_copy_2_bytes);
    __ movl(rdx, Address(from, count, sf, -4));
    __ movl(Address(to, count, sf, -4), rdx);
    if (t == T_BYTE || t == T_SHORT) {
        __ subl(count, (1<<shift));
      __ BIND(L_copy_2_bytes);
        __ testl(count, 1<<(shift-1));
        __ jccb(Assembler::zero, L_copy_byte);
        __ movw(rdx, Address(from, count, sf, -2));
        __ movw(Address(to, count, sf, -2), rdx);
        if (t == T_BYTE) {
          __ subl(count, 1<<(shift-1));
        __ BIND(L_copy_byte);
          __ testl(count, 1);
          __ jccb(Assembler::zero, L_exit);
          __ movb(rdx, Address(from, 0));
          __ movb(Address(to, 0), rdx);
        __ BIND(L_exit);
        } else {
        __ BIND(L_copy_byte);
        }
    } else {
    __ BIND(L_copy_2_bytes);
    }
    if (t == T_OBJECT) {
      __ movl2ptr(count, Address(rsp, 12+12)); // reread count
      gen_write_ref_array_post_barrier(to, count);
    __ BIND(L_0_count);
    }
    inc_copy_counter_np(t);
    __ pop(rdi);
    __ pop(rsi);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ xorptr(rax, rax); // return 0
    __ ret(0);
    return start;
  }
  address generate_disjoint_long_copy(address* entry, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_copy_8_bytes, L_copy_8_bytes_loop;
    const Register from       = rax;  // source array address
    const Register to         = rdx;  // destination array address
    const Register count      = rcx;  // elements count
    const Register to_from    = rdx;  // (to - from)
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ movptr(from , Address(rsp, 8+0));       // from
    __ movptr(to   , Address(rsp, 8+4));       // to
    __ movl2ptr(count, Address(rsp, 8+8));     // count
    BLOCK_COMMENT("Entry:");
    __ subptr(to, from); // to --> to_from
    if (VM_Version::supports_mmx()) {
      if (UseXMMForArrayCopy) {
        xmm_copy_forward(from, to_from, count);
      } else {
        mmx_copy_forward(from, to_from, count);
      }
    } else {
      __ jmpb(L_copy_8_bytes);
      __ align(OptoLoopAlignment);
    __ BIND(L_copy_8_bytes_loop);
      __ fild_d(Address(from, 0));
      __ fistp_d(Address(from, to_from, Address::times_1));
      __ addptr(from, 8);
    __ BIND(L_copy_8_bytes);
      __ decrement(count);
      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
    }
    inc_copy_counter_np(T_LONG);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ xorptr(rax, rax); // return 0
    __ ret(0);
    return start;
  }
  address generate_conjoint_long_copy(address nooverlap_target,
                                      address* entry, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_copy_8_bytes, L_copy_8_bytes_loop;
    const Register from       = rax;  // source array address
    const Register to         = rdx;  // destination array address
    const Register count      = rcx;  // elements count
    const Register end_from   = rax;  // source array end address
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ movptr(from , Address(rsp, 8+0));       // from
    __ movptr(to   , Address(rsp, 8+4));       // to
    __ movl2ptr(count, Address(rsp, 8+8));     // count
    BLOCK_COMMENT("Entry:");
    __ cmpptr(to, from);
    RuntimeAddress nooverlap(nooverlap_target);
    __ jump_cc(Assembler::belowEqual, nooverlap);
    __ lea(end_from, Address(from, count, Address::times_8, 0));
    __ cmpptr(to, end_from);
    __ movptr(from, Address(rsp, 8));  // from
    __ jump_cc(Assembler::aboveEqual, nooverlap);
    __ jmpb(L_copy_8_bytes);
    __ align(OptoLoopAlignment);
  __ BIND(L_copy_8_bytes_loop);
    if (VM_Version::supports_mmx()) {
      if (UseXMMForArrayCopy) {
        __ movq(xmm0, Address(from, count, Address::times_8));
        __ movq(Address(to, count, Address::times_8), xmm0);
      } else {
        __ movq(mmx0, Address(from, count, Address::times_8));
        __ movq(Address(to, count, Address::times_8), mmx0);
      }
    } else {
      __ fild_d(Address(from, count, Address::times_8));
      __ fistp_d(Address(to, count, Address::times_8));
    }
  __ BIND(L_copy_8_bytes);
    __ decrement(count);
    __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
    if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
      __ emms();
    }
    inc_copy_counter_np(T_LONG);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ xorptr(rax, rax); // return 0
    __ ret(0);
    return start;
  }
  void generate_type_check(Register sub_klass,
                           Address& super_check_offset_addr,
                           Address& super_klass_addr,
                           Register temp,
                           Label* L_success, Label* L_failure) {
    BLOCK_COMMENT("type_check:");
    Label L_fallthrough;
#define LOCAL_JCC(assembler_con, label_ptr)                             \
    if (label_ptr != NULL)  __ jcc(assembler_con, *(label_ptr));        \
    else                    __ jcc(assembler_con, L_fallthrough) /*omit semi*/
    assert_different_registers(sub_klass, temp);
    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
    __ cmpptr(sub_klass, super_klass_addr);
    LOCAL_JCC(Assembler::equal, L_success);
    __ movl2ptr(temp, super_check_offset_addr);
    Address super_check_addr(sub_klass, temp, Address::times_1, 0);
    __ movptr(temp, super_check_addr); // load displayed supertype
    __ cmpptr(temp, super_klass_addr); // test the super type
    LOCAL_JCC(Assembler::equal, L_success);
    __ cmpl(super_check_offset_addr, sc_offset);
    LOCAL_JCC(Assembler::notEqual, L_failure);
    Register super_klass = temp;
    __ movptr(super_klass, super_klass_addr);
    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
                                     L_success, L_failure);
    __ bind(L_fallthrough);
    if (L_success == NULL) { BLOCK_COMMENT("L_success:"); }
    if (L_failure == NULL) { BLOCK_COMMENT("L_failure:"); }
#undef LOCAL_JCC
  }
  address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_load_element, L_store_element, L_do_card_marks, L_done;
    const Register from       = rax;    // source array address
    const Register to         = rdx;    // destination array address
    const Register length     = rcx;    // elements count
    const Register elem       = rdi;    // each oop copied
    const Register elem_klass = rsi;    // each elem._klass (sub_klass)
    const Register temp       = rbx;    // lone remaining temp
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ push(rsi);
    __ push(rdi);
    __ push(rbx);
    Address   from_arg(rsp, 16+ 4);     // from
    Address     to_arg(rsp, 16+ 8);     // to
    Address length_arg(rsp, 16+12);     // elements count
    Address  ckoff_arg(rsp, 16+16);     // super_check_offset
    Address  ckval_arg(rsp, 16+20);     // super_klass
    __ movptr(from,     from_arg);
    __ movptr(to,         to_arg);
    __ movl2ptr(length, length_arg);
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    Address end_from_addr(from, length, Address::times_ptr, 0);
    Address   end_to_addr(to,   length, Address::times_ptr, 0);
    Register end_from = from;           // re-use
    Register end_to   = to;             // re-use
    Register count    = length;         // re-use
    Address from_element_addr(end_from, count, Address::times_ptr, 0);
    Address   to_element_addr(end_to,   count, Address::times_ptr, 0);
    Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
    gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
    __ lea(end_from, end_from_addr);
    __ lea(end_to,   end_to_addr);
    assert(length == count, "");        // else fix next line:
    __ negptr(count);                   // negate and test the length
    __ jccb(Assembler::notZero, L_load_element);
    __ xorptr(rax, rax);                  // return 0 on (trivial) success
    __ jmp(L_done);
    __ align(OptoLoopAlignment);
    __ BIND(L_store_element);
    __ movptr(to_element_addr, elem);     // store the oop
    __ increment(count);                // increment the count toward zero
    __ jccb(Assembler::zero, L_do_card_marks);
    __ BIND(L_load_element);
    __ movptr(elem, from_element_addr);   // load the oop
    __ testptr(elem, elem);
    __ jccb(Assembler::zero, L_store_element);
    __ movptr(elem_klass, elem_klass_addr); // query the object klass
    generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
                        &L_store_element, NULL);
    assert_different_registers(to, count, rax);
    Label L_post_barrier;
    __ addl(count, length_arg);         // transfers = (length - remaining)
    __ movl2ptr(rax, count);            // save the value
    __ notptr(rax);                     // report (-1^K) to caller (does not affect flags)
    __ jccb(Assembler::notZero, L_post_barrier);
    __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
    __ BIND(L_do_card_marks);
    __ xorptr(rax, rax);                // return 0 on success
    __ movl2ptr(count, length_arg);
    __ BIND(L_post_barrier);
    __ movptr(to, to_arg);              // reload
    gen_write_ref_array_post_barrier(to, count);
    __ BIND(L_done);
    __ pop(rbx);
    __ pop(rdi);
    __ pop(rsi);
    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  address generate_unsafe_copy(const char *name,
                               address byte_copy_entry,
                               address short_copy_entry,
                               address int_copy_entry,
                               address long_copy_entry) {
    Label L_long_aligned, L_int_aligned, L_short_aligned;
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    const Register from       = rax;  // source array address
    const Register to         = rdx;  // destination array address
    const Register count      = rcx;  // elements count
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ push(rsi);
    __ push(rdi);
    Address  from_arg(rsp, 12+ 4);      // from
    Address    to_arg(rsp, 12+ 8);      // to
    Address count_arg(rsp, 12+12);      // byte count
    __ movptr(from ,  from_arg);
    __ movptr(to   ,    to_arg);
    __ movl2ptr(count, count_arg);
    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
    const Register bits = rsi;
    __ mov(bits, from);
    __ orptr(bits, to);
    __ orptr(bits, count);
    __ testl(bits, BytesPerLong-1);
    __ jccb(Assembler::zero, L_long_aligned);
    __ testl(bits, BytesPerInt-1);
    __ jccb(Assembler::zero, L_int_aligned);
    __ testl(bits, BytesPerShort-1);
    __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
    __ BIND(L_short_aligned);
    __ shrptr(count, LogBytesPerShort); // size => short_count
    __ movl(count_arg, count);          // update 'count'
    __ jump(RuntimeAddress(short_copy_entry));
    __ BIND(L_int_aligned);
    __ shrptr(count, LogBytesPerInt); // size => int_count
    __ movl(count_arg, count);          // update 'count'
    __ jump(RuntimeAddress(int_copy_entry));
    __ BIND(L_long_aligned);
    __ shrptr(count, LogBytesPerLong); // size => qword_count
    __ movl(count_arg, count);          // update 'count'
    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
    __ pop(rsi);
    __ jump(RuntimeAddress(long_copy_entry));
    return start;
  }
  void arraycopy_range_checks(Register src,
                              Register src_pos,
                              Register dst,
                              Register dst_pos,
                              Address& length,
                              Label& L_failed) {
    BLOCK_COMMENT("arraycopy_range_checks:");
    const Register src_end = src_pos;   // source array end position
    const Register dst_end = dst_pos;   // destination array end position
    __ addl(src_end, length); // src_pos + length
    __ addl(dst_end, length); // dst_pos + length
    __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
    __ jcc(Assembler::above, L_failed);
    __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
    __ jcc(Assembler::above, L_failed);
    BLOCK_COMMENT("arraycopy_range_checks done");
  }
  address generate_generic_copy(const char *name,
                                address entry_jbyte_arraycopy,
                                address entry_jshort_arraycopy,
                                address entry_jint_arraycopy,
                                address entry_oop_arraycopy,
                                address entry_jlong_arraycopy,
                                address entry_checkcast_arraycopy) {
    Label L_failed, L_failed_0, L_objArray;
    { int modulus = CodeEntryAlignment;
      int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
      int advance = target - (__ offset() % modulus);
      if (advance < 0)  advance += modulus;
      if (advance > 0)  __ nop(advance);
    }
    StubCodeMark mark(this, "StubRoutines", name);
    __ BIND(L_failed_0);
    __ jmp(L_failed);
    assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
    __ align(CodeEntryAlignment);
    address start = __ pc();
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ push(rsi);
    __ push(rdi);
    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
    Address SRC     (rsp, 12+ 4);
    Address SRC_POS (rsp, 12+ 8);
    Address DST     (rsp, 12+12);
    Address DST_POS (rsp, 12+16);
    Address LENGTH  (rsp, 12+20);
    const Register src     = rax;       // source array oop
    const Register src_pos = rsi;
    const Register dst     = rdx;       // destination array oop
    const Register dst_pos = rdi;
    const Register length  = rcx;       // transfer count
    __ movptr(src, SRC);      // src oop
    __ testptr(src, src);
    __ jccb(Assembler::zero, L_failed_0);
    __ movl2ptr(src_pos, SRC_POS);  // src_pos
    __ testl(src_pos, src_pos);
    __ jccb(Assembler::negative, L_failed_0);
    __ movptr(dst, DST);      // dst oop
    __ testptr(dst, dst);
    __ jccb(Assembler::zero, L_failed_0);
    __ movl2ptr(dst_pos, DST_POS);  // dst_pos
    __ testl(dst_pos, dst_pos);
    __ jccb(Assembler::negative, L_failed_0);
    __ movl2ptr(length, LENGTH);   // length
    __ testl(length, length);
    __ jccb(Assembler::negative, L_failed_0);
    Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
    Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
    const Register rcx_src_klass = rcx;    // array klass
    __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
#ifdef ASSERT
    BLOCK_COMMENT("assert klasses not null");
    { Label L1, L2;
      __ testptr(rcx_src_klass, rcx_src_klass);
      __ jccb(Assembler::notZero, L2);   // it is broken if klass is NULL
      __ bind(L1);
      __ stop("broken null klass");
      __ bind(L2);
      __ cmpptr(dst_klass_addr, (int32_t)NULL_WORD);
      __ jccb(Assembler::equal, L1);      // this would be broken also
      BLOCK_COMMENT("assert done");
    }
#endif //ASSERT
    int lh_offset = in_bytes(Klass::layout_helper_offset());
    Address src_klass_lh_addr(rcx_src_klass, lh_offset);
    jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
    __ cmpl(src_klass_lh_addr, objArray_lh);
    __ jcc(Assembler::equal, L_objArray);
    __ cmpptr(rcx_src_klass, dst_klass_addr);
    __ jccb(Assembler::notEqual, L_failed_0);
    const Register rcx_lh = rcx;  // layout helper
    assert(rcx_lh == rcx_src_klass, "known alias");
    __ movl(rcx_lh, src_klass_lh_addr);
    __ cmpl(rcx_lh, Klass::_lh_neutral_value);
    __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
#ifdef ASSERT
    { Label L;
      __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
      __ jcc(Assembler::greaterEqual, L); // signed cmp
      __ stop("must be a primitive array");
      __ bind(L);
    }
#endif
    assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
    const Register rsi_offset = rsi; // array offset
    const Register src_array  = src; // src array offset
    const Register dst_array  = dst; // dst array offset
    const Register rdi_elsize = rdi; // log2 element size
    __ mov(rsi_offset, rcx_lh);
    __ shrptr(rsi_offset, Klass::_lh_header_size_shift);
    __ andptr(rsi_offset, Klass::_lh_header_size_mask);   // array_offset
    __ addptr(src_array, rsi_offset);  // src array offset
    __ addptr(dst_array, rsi_offset);  // dst array offset
    __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
    const Register from       = src; // source array address
    const Register to         = dst; // destination array address
    const Register count      = rcx; // elements count
#define FROM   Address(rsp, 12+ 4)
#define TO     Address(rsp, 12+ 8)   // Not used now
#define COUNT  Address(rsp, 12+12)   // Only for oop arraycopy
    BLOCK_COMMENT("scale indexes to element size");
    __ movl2ptr(rsi, SRC_POS);  // src_pos
    __ shlptr(rsi);             // src_pos << rcx (log2 elsize)
    assert(src_array == from, "");
    __ addptr(from, rsi);       // from = src_array + SRC_POS << log2 elsize
    __ movl2ptr(rdi, DST_POS);  // dst_pos
    __ shlptr(rdi);             // dst_pos << rcx (log2 elsize)
    assert(dst_array == to, "");
    __ addptr(to,  rdi);        // to   = dst_array + DST_POS << log2 elsize
    __ movptr(FROM, from);      // src_addr
    __ mov(rdi_elsize, rcx_lh); // log2 elsize
    __ movl2ptr(count, LENGTH); // elements count
    BLOCK_COMMENT("choose copy loop based on element size");
    __ cmpl(rdi_elsize, 0);
    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
    __ cmpl(rdi_elsize, LogBytesPerShort);
    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
    __ cmpl(rdi_elsize, LogBytesPerInt);
    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
#ifdef ASSERT
    __ cmpl(rdi_elsize, LogBytesPerLong);
    __ jccb(Assembler::notEqual, L_failed);
#endif
    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
    __ pop(rsi);
    __ jump(RuntimeAddress(entry_jlong_arraycopy));
  __ BIND(L_failed);
    __ xorptr(rax, rax);
    __ notptr(rax); // return -1
    __ pop(rdi);
    __ pop(rsi);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
  __ BIND(L_objArray);
    Label L_plain_copy, L_checkcast_copy;
    __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
    __ jccb(Assembler::notEqual, L_checkcast_copy);
    assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
  __ BIND(L_plain_copy);
    __ movl2ptr(count, LENGTH); // elements count
    __ movl2ptr(src_pos, SRC_POS);  // reload src_pos
    __ lea(from, Address(src, src_pos, Address::times_ptr,
                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
    __ movl2ptr(dst_pos, DST_POS);  // reload dst_pos
    __ lea(to,   Address(dst, dst_pos, Address::times_ptr,
                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
    __ movptr(FROM,  from);   // src_addr
    __ movptr(TO,    to);     // dst_addr
    __ movl(COUNT, count);  // count
    __ jump(RuntimeAddress(entry_oop_arraycopy));
  __ BIND(L_checkcast_copy);
    {
      int  ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
      int sco_offset = in_bytes(Klass::super_check_offset_offset());
      Register rsi_dst_klass = rsi;
      Register rdi_temp      = rdi;
      assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
      assert(rdi_temp      == dst_pos, "expected alias w/ dst_pos");
      Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
      __ movptr(rsi_dst_klass, dst_klass_addr);
      __ cmpl(dst_klass_lh_addr, objArray_lh);
      __ jccb(Assembler::notEqual, L_failed);
      __ movl2ptr(src_pos, SRC_POS);        // reload rsi
      arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
      __ push(rbx);
      Register rbx_src_klass = rbx;
      __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
      __ movptr(rsi_dst_klass, dst_klass_addr);
      Address super_check_offset_addr(rsi_dst_klass, sco_offset);
      Label L_fail_array_check;
      generate_type_check(rbx_src_klass,
                          super_check_offset_addr, dst_klass_addr,
                          rdi_temp, NULL, &L_fail_array_check);
      __ pop(rbx);
      __ jmp(L_plain_copy);
      __ BIND(L_fail_array_check);
      Address   from_arg(rsp, 16+ 4);   // from
      Address     to_arg(rsp, 16+ 8);   // to
      Address length_arg(rsp, 16+12);   // elements count
      Address  ckoff_arg(rsp, 16+16);   // super_check_offset
      Address  ckval_arg(rsp, 16+20);   // super_klass
      Address SRC_POS_arg(rsp, 16+ 8);
      Address DST_POS_arg(rsp, 16+16);
      Address  LENGTH_arg(rsp, 16+20);
      __ movptr(rbx, Address(rsi_dst_klass, ek_offset));
      __ movl2ptr(length, LENGTH_arg);    // reload elements count
      __ movl2ptr(src_pos, SRC_POS_arg);  // reload src_pos
      __ movl2ptr(dst_pos, DST_POS_arg);  // reload dst_pos
      __ movptr(ckval_arg, rbx);          // destination element type
      __ movl(rbx, Address(rbx, sco_offset));
      __ movl(ckoff_arg, rbx);          // corresponding class check offset
      __ movl(length_arg, length);      // outgoing length argument
      __ lea(from, Address(src, src_pos, Address::times_ptr,
                            arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
      __ movptr(from_arg, from);
      __ lea(to, Address(dst, dst_pos, Address::times_ptr,
                          arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
      __ movptr(to_arg, to);
      __ jump(RuntimeAddress(entry_checkcast_arraycopy));
    }
    return start;
  }
  void generate_arraycopy_stubs() {
    address entry;
    address entry_jbyte_arraycopy;
    address entry_jshort_arraycopy;
    address entry_jint_arraycopy;
    address entry_oop_arraycopy;
    address entry_jlong_arraycopy;
    address entry_checkcast_arraycopy;
    StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
        generate_disjoint_copy(T_BYTE,  true, Address::times_1, &entry,
                               "arrayof_jbyte_disjoint_arraycopy");
    StubRoutines::_arrayof_jbyte_arraycopy =
        generate_conjoint_copy(T_BYTE,  true, Address::times_1,  entry,
                               NULL, "arrayof_jbyte_arraycopy");
    StubRoutines::_jbyte_disjoint_arraycopy =
        generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
                               "jbyte_disjoint_arraycopy");
    StubRoutines::_jbyte_arraycopy =
        generate_conjoint_copy(T_BYTE, false, Address::times_1,  entry,
                               &entry_jbyte_arraycopy, "jbyte_arraycopy");
    StubRoutines::_arrayof_jshort_disjoint_arraycopy =
        generate_disjoint_copy(T_SHORT,  true, Address::times_2, &entry,
                               "arrayof_jshort_disjoint_arraycopy");
    StubRoutines::_arrayof_jshort_arraycopy =
        generate_conjoint_copy(T_SHORT,  true, Address::times_2,  entry,
                               NULL, "arrayof_jshort_arraycopy");
    StubRoutines::_jshort_disjoint_arraycopy =
        generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
                               "jshort_disjoint_arraycopy");
    StubRoutines::_jshort_arraycopy =
        generate_conjoint_copy(T_SHORT, false, Address::times_2,  entry,
                               &entry_jshort_arraycopy, "jshort_arraycopy");
    StubRoutines::_jint_disjoint_arraycopy =
        generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
                               "jint_disjoint_arraycopy");
    StubRoutines::_jint_arraycopy =
        generate_conjoint_copy(T_INT, true, Address::times_4,  entry,
                               &entry_jint_arraycopy, "jint_arraycopy");
    StubRoutines::_oop_disjoint_arraycopy =
        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
                               "oop_disjoint_arraycopy");
    StubRoutines::_oop_arraycopy =
        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
                               &entry_oop_arraycopy, "oop_arraycopy");
    StubRoutines::_oop_disjoint_arraycopy_uninit =
        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
                               "oop_disjoint_arraycopy_uninit",
    StubRoutines::_oop_arraycopy_uninit =
        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
                               NULL, "oop_arraycopy_uninit",
    StubRoutines::_jlong_disjoint_arraycopy =
        generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
    StubRoutines::_jlong_arraycopy =
        generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
                                    "jlong_arraycopy");
    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
    StubRoutines::_arrayof_jint_disjoint_arraycopy       = StubRoutines::_jint_disjoint_arraycopy;
    StubRoutines::_arrayof_oop_disjoint_arraycopy        = StubRoutines::_oop_disjoint_arraycopy;
    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
    StubRoutines::_arrayof_jlong_disjoint_arraycopy      = StubRoutines::_jlong_disjoint_arraycopy;
    StubRoutines::_arrayof_jint_arraycopy       = StubRoutines::_jint_arraycopy;
    StubRoutines::_arrayof_oop_arraycopy        = StubRoutines::_oop_arraycopy;
    StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
    StubRoutines::_arrayof_jlong_arraycopy      = StubRoutines::_jlong_arraycopy;
    StubRoutines::_checkcast_arraycopy =
        generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
    StubRoutines::_checkcast_arraycopy_uninit =
        generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, /*dest_uninitialized*/true);
    StubRoutines::_unsafe_arraycopy =
        generate_unsafe_copy("unsafe_arraycopy",
                               entry_jbyte_arraycopy,
                               entry_jshort_arraycopy,
                               entry_jint_arraycopy,
                               entry_jlong_arraycopy);
    StubRoutines::_generic_arraycopy =
        generate_generic_copy("generic_arraycopy",
                               entry_jbyte_arraycopy,
                               entry_jshort_arraycopy,
                               entry_jint_arraycopy,
                               entry_oop_arraycopy,
                               entry_jlong_arraycopy,
                               entry_checkcast_arraycopy);
  }
  void generate_math_stubs() {
    {
      StubCodeMark mark(this, "StubRoutines", "log");
      StubRoutines::_intrinsic_log = (double (*)(double)) __ pc();
      __ fld_d(Address(rsp, 4));
      __ flog();
      __ ret(0);
    }
    {
      StubCodeMark mark(this, "StubRoutines", "log10");
      StubRoutines::_intrinsic_log10 = (double (*)(double)) __ pc();
      __ fld_d(Address(rsp, 4));
      __ flog10();
      __ ret(0);
    }
    {
      StubCodeMark mark(this, "StubRoutines", "sin");
      StubRoutines::_intrinsic_sin = (double (*)(double))  __ pc();
      __ fld_d(Address(rsp, 4));
      __ trigfunc('s');
      __ ret(0);
    }
    {
      StubCodeMark mark(this, "StubRoutines", "cos");
      StubRoutines::_intrinsic_cos = (double (*)(double)) __ pc();
      __ fld_d(Address(rsp, 4));
      __ trigfunc('c');
      __ ret(0);
    }
    {
      StubCodeMark mark(this, "StubRoutines", "tan");
      StubRoutines::_intrinsic_tan = (double (*)(double)) __ pc();
      __ fld_d(Address(rsp, 4));
      __ trigfunc('t');
      __ ret(0);
    }
    {
      StubCodeMark mark(this, "StubRoutines", "exp");
      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
      __ fld_d(Address(rsp, 4));
      __ exp_with_fallback(0);
      __ ret(0);
    }
    {
      StubCodeMark mark(this, "StubRoutines", "pow");
      StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
      __ fld_d(Address(rsp, 12));
      __ fld_d(Address(rsp, 4));
      __ pow_with_fallback(0);
      __ ret(0);
    }
  }
  enum {AESBlockSize = 16};
  address generate_key_shuffle_mask() {
    __ align(16);
    StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
    address start = __ pc();
    __ emit_data(0x00010203, relocInfo::none, 0 );
    __ emit_data(0x04050607, relocInfo::none, 0 );
    __ emit_data(0x08090a0b, relocInfo::none, 0 );
    __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
    return start;
  }
  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
    __ movdqu(xmmdst, Address(key, offset));
    if (xmm_shuf_mask != NULL) {
      __ pshufb(xmmdst, xmm_shuf_mask);
    } else {
      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    }
  }
  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
    load_key(xmmtmp, key, offset, xmm_shuf_mask);
    __ aesenc(xmmdst, xmmtmp);
  }
  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
    load_key(xmmtmp, key, offset, xmm_shuf_mask);
    __ aesdec(xmmdst, xmmtmp);
  }
  address generate_aescrypt_encryptBlock() {
    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
    Label L_doLast;
    address start = __ pc();
    const Register from        = rdx;      // source array address
    const Register to          = rdx;      // destination array address
    const Register key         = rcx;      // key array address
    const Register keylen      = rax;
    const Address  from_param(rbp, 8+0);
    const Address  to_param  (rbp, 8+4);
    const Address  key_param (rbp, 8+8);
    const XMMRegister xmm_result = xmm0;
    const XMMRegister xmm_key_shuf_mask = xmm1;
    const XMMRegister xmm_temp1  = xmm2;
    const XMMRegister xmm_temp2  = xmm3;
    const XMMRegister xmm_temp3  = xmm4;
    const XMMRegister xmm_temp4  = xmm5;
    __ enter();   // required for proper stackwalking of RuntimeStub frame
    __ movptr(from, from_param);
    __ movptr(key, key_param);
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
    __ movptr(to, to_param);
    load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
    __ pxor(xmm_result, xmm_temp1);
    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
    __ aesenc(xmm_result, xmm_temp1);
    __ aesenc(xmm_result, xmm_temp2);
    __ aesenc(xmm_result, xmm_temp3);
    __ aesenc(xmm_result, xmm_temp4);
    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
    __ aesenc(xmm_result, xmm_temp1);
    __ aesenc(xmm_result, xmm_temp2);
    __ aesenc(xmm_result, xmm_temp3);
    __ aesenc(xmm_result, xmm_temp4);
    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
    __ cmpl(keylen, 44);
    __ jccb(Assembler::equal, L_doLast);
    __ aesenc(xmm_result, xmm_temp1);
    __ aesenc(xmm_result, xmm_temp2);
    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
    __ cmpl(keylen, 52);
    __ jccb(Assembler::equal, L_doLast);
    __ aesenc(xmm_result, xmm_temp1);
    __ aesenc(xmm_result, xmm_temp2);
    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
    __ BIND(L_doLast);
    __ aesenc(xmm_result, xmm_temp1);
    __ aesenclast(xmm_result, xmm_temp2);
    __ movdqu(Address(to, 0), xmm_result);        // store the result
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  address generate_aescrypt_decryptBlock() {
    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
    Label L_doLast;
    address start = __ pc();
    const Register from        = rdx;      // source array address
    const Register to          = rdx;      // destination array address
    const Register key         = rcx;      // key array address
    const Register keylen      = rax;
    const Address  from_param(rbp, 8+0);
    const Address  to_param  (rbp, 8+4);
    const Address  key_param (rbp, 8+8);
    const XMMRegister xmm_result = xmm0;
    const XMMRegister xmm_key_shuf_mask = xmm1;
    const XMMRegister xmm_temp1  = xmm2;
    const XMMRegister xmm_temp2  = xmm3;
    const XMMRegister xmm_temp3  = xmm4;
    const XMMRegister xmm_temp4  = xmm5;
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ movptr(from, from_param);
    __ movptr(key, key_param);
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    __ movdqu(xmm_result, Address(from, 0));
    __ movptr(to, to_param);
    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
    __ pxor  (xmm_result, xmm_temp1);
    __ aesdec(xmm_result, xmm_temp2);
    __ aesdec(xmm_result, xmm_temp3);
    __ aesdec(xmm_result, xmm_temp4);
    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
    __ aesdec(xmm_result, xmm_temp1);
    __ aesdec(xmm_result, xmm_temp2);
    __ aesdec(xmm_result, xmm_temp3);
    __ aesdec(xmm_result, xmm_temp4);
    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
    load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
    __ cmpl(keylen, 44);
    __ jccb(Assembler::equal, L_doLast);
    __ aesdec(xmm_result, xmm_temp1);
    __ aesdec(xmm_result, xmm_temp2);
    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
    __ cmpl(keylen, 52);
    __ jccb(Assembler::equal, L_doLast);
    __ aesdec(xmm_result, xmm_temp1);
    __ aesdec(xmm_result, xmm_temp2);
    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
    __ BIND(L_doLast);
    __ aesdec(xmm_result, xmm_temp1);
    __ aesdec(xmm_result, xmm_temp2);
    __ aesdeclast(xmm_result, xmm_temp3);
    __ movdqu(Address(to, 0), xmm_result);  // store the result
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  void handleSOERegisters(bool saving) {
    const int saveFrameSizeInBytes = 4 * wordSize;
    const Address saved_rbx     (rbp, -3 * wordSize);
    const Address saved_rsi     (rbp, -2 * wordSize);
    const Address saved_rdi     (rbp, -1 * wordSize);
    if (saving) {
      __ subptr(rsp, saveFrameSizeInBytes);
      __ movptr(saved_rsi, rsi);
      __ movptr(saved_rdi, rdi);
      __ movptr(saved_rbx, rbx);
    } else {
      __ movptr(rsi, saved_rsi);
      __ movptr(rdi, saved_rdi);
      __ movptr(rbx, saved_rbx);
    }
  }
  address generate_cipherBlockChaining_encryptAESCrypt() {
    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
    address start = __ pc();
    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
    const Register from        = rsi;      // source array address
    const Register to          = rdx;      // destination array address
    const Register key         = rcx;      // key array address
    const Register rvec        = rdi;      // r byte array initialized from initvector array address
    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
    const Register pos         = rax;
    const XMMRegister xmm_result = xmm0;
    const XMMRegister xmm_temp   = xmm1;
    const int XMM_REG_NUM_KEY_FIRST = 2;
    const int XMM_REG_NUM_KEY_LAST  = 7;
    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    handleSOERegisters(true /*saving*/);
    const Address  from_param(rbp, 8+0);
    const Address  to_param  (rbp, 8+4);
    const Address  key_param (rbp, 8+8);
    const Address  rvec_param (rbp, 8+12);
    const Address  len_param  (rbp, 8+16);
    __ movptr(from , from_param);
    __ movptr(to   , to_param);
    __ movptr(key  , key_param);
    __ movptr(rvec , rvec_param);
    __ movptr(len_reg , len_param);
    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
      offset += 0x10;
    }
    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
    __ cmpl(rax, 44);
    __ jcc(Assembler::notEqual, L_key_192_256);
    __ movl(pos, 0);
    __ align(OptoLoopAlignment);
    __ BIND(L_loopTop_128);
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
      __ aesenc(xmm_result, as_XMMRegister(rnum));
    }
    for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
    }
    load_key(xmm_temp, key, 0xa0);
    __ aesenclast(xmm_result, xmm_temp);
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
    __ addptr(pos, AESBlockSize);
    __ subptr(len_reg, AESBlockSize);
    __ jcc(Assembler::notEqual, L_loopTop_128);
    __ BIND(L_exit);
    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
    handleSOERegisters(false /*restoring*/);
    __ movptr(rax, len_param); // return length
    __ leave();                                  // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    __ BIND(L_key_192_256);
    __ cmpl(rax, 52);
    __ jcc(Assembler::notEqual, L_key_256);
    __ movl(pos, 0);
    __ align(OptoLoopAlignment);
    __ BIND(L_loopTop_192);
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
      __ aesenc(xmm_result, as_XMMRegister(rnum));
    }
    for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
    }
    load_key(xmm_temp, key, 0xc0);
    __ aesenclast(xmm_result, xmm_temp);
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
    __ addptr(pos, AESBlockSize);
    __ subptr(len_reg, AESBlockSize);
    __ jcc(Assembler::notEqual, L_loopTop_192);
    __ jmp(L_exit);
    __ BIND(L_key_256);
    __ movl(pos, 0);
    __ align(OptoLoopAlignment);
    __ BIND(L_loopTop_256);
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
      __ aesenc(xmm_result, as_XMMRegister(rnum));
    }
    for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
    }
    load_key(xmm_temp, key, 0xe0);
    __ aesenclast(xmm_result, xmm_temp);
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
    __ addptr(pos, AESBlockSize);
    __ subptr(len_reg, AESBlockSize);
    __ jcc(Assembler::notEqual, L_loopTop_256);
    __ jmp(L_exit);
    return start;
  }
  address generate_cipherBlockChaining_decryptAESCrypt() {
    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
    address start = __ pc();
    Label L_exit, L_key_192_256, L_key_256;
    Label L_singleBlock_loopTop_128;
    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
    const Register from        = rsi;      // source array address
    const Register to          = rdx;      // destination array address
    const Register key         = rcx;      // key array address
    const Register rvec        = rdi;      // r byte array initialized from initvector array address
    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
    const Register pos         = rax;
    const XMMRegister xmm_result = xmm0;
    const XMMRegister xmm_temp   = xmm1;
    const int XMM_REG_NUM_KEY_FIRST = 2;
    const int XMM_REG_NUM_KEY_LAST  = 7;
    const int FIRST_NON_REG_KEY_offset = 0x70;
    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    handleSOERegisters(true /*saving*/);
    const Address  from_param(rbp, 8+0);
    const Address  to_param  (rbp, 8+4);
    const Address  key_param (rbp, 8+8);
    const Address  rvec_param (rbp, 8+12);
    const Address  len_param  (rbp, 8+16);
    __ movptr(from , from_param);
    __ movptr(to   , to_param);
    __ movptr(key  , key_param);
    __ movptr(rvec , rvec_param);
    __ movptr(len_reg , len_param);
    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
      offset += 0x10;
    }
    const Register  prev_block_cipher_ptr = rvec;
    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
    __ cmpl(rax, 44);
    __ jcc(Assembler::notEqual, L_key_192_256);
    __ movl(pos, 0);
    __ align(OptoLoopAlignment);
    __ BIND(L_singleBlock_loopTop_128);
    __ cmpptr(len_reg, 0);           // any blocks left??
    __ jcc(Assembler::equal, L_exit);
    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
      __ aesdec(xmm_result, as_XMMRegister(rnum));
    }
    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) {   // 128-bit runs up to key offset a0
      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
    }
    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
    __ aesdeclast(xmm_result, xmm_temp);
    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
    __ addptr(pos, AESBlockSize);
    __ subptr(len_reg, AESBlockSize);
    __ jmp(L_singleBlock_loopTop_128);
    __ BIND(L_exit);
    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
    __ movptr(rvec , rvec_param);                                     // restore this since used in loop
    __ movdqu(Address(rvec, 0), xmm_temp);                            // final value of r stored in rvec of CipherBlockChaining object
    handleSOERegisters(false /*restoring*/);
    __ movptr(rax, len_param); // return length
    __ leave();                                                       // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    __ BIND(L_key_192_256);
    __ cmpl(rax, 52);
    __ jcc(Assembler::notEqual, L_key_256);
    __ movl(pos, 0);
    __ align(OptoLoopAlignment);
    __ BIND(L_singleBlock_loopTop_192);
    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
      __ aesdec(xmm_result, as_XMMRegister(rnum));
    }
    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) {   // 192-bit runs up to key offset c0
      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
    }
    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
    __ aesdeclast(xmm_result, xmm_temp);
    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
    __ addptr(pos, AESBlockSize);
    __ subptr(len_reg, AESBlockSize);
    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
    __ jmp(L_exit);
    __ BIND(L_key_256);
    __ movl(pos, 0);
    __ align(OptoLoopAlignment);
    __ BIND(L_singleBlock_loopTop_256);
    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
      __ aesdec(xmm_result, as_XMMRegister(rnum));
    }
    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) {   // 256-bit runs up to key offset e0
      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
    }
    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
    __ aesdeclast(xmm_result, xmm_temp);
    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
    __ addptr(pos, AESBlockSize);
    __ subptr(len_reg, AESBlockSize);
    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
    __ jmp(L_exit);
    return start;
  }
  address generate_ghash_long_swap_mask() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
    address start = __ pc();
    __ emit_data(0x0b0a0908, relocInfo::none, 0);
    __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
    __ emit_data(0x03020100, relocInfo::none, 0);
    __ emit_data(0x07060504, relocInfo::none, 0);
  return start;
  }
  address generate_ghash_byte_swap_mask() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
    address start = __ pc();
    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
    __ emit_data(0x08090a0b, relocInfo::none, 0);
    __ emit_data(0x04050607, relocInfo::none, 0);
    __ emit_data(0x00010203, relocInfo::none, 0);
  return start;
  }
  address generate_ghash_processBlocks() {
    assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
    __ align(CodeEntryAlignment);
    Label L_ghash_loop, L_exit;
    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
    address start = __ pc();
    const Register state        = rdi;
    const Register subkeyH      = rsi;
    const Register data         = rdx;
    const Register blocks       = rcx;
    const Address  state_param(rbp, 8+0);
    const Address  subkeyH_param(rbp, 8+4);
    const Address  data_param(rbp, 8+8);
    const Address  blocks_param(rbp, 8+12);
    const XMMRegister xmm_temp0 = xmm0;
    const XMMRegister xmm_temp1 = xmm1;
    const XMMRegister xmm_temp2 = xmm2;
    const XMMRegister xmm_temp3 = xmm3;
    const XMMRegister xmm_temp4 = xmm4;
    const XMMRegister xmm_temp5 = xmm5;
    const XMMRegister xmm_temp6 = xmm6;
    const XMMRegister xmm_temp7 = xmm7;
    __ enter();
    handleSOERegisters(true);  // Save registers
    __ movptr(state, state_param);
    __ movptr(subkeyH, subkeyH_param);
    __ movptr(data, data_param);
    __ movptr(blocks, blocks_param);
    __ movdqu(xmm_temp0, Address(state, 0));
    __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
    __ movdqu(xmm_temp1, Address(subkeyH, 0));
    __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
    __ BIND(L_ghash_loop);
    __ movdqu(xmm_temp2, Address(data, 0));
    __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
    __ pxor(xmm_temp0, xmm_temp2);
    __ movdqu(xmm_temp3, xmm_temp0);
    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
    __ movdqu(xmm_temp4, xmm_temp0);
    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
    __ movdqu(xmm_temp5, xmm_temp0);
    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
    __ movdqu(xmm_temp6, xmm_temp0);
    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
    __ pxor(xmm_temp3, xmm_temp5);
    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
    __ movdqu(xmm_temp7, xmm_temp3);
    __ movdqu(xmm_temp4, xmm_temp6);
    __ pslld (xmm_temp3, 1);
    __ pslld(xmm_temp6, 1);
    __ psrld(xmm_temp7, 31);
    __ psrld(xmm_temp4, 31);
    __ movdqu(xmm_temp5, xmm_temp7);
    __ pslldq(xmm_temp4, 4);
    __ pslldq(xmm_temp7, 4);
    __ psrldq(xmm_temp5, 12);
    __ por(xmm_temp3, xmm_temp7);
    __ por(xmm_temp6, xmm_temp4);
    __ por(xmm_temp6, xmm_temp5);
    __ movdqu(xmm_temp7, xmm_temp3);
    __ movdqu(xmm_temp4, xmm_temp3);
    __ movdqu(xmm_temp5, xmm_temp3);
    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
    __ pslld(xmm_temp4, 30);    // packed right shift shifting << 30
    __ pslld(xmm_temp5, 25);    // packed right shift shifting << 25
    __ pxor(xmm_temp7, xmm_temp4);      // xor the shifted versions
    __ pxor(xmm_temp7, xmm_temp5);
    __ movdqu(xmm_temp4, xmm_temp7);
    __ pslldq(xmm_temp7, 12);
    __ psrldq(xmm_temp4, 4);
    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
    __ movdqu(xmm_temp2, xmm_temp3);
    __ movdqu(xmm_temp7, xmm_temp3);
    __ movdqu(xmm_temp5, xmm_temp3);
    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
    __ psrld(xmm_temp7, 2);     // packed left shifting >> 2
    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
    __ pxor(xmm_temp2, xmm_temp7);      // xor the shifted versions
    __ pxor(xmm_temp2, xmm_temp5);
    __ pxor(xmm_temp2, xmm_temp4);
    __ pxor(xmm_temp3, xmm_temp2);
    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
    __ decrement(blocks);
    __ jcc(Assembler::zero, L_exit);
    __ movdqu(xmm_temp0, xmm_temp6);
    __ addptr(data, 16);
    __ jmp(L_ghash_loop);
    __ BIND(L_exit);
    __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
    handleSOERegisters(false);  // restore registers
    __ leave();
    __ ret(0);
    return start;
  }
  address generate_updateBytesCRC32() {
    assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
    address start = __ pc();
    const Register crc   = rdx;  // crc
    const Register buf   = rsi;  // source java byte array address
    const Register len   = rcx;  // length
    const Register table = rdi;  // crc_table address (reuse register)
    const Register tmp   = rbx;
    assert_different_registers(crc, buf, len, table, tmp, rax);
    BLOCK_COMMENT("Entry:");
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ push(rsi);
    __ push(rdi);
    __ push(rbx);
    Address crc_arg(rbp, 8 + 0);
    Address buf_arg(rbp, 8 + 4);
    Address len_arg(rbp, 8 + 8);
    __ movl(crc,   crc_arg);
    __ movptr(buf, buf_arg);
    __ movl(len,   len_arg);
    __ kernel_crc32(crc, buf, len, table, tmp);
    __ movl(rax, crc);
    __ pop(rbx);
    __ pop(rdi);
    __ pop(rsi);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  void generate_safefetch(const char* name, int size, address* entry,
                          address* fault_pc, address* continuation_pc) {
    StubCodeMark mark(this, "StubRoutines", name);
    __ movl(rax, Address(rsp, 0x8));
    __ movl(rcx, Address(rsp, 0x4));
    switch (size) {
      case 4:
        __ movl(rax, Address(rcx, 0));
        break;
      case 8:
        Unimplemented();
        break;
      default:
        ShouldNotReachHere();
    }
    __ ret(0);
  }
 public:
  enum layout {
    thread_off,    // last_java_sp
    arg1_off,
    arg2_off,
    rbp_off,       // callee saved register
    ret_pc,
    framesize
  };
 private:
#undef  __
#define __ masm->
  address generate_throw_exception(const char* name, address runtime_entry,
                                   Register arg1 = noreg, Register arg2 = noreg) {
    int insts_size = 256;
    int locs_size  = 32;
    CodeBuffer code(name, insts_size, locs_size);
    OopMapSet* oop_maps  = new OopMapSet();
    MacroAssembler* masm = new MacroAssembler(&code);
    address start = __ pc();
    Register java_thread = rbx;
    __ get_thread(java_thread);
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ subptr(rsp, (framesize-2) * wordSize); // prolog
    int frame_complete = __ pc() - start;
    __ movptr(Address(rsp, thread_off * wordSize), java_thread);
    if (arg1 != noreg) {
      __ movptr(Address(rsp, arg1_off * wordSize), arg1);
    }
    if (arg2 != noreg) {
      assert(arg1 != noreg, "missing reg arg");
      __ movptr(Address(rsp, arg2_off * wordSize), arg2);
    }
    __ set_last_Java_frame(java_thread, rsp, rbp, NULL);
    BLOCK_COMMENT("call runtime_entry");
    __ call(RuntimeAddress(runtime_entry));
    OopMap* map =  new OopMap(framesize, 0);
    oop_maps->add_gc_map(__ pc() - start, map);
    __ get_thread(java_thread);
    __ reset_last_Java_frame(java_thread, true);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
#ifdef ASSERT
    Label L;
    __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
    __ jcc(Assembler::notEqual, L);
    __ should_not_reach_here();
    __ bind(L);
#endif /* ASSERT */
    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
    return stub->entry_point();
  }
  void create_control_words() {
    StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
    StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
    StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
    StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
    StubRoutines::_mxcsr_std           = 0x1F80;
    StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
    StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
    StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
    StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
    StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
    StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
  }
  void generate_initial() {
    StubRoutines::_forward_exception_entry      = generate_forward_exception();
    StubRoutines::_call_stub_entry              =
      generate_call_stub(StubRoutines::_call_stub_return_address);
    StubRoutines::_catch_exception_entry        = generate_catch_exception();
    StubRoutines::_atomic_xchg_entry            = generate_atomic_xchg();
    StubRoutines::_handler_for_unsafe_access_entry =
      generate_handler_for_unsafe_access();
    create_control_words();
    StubRoutines::x86::_verify_mxcsr_entry                 = generate_verify_mxcsr();
    StubRoutines::x86::_verify_fpu_cntrl_wrd_entry         = generate_verify_fpu_cntrl_wrd();
    StubRoutines::_d2i_wrapper                              = generate_d2i_wrapper(T_INT,
                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
    StubRoutines::_d2l_wrapper                              = generate_d2i_wrapper(T_LONG,
                                                                                   CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
    if (UseCRC32Intrinsics) {
      StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
    }
  }
  void generate_all() {
    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
    generate_arraycopy_stubs();
    generate_math_stubs();
    if (UseAESIntrinsics) {
      StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
    }
    if (UseGHASHIntrinsics) {
      StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
      StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
    }
    generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
                                                   &StubRoutines::_safefetch32_fault_pc,
                                                   &StubRoutines::_safefetch32_continuation_pc);
    StubRoutines::_safefetchN_entry           = StubRoutines::_safefetch32_entry;
    StubRoutines::_safefetchN_fault_pc        = StubRoutines::_safefetch32_fault_pc;
    StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
  }
 public:
  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
    if (all) {
      generate_all();
    } else {
      generate_initial();
    }
  }
}; // end class declaration
void StubGenerator_generate(CodeBuffer* code, bool all) {
  StubGenerator g(code, all);
}
C:\hotspot-69087d08d473\src\cpu\x86\vm/stubGenerator_x86_64.cpp
#include "precompiled.hpp"
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "interpreter/interpreter.hpp"
#include "nativeInst_x86.hpp"
#include "oops/instanceOop.hpp"
#include "oops/method.hpp"
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/sharedRuntime.hpp"
#include "runtime/stubCodeGenerator.hpp"
#include "runtime/stubRoutines.hpp"
#include "runtime/thread.inline.hpp"
#include "utilities/top.hpp"
#ifdef COMPILER2
#include "opto/runtime.hpp"
#endif
#define __ _masm->
#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
#define a__ ((Assembler*)_masm)->
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
static address handle_unsafe_access() {
  JavaThread* thread = JavaThread::current();
  address pc = thread->saved_exception_pc();
  address npc = Assembler::locate_next_instruction(pc);
  thread->set_pending_unsafe_access_error();
  return npc;
}
class StubGenerator: public StubCodeGenerator {
 private:
#ifdef PRODUCT
#define inc_counter_np(counter) ((void)0)
#else
  void inc_counter_np_(int& counter) {
    __ incrementl(ExternalAddress((address)&counter));
  }
#define inc_counter_np(counter) \
  BLOCK_COMMENT("inc_counter " #counter); \
  inc_counter_np_(counter);
#endif
  enum call_stub_layout {
#ifdef _WIN64
    xmm_save_first     = 6,  // save from xmm6
    xmm_save_last      = 15, // to xmm15
    xmm_save_base      = -9,
    rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
    r15_off            = -7,
    r14_off            = -6,
    r13_off            = -5,
    r12_off            = -4,
    rdi_off            = -3,
    rsi_off            = -2,
    rbx_off            = -1,
    rbp_off            =  0,
    retaddr_off        =  1,
    call_wrapper_off   =  2,
    result_off         =  3,
    result_type_off    =  4,
    method_off         =  5,
    entry_point_off    =  6,
    parameters_off     =  7,
    parameter_size_off =  8,
    thread_off         =  9
#else
    rsp_after_call_off = -12,
    mxcsr_off          = rsp_after_call_off,
    r15_off            = -11,
    r14_off            = -10,
    r13_off            = -9,
    r12_off            = -8,
    rbx_off            = -7,
    call_wrapper_off   = -6,
    result_off         = -5,
    result_type_off    = -4,
    method_off         = -3,
    entry_point_off    = -2,
    parameters_off     = -1,
    rbp_off            =  0,
    retaddr_off        =  1,
    parameter_size_off =  2,
    thread_off         =  3
#endif
  };
#ifdef _WIN64
  Address xmm_save(int reg) {
    assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
    return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
  }
#endif
  address generate_call_stub(address& return_address) {
    assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
           (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
           "adjust this code");
    StubCodeMark mark(this, "StubRoutines", "call_stub");
    address start = __ pc();
    const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
    const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
    const Address result        (rbp, result_off         * wordSize);
    const Address result_type   (rbp, result_type_off    * wordSize);
    const Address method        (rbp, method_off         * wordSize);
    const Address entry_point   (rbp, entry_point_off    * wordSize);
    const Address parameters    (rbp, parameters_off     * wordSize);
    const Address parameter_size(rbp, parameter_size_off * wordSize);
    const Address thread        (rbp, thread_off         * wordSize);
    const Address r15_save(rbp, r15_off * wordSize);
    const Address r14_save(rbp, r14_off * wordSize);
    const Address r13_save(rbp, r13_off * wordSize);
    const Address r12_save(rbp, r12_off * wordSize);
    const Address rbx_save(rbp, rbx_off * wordSize);
    __ enter();
    __ subptr(rsp, -rsp_after_call_off * wordSize);
#ifndef _WIN64
    __ movptr(parameters,   c_rarg5); // parameters
    __ movptr(entry_point,  c_rarg4); // entry_point
#endif
    __ movptr(method,       c_rarg3); // method
    __ movl(result_type,  c_rarg2);   // result type
    __ movptr(result,       c_rarg1); // result
    __ movptr(call_wrapper, c_rarg0); // call wrapper
    __ movptr(rbx_save, rbx);
    __ movptr(r12_save, r12);
    __ movptr(r13_save, r13);
    __ movptr(r14_save, r14);
    __ movptr(r15_save, r15);
#ifdef _WIN64
    for (int i = 6; i <= 15; i++) {
      __ movdqu(xmm_save(i), as_XMMRegister(i));
    }
    const Address rdi_save(rbp, rdi_off * wordSize);
    const Address rsi_save(rbp, rsi_off * wordSize);
    __ movptr(rsi_save, rsi);
    __ movptr(rdi_save, rdi);
#else
    const Address mxcsr_save(rbp, mxcsr_off * wordSize);
    {
      Label skip_ldmx;
      __ stmxcsr(mxcsr_save);
      __ movl(rax, mxcsr_save);
      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
      __ cmp32(rax, mxcsr_std);
      __ jcc(Assembler::equal, skip_ldmx);
      __ ldmxcsr(mxcsr_std);
      __ bind(skip_ldmx);
    }
#endif
    __ movptr(r15_thread, thread);
    __ reinit_heapbase();
#ifdef ASSERT
    {
      Label L;
      __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
      __ jcc(Assembler::equal, L);
      __ stop("StubRoutines::call_stub: entered with pending exception");
      __ bind(L);
    }
#endif
    BLOCK_COMMENT("pass parameters if any");
    Label parameters_done;
    __ movl(c_rarg3, parameter_size);
    __ testl(c_rarg3, c_rarg3);
    __ jcc(Assembler::zero, parameters_done);
    Label loop;
    __ movptr(c_rarg2, parameters);       // parameter pointer
    __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
    __ BIND(loop);
    __ movptr(rax, Address(c_rarg2, 0));// get parameter
    __ addptr(c_rarg2, wordSize);       // advance to next parameter
    __ decrementl(c_rarg1);             // decrement counter
    __ push(rax);                       // pass parameter
    __ jcc(Assembler::notZero, loop);
    __ BIND(parameters_done);
    __ movptr(rbx, method);             // get Method*
    __ movptr(c_rarg1, entry_point);    // get entry_point
    __ mov(r13, rsp);                   // set sender sp
    BLOCK_COMMENT("call Java function");
    __ call(c_rarg1);
    BLOCK_COMMENT("call_stub_return_address:");
    return_address = __ pc();
    __ movptr(c_rarg0, result);
    Label is_long, is_float, is_double, exit;
    __ movl(c_rarg1, result_type);
    __ cmpl(c_rarg1, T_OBJECT);
    __ jcc(Assembler::equal, is_long);
    __ cmpl(c_rarg1, T_LONG);
    __ jcc(Assembler::equal, is_long);
    __ cmpl(c_rarg1, T_FLOAT);
    __ jcc(Assembler::equal, is_float);
    __ cmpl(c_rarg1, T_DOUBLE);
    __ jcc(Assembler::equal, is_double);
    __ movl(Address(c_rarg0, 0), rax);
    __ BIND(exit);
    __ lea(rsp, rsp_after_call);
#ifdef ASSERT
    {
      Label L, S;
      __ cmpptr(r15_thread, thread);
      __ jcc(Assembler::notEqual, S);
      __ get_thread(rbx);
      __ cmpptr(r15_thread, rbx);
      __ jcc(Assembler::equal, L);
      __ bind(S);
      __ jcc(Assembler::equal, L);
      __ stop("StubRoutines::call_stub: threads must correspond");
      __ bind(L);
    }
#endif
#ifdef _WIN64
    for (int i = 15; i >= 6; i--) {
      __ movdqu(as_XMMRegister(i), xmm_save(i));
    }
#endif
    __ movptr(r15, r15_save);
    __ movptr(r14, r14_save);
    __ movptr(r13, r13_save);
    __ movptr(r12, r12_save);
    __ movptr(rbx, rbx_save);
#ifdef _WIN64
    __ movptr(rdi, rdi_save);
    __ movptr(rsi, rsi_save);
#else
    __ ldmxcsr(mxcsr_save);
#endif
    __ addptr(rsp, -rsp_after_call_off * wordSize);
    __ pop(rbp);
    __ ret(0);
    __ BIND(is_long);
    __ movq(Address(c_rarg0, 0), rax);
    __ jmp(exit);
    __ BIND(is_float);
    __ movflt(Address(c_rarg0, 0), xmm0);
    __ jmp(exit);
    __ BIND(is_double);
    __ movdbl(Address(c_rarg0, 0), xmm0);
    __ jmp(exit);
    return start;
  }
  address generate_catch_exception() {
    StubCodeMark mark(this, "StubRoutines", "catch_exception");
    address start = __ pc();
    const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
    const Address thread        (rbp, thread_off         * wordSize);
#ifdef ASSERT
    {
      Label L, S;
      __ cmpptr(r15_thread, thread);
      __ jcc(Assembler::notEqual, S);
      __ get_thread(rbx);
      __ cmpptr(r15_thread, rbx);
      __ jcc(Assembler::equal, L);
      __ bind(S);
      __ stop("StubRoutines::catch_exception: threads must correspond");
      __ bind(L);
    }
#endif
    __ verify_oop(rax);
    __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
    __ lea(rscratch1, ExternalAddress((address)__FILE__));
    __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
    __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
    assert(StubRoutines::_call_stub_return_address != NULL,
           "_call_stub_return_address must have been generated before");
    __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
    return start;
  }
  address generate_forward_exception() {
    StubCodeMark mark(this, "StubRoutines", "forward exception");
    address start = __ pc();
#ifdef ASSERT
    {
      Label L;
      __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
      __ jcc(Assembler::notEqual, L);
      __ stop("StubRoutines::forward exception: no pending exception (1)");
      __ bind(L);
    }
#endif
    __ movptr(c_rarg0, Address(rsp, 0));
    BLOCK_COMMENT("call exception_handler_for_return_address");
    __ call_VM_leaf(CAST_FROM_FN_PTR(address,
                         SharedRuntime::exception_handler_for_return_address),
                    r15_thread, c_rarg0);
    __ mov(rbx, rax);
    __ pop(rdx);
    __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
    __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
#ifdef ASSERT
    {
      Label L;
      __ testptr(rax, rax);
      __ jcc(Assembler::notEqual, L);
      __ stop("StubRoutines::forward exception: no pending exception (2)");
      __ bind(L);
    }
#endif
    __ verify_oop(rax);
    __ jmp(rbx);
    return start;
  }
  address generate_atomic_xchg() {
    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
    address start = __ pc();
    __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
    __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
    __ ret(0);
    return start;
  }
  address generate_atomic_xchg_ptr() {
    StubCodeMark mark(this, "StubRoutines", "atomic_xchg_ptr");
    address start = __ pc();
    __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
    __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
    __ ret(0);
    return start;
  }
  address generate_atomic_cmpxchg() {
    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
    address start = __ pc();
    __ movl(rax, c_rarg2);
   if ( os::is_MP() ) __ lock();
    __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
    __ ret(0);
    return start;
  }
  address generate_atomic_cmpxchg_long() {
    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
    address start = __ pc();
    __ movq(rax, c_rarg2);
   if ( os::is_MP() ) __ lock();
    __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
    __ ret(0);
    return start;
  }
  address generate_atomic_add() {
    StubCodeMark mark(this, "StubRoutines", "atomic_add");
    address start = __ pc();
    __ movl(rax, c_rarg0);
   if ( os::is_MP() ) __ lock();
    __ xaddl(Address(c_rarg1, 0), c_rarg0);
    __ addl(rax, c_rarg0);
    __ ret(0);
    return start;
  }
  address generate_atomic_add_ptr() {
    StubCodeMark mark(this, "StubRoutines", "atomic_add_ptr");
    address start = __ pc();
    __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
   if ( os::is_MP() ) __ lock();
    __ xaddptr(Address(c_rarg1, 0), c_rarg0);
    __ addptr(rax, c_rarg0);
    __ ret(0);
    return start;
  }
  address generate_orderaccess_fence() {
    StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
    address start = __ pc();
    __ membar(Assembler::StoreLoad);
    __ ret(0);
    return start;
  }
  address generate_get_previous_fp() {
    StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
    const Address old_fp(rbp, 0);
    const Address older_fp(rax, 0);
    address start = __ pc();
    __ enter();
    __ movptr(rax, old_fp); // callers fp
    __ movptr(rax, older_fp); // the frame for ps()
    __ pop(rbp);
    __ ret(0);
    return start;
  }
  address generate_get_previous_sp() {
    StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
    address start = __ pc();
    __ movptr(rax, rsp);
    __ addptr(rax, 8); // return address is at the top of the stack.
    __ ret(0);
    return start;
  }
  address generate_verify_mxcsr() {
    StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
    address start = __ pc();
    const Address mxcsr_save(rsp, 0);
    if (CheckJNICalls) {
      Label ok_ret;
      ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
      __ push(rax);
      __ subptr(rsp, wordSize);      // allocate a temp location
      __ stmxcsr(mxcsr_save);
      __ movl(rax, mxcsr_save);
      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
      __ cmp32(rax, mxcsr_std);
      __ jcc(Assembler::equal, ok_ret);
      __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
      __ ldmxcsr(mxcsr_std);
      __ bind(ok_ret);
      __ addptr(rsp, wordSize);
      __ pop(rax);
    }
    __ ret(0);
    return start;
  }
  address generate_f2i_fixup() {
    StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
    Address inout(rsp, 5 * wordSize); // return address + 4 saves
    address start = __ pc();
    Label L;
    __ push(rax);
    __ push(c_rarg3);
    __ push(c_rarg2);
    __ push(c_rarg1);
    __ movl(rax, 0x7f800000);
    __ xorl(c_rarg3, c_rarg3);
    __ movl(c_rarg2, inout);
    __ movl(c_rarg1, c_rarg2);
    __ andl(c_rarg1, 0x7fffffff);
    __ cmpl(rax, c_rarg1); // NaN? -> 0
    __ jcc(Assembler::negative, L);
    __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
    __ movl(c_rarg3, 0x80000000);
    __ movl(rax, 0x7fffffff);
    __ cmovl(Assembler::positive, c_rarg3, rax);
    __ bind(L);
    __ movptr(inout, c_rarg3);
    __ pop(c_rarg1);
    __ pop(c_rarg2);
    __ pop(c_rarg3);
    __ pop(rax);
    __ ret(0);
    return start;
  }
  address generate_f2l_fixup() {
    StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
    Address inout(rsp, 5 * wordSize); // return address + 4 saves
    address start = __ pc();
    Label L;
    __ push(rax);
    __ push(c_rarg3);
    __ push(c_rarg2);
    __ push(c_rarg1);
    __ movl(rax, 0x7f800000);
    __ xorl(c_rarg3, c_rarg3);
    __ movl(c_rarg2, inout);
    __ movl(c_rarg1, c_rarg2);
    __ andl(c_rarg1, 0x7fffffff);
    __ cmpl(rax, c_rarg1); // NaN? -> 0
    __ jcc(Assembler::negative, L);
    __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
    __ mov64(c_rarg3, 0x8000000000000000);
    __ mov64(rax, 0x7fffffffffffffff);
    __ cmov(Assembler::positive, c_rarg3, rax);
    __ bind(L);
    __ movptr(inout, c_rarg3);
    __ pop(c_rarg1);
    __ pop(c_rarg2);
    __ pop(c_rarg3);
    __ pop(rax);
    __ ret(0);
    return start;
  }
  address generate_d2i_fixup() {
    StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
    Address inout(rsp, 6 * wordSize); // return address + 5 saves
    address start = __ pc();
    Label L;
    __ push(rax);
    __ push(c_rarg3);
    __ push(c_rarg2);
    __ push(c_rarg1);
    __ push(c_rarg0);
    __ movl(rax, 0x7ff00000);
    __ movq(c_rarg2, inout);
    __ movl(c_rarg3, c_rarg2);
    __ mov(c_rarg1, c_rarg2);
    __ mov(c_rarg0, c_rarg2);
    __ negl(c_rarg3);
    __ shrptr(c_rarg1, 0x20);
    __ orl(c_rarg3, c_rarg2);
    __ andl(c_rarg1, 0x7fffffff);
    __ xorl(c_rarg2, c_rarg2);
    __ shrl(c_rarg3, 0x1f);
    __ orl(c_rarg1, c_rarg3);
    __ cmpl(rax, c_rarg1);
    __ jcc(Assembler::negative, L); // NaN -> 0
    __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
    __ movl(c_rarg2, 0x80000000);
    __ movl(rax, 0x7fffffff);
    __ cmov(Assembler::positive, c_rarg2, rax);
    __ bind(L);
    __ movptr(inout, c_rarg2);
    __ pop(c_rarg0);
    __ pop(c_rarg1);
    __ pop(c_rarg2);
    __ pop(c_rarg3);
    __ pop(rax);
    __ ret(0);
    return start;
  }
  address generate_d2l_fixup() {
    StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
    Address inout(rsp, 6 * wordSize); // return address + 5 saves
    address start = __ pc();
    Label L;
    __ push(rax);
    __ push(c_rarg3);
    __ push(c_rarg2);
    __ push(c_rarg1);
    __ push(c_rarg0);
    __ movl(rax, 0x7ff00000);
    __ movq(c_rarg2, inout);
    __ movl(c_rarg3, c_rarg2);
    __ mov(c_rarg1, c_rarg2);
    __ mov(c_rarg0, c_rarg2);
    __ negl(c_rarg3);
    __ shrptr(c_rarg1, 0x20);
    __ orl(c_rarg3, c_rarg2);
    __ andl(c_rarg1, 0x7fffffff);
    __ xorl(c_rarg2, c_rarg2);
    __ shrl(c_rarg3, 0x1f);
    __ orl(c_rarg1, c_rarg3);
    __ cmpl(rax, c_rarg1);
    __ jcc(Assembler::negative, L); // NaN -> 0
    __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
    __ mov64(c_rarg2, 0x8000000000000000);
    __ mov64(rax, 0x7fffffffffffffff);
    __ cmovq(Assembler::positive, c_rarg2, rax);
    __ bind(L);
    __ movq(inout, c_rarg2);
    __ pop(c_rarg0);
    __ pop(c_rarg1);
    __ pop(c_rarg2);
    __ pop(c_rarg3);
    __ pop(rax);
    __ ret(0);
    return start;
  }
  address generate_fp_mask(const char *stub_name, int64_t mask) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", stub_name);
    address start = __ pc();
    __ emit_data64( mask, relocInfo::none );
    __ emit_data64( mask, relocInfo::none );
    return start;
  }
  address generate_handler_for_unsafe_access() {
    StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
    address start = __ pc();
    __ push(0);                       // hole for return address-to-be
    __ pusha();                       // push registers
    Address next_pc(rsp, RegisterImpl::number_of_registers * BytesPerWord);
    __ subptr(rsp, frame::arg_reg_save_area_bytes);
    BLOCK_COMMENT("call handle_unsafe_access");
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, handle_unsafe_access)));
    __ addptr(rsp, frame::arg_reg_save_area_bytes);
    __ movptr(next_pc, rax);          // stuff next address
    __ popa();
    __ ret(0);                        // jump to next address
    return start;
  }
  address generate_verify_oop() {
    StubCodeMark mark(this, "StubRoutines", "verify_oop");
    address start = __ pc();
    Label exit, error;
    __ pushf();
    __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
    __ push(r12);
    __ push(c_rarg2);
    __ push(c_rarg3);
    enum {
           oop_to_verify = 6 * wordSize,
           saved_rax     = 7 * wordSize,
           saved_r10     = 8 * wordSize,
           return_addr   = 16 * wordSize,
           error_msg     = 17 * wordSize
    };
    __ movptr(rax, Address(rsp, oop_to_verify));
    __ testptr(rax, rax);
    __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
    __ movptr(c_rarg2, rax);
    __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
    __ andptr(c_rarg2, c_rarg3);
    __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
    __ cmpptr(c_rarg2, c_rarg3);
    __ jcc(Assembler::notZero, error);
    __ reinit_heapbase();
    __ load_klass(rax, rax);  // get klass
    __ testptr(rax, rax);
    __ jcc(Assembler::zero, error); // if klass is NULL it is broken
    __ bind(exit);
    __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
    __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
    __ pop(c_rarg3);                             // restore c_rarg3
    __ pop(c_rarg2);                             // restore c_rarg2
    __ pop(r12);                                 // restore r12
    __ popf();                                   // restore flags
    __ ret(4 * wordSize);                        // pop caller saved stuff
    __ bind(error);
    __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
    __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
    __ pop(c_rarg3);                             // get saved c_rarg3 back
    __ pop(c_rarg2);                             // get saved c_rarg2 back
    __ pop(r12);                                 // get saved r12 back
    __ popf();                                   // get saved flags off stack --
    __ pusha();                                  // push registers
    __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
    __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
    __ movq(c_rarg2, rsp);                          // pass address of regs on stack
    __ mov(r12, rsp);                               // remember rsp
    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
    __ andptr(rsp, -16);                            // align stack as required by ABI
    BLOCK_COMMENT("call MacroAssembler::debug");
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
    __ mov(rsp, r12);                               // restore rsp
    __ popa();                                      // pop registers (includes r12)
    __ ret(4 * wordSize);                           // pop caller saved stuff
    return start;
  }
  void assert_clean_int(Register Rint, Register Rtmp) {
#ifdef ASSERT
    Label L;
    assert_different_registers(Rtmp, Rint);
    __ movslq(Rtmp, Rint);
    __ cmpq(Rtmp, Rint);
    __ jcc(Assembler::equal, L);
    __ stop("high 32-bits of int value are not 0");
    __ bind(L);
#endif
  }
  void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
    assert(no_overlap_target != NULL, "must be generated");
    array_overlap_test(no_overlap_target, NULL, sf);
  }
  void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
    array_overlap_test(NULL, &L_no_overlap, sf);
  }
  void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
    const Register from     = c_rarg0;
    const Register to       = c_rarg1;
    const Register count    = c_rarg2;
    const Register end_from = rax;
    __ cmpptr(to, from);
    __ lea(end_from, Address(from, count, sf, 0));
    if (NOLp == NULL) {
      ExternalAddress no_overlap(no_overlap_target);
      __ jump_cc(Assembler::belowEqual, no_overlap);
      __ cmpptr(to, end_from);
      __ jump_cc(Assembler::aboveEqual, no_overlap);
    } else {
      __ jcc(Assembler::belowEqual, (*NOLp));
      __ cmpptr(to, end_from);
      __ jcc(Assembler::aboveEqual, (*NOLp));
    }
  }
  void setup_arg_regs(int nargs = 3) {
    const Register saved_rdi = r9;
    const Register saved_rsi = r10;
    assert(nargs == 3 || nargs == 4, "else fix");
#ifdef _WIN64
    assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
           "unexpected argument registers");
    if (nargs >= 4)
      __ mov(rax, r9);  // r9 is also saved_rdi
    __ movptr(saved_rdi, rdi);
    __ movptr(saved_rsi, rsi);
    __ mov(rdi, rcx); // c_rarg0
    __ mov(rsi, rdx); // c_rarg1
    __ mov(rdx, r8);  // c_rarg2
    if (nargs >= 4)
      __ mov(rcx, rax); // c_rarg3 (via rax)
#else
    assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
           "unexpected argument registers");
#endif
  }
  void restore_arg_regs() {
    const Register saved_rdi = r9;
    const Register saved_rsi = r10;
#ifdef _WIN64
    __ movptr(rdi, saved_rdi);
    __ movptr(rsi, saved_rsi);
#endif
  }
  void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
    BarrierSet* bs = Universe::heap()->barrier_set();
    switch (bs->kind()) {
      case BarrierSet::G1SATBCT:
      case BarrierSet::G1SATBCTLogging:
        if (!dest_uninitialized) {
           __ pusha();                      // push registers
           if (count == c_rarg0) {
             if (addr == c_rarg1) {
               __ xchgptr(c_rarg1, c_rarg0);
             } else {
               __ movptr(c_rarg1, count);
               __ movptr(c_rarg0, addr);
             }
           } else {
             __ movptr(c_rarg0, addr);
             __ movptr(c_rarg1, count);
           }
           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
           __ popa();
        }
         break;
      case BarrierSet::CardTableModRef:
      case BarrierSet::CardTableExtension:
      case BarrierSet::ModRef:
        break;
      default:
        ShouldNotReachHere();
    }
  }
  void  gen_write_ref_array_post_barrier(Register start, Register count, Register scratch) {
    assert_different_registers(start, count, scratch);
    BarrierSet* bs = Universe::heap()->barrier_set();
    switch (bs->kind()) {
      case BarrierSet::G1SATBCT:
      case BarrierSet::G1SATBCTLogging:
        {
          __ pusha();             // push registers (overkill)
          if (c_rarg0 == count) { // On win64 c_rarg0 == rcx
            assert_different_registers(c_rarg1, start);
            __ mov(c_rarg1, count);
            __ mov(c_rarg0, start);
          } else {
            assert_different_registers(c_rarg0, count);
            __ mov(c_rarg0, start);
            __ mov(c_rarg1, count);
          }
          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
          __ popa();
        }
        break;
      case BarrierSet::CardTableModRef:
      case BarrierSet::CardTableExtension:
        {
          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
          Label L_loop;
          const Register end = count;
          __ leaq(end, Address(start, count, TIMES_OOP, 0));  // end == start+count*oop_size
          __ subptr(end, BytesPerHeapOop); // end - 1 to make inclusive
          __ shrptr(start, CardTableModRefBS::card_shift);
          __ shrptr(end,   CardTableModRefBS::card_shift);
          __ subptr(end, start); // end --> cards count
          int64_t disp = (int64_t) ct->byte_map_base;
          __ mov64(scratch, disp);
          __ addptr(start, scratch);
        __ BIND(L_loop);
          __ movb(Address(start, count, Address::times_1), 0);
          __ decrement(count);
          __ jcc(Assembler::greaterEqual, L_loop);
        }
        break;
      default:
        ShouldNotReachHere();
    }
  }
  void copy_bytes_forward(Register end_from, Register end_to,
                             Register qword_count, Register to,
                             Label& L_copy_bytes, Label& L_copy_8_bytes) {
    DEBUG_ONLY(__ stop("enter at entry label, not here"));
    Label L_loop;
    __ align(OptoLoopAlignment);
    if (UseUnalignedLoadStores) {
      Label L_end;
      __ BIND(L_loop);
      if (UseAVX >= 2) {
        __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
        __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
        __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
        __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
      } else {
        __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
        __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
        __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
        __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
        __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
        __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
        __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
        __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
      }
      __ BIND(L_copy_bytes);
      __ addptr(qword_count, 8);
      __ jcc(Assembler::lessEqual, L_loop);
      __ subptr(qword_count, 4);  // sub(8) and add(4)
      __ jccb(Assembler::greater, L_end);
      if (UseAVX >= 2) {
        __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
        __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
      } else {
        __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
        __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
        __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
        __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
      }
      __ addptr(qword_count, 4);
      __ BIND(L_end);
      if (UseAVX >= 2) {
        __ vpxor(xmm0, xmm0);
        __ vpxor(xmm1, xmm1);
      }
    } else {
      __ BIND(L_loop);
      __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
      __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
      __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
      __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
      __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
      __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
      __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
      __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
      __ BIND(L_copy_bytes);
      __ addptr(qword_count, 4);
      __ jcc(Assembler::lessEqual, L_loop);
    }
    __ subptr(qword_count, 4);
    __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
  }
  void copy_bytes_backward(Register from, Register dest,
                              Register qword_count, Register to,
                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
    DEBUG_ONLY(__ stop("enter at entry label, not here"));
    Label L_loop;
    __ align(OptoLoopAlignment);
    if (UseUnalignedLoadStores) {
      Label L_end;
      __ BIND(L_loop);
      if (UseAVX >= 2) {
        __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
        __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
        __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
        __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
      } else {
        __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
        __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
        __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
        __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
        __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
        __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
        __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
        __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
      }
      __ BIND(L_copy_bytes);
      __ subptr(qword_count, 8);
      __ jcc(Assembler::greaterEqual, L_loop);
      __ addptr(qword_count, 4);  // add(8) and sub(4)
      __ jccb(Assembler::less, L_end);
      if (UseAVX >= 2) {
        __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
        __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
      } else {
        __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
        __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
        __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
        __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
      }
      __ subptr(qword_count, 4);
      __ BIND(L_end);
      if (UseAVX >= 2) {
        __ vpxor(xmm0, xmm0);
        __ vpxor(xmm1, xmm1);
      }
    } else {
      __ BIND(L_loop);
      __ movq(to, Address(from, qword_count, Address::times_8, 24));
      __ movq(Address(dest, qword_count, Address::times_8, 24), to);
      __ movq(to, Address(from, qword_count, Address::times_8, 16));
      __ movq(Address(dest, qword_count, Address::times_8, 16), to);
      __ movq(to, Address(from, qword_count, Address::times_8,  8));
      __ movq(Address(dest, qword_count, Address::times_8,  8), to);
      __ movq(to, Address(from, qword_count, Address::times_8,  0));
      __ movq(Address(dest, qword_count, Address::times_8,  0), to);
      __ BIND(L_copy_bytes);
      __ subptr(qword_count, 4);
      __ jcc(Assembler::greaterEqual, L_loop);
    }
    __ addptr(qword_count, 4);
    __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
  }
  address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
    Label L_copy_byte, L_exit;
    const Register from        = rdi;  // source array address
    const Register to          = rsi;  // destination array address
    const Register count       = rdx;  // elements count
    const Register byte_count  = rcx;
    const Register qword_count = count;
    const Register end_from    = from; // source array end address
    const Register end_to      = to;   // destination array end address
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
    __ movptr(byte_count, count);
    __ shrptr(count, 3); // count => qword_count
    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
    __ negptr(qword_count); // make the count negative
    __ jmp(L_copy_bytes);
  __ BIND(L_copy_8_bytes);
    __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
    __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
    __ increment(qword_count);
    __ jcc(Assembler::notZero, L_copy_8_bytes);
  __ BIND(L_copy_4_bytes);
    __ testl(byte_count, 4);
    __ jccb(Assembler::zero, L_copy_2_bytes);
    __ movl(rax, Address(end_from, 8));
    __ movl(Address(end_to, 8), rax);
    __ addptr(end_from, 4);
    __ addptr(end_to, 4);
  __ BIND(L_copy_2_bytes);
    __ testl(byte_count, 2);
    __ jccb(Assembler::zero, L_copy_byte);
    __ movw(rax, Address(end_from, 8));
    __ movw(Address(end_to, 8), rax);
    __ addptr(end_from, 2);
    __ addptr(end_to, 2);
  __ BIND(L_copy_byte);
    __ testl(byte_count, 1);
    __ jccb(Assembler::zero, L_exit);
    __ movb(rax, Address(end_from, 8));
    __ movb(Address(end_to, 8), rax);
  __ BIND(L_exit);
    restore_arg_regs();
    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
    __ jmp(L_copy_4_bytes);
    return start;
  }
  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
                                      address* entry, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
    const Register from        = rdi;  // source array address
    const Register to          = rsi;  // destination array address
    const Register count       = rdx;  // elements count
    const Register byte_count  = rcx;
    const Register qword_count = count;
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    array_overlap_test(nooverlap_target, Address::times_1);
    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
    __ movptr(byte_count, count);
    __ shrptr(count, 3);   // count => qword_count
    __ testl(byte_count, 1);
    __ jcc(Assembler::zero, L_copy_2_bytes);
    __ movb(rax, Address(from, byte_count, Address::times_1, -1));
    __ movb(Address(to, byte_count, Address::times_1, -1), rax);
    __ decrement(byte_count); // Adjust for possible trailing word
  __ BIND(L_copy_2_bytes);
    __ testl(byte_count, 2);
    __ jcc(Assembler::zero, L_copy_4_bytes);
    __ movw(rax, Address(from, byte_count, Address::times_1, -2));
    __ movw(Address(to, byte_count, Address::times_1, -2), rax);
  __ BIND(L_copy_4_bytes);
    __ testl(byte_count, 4);
    __ jcc(Assembler::zero, L_copy_bytes);
    __ movl(rax, Address(from, qword_count, Address::times_8));
    __ movl(Address(to, qword_count, Address::times_8), rax);
    __ jmp(L_copy_bytes);
  __ BIND(L_copy_8_bytes);
    __ movq(rax, Address(from, qword_count, Address::times_8, -8));
    __ movq(Address(to, qword_count, Address::times_8, -8), rax);
    __ decrement(qword_count);
    __ jcc(Assembler::notZero, L_copy_8_bytes);
    restore_arg_regs();
    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
    restore_arg_regs();
    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
    const Register from        = rdi;  // source array address
    const Register to          = rsi;  // destination array address
    const Register count       = rdx;  // elements count
    const Register word_count  = rcx;
    const Register qword_count = count;
    const Register end_from    = from; // source array end address
    const Register end_to      = to;   // destination array end address
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
    __ movptr(word_count, count);
    __ shrptr(count, 2); // count => qword_count
    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
    __ negptr(qword_count);
    __ jmp(L_copy_bytes);
  __ BIND(L_copy_8_bytes);
    __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
    __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
    __ increment(qword_count);
    __ jcc(Assembler::notZero, L_copy_8_bytes);
  __ BIND(L_copy_4_bytes);
    __ testl(word_count, 2);
    __ jccb(Assembler::zero, L_copy_2_bytes);
    __ movl(rax, Address(end_from, 8));
    __ movl(Address(end_to, 8), rax);
    __ addptr(end_from, 4);
    __ addptr(end_to, 4);
  __ BIND(L_copy_2_bytes);
    __ testl(word_count, 1);
    __ jccb(Assembler::zero, L_exit);
    __ movw(rax, Address(end_from, 8));
    __ movw(Address(end_to, 8), rax);
  __ BIND(L_exit);
    restore_arg_regs();
    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
    __ jmp(L_copy_4_bytes);
    return start;
  }
  address generate_fill(BasicType t, bool aligned, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    BLOCK_COMMENT("Entry:");
    const Register to       = c_rarg0;  // source array address
    const Register value    = c_rarg1;  // value
    const Register count    = c_rarg2;  // elements count
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
                                       address *entry, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
    const Register from        = rdi;  // source array address
    const Register to          = rsi;  // destination array address
    const Register count       = rdx;  // elements count
    const Register word_count  = rcx;
    const Register qword_count = count;
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    array_overlap_test(nooverlap_target, Address::times_2);
    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
    __ movptr(word_count, count);
    __ shrptr(count, 2); // count => qword_count
    __ testl(word_count, 1);
    __ jccb(Assembler::zero, L_copy_4_bytes);
    __ movw(rax, Address(from, word_count, Address::times_2, -2));
    __ movw(Address(to, word_count, Address::times_2, -2), rax);
  __ BIND(L_copy_4_bytes);
    __ testl(word_count, 2);
    __ jcc(Assembler::zero, L_copy_bytes);
    __ movl(rax, Address(from, qword_count, Address::times_8));
    __ movl(Address(to, qword_count, Address::times_8), rax);
    __ jmp(L_copy_bytes);
  __ BIND(L_copy_8_bytes);
    __ movq(rax, Address(from, qword_count, Address::times_8, -8));
    __ movq(Address(to, qword_count, Address::times_8, -8), rax);
    __ decrement(qword_count);
    __ jcc(Assembler::notZero, L_copy_8_bytes);
    restore_arg_regs();
    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
    restore_arg_regs();
    inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
                                         const char *name, bool dest_uninitialized = false) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
    const Register from        = rdi;  // source array address
    const Register to          = rsi;  // destination array address
    const Register count       = rdx;  // elements count
    const Register dword_count = rcx;
    const Register qword_count = count;
    const Register end_from    = from; // source array end address
    const Register end_to      = to;   // destination array end address
    const Register saved_to    = r11;  // saved destination array address
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
    if (is_oop) {
      __ movq(saved_to, to);
      gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
    }
    __ movptr(dword_count, count);
    __ shrptr(count, 1); // count => qword_count
    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
    __ negptr(qword_count);
    __ jmp(L_copy_bytes);
  __ BIND(L_copy_8_bytes);
    __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
    __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
    __ increment(qword_count);
    __ jcc(Assembler::notZero, L_copy_8_bytes);
  __ BIND(L_copy_4_bytes);
    __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
    __ jccb(Assembler::zero, L_exit);
    __ movl(rax, Address(end_from, 8));
    __ movl(Address(end_to, 8), rax);
  __ BIND(L_exit);
    if (is_oop) {
      gen_write_ref_array_post_barrier(saved_to, dword_count, rax);
    }
    restore_arg_regs();
    inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
    __ jmp(L_copy_4_bytes);
    return start;
  }
  address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
                                         address *entry, const char *name,
                                         bool dest_uninitialized = false) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
    const Register from        = rdi;  // source array address
    const Register to          = rsi;  // destination array address
    const Register count       = rdx;  // elements count
    const Register dword_count = rcx;
    const Register qword_count = count;
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    array_overlap_test(nooverlap_target, Address::times_4);
    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
    if (is_oop) {
      gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
    }
    assert_clean_int(count, rax); // Make sure 'count' is clean int.
    __ movptr(dword_count, count);
    __ shrptr(count, 1); // count => qword_count
    __ testl(dword_count, 1);
    __ jcc(Assembler::zero, L_copy_bytes);
    __ movl(rax, Address(from, dword_count, Address::times_4, -4));
    __ movl(Address(to, dword_count, Address::times_4, -4), rax);
    __ jmp(L_copy_bytes);
  __ BIND(L_copy_8_bytes);
    __ movq(rax, Address(from, qword_count, Address::times_8, -8));
    __ movq(Address(to, qword_count, Address::times_8, -8), rax);
    __ decrement(qword_count);
    __ jcc(Assembler::notZero, L_copy_8_bytes);
    if (is_oop) {
      __ jmp(L_exit);
    }
    restore_arg_regs();
    inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
  __ BIND(L_exit);
    if (is_oop) {
      gen_write_ref_array_post_barrier(to, dword_count, rax);
    }
    restore_arg_regs();
    inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
                                          const char *name, bool dest_uninitialized = false) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_copy_bytes, L_copy_8_bytes, L_exit;
    const Register from        = rdi;  // source array address
    const Register to          = rsi;  // destination array address
    const Register qword_count = rdx;  // elements count
    const Register end_from    = from; // source array end address
    const Register end_to      = rcx;  // destination array end address
    const Register saved_to    = to;
    const Register saved_count = r11;
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
    if (is_oop) {
      __ movptr(saved_count, qword_count);
      gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized);
    }
    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
    __ negptr(qword_count);
    __ jmp(L_copy_bytes);
  __ BIND(L_copy_8_bytes);
    __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
    __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
    __ increment(qword_count);
    __ jcc(Assembler::notZero, L_copy_8_bytes);
    if (is_oop) {
      __ jmp(L_exit);
    } else {
      restore_arg_regs();
      inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
      __ xorptr(rax, rax); // return 0
      __ leave(); // required for proper stackwalking of RuntimeStub frame
      __ ret(0);
    }
    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
    if (is_oop) {
    __ BIND(L_exit);
      gen_write_ref_array_post_barrier(saved_to, saved_count, rax);
    }
    restore_arg_regs();
    if (is_oop) {
      inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
    } else {
      inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
    }
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
                                          address nooverlap_target, address *entry,
                                          const char *name, bool dest_uninitialized = false) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    Label L_copy_bytes, L_copy_8_bytes, L_exit;
    const Register from        = rdi;  // source array address
    const Register to          = rsi;  // destination array address
    const Register qword_count = rdx;  // elements count
    const Register saved_count = rcx;
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    array_overlap_test(nooverlap_target, Address::times_8);
    setup_arg_regs(); // from => rdi, to => rsi, count => rdx
    if (is_oop) {
      __ movptr(saved_count, qword_count);
      gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
    }
    __ jmp(L_copy_bytes);
  __ BIND(L_copy_8_bytes);
    __ movq(rax, Address(from, qword_count, Address::times_8, -8));
    __ movq(Address(to, qword_count, Address::times_8, -8), rax);
    __ decrement(qword_count);
    __ jcc(Assembler::notZero, L_copy_8_bytes);
    if (is_oop) {
      __ jmp(L_exit);
    } else {
      restore_arg_regs();
      inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
      __ xorptr(rax, rax); // return 0
      __ leave(); // required for proper stackwalking of RuntimeStub frame
      __ ret(0);
    }
    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
    if (is_oop) {
    __ BIND(L_exit);
      gen_write_ref_array_post_barrier(to, saved_count, rax);
    }
    restore_arg_regs();
    if (is_oop) {
      inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
    } else {
      inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
    }
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }
  void generate_type_check(Register sub_klass,
                           Register super_check_offset,
                           Register super_klass,
                           Label& L_success) {
    assert_different_registers(sub_klass, super_check_offset, super_klass);
    BLOCK_COMMENT("type_check:");
    Label L_miss;
    __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
                                     super_check_offset);
    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
    __ BIND(L_miss);
  }
  address generate_checkcast_copy(const char *name, address *entry,
                                  bool dest_uninitialized = false) {
    Label L_load_element, L_store_element, L_do_card_marks, L_done;
    const Register from        = rdi;   // source array address
    const Register to          = rsi;   // destination array address
    const Register length      = rdx;   // elements count
    const Register ckoff       = rcx;   // super_check_offset
    const Register ckval       = r8;    // super_klass
    const Register end_from    = from;  // source array end address
    const Register end_to      = r13;   // destination array end address
    const Register count       = rdx;   // -(count_remaining)
    const Register r14_length  = r14;   // saved copy of length
    const Register rax_oop    = rax;    // actual oop copied
    const Register r11_klass  = r11;    // oop._klass
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    __ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef ASSERT
    { Label L;
      array_overlap_test(L, TIMES_OOP);
      __ stop("checkcast_copy within a single array");
      __ bind(L);
    }
#endif //ASSERT
    setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
#ifdef _WIN64
    __ movptr(ckval, Address(rsp, 6 * wordSize));
#endif
    if (entry != NULL) {
      BLOCK_COMMENT("Entry:");
    }
    enum {
      saved_r13_offset,
      saved_r14_offset,
      saved_rbp_offset
    };
    __ subptr(rsp, saved_rbp_offset * wordSize);
    __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
    __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
    assert_clean_int(length, rax);
    assert_clean_int(ckoff, rax);
#ifdef ASSERT
    BLOCK_COMMENT("assert consistent ckoff/ckval");
    { Label L;
      int sco_offset = in_bytes(Klass::super_check_offset_offset());
      __ cmpl(ckoff, Address(ckval, sco_offset));
      __ jcc(Assembler::equal, L);
      __ stop("super_check_offset inconsistent");
      __ bind(L);
    }
#endif //ASSERT
    Address end_from_addr(from, length, TIMES_OOP, 0);
    Address   end_to_addr(to,   length, TIMES_OOP, 0);
    Address from_element_addr(end_from, count, TIMES_OOP, 0);
    Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
    gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
    __ lea(end_from, end_from_addr);
    __ lea(end_to,   end_to_addr);
    __ movptr(r14_length, length);        // save a copy of the length
    assert(length == count, "");          // else fix next line:
    __ negptr(count);                     // negate and test the length
    __ jcc(Assembler::notZero, L_load_element);
    __ xorptr(rax, rax);                  // return 0 on (trivial) success
    __ jmp(L_done);
    __ align(OptoLoopAlignment);
    __ BIND(L_store_element);
    __ store_heap_oop(to_element_addr, rax_oop);  // store the oop
    __ increment(count);               // increment the count toward zero
    __ jcc(Assembler::zero, L_do_card_marks);
    __ BIND(L_load_element);
    __ load_heap_oop(rax_oop, from_element_addr); // load the oop
    __ testptr(rax_oop, rax_oop);
    __ jcc(Assembler::zero, L_store_element);
    __ load_klass(r11_klass, rax_oop);// query the object klass
    generate_type_check(r11_klass, ckoff, ckval, L_store_element);
    assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
    Label L_post_barrier;
    __ addptr(r14_length, count);     // K = (original - remaining) oops
    __ movptr(rax, r14_length);       // save the value
    __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
    __ jccb(Assembler::notZero, L_post_barrier);
    __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
    __ BIND(L_do_card_marks);
    __ xorptr(rax, rax);              // return 0 on success
    __ BIND(L_post_barrier);
    gen_write_ref_array_post_barrier(to, r14_length, rscratch1);
    __ BIND(L_done);
    __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
    __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
    restore_arg_regs();
    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
    return start;
  }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值