Jit in luajit
Luajit是一款高性能的lua解释器,与官方的lua解释器相比,luajit的高速除了将解释器直接以汇编代码实现外,还支持jit模式(Just in time)。Jit模式即将luajit的字节码编译成处理器能够直接执行的机器码,从而比解释执行速度更快。
Luajit存在97个字节码指令,例如 FORL指令对应一个数字类型的for循环语句,同时还有IFORL指令(强制解释模式执行)和JFORL指令(Jit模式执行),同时解释器实现了对各个字节码指令的翻译,这里以X86的翻译器为例。
Luajit优化一段指令序列,当一个指令的地址被识别为hot后,并开始跟踪记录指令线性序列、在退出跟踪时将指令序列编译成机器码。但是luajit只对FUNCF、FORL、ITERL、LOOP这四个指令进行了跟踪,即循环和一个函数的开始,例如,在解释执行FORL指令:
case BC_FORL:
|.if JIT
| hotloop RB
|.endif
| // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
break;
它首先判断是否是JIT模式,如果是jit模式,则调用hotloop块进行热点判断,同样的,如果是FUNCF指令,则调用hotcall块:
case BC_FUNCF:
|.if JIT
| hotcall RB
|.endif
case BC_FUNCV: /* NYI: compiled vararg functions. */
| // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow and ins_AD is a no-op.
break;
hotloop块的定义如下:
|// Decrement hashed hotcount and trigger trace recorder if zero.
|.macro hotloop, reg
| mov reg, PC
| shr reg, 1
| and reg, HOTCOUNT_PCMASK
| sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_LOOP
| jb ->vm_hotloop
|.endmacro
它将当前指令的地址右移一位,并与HOTCOUNT_PCMASK与操作,得到一个索引(哈希运算),根据这个索引在数值中找到计数值,减去HOTCOUNT_LOOP,当这个计数值小于0时,跳转到vm_hotloop继续执行。
|->vm_hotloop: // Hot loop counter underflow.
|.if JIT
| mov LFUNC:RB, [BASE-8] // Same as curr_topL(L).
| mov RB, LFUNC:RB->pc
| movzx RD, byte [RB+PC2PROTO(framesize)]
| lea RD, [BASE+RD*8]
| mov L:RB, SAVE_L
| mov L:RB->base, BASE
| mov L:RB->top, RD
| mov FCARG2, PC
| lea FCARG1, [DISPATCH+GG_DISP2J]
| mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
| mov SAVE_PC, PC
| call extern lj_trace_hot@8 // (jit_State *J, const BCIns *pc)
| jmp <3
|.endif
首先获取当前的函数,并得到字节码PC指针,获取栈大小并保存到RD中,接着讲top的位置保存到RD中,在进行一些参数设置后,调用lj_trace_hot用于跟踪热点,该函数位于lj_trace.c中:
/* A hotcount triggered. Start recording a root trace. */
void LJ_FASTCALL lj_trace_hot(jit_State *J, const BCIns *pc)
{
/* Note: pc is the interpreter bytecode PC here. It's offset by 1. */
ERRNO_SAVE
/* Reset hotcount. */
hotcount_set(J2GG(J), pc, J->param[JIT_P_hotloop]*HOTCOUNT_LOOP);
/* Only start a new trace if not recording or inside __gc call or vmevent. */
if (J->state == LJ_TRACE_IDLE &&
!(J2G(J)->hookmask & (HOOK_GC|HOOK_VMEVENT))) {
J->parent = 0; /* Root trace. */
J->exitno = 0;
J->state = LJ_TRACE_START;
lj_trace_ins(J, pc-1);
}
ERRNO_RESTORE
}
它将状态设置为LJ_TRACE_START后,开始调用lj_trace_ins进行热点跟踪:
/* A bytecode instruction is about to be executed. Record it. */
void lj_trace_ins(jit_State *J, const BCIns *pc)
{
/* Note: J->L must already be set. pc is the true bytecode PC here. */
J->pc = pc;
J->fn = curr_func(J->L);
J->pt = isluafunc(J->fn) ? funcproto(J->fn) : NULL;
while (lj_vm_cpcall(J->L, NULL, (void *)J, trace_state) != 0)
J->state = LJ_TRACE_ERR;
}
这里的pc是指向的字节码指令,在循环中不断执行和跟踪,这里的跟踪通过trace_state函数实现,这个函数存在7种状态:
/* Trace compiler state. */
typedef enum {
LJ_TRACE_IDLE, /* Trace compiler idle. */
LJ_TRACE_ACTIVE = 0x10,
LJ_TRACE_RECORD, /* Bytecode recording active. */
LJ_TRACE_START, /* New trace started. */
LJ_TRACE_END, /* End of trace. */
LJ_TRACE_ASM, /* Assemble trace. */
LJ_TRACE_ERR /* Trace aborted with error. */
} TraceState;
IDLE表示空闲、RECORD表示正在跟踪记录、END表示结束、ASM表示开始编译机器指令,这个状态转换函数的实现如下:
/* State machine for the trace compiler. Protected callback. */
static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud)
{
jit_State *J = (jit_State *)ud;
UNUSED(dummy);
do {
retry:
switch (J->state) {
case LJ_TRACE_START:
J->state = LJ_TRACE_RECORD; /* trace_start() may change state. */
trace_start(J);
lj_dispatch_update(J2G(J));
break;
case LJ_TRACE_RECORD:
trace_pendpatch(J, 0);
setvmstate(J2G(J), RECORD);
lj_vmevent_send_(L, RECORD,
/* Save/restore tmptv state for trace recorder. */
TValue savetv = J2G(J)->tmptv;
TValue savetv2 = J2G(J)->tmptv2;
setintV(L->top++, J->cur.traceno);
setfuncV(L, L->top++, J->fn);
setintV(L->top++, J->pt ? (int32_t)proto_bcpos(J->pt, J->pc) : -1);
setintV(L->top++, J->framedepth);
J2G(J)->tmptv = savetv;
J2G(J)->tmptv2 = savetv2;
);
lj_record_ins(J);
break;
case LJ_TRACE_END:
trace_pendpatch(J, 1);
J->loopref = 0;
if ((J->flags & JIT_F_OPT_LOOP) &&
J->cur.link == J->cur.traceno && J->framedepth + J->retdepth == 0) {
setvmstate(J2G(J), OPT);
lj_opt_dce(J);
if (lj_opt_loop(J)) { /* Loop optimization failed? */
J->cur.link = 0;
J->cur.linktype = LJ_TRLINK_NONE;
J->loopref = J->cur.nins;
J->state = LJ_TRACE_RECORD; /* Try to continue recording. */
break;
}
J->loopref = J->chain[IR_LOOP]; /* Needed by assembler. */
}
lj_opt_split(J);
lj_opt_sink(J);
if (!J->loopref) J->cur.snap[J->cur.nsnap-1].count = SNAPCOUNT_DONE;
J->state = LJ_TRACE_ASM;
break;
case LJ_TRACE_ASM:
setvmstate(J2G(J), ASM);
lj_asm_trace(J, &J->cur);
trace_stop(J);
setvmstate(J2G(J), INTERP);
J->state = LJ_TRACE_IDLE;
lj_dispatch_update(J2G(J));
return NULL;
default: /* Trace aborted asynchronously. */
setintV(L->top++, (int32_t)LJ_TRERR_RECERR);
/* fallthrough */
case LJ_TRACE_ERR:
trace_pendpatch(J, 1);
if (trace_abort(J))
goto retry;
setvmstate(J2G(J), INTERP);
J->state = LJ_TRACE_IDLE;
lj_dispatch_update(J2G(J));
return NULL;
}
} while (J->state > LJ_TRACE_RECORD);
return NULL;
}
它根据不同的状态执行不同的操作函数,我们可以简化为:
/* State machine for the trace compiler. Protected callback. */
static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud)
{
jit_State *J = (jit_State *)ud;
UNUSED(dummy);
do {
retry:
switch (J->state) {
case LJ_TRACE_START:
J->state = LJ_TRACE_RECORD; /* trace_start() may change state. */
trace_start(J);
lj_dispatch_update(J2G(J));
break;
case LJ_TRACE_RECORD:
lj_record_ins(J);
break;
case LJ_TRACE_END:
trace_pendpatch(J, 1);
J->state = LJ_TRACE_ASM;
break;
case LJ_TRACE_ASM:
setvmstate(J2G(J), ASM);
lj_asm_trace(J, &J->cur);
trace_stop(J);
setvmstate(J2G(J), INTERP);
J->state = LJ_TRACE_IDLE;
lj_dispatch_update(J2G(J));
return NULL;
default: /* Trace aborted asynchronously. */
setintV(L->top++, (int32_t)LJ_TRERR_RECERR);
case LJ_TRACE_ERR:
trace_pendpatch(J, 1);
if (trace_abort(J))
goto retry;
setvmstate(J2G(J), INTERP);
J->state = LJ_TRACE_IDLE;
lj_dispatch_update(J2G(J));
return NULL;
}
} while (J->state > LJ_TRACE_RECORD);
return NULL;
}
Trace_start用于初始化trace结构,分配一个traceno等,它是一个数组的下标,其中比较重要的是lj_record_ins函数,它用于记录一个字节码指令,并保存为一个SSA中间代码IR形式,IR的定义在lj_ir.c中:
/* -- IR instructions ----------------------------------------------------- */
/* IR instruction definition. Order matters, see below. ORDER IR */
#define IRDEF(_) \
/* Guarded assertions. */ \
/* Must be properly aligned to flip opposites (^1) and (un)ordered (^4). */ \
_(LT, N , ref, ref) \
_(GE, N , ref, ref) \
_(LE, N , ref, ref) \
_(GT, N , ref, ref) \
\
_(ULT, N , ref, ref) \
_(UGE, N , ref, ref) \
_(ULE, N , ref, ref) \
_(UGT, N , ref, ref) \
\
_(EQ, C , ref, ref) \
_(NE, C , ref, ref) \
\
_(ABC, N , ref, ref) \
_(RETF, S , ref, ref) \
\
/* Miscellaneous ops. */ \
_(NOP, N , ___, ___) \
_(BASE, N , lit, lit) \
_(PVAL, N , lit, ___) \
_(GCSTEP, S , ___, ___) \
_(HIOP, S , ref, ref) \
_(LOOP, S , ___, ___) \
_(USE, S , ref, ___) \
_(PHI, S , ref, ref) \
_(RENAME, S , ref, lit) \
_(PROF, S , ___, ___) \
\
/* Constants. */ \
_(KPRI, N , ___, ___) \
_(KINT, N , cst, ___) \
_(KGC, N , cst, ___) \
_(KPTR, N , cst, ___) \
_(KKPTR, N , cst, ___) \
_(KNULL, N , cst, ___) \
_(KNUM, N , cst, ___) \
_(KINT64, N , cst, ___) \
_(KSLOT, N , ref, lit) \
\
/* Bit ops. */ \
_(BNOT, N , ref, ___) \
_(BSWAP, N , ref, ___) \
_(BAND, C , ref, ref) \
_(BOR, C , ref, ref) \
_(BXOR, C , ref, ref) \
_(BSHL, N , ref, ref) \
_(BSHR, N , ref, ref) \
_(BSAR, N , ref, ref) \
_(BROL, N , ref, ref) \
_(BROR, N , ref, ref) \
\
/* Arithmetic ops. ORDER ARITH */ \
_(ADD, C , ref, ref) \
_(SUB, N , ref, ref) \
_(MUL, C , ref, ref) \
_(DIV, N , ref, ref) \
_(MOD, N , ref, ref) \
_(POW, N , ref, ref) \
_(NEG, N , ref, ref) \
\
_(ABS, N , ref, ref) \
_(ATAN2, N , ref, ref) \
_(LDEXP, N , ref, ref) \
_(MIN, C , ref, ref) \
_(MAX, C , ref, ref) \
_(FPMATH, N , ref, lit) \
\
/* Overflow-checking arithmetic ops. */ \
_(ADDOV, CW, ref, ref) \
_(SUBOV, NW, ref, ref) \
_(MULOV, CW, ref, ref) \
\
/* Memory ops. A = array, H = hash, U = upvalue, F = field, S = stack. */ \
\
/* Memory references. */ \
_(AREF, R , ref, ref) \
_(HREFK, R , ref, ref) \
_(HREF, L , ref, ref) \
_(NEWREF, S , ref, ref) \
_(UREFO, LW, ref, lit) \
_(UREFC, LW, ref, lit) \
_(FREF, R , ref, lit) \
_(STRREF, N , ref, ref) \
_(LREF, L , ___, ___) \
\
/* Loads and Stores. These must be in the same order. */ \
_(ALOAD, L , ref, ___) \
_(HLOAD, L , ref, ___) \
_(ULOAD, L , ref, ___) \
_(FLOAD, L , ref, lit) \
_(XLOAD, L , ref, lit) \
_(SLOAD, L , lit, lit) \
_(VLOAD, L , ref, ___) \
\
_(ASTORE, S , ref, ref) \
_(HSTORE, S , ref, ref) \
_(USTORE, S , ref, ref) \
_(FSTORE, S , ref, ref) \
_(XSTORE, S , ref, ref) \
\
/* Allocations. */ \
_(SNEW, N , ref, ref) /* CSE is ok, not marked as A. */ \
_(XSNEW, A , ref, ref) \
_(TNEW, AW, lit, lit) \
_(TDUP, AW, ref, ___) \
_(CNEW, AW, ref, ref) \
_(CNEWI, NW, ref, ref) /* CSE is ok, not marked as A. */ \
\
/* Buffer operations. */ \
_(BUFHDR, L , ref, lit) \
_(BUFPUT, L , ref, ref) \
_(BUFSTR, A , ref, ref) \
\
/* Barriers. */ \
_(TBAR, S , ref, ___) \
_(OBAR, S , ref, ref) \
_(XBAR, S , ___, ___) \
\
/* Type conversions. */ \
_(CONV, NW, ref, lit) \
_(TOBIT, N , ref, ref) \
_(TOSTR, N , ref, lit) \
_(STRTO, N , ref, ___) \
\
/* Calls. */ \
_(CALLN, N , ref, lit) \
_(CALLA, A , ref, lit) \
_(CALLL, L , ref, lit) \
_(CALLS, S , ref, lit) \
_(CALLXS, S , ref, ref) \
_(CARG, N , ref, ref) \
\
/* End of list. */
多种情况都会出现结束记录的情况,如遇到了已经编译的指令。在LJ_TRACE_ASM状态下会进行代码的编译操作lj_asm_trace函数位于lj_asm.c中,函数中有一个循环如下:
/* Assemble a trace in linear backwards order. */
for (as->curins--; as->curins > as->stopins; as->curins--) {
IRIns *ir = IR(as->curins);
lua_assert(!(LJ_32 && irt_isint64(ir->t))); /* Handled by SPLIT. */
if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE))
continue; /* Dead-code elimination can be soooo easy. */
if (irt_isguard(ir->t))
asm_snap_prep(as);
RA_DBG_REF();
checkmclim(as);
asm_ir(as, ir);
}
它调用asm_ir将所有的ir指令转换成机器码,在lj_asm_trace函数后,接着调用trace_stop函数结束一个跟踪,该函数实现如下:
/* Stop tracing. */
static void trace_stop(jit_State *J)
{
BCIns *pc = mref(J->cur.startpc, BCIns);
BCOp op = bc_op(J->cur.startins);
GCproto *pt = &gcref(J->cur.startpt)->pt;
TraceNo traceno = J->cur.traceno;
GCtrace *T = J->curfinal;
lua_State *L;
switch (op) {
case BC_FORL:
setbc_op(pc+bc_j(J->cur.startins), BC_JFORI); /* Patch FORI, too. */
/* fallthrough */
case BC_LOOP:
case BC_ITERL:
case BC_FUNCF:
/* Patch bytecode of starting instruction in root trace. */
setbc_op(pc, (int)op+(int)BC_JLOOP-(int)BC_LOOP);
setbc_d(pc, traceno);
addroot:
/* Add to root trace chain in prototype. */
J->cur.nextroot = pt->trace;
pt->trace = (TraceNo1)traceno;
break;
case BC_RET:
case BC_RET0:
case BC_RET1:
*pc = BCINS_AD(BC_JLOOP, J->cur.snap[0].nslots, traceno);
goto addroot;
case BC_JMP:
/* Patch exit branch in parent to side trace entry. */
lua_assert(J->parent != 0 && J->cur.root != 0);
lj_asm_patchexit(J, traceref(J, J->parent), J->exitno, J->cur.mcode);
/* Avoid compiling a side trace twice (stack resizing uses parent exit). */
traceref(J, J->parent)->snap[J->exitno].count = SNAPCOUNT_DONE;
/* Add to side trace chain in root trace. */
{
GCtrace *root = traceref(J, J->cur.root);
root->nchild++;
J->cur.nextside = root->nextside;
root->nextside = (TraceNo1)traceno;
}
break;
case BC_CALLM:
case BC_CALL:
case BC_ITERC:
/* Trace stitching: patch link of previous trace. */
traceref(J, J->exitno)->link = traceno;
break;
default:
lua_assert(0);
break;
}
/* Commit new mcode only after all patching is done. */
lj_mcode_commit(J, J->cur.mcode);
J->postproc = LJ_POST_NONE;
trace_save(J, T);
L = J->L;
lj_vmevent_send(L, TRACE,
setstrV(L, L->top++, lj_str_newlit(L, "stop"));
setintV(L->top++, traceno);
setfuncV(L, L->top++, J->fn);
);
}
它通过如下两个函数:
setbc_op(pc, (int)op+(int)BC_JLOOP-(int)BC_LOOP);
setbc_d(pc, traceno);
重新设置指令的opcode,即J_op = op + BC_JLOOP – BC_LOOP,那么如果将lj_bc.h中的指令随意打乱会影响到这里的正确性。
修改后的指令为:j_op traceno
同时可以看到pt->trace字段记录的是一个traceno
pt->trace = (TraceNo1)traceno;
那么接下来看解释器中对JFORL的实现:
case BC_JFORI:
case BC_JFORL:
#if !LJ_HASJIT
break;
#endif
case BC_FORI:
case BC_IFORL:
vk = (op == BC_IFORL || op == BC_JFORL);
| ins_AJ // RA = base, RD = target (after end of loop or start of loop)
| lea RA, [BASE+RA*8]
if (LJ_DUALNUM) {
| cmp FOR_TIDX, LJ_TISNUM; jne >9
if (!vk) {
| cmp FOR_TSTOP, LJ_TISNUM; jne ->vmeta_for
| cmp FOR_TSTEP, LJ_TISNUM; jne ->vmeta_for
| mov RB, dword FOR_IDX
| cmp dword FOR_STEP, 0; jl >5
} else {
#ifdef LUA_USE_ASSERT
| cmp FOR_TSTOP, LJ_TISNUM; jne ->assert_bad_for_arg_type
| cmp FOR_TSTEP, LJ_TISNUM; jne ->assert_bad_for_arg_type
#endif
| mov RB, dword FOR_STEP
| test RB, RB; js >5
| add RB, dword FOR_IDX; jo >1
| mov dword FOR_IDX, RB
}
| cmp RB, dword FOR_STOP
| mov FOR_TEXT, LJ_TISNUM
| mov dword FOR_EXT, RB
if (op == BC_FORI) {
| jle >7
|1:
|6:
| branchPC RD
} else if (op == BC_JFORI) {
| branchPC RD
| movzx RD, PC_RD
| jle =>BC_JLOOP
|1:
|6:
} else if (op == BC_IFORL) {
| jg >7
|6:
| branchPC RD
|1:
} else {
| jle =>BC_JLOOP
|1:
|6:
}
当op = JFORL时,跳转到BC_JLOOP,如下:
case BC_JLOOP:
|.if JIT
| ins_AD // RA = base (ignored), RD = traceno
| mov RA, [DISPATCH+DISPATCH_J(trace)]
| mov TRACE:RD, [RA+RD*4]
| mov RDa, TRACE:RD->mcode
| mov L:RB, SAVE_L
| mov [DISPATCH+DISPATCH_GL(jit_base)], BASE
| mov [DISPATCH+DISPATCH_GL(tmpbuf.L)], L:RB
| // Save additional callee-save registers only used in compiled code.
|.if X64WIN
| mov TMPQ, r12
| mov TMPa, r13
| mov CSAVE_4, r14
| mov CSAVE_3, r15
| mov RAa, rsp
| sub rsp, 9*16+4*8
| movdqa [RAa], xmm6
| movdqa [RAa-1*16], xmm7
| movdqa [RAa-2*16], xmm8
| movdqa [RAa-3*16], xmm9
| movdqa [RAa-4*16], xmm10
| movdqa [RAa-5*16], xmm11
| movdqa [RAa-6*16], xmm12
| movdqa [RAa-7*16], xmm13
| movdqa [RAa-8*16], xmm14
| movdqa [RAa-9*16], xmm15
|.elif X64
| mov TMPQ, r12
| mov TMPa, r13
| sub rsp, 16
|.endif
| jmp RDa
|.endif
break;
先根据RD中保存的traceno获取到trace结构,并将trace结构中保存的机器码赋值在Rda中,进行堆栈转换后,jmp Rda直接跳转到机器码处执行。
在x86中,当字节码执行结束,继续执行下一个字节码时,都会使用ins_next块,它的定义如下:
|.macro ins_NEXT
| mov RC, [PC]
| movzx RA, RCH
| movzx OP, RCL
| add PC, 4
| shr RC, 16
|.if X64
| jmp aword [DISPATCH+OP*8]
|.else
| jmp aword [DISPATCH+OP*4]
|.endif
|.endmacro
它从PC指向的字节码中获取了opcode,并跳转到DISPATCH + OP *4的地方执行,可以看出OP实质上保存的是数组的下标而这些数组元素都指向了vm_record汇编块:
|->vm_record: // Dispatch target for recording phase.
|.if JIT
| movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
| test RDL, HOOK_VMEVENT // No recording while in vmevent.
| jnz >5
| // Decrement the hookcount for consistency, but always do the call.
| test RDL, HOOK_ACTIVE
| jnz >1
| test RDL, LUA_MASKLINE|LUA_MASKCOUNT
| jz >1
| dec dword [DISPATCH+DISPATCH_GL(hookcount)]
| jmp >1
|.endif
|
|->vm_rethook: // Dispatch target for return hooks.
| movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
| test RDL, HOOK_ACTIVE // Hook already active?
| jnz >5
| jmp >1
|
|->vm_inshook: // Dispatch target for instr/line hooks.
| movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
| test RDL, HOOK_ACTIVE // Hook already active?
| jnz >5
|
| test RDL, LUA_MASKLINE|LUA_MASKCOUNT
| jz >5
| dec dword [DISPATCH+DISPATCH_GL(hookcount)]
| jz >1
| test RDL, LUA_MASKLINE
| jz >5
|1:
| mov L:RB, SAVE_L
| mov L:RB->base, BASE
| mov FCARG2, PC // Caveat: FCARG2 == BASE
| mov FCARG1, L:RB
| // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.
| call extern lj_dispatch_ins@8 // (lua_State *L, const BCIns *pc)
|3:
| mov BASE, L:RB->base
|4:
| movzx RA, PC_RA
|5:
| movzx OP, PC_OP
| movzx RD, PC_RD
|.if X64
| jmp aword [DISPATCH+OP*8+GG_DISP2STATIC] // Re-dispatch to static ins.
|.else
| jmp aword [DISPATCH+OP*4+GG_DISP2STATIC] // Re-dispatch to static ins.
|.endif
调用lj_dispatch_ins后,最终跳转到DISPATCH+OP*4+GG_DISP2STATIC这个地址继续执行,这个地址正是每个opcode对应的解释器汇编块。
Jit的正常运行还涉及堆栈状态的转换、jit模式到解释模式的跳转等(SSA守护代码),远不止这些。