• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tarantool / luajit / 7119175127

06 Dec 2023 06:58PM UTC coverage: 88.591% (-0.03%) from 88.621%
7119175127

push

github

igormunkin
Fix HREFK forwarding vs. table.clear().

Reported by XmiliaH.

(cherry-picked from commit d5a237eae)

When performing HREFK (and also ALOAD, HLOAD) forwarding optimization,
the `table.clear()` function call may be performed on the table operand
from HREFK between table creation and IR, from which value is forwarded.
This call isn't taken in the account, so it may lead to too optimistic
value-forwarding from NEWREF (and also ASTORE, HSTORE), or the omitted
type guard for HREFK operation. Therefore, this leads to incorrect trace
behaviour (for example, taking a non-nil value from the cleared table).

This patch adds necessary checks for `table.clear()` calls.

Sergey Kaplun:
* added the description and the test for the problem

Part of tarantool/tarantool#9145

Reviewed-by: Maxim Kokryashkin <m.kokryashkin@tarantool.org>
Reviewed-by: Sergey Bronnikov <sergeyb@tarantool.org>
Signed-off-by: Igor Munkin <imun@tarantool.org>

5377 of 5987 branches covered (0.0%)

Branch coverage included in aggregate %.

12 of 12 new or added lines in 1 file covered. (100.0%)

24 existing lines in 5 files now uncovered.

20619 of 23357 relevant lines covered (88.28%)

2754697.77 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.07
/src/lj_asm_x86.h
1
/*
2
** x86/x64 IR assembler (SSA IR -> machine code).
3
** Copyright (C) 2005-2017 Mike Pall. See Copyright Notice in luajit.h
4
*/
5

6
/* -- Guard handling ------------------------------------------------------ */
7

8
/* Generate an exit stub group at the bottom of the reserved MCode memory. */
9
static MCode *asm_exitstub_gen(ASMState *as, ExitNo group)
339✔
10
{
11
  ExitNo i, groupofs = (group*EXITSTUBS_PER_GROUP) & 0xff;
339✔
12
  MCode *mxp = as->mcbot;
339✔
13
  MCode *mxpstart = mxp;
339✔
14
  if (mxp + (2+2)*EXITSTUBS_PER_GROUP+8+5 >= as->mctop)
339✔
15
    asm_mclimit(as);
×
16
  /* Push low byte of exitno for each exit stub. */
17
  *mxp++ = XI_PUSHi8; *mxp++ = (MCode)groupofs;
339✔
18
  for (i = 1; i < EXITSTUBS_PER_GROUP; i++) {
10,848✔
19
    *mxp++ = XI_JMPs; *mxp++ = (MCode)((2+2)*(EXITSTUBS_PER_GROUP - i) - 2);
10,509✔
20
    *mxp++ = XI_PUSHi8; *mxp++ = (MCode)(groupofs + i);
10,509✔
21
  }
22
  /* Push the high byte of the exitno for each exit stub group. */
23
  *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8);
339✔
24
#if !LJ_GC64
25
  /* Store DISPATCH at original stack slot 0. Account for the two push ops. */
26
  *mxp++ = XI_MOVmi;
27
  *mxp++ = MODRM(XM_OFS8, 0, RID_ESP);
28
  *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
29
  *mxp++ = 2*sizeof(void *);
30
  *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4;
31
#endif
32
  /* Jump to exit handler which fills in the ExitState. */
33
  *mxp++ = XI_JMP; mxp += 4;
339✔
34
  *((int32_t *)(mxp-4)) = jmprel(as->J, mxp, (MCode *)(void *)lj_vm_exit_handler);
339✔
35
  /* Commit the code for this group (even if assembly fails later on). */
36
  lj_mcode_commitbot(as->J, mxp);
339✔
37
  as->mcbot = mxp;
339✔
38
  as->mclim = as->mcbot + MCLIM_REDZONE;
339✔
39
  return mxpstart;
339✔
40
}
41

42
/* Setup all needed exit stubs. */
43
static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
5,327✔
44
{
45
  ExitNo i;
5,327✔
46
  if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR)
5,327✔
47
    lj_trace_err(as->J, LJ_TRERR_SNAPOV);
×
48
  for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++)
11,398✔
49
    if (as->J->exitstubgroup[i] == NULL)
6,071✔
50
      as->J->exitstubgroup[i] = asm_exitstub_gen(as, i);
339✔
51
}
5,327✔
52

53
/* Emit conditional branch to exit for guard.
54
** It's important to emit this *after* all registers have been allocated,
55
** because rematerializations may invalidate the flags.
56
*/
57
static void asm_guardcc(ASMState *as, int cc)
295,036✔
58
{
59
  MCode *target = exitstub_addr(as->J, as->snapno);
295,036✔
60
  MCode *p = as->mcp;
295,036✔
61
  if (LJ_UNLIKELY(p == as->invmcp)) {
295,036✔
62
    as->loopinv = 1;
2,401✔
63
    *(int32_t *)(p+1) = jmprel(as->J, p+5, target);
2,401✔
64
    target = p;
2,401✔
65
    cc ^= 1;
2,401✔
66
    if (as->realign) {
2,401✔
67
      if (LJ_GC64 && LJ_UNLIKELY(as->mrm.base == RID_RIP))
1,090✔
68
        as->mrm.ofs += 2;  /* Fixup RIP offset for pending fused load. */
×
69
      emit_sjcc(as, cc, target);
1,090✔
70
      return;
1,090✔
71
    }
72
  }
73
  if (LJ_GC64 && LJ_UNLIKELY(as->mrm.base == RID_RIP))
293,946✔
74
    as->mrm.ofs += 6;  /* Fixup RIP offset for pending fused load. */
21✔
75
  emit_jcc(as, cc, target);
293,946✔
76
}
77

78
/* -- Memory operand fusion ----------------------------------------------- */
79

80
/* Limit linear search to this distance. Avoids O(n^2) behavior. */
81
#define CONFLICT_SEARCH_LIM        31
82

83
/* Check if a reference is a signed 32 bit constant. */
84
static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
143,385✔
85
{
86
  if (irref_isk(ref)) {
143,385✔
87
    IRIns *ir = IR(ref);
139,857✔
88
#if LJ_GC64
89
    if (ir->o == IR_KNULL || !irt_is64(ir->t)) {
139,857✔
90
      *k = ir->i;
133,910✔
91
      return 1;
133,910✔
92
    } else if (checki32((int64_t)ir_k64(ir)->u64)) {
5,947✔
93
      *k = (int32_t)ir_k64(ir)->u64;
712✔
94
      return 1;
712✔
95
    }
96
#else
97
    if (ir->o != IR_KINT64) {
98
      *k = ir->i;
99
      return 1;
100
    } else if (checki32((int64_t)ir_kint64(ir)->u64)) {
101
      *k = (int32_t)ir_kint64(ir)->u64;
102
      return 1;
103
    }
104
#endif
105
  }
106
  return 0;
107
}
108

109
/* Check if there's no conflicting instruction between curins and ref.
110
** Also avoid fusing loads if there are multiple references.
111
*/
112
static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload)
16,979✔
113
{
114
  IRIns *ir = as->ir;
16,979✔
115
  IRRef i = as->curins;
16,979✔
116
  if (i > ref + CONFLICT_SEARCH_LIM)
16,979✔
117
    return 0;  /* Give up, ref is too far away. */
118
  while (--i > ref) {
21,466✔
119
    if (ir[i].o == conflict)
4,863✔
120
      return 0;  /* Conflict found. */
121
    else if (!noload && (ir[i].op1 == ref || ir[i].op2 == ref))
2,941✔
122
      return 0;
123
  }
124
  return 1;  /* Ok, no conflict. */
125
}
126

127
/* Fuse array base into memory operand. */
128
static IRRef asm_fuseabase(ASMState *as, IRRef ref)
2,647✔
129
{
130
  IRIns *irb = IR(ref);
2,647✔
131
  as->mrm.ofs = 0;
2,647✔
132
  if (irb->o == IR_FLOAD) {
2,647✔
133
    IRIns *ira = IR(irb->op1);
2,622✔
134
    lj_assertA(irb->op2 == IRFL_TAB_ARRAY, "expected FLOAD TAB_ARRAY");
2,622✔
135
    /* We can avoid the FLOAD of t->array for colocated arrays. */
136
    if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE &&
2,622✔
137
        !neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 1)) {
472✔
138
      as->mrm.ofs = (int32_t)sizeof(GCtab);  /* Ofs to colocated array. */
225✔
139
      return irb->op1;  /* Table obj. */
225✔
140
    }
141
  } else if (irb->o == IR_ADD && irref_isk(irb->op2)) {
25✔
142
    /* Fuse base offset (vararg load). */
143
    as->mrm.ofs = IR(irb->op2)->i;
25✔
144
    return irb->op1;
25✔
145
  }
146
  return ref;  /* Otherwise use the given array base. */
147
}
148

149
/* Fuse array reference into memory operand. */
150
static void asm_fusearef(ASMState *as, IRIns *ir, RegSet allow)
151
{
152
  IRIns *irx;
153
  lj_assertA(ir->o == IR_AREF, "expected AREF");
154
  as->mrm.base = (uint8_t)ra_alloc1(as, asm_fuseabase(as, ir->op1), allow);
155
  irx = IR(ir->op2);
156
  if (irref_isk(ir->op2)) {
157
    as->mrm.ofs += 8*irx->i;
158
    as->mrm.idx = RID_NONE;
159
  } else {
160
    rset_clear(allow, as->mrm.base);
161
    as->mrm.scale = XM_SCALE8;
162
    /* Fuse a constant ADD (e.g. t[i+1]) into the offset.
163
    ** Doesn't help much without ABCelim, but reduces register pressure.
164
    */
165
    if (!LJ_64 &&  /* Has bad effects with negative index on x64. */
166
        mayfuse(as, ir->op2) && ra_noreg(irx->r) &&
167
        irx->o == IR_ADD && irref_isk(irx->op2)) {
168
      as->mrm.ofs += 8*IR(irx->op2)->i;
169
      as->mrm.idx = (uint8_t)ra_alloc1(as, irx->op1, allow);
170
    } else {
171
      as->mrm.idx = (uint8_t)ra_alloc1(as, ir->op2, allow);
172
    }
173
  }
174
}
175

176
/* Fuse array/hash/upvalue reference into memory operand.
177
** Caveat: this may allocate GPRs for the base/idx registers. Be sure to
178
** pass the final allow mask, excluding any GPRs used for other inputs.
179
** In particular: 2-operand GPR instructions need to call ra_dest() first!
180
*/
181
static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow)
96,054✔
182
{
183
  IRIns *ir = IR(ref);
96,054✔
184
  if (ra_noreg(ir->r)) {
96,054✔
185
    switch ((IROp)ir->o) {
95,925✔
186
    case IR_AREF:
2,657✔
187
      if (mayfuse(as, ref)) {
2,657✔
188
        asm_fusearef(as, ir, allow);
2,612✔
189
        return;
2,612✔
190
      }
191
      break;
192
    case IR_HREFK:
91,415✔
193
      if (mayfuse(as, ref)) {
91,415✔
194
        as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);
17,469✔
195
        as->mrm.ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
17,469✔
196
        as->mrm.idx = RID_NONE;
17,469✔
197
        return;
17,469✔
198
      }
199
      break;
200
    case IR_UREFC:
464✔
201
      if (irref_isk(ir->op1)) {
464✔
202
        GCfunc *fn = ir_kfunc(IR(ir->op1));
281✔
203
        GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv;
281✔
204
#if LJ_GC64
205
        int64_t ofs = dispofs(as, &uv->tv);
281✔
206
        if (checki32(ofs) && checki32(ofs+4)) {
281✔
207
          as->mrm.ofs = (int32_t)ofs;
281✔
208
          as->mrm.base = RID_DISPATCH;
281✔
209
          as->mrm.idx = RID_NONE;
281✔
210
          return;
281✔
211
        }
212
#else
213
        as->mrm.ofs = ptr2addr(&uv->tv);
214
        as->mrm.base = as->mrm.idx = RID_NONE;
215
        return;
216
#endif
217
      }
218
      break;
219
    default:
220
      lj_assertA(ir->o == IR_HREF || ir->o == IR_NEWREF || ir->o == IR_UREFO ||
221
                 ir->o == IR_KKPTR,
222
                 "bad IR op %d", ir->o);
223
      break;
224
    }
225
  }
129✔
226
  as->mrm.base = (uint8_t)ra_alloc1(as, ref, allow);
75,692✔
227
  as->mrm.ofs = 0;
75,692✔
228
  as->mrm.idx = RID_NONE;
75,692✔
229
}
230

231
/* Fuse FLOAD/FREF reference into memory operand. */
232
static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow)
233
{
234
  lj_assertA(ir->o == IR_FLOAD || ir->o == IR_FREF,
235
             "bad IR op %d", ir->o);
236
  as->mrm.idx = RID_NONE;
237
  if (ir->op1 == REF_NIL) {  /* FLOAD from GG_State with offset. */
238
#if LJ_GC64
239
    as->mrm.ofs = (int32_t)(ir->op2 << 2) - GG_OFS(dispatch);
240
    as->mrm.base = RID_DISPATCH;
241
#else
242
    as->mrm.ofs = (int32_t)(ir->op2 << 2) + ptr2addr(J2GG(as->J));
243
    as->mrm.base = RID_NONE;
244
#endif
245
    return;
246
  }
247
  as->mrm.ofs = field_ofs[ir->op2];
248
  if (irref_isk(ir->op1)) {
249
    IRIns *op1 = IR(ir->op1);
250
#if LJ_GC64
251
    if (ir->op1 == REF_NIL) {
252
      as->mrm.ofs -= GG_OFS(dispatch);
253
      as->mrm.base = RID_DISPATCH;
254
      return;
255
    } else if (op1->o == IR_KPTR || op1->o == IR_KKPTR) {
256
      intptr_t ofs = dispofs(as, ir_kptr(op1));
257
      if (checki32(as->mrm.ofs + ofs)) {
258
        as->mrm.ofs += (int32_t)ofs;
259
        as->mrm.base = RID_DISPATCH;
260
        return;
261
      }
262
    }
263
#else
264
    as->mrm.ofs += op1->i;
265
    as->mrm.base = RID_NONE;
266
    return;
267
#endif
268
  }
269
  as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow);
270
}
271

272
/* Fuse string reference into memory operand. */
273
static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow)
274
{
275
  IRIns *irr;
276
  lj_assertA(ir->o == IR_STRREF, "bad IR op %d", ir->o);
277
  as->mrm.base = as->mrm.idx = RID_NONE;
278
  as->mrm.scale = XM_SCALE1;
279
  as->mrm.ofs = sizeof(GCstr);
280
  if (!LJ_GC64 && irref_isk(ir->op1)) {
281
    as->mrm.ofs += IR(ir->op1)->i;
282
  } else {
283
    Reg r = ra_alloc1(as, ir->op1, allow);
284
    rset_clear(allow, r);
285
    as->mrm.base = (uint8_t)r;
286
  }
287
  irr = IR(ir->op2);
288
  if (irref_isk(ir->op2)) {
289
    as->mrm.ofs += irr->i;
290
  } else {
291
    Reg r;
292
    /* Fuse a constant add into the offset, e.g. string.sub(s, i+10). */
293
    if (!LJ_64 &&  /* Has bad effects with negative index on x64. */
294
        mayfuse(as, ir->op2) && irr->o == IR_ADD && irref_isk(irr->op2)) {
295
      as->mrm.ofs += IR(irr->op2)->i;
296
      r = ra_alloc1(as, irr->op1, allow);
297
    } else {
298
      r = ra_alloc1(as, ir->op2, allow);
299
    }
300
    if (as->mrm.base == RID_NONE)
301
      as->mrm.base = (uint8_t)r;
302
    else
303
      as->mrm.idx = (uint8_t)r;
304
  }
305
}
306

307
static void asm_fusexref(ASMState *as, IRRef ref, RegSet allow)
1,089✔
308
{
309
  IRIns *ir = IR(ref);
1,089✔
310
  as->mrm.idx = RID_NONE;
1,089✔
311
  if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
1,089✔
312
#if LJ_GC64
313
    intptr_t ofs = dispofs(as, ir_kptr(ir));
73✔
314
    if (checki32(ofs)) {
73✔
315
      as->mrm.ofs = (int32_t)ofs;
73✔
316
      as->mrm.base = RID_DISPATCH;
73✔
317
      return;
73✔
318
    }
319
  } if (0) {
1,016✔
320
#else
321
    as->mrm.ofs = ir->i;
322
    as->mrm.base = RID_NONE;
323
  } else if (ir->o == IR_STRREF) {
324
    asm_fusestrref(as, ir, allow);
325
#endif
326
  } else {
327
    as->mrm.ofs = 0;
1,016✔
328
    if (canfuse(as, ir) && ir->o == IR_ADD && ra_noreg(ir->r)) {
1,016✔
329
      /* Gather (base+idx*sz)+ofs as emitted by cdata ptr/array indexing. */
330
      IRIns *irx;
740✔
331
      IRRef idx;
740✔
332
      Reg r;
740✔
333
      if (asm_isk32(as, ir->op2, &as->mrm.ofs)) {  /* Recognize x+ofs. */
1,452✔
334
        ref = ir->op1;
712✔
335
        ir = IR(ref);
712✔
336
        if (!(ir->o == IR_ADD && canfuse(as, ir) && ra_noreg(ir->r)))
712✔
337
          goto noadd;
317✔
338
      }
339
      as->mrm.scale = XM_SCALE1;
423✔
340
      idx = ir->op1;
423✔
341
      ref = ir->op2;
423✔
342
      irx = IR(idx);
423✔
343
      if (!(irx->o == IR_BSHL || irx->o == IR_ADD)) {  /* Try other operand. */
423✔
344
        idx = ir->op2;
78✔
345
        ref = ir->op1;
78✔
346
        irx = IR(idx);
78✔
347
      }
348
      if (canfuse(as, irx) && ra_noreg(irx->r)) {
423✔
349
        if (irx->o == IR_BSHL && irref_isk(irx->op2) && IR(irx->op2)->i <= 3) {
308✔
350
          /* Recognize idx<<b with b = 0-3, corresponding to sz = (1),2,4,8. */
351
          idx = irx->op1;
202✔
352
          as->mrm.scale = (uint8_t)(IR(irx->op2)->i << 6);
202✔
353
        } else if (irx->o == IR_ADD && irx->op1 == irx->op2) {
106✔
354
          /* FOLD does idx*2 ==> idx<<1 ==> idx+idx. */
355
          idx = irx->op1;
38✔
356
          as->mrm.scale = XM_SCALE2;
38✔
357
        }
358
      }
359
      r = ra_alloc1(as, idx, allow);
423✔
360
      rset_clear(allow, r);
423✔
361
      as->mrm.idx = (uint8_t)r;
423✔
362
    }
363
  noadd:
276✔
364
    as->mrm.base = (uint8_t)ra_alloc1(as, ref, allow);
1,016✔
365
  }
366
}
367

368
/* Fuse load of 64 bit IR constant into memory operand. */
369
static Reg asm_fuseloadk64(ASMState *as, IRIns *ir)
2,932✔
370
{
371
  const uint64_t *k = &ir_k64(ir)->u64;
2,932✔
372
  if (!LJ_GC64 || checki32((intptr_t)k)) {
2,932✔
373
    as->mrm.ofs = ptr2addr(k);
×
374
    as->mrm.base = RID_NONE;
×
375
#if LJ_GC64
376
  } else if (checki32(dispofs(as, k))) {
2,932✔
377
    as->mrm.ofs = (int32_t)dispofs(as, k);
818✔
378
    as->mrm.base = RID_DISPATCH;
818✔
379
  } else if (checki32(mcpofs(as, k)) && checki32(mcpofs(as, k+1)) &&
2,114✔
380
             checki32(mctopofs(as, k)) && checki32(mctopofs(as, k+1))) {
×
381
    as->mrm.ofs = (int32_t)mcpofs(as, k);
×
382
    as->mrm.base = RID_RIP;
×
383
  } else {  /* Intern 64 bit constant at bottom of mcode. */
384
    if (ir->i) {
2,114✔
385
      lj_assertA(*k == *(uint64_t*)(as->mctop - ir->i),
386
                 "bad interned 64 bit constant");
387
    } else {
388
      while ((uintptr_t)as->mcbot & 7) *as->mcbot++ = XI_INT3;
1,083✔
389
      *(uint64_t*)as->mcbot = *k;
1,079✔
390
      ir->i = (int32_t)(as->mctop - as->mcbot);
1,079✔
391
      as->mcbot += 8;
1,079✔
392
      as->mclim = as->mcbot + MCLIM_REDZONE;
1,079✔
393
      lj_mcode_commitbot(as->J, as->mcbot);
1,079✔
394
    }
395
    as->mrm.ofs = (int32_t)mcpofs(as, as->mctop - ir->i);
2,114✔
396
    as->mrm.base = RID_RIP;
2,114✔
397
#endif
398
  }
399
  as->mrm.idx = RID_NONE;
2,932✔
400
  return RID_MRM;
2,932✔
401
}
402

403
/* Fuse load into memory operand.
404
**
405
** Important caveat: this may emit RIP-relative loads! So don't place any
406
** code emitters between this function and the use of its result.
407
** The only permitted exception is asm_guardcc().
408
*/
409
static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
153,675✔
410
{
411
  IRIns *ir = IR(ref);
153,675✔
412
  if (ra_hasreg(ir->r)) {
153,675✔
413
    if (allow != RSET_EMPTY) {  /* Fast path. */
10,363✔
414
      ra_noweak(as, ir->r);
10,363✔
415
      return ir->r;
10,363✔
416
    }
417
  fusespill:
×
418
    /* Force a spill if only memory operands are allowed (asm_x87load). */
419
    as->mrm.base = RID_ESP;
320✔
420
    as->mrm.ofs = ra_spill(as, ir);
320✔
421
    as->mrm.idx = RID_NONE;
320✔
422
    return RID_MRM;
320✔
423
  }
424
  if (ir->o == IR_KNUM) {
143,312✔
425
    RegSet avail = as->freeset & ~as->modset & RSET_FPR;
5,810✔
426
    lj_assertA(allow != RSET_EMPTY, "no register allowed");
5,810✔
427
    if (!(avail & (avail-1)))  /* Fuse if less than two regs available. */
5,810✔
428
      return asm_fuseloadk64(as, ir);
2,930✔
429
  } else if (ref == REF_BASE || ir->o == IR_KINT64) {
137,502✔
430
    RegSet avail = as->freeset & ~as->modset & RSET_GPR;
87✔
431
    lj_assertA(allow != RSET_EMPTY, "no register allowed");
87✔
432
    if (!(avail & (avail-1))) {  /* Fuse if less than two regs available. */
87✔
433
      if (ref == REF_BASE) {
26✔
434
#if LJ_GC64
435
        as->mrm.ofs = (int32_t)dispofs(as, &J2G(as->J)->jit_base);
24✔
436
        as->mrm.base = RID_DISPATCH;
24✔
437
#else
438
        as->mrm.ofs = ptr2addr(&J2G(as->J)->jit_base);
439
        as->mrm.base = RID_NONE;
440
#endif
441
        as->mrm.idx = RID_NONE;
24✔
442
        return RID_MRM;
24✔
443
      } else {
444
        return asm_fuseloadk64(as, ir);
2✔
445
      }
446
    }
447
  } else if (mayfuse(as, ref)) {
137,415✔
448
    RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR;
22,586✔
449
    if (ir->o == IR_SLOAD) {
22,586✔
450
      if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) &&
766✔
451
          noconflict(as, ref, IR_RETF, 0) &&
1,193✔
452
          !(LJ_GC64 && irt_isaddr(ir->t))) {
481✔
453
        as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow);
453✔
454
        as->mrm.ofs = 8*((int32_t)ir->op1-1-LJ_FR2) +
453✔
455
                      (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0);
456
        as->mrm.idx = RID_NONE;
453✔
457
        return RID_MRM;
453✔
458
      }
459
    } else if (ir->o == IR_FLOAD) {
21,820✔
460
      /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
461
      if ((irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)) &&
13,149✔
462
          noconflict(as, ref, IR_FSTORE, 0)) {
25,955✔
463
        asm_fusefref(as, ir, xallow);
12,892✔
464
        return RID_MRM;
12,892✔
465
      }
466
    } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) {
8,671✔
467
      if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) &&
5,813✔
468
          !(LJ_GC64 && irt_isaddr(ir->t))) {
2,815✔
469
        asm_fuseahuref(as, ir->op1, xallow);
2,807✔
470
        return RID_MRM;
2,807✔
471
      }
472
    } else if (ir->o == IR_XLOAD) {
5,763✔
473
      /* Generic fusion is not ok for 8/16 bit operands (but see asm_comp).
474
      ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
475
      */
476
      if ((!irt_typerange(ir->t, IRT_I8, IRT_U16)) &&
349✔
477
          noconflict(as, ref, IR_XSTORE, 0)) {
474✔
478
        asm_fusexref(as, ir->op1, xallow);
190✔
479
        return RID_MRM;
190✔
480
      }
481
    } else if (ir->o == IR_VLOAD && !(LJ_GC64 && irt_isaddr(ir->t))) {
5,414✔
482
      asm_fuseahuref(as, ir->op1, xallow);
6✔
483
      return RID_MRM;
6✔
484
    }
485
  }
486
  if (ir->o == IR_FLOAD && ir->op1 == REF_NIL) {
124,008✔
487
    asm_fusefref(as, ir, RSET_EMPTY);
6✔
488
    return RID_MRM;
6✔
489
  }
490
  if (!(as->freeset & allow) && !emit_canremat(ref) &&
124,002✔
491
      (allow == RSET_EMPTY || ra_hasspill(ir->s) || iscrossref(as, ref)))
397✔
492
    goto fusespill;
320✔
493
  return ra_allocref(as, ref, allow);
123,682✔
494
}
495

496
#if LJ_64
497
/* Don't fuse a 32 bit load into a 64 bit operation. */
498
static Reg asm_fuseloadm(ASMState *as, IRRef ref, RegSet allow, int is64)
138,641✔
499
{
500
  if (is64 && !irt_is64(IR(ref)->t))
138,641✔
501
    return ra_alloc1(as, ref, allow);
23✔
502
  return asm_fuseload(as, ref, allow);
138,618✔
503
}
504
#else
505
#define asm_fuseloadm(as, ref, allow, is64)  asm_fuseload(as, (ref), (allow))
506
#endif
507

508
/* -- Calls --------------------------------------------------------------- */
509

510
/* Count the required number of stack slots for a call. */
511
static int asm_count_call_slots(ASMState *as, const CCallInfo *ci, IRRef *args)
512
{
513
  uint32_t i, nargs = CCI_XNARGS(ci);
514
  int nslots = 0;
515
#if LJ_64
516
  if (LJ_ABI_WIN) {
517
    nslots = (int)(nargs*2);  /* Only matters for more than four args. */
518
  } else {
519
    int ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
520
    for (i = 0; i < nargs; i++)
521
      if (args[i] && irt_isfp(IR(args[i])->t)) {
522
        if (nfpr > 0) nfpr--; else nslots += 2;
523
      } else {
524
        if (ngpr > 0) ngpr--; else nslots += 2;
525
      }
526
  }
527
#else
528
  int ngpr = 0;
529
  if ((ci->flags & CCI_CC_MASK) == CCI_CC_FASTCALL)
530
    ngpr = 2;
531
  else if ((ci->flags & CCI_CC_MASK) == CCI_CC_THISCALL)
532
    ngpr = 1;
533
  for (i = 0; i < nargs; i++)
534
    if (args[i] && irt_isfp(IR(args[i])->t)) {
535
      nslots += irt_isnum(IR(args[i])->t) ? 2 : 1;
536
    } else {
537
      if (ngpr > 0) ngpr--; else nslots++;
538
    }
539
#endif
540
  return nslots;
541
}
542

543
/* Generate a call to a C function. */
544
static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
545
{
546
  uint32_t n, nargs = CCI_XNARGS(ci);
547
  int32_t ofs = STACKARG_OFS;
548
#if LJ_64
549
  uint32_t gprs = REGARG_GPRS;
550
  Reg fpr = REGARG_FIRSTFPR;
551
#if !LJ_ABI_WIN
552
  MCode *patchnfpr = NULL;
553
#endif
554
#else
555
  uint32_t gprs = 0;
556
  if ((ci->flags & CCI_CC_MASK) != CCI_CC_CDECL) {
557
    if ((ci->flags & CCI_CC_MASK) == CCI_CC_THISCALL)
558
      gprs = (REGARG_GPRS & 31);
559
    else if ((ci->flags & CCI_CC_MASK) == CCI_CC_FASTCALL)
560
      gprs = REGARG_GPRS;
561
  }
562
#endif
563
  if ((void *)ci->func)
564
    emit_call(as, ci->func);
565
#if LJ_64
566
  if ((ci->flags & CCI_VARARG)) {  /* Special handling for vararg calls. */
567
#if LJ_ABI_WIN
568
    for (n = 0; n < 4 && n < nargs; n++) {
569
      IRIns *ir = IR(args[n]);
570
      if (irt_isfp(ir->t))  /* Duplicate FPRs in GPRs. */
571
        emit_rr(as, XO_MOVDto, (irt_isnum(ir->t) ? REX_64 : 0) | (fpr+n),
572
                ((gprs >> (n*5)) & 31));  /* Either MOVD or MOVQ. */
573
    }
574
#else
575
    patchnfpr = --as->mcp;  /* Indicate number of used FPRs in register al. */
576
    *--as->mcp = XI_MOVrib | RID_EAX;
577
#endif
578
  }
579
#endif
580
  for (n = 0; n < nargs; n++) {  /* Setup args. */
581
    IRRef ref = args[n];
582
    IRIns *ir = IR(ref);
583
    Reg r;
584
#if LJ_64 && LJ_ABI_WIN
585
    /* Windows/x64 argument registers are strictly positional. */
586
    r = irt_isfp(ir->t) ? (fpr <= REGARG_LASTFPR ? fpr : 0) : (gprs & 31);
587
    fpr++; gprs >>= 5;
588
#elif LJ_64
589
    /* POSIX/x64 argument registers are used in order of appearance. */
590
    if (irt_isfp(ir->t)) {
591
      r = fpr <= REGARG_LASTFPR ? fpr++ : 0;
592
    } else {
593
      r = gprs & 31; gprs >>= 5;
594
    }
595
#else
596
    if (ref && irt_isfp(ir->t)) {
597
      r = 0;
598
    } else {
599
      r = gprs & 31; gprs >>= 5;
600
      if (!ref) continue;
601
    }
602
#endif
603
    if (r) {  /* Argument is in a register. */
604
      if (r < RID_MAX_GPR && ref < ASMREF_TMP1) {
605
#if LJ_64
606
        if (LJ_GC64 ? !(ir->o == IR_KINT || ir->o == IR_KNULL) : ir->o == IR_KINT64)
607
          emit_loadu64(as, r, ir_k64(ir)->u64);
608
        else
609
#endif
610
          emit_loadi(as, r, ir->i);
611
      } else {
612
        /* Must have been evicted. */
613
        lj_assertA(rset_test(as->freeset, r), "reg %d not free", r);
614
        if (ra_hasreg(ir->r)) {
615
          ra_noweak(as, ir->r);
616
          emit_movrr(as, ir, r, ir->r);
617
        } else {
618
          ra_allocref(as, ref, RID2RSET(r));
619
        }
620
      }
621
    } else if (irt_isfp(ir->t)) {  /* FP argument is on stack. */
622
      lj_assertA(!(irt_isfloat(ir->t) && irref_isk(ref)),
623
                 "unexpected float constant");
624
      if (LJ_32 && (ofs & 4) && irref_isk(ref)) {
625
        /* Split stores for unaligned FP consts. */
626
        emit_movmroi(as, RID_ESP, ofs, (int32_t)ir_knum(ir)->u32.lo);
627
        emit_movmroi(as, RID_ESP, ofs+4, (int32_t)ir_knum(ir)->u32.hi);
628
      } else {
629
        r = ra_alloc1(as, ref, RSET_FPR);
630
        emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSDto : XO_MOVSSto,
631
                  r, RID_ESP, ofs);
632
      }
633
      ofs += (LJ_32 && irt_isfloat(ir->t)) ? 4 : 8;
634
    } else {  /* Non-FP argument is on stack. */
635
      if (LJ_32 && ref < ASMREF_TMP1) {
636
        emit_movmroi(as, RID_ESP, ofs, ir->i);
637
      } else {
638
        r = ra_alloc1(as, ref, RSET_GPR);
639
        emit_movtomro(as, REX_64 + r, RID_ESP, ofs);
640
      }
641
      ofs += sizeof(intptr_t);
642
    }
643
    checkmclim(as);
644
  }
645
#if LJ_64 && !LJ_ABI_WIN
646
  if (patchnfpr) *patchnfpr = fpr - REGARG_FIRSTFPR;
647
#endif
648
}
649

650
/* Setup result reg/sp for call. Evict scratch regs. */
651
static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
652
{
653
  RegSet drop = RSET_SCRATCH;
654
  int hiop = (LJ_32 && (ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t));
655
  if ((ci->flags & CCI_NOFPRCLOBBER))
656
    drop &= ~RSET_FPR;
657
  if (ra_hasreg(ir->r))
658
    rset_clear(drop, ir->r);  /* Dest reg handled below. */
659
  if (hiop && ra_hasreg((ir+1)->r))
660
    rset_clear(drop, (ir+1)->r);  /* Dest reg handled below. */
661
  ra_evictset(as, drop);  /* Evictions must be performed first. */
662
  if (ra_used(ir)) {
663
    if (irt_isfp(ir->t)) {
664
      int32_t ofs = sps_scale(ir->s);  /* Use spill slot or temp slots. */
665
#if LJ_64
666
      if ((ci->flags & CCI_CASTU64)) {
667
        Reg dest = ir->r;
668
        if (ra_hasreg(dest)) {
669
          ra_free(as, dest);
670
          ra_modified(as, dest);
671
          emit_rr(as, XO_MOVD, dest|REX_64, RID_RET);  /* Really MOVQ. */
672
        }
673
        if (ofs) emit_movtomro(as, RID_RET|REX_64, RID_ESP, ofs);
674
      } else {
675
        ra_destreg(as, ir, RID_FPRET);
676
      }
677
#else
678
      /* Number result is in x87 st0 for x86 calling convention. */
679
      Reg dest = ir->r;
680
      if (ra_hasreg(dest)) {
681
        ra_free(as, dest);
682
        ra_modified(as, dest);
683
        emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSD : XO_MOVSS,
684
                  dest, RID_ESP, ofs);
685
      }
686
      if ((ci->flags & CCI_CASTU64)) {
687
        emit_movtomro(as, RID_RETLO, RID_ESP, ofs);
688
        emit_movtomro(as, RID_RETHI, RID_ESP, ofs+4);
689
      } else {
690
        emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
691
                  irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
692
      }
693
#endif
694
#if LJ_32
695
    } else if (hiop) {
696
      ra_destpair(as, ir);
697
#endif
698
    } else {
699
      lj_assertA(!irt_ispri(ir->t), "PRI dest");
700
      ra_destreg(as, ir, RID_RET);
701
    }
702
  } else if (LJ_32 && irt_isfp(ir->t) && !(ci->flags & CCI_CASTU64)) {
703
    emit_x87op(as, XI_FPOP);  /* Pop unused result from x87 st0. */
704
  }
705
}
706

707
/* Return a constant function pointer or NULL for indirect calls. */
708
static void *asm_callx_func(ASMState *as, IRIns *irf, IRRef func)
32✔
709
{
710
#if LJ_32
711
  UNUSED(as);
712
  if (irref_isk(func))
713
    return (void *)irf->i;
714
#else
715
  if (irref_isk(func)) {
32✔
716
    MCode *p;
32✔
717
    if (irf->o == IR_KINT64)
32✔
718
      p = (MCode *)(void *)ir_k64(irf)->u64;
32✔
719
    else
720
      p = (MCode *)(void *)(uintptr_t)(uint32_t)irf->i;
×
721
    if (p - as->mcp == (int32_t)(p - as->mcp))
32✔
722
      return p;  /* Call target is still in +-2GB range. */
×
723
    /* Avoid the indirect case of emit_call(). Try to hoist func addr. */
724
  }
725
#endif
726
  return NULL;
727
}
728

729
static void asm_callx(ASMState *as, IRIns *ir)
32✔
730
{
731
  IRRef args[CCI_NARGS_MAX*2];
32✔
732
  CCallInfo ci;
32✔
733
  IRRef func;
32✔
734
  IRIns *irf;
32✔
735
  int32_t spadj = 0;
32✔
736
  ci.flags = asm_callx_flags(as, ir);
32✔
737
  asm_collectargs(as, ir, &ci, args);
32✔
738
  asm_setupresult(as, ir, &ci);
32✔
739
#if LJ_32
740
  /* Have to readjust stack after non-cdecl calls due to callee cleanup. */
741
  if ((ci.flags & CCI_CC_MASK) != CCI_CC_CDECL)
742
    spadj = 4 * asm_count_call_slots(as, &ci, args);
743
#endif
744
  func = ir->op2; irf = IR(func);
32✔
745
  if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
32✔
746
  ci.func = (ASMFunction)asm_callx_func(as, irf, func);
32✔
747
  if (!(void *)ci.func) {
32✔
748
    /* Use a (hoistable) non-scratch register for indirect calls. */
749
    RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
32✔
750
    Reg r = ra_alloc1(as, func, allow);
32✔
751
    if (LJ_32) emit_spsub(as, spadj);  /* Above code may cause restores! */
32✔
752
    emit_rr(as, XO_GROUP5, XOg_CALL, r);
32✔
753
  } else if (LJ_32) {
754
    emit_spsub(as, spadj);
755
  }
756
  asm_gencall(as, &ci, args);
32✔
757
}
32✔
758

759
/* -- Returns ------------------------------------------------------------- */
760

761
/* Return to lower frame. Guard that it goes to the right spot. */
762
static void asm_retf(ASMState *as, IRIns *ir)
763
{
764
  Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
765
#if LJ_FR2
766
  Reg rpc = ra_scratch(as, rset_exclude(RSET_GPR, base));
767
#endif
768
  void *pc = ir_kptr(IR(ir->op2));
769
  int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1));
770
  as->topslot -= (BCReg)delta;
771
  if ((int32_t)as->topslot < 0) as->topslot = 0;
772
  irt_setmark(IR(REF_BASE)->t);  /* Children must not coalesce with BASE reg. */
773
  emit_setgl(as, base, jit_base);
774
  emit_addptr(as, base, -8*delta);
775
  asm_guardcc(as, CC_NE);
776
#if LJ_FR2
777
  emit_rmro(as, XO_CMP, rpc|REX_GC64, base, -8);
778
  emit_loadu64(as, rpc, u64ptr(pc));
779
#else
780
  emit_gmroi(as, XG_ARITHi(XOg_CMP), base, -4, ptr2addr(pc));
781
#endif
782
}
783

784
/* -- Type conversions ---------------------------------------------------- */
785

786
static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
311✔
787
{
788
  Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
311✔
789
  Reg dest = ra_dest(as, ir, RSET_GPR);
311✔
790
  asm_guardcc(as, CC_P);
311✔
791
  asm_guardcc(as, CC_NE);
311✔
792
  emit_rr(as, XO_UCOMISD, left, tmp);
311✔
793
  emit_rr(as, XO_CVTSI2SD, tmp, dest);
311✔
794
  emit_rr(as, XO_XORPS, tmp, tmp);  /* Avoid partial register stall. */
311✔
795
  emit_rr(as, XO_CVTTSD2SI, dest, left);
311✔
796
  /* Can't fuse since left is needed twice. */
797
}
311✔
798

799
static void asm_tobit(ASMState *as, IRIns *ir)
63✔
800
{
801
  Reg dest = ra_dest(as, ir, RSET_GPR);
63✔
802
  Reg tmp = ra_noreg(IR(ir->op1)->r) ?
126✔
803
              ra_alloc1(as, ir->op1, RSET_FPR) :
63✔
UNCOV
804
              ra_scratch(as, RSET_FPR);
×
805
  Reg right;
63✔
806
  emit_rr(as, XO_MOVDto, tmp, dest);
63✔
807
  right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp));
63✔
808
  emit_mrm(as, XO_ADDSD, tmp, right);
63✔
809
  ra_left(as, tmp, ir->op1);
63✔
810
}
63✔
811

812
static void asm_conv(ASMState *as, IRIns *ir)
2,159✔
813
{
814
  IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
2,159✔
815
  int st64 = (st == IRT_I64 || st == IRT_U64 || (LJ_64 && st == IRT_P64));
2,159✔
816
  int stfp = (st == IRT_NUM || st == IRT_FLOAT);
2,159✔
817
  IRRef lref = ir->op1;
2,159✔
818
  lj_assertA(irt_type(ir->t) != st, "inconsistent types for CONV");
2,159✔
819
  lj_assertA(!(LJ_32 && (irt_isint64(ir->t) || st64)),
2,159✔
820
             "IR %04d has unsplit 64 bit type",
821
             (int)(ir - as->ir) - REF_BIAS);
822
  if (irt_isfp(ir->t)) {
2,159✔
823
    Reg dest = ra_dest(as, ir, RSET_FPR);
1,551✔
824
    if (stfp) {  /* FP to FP conversion. */
1,551✔
825
      Reg left = asm_fuseload(as, lref, RSET_FPR);
35✔
826
      emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left);
61✔
827
      if (left == dest) return;  /* Avoid the XO_XORPS. */
35✔
828
    } else if (LJ_32 && st == IRT_U32) {  /* U32 to FP conversion on x86. */
1,516✔
829
      /* number = (2^52+2^51 .. u32) - (2^52+2^51) */
830
      cTValue *k = &as->J->k64[LJ_K64_TOBIT];
831
      Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest));
832
      if (irt_isfloat(ir->t))
833
        emit_rr(as, XO_CVTSD2SS, dest, dest);
834
      emit_rr(as, XO_SUBSD, dest, bias);  /* Subtract 2^52+2^51 bias. */
835
      emit_rr(as, XO_XORPS, dest, bias);  /* Merge bias and integer. */
836
      emit_rma(as, XO_MOVSD, bias, k);
837
      emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
838
      return;
839
    } else {  /* Integer to FP conversion. */
840
      Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ?
3,032✔
841
                 ra_alloc1(as, lref, RSET_GPR) :
1,516✔
842
                 asm_fuseloadm(as, lref, RSET_GPR, st64);
1,466✔
843
      if (LJ_64 && st == IRT_U64) {
1,516✔
844
        MCLabel l_end = emit_label(as);
38✔
845
        cTValue *k = &as->J->k64[LJ_K64_2P64];
38✔
846
        emit_rma(as, XO_ADDSD, dest, k);  /* Add 2^64 to compensate. */
38✔
847
        emit_sjcc(as, CC_NS, l_end);
38✔
848
        emit_rr(as, XO_TEST, left|REX_64, left);  /* Check if u64 >= 2^63. */
38✔
849
      }
850
      emit_mrm(as, irt_isnum(ir->t) ? XO_CVTSI2SD : XO_CVTSI2SS,
1,532✔
851
               dest|((LJ_64 && (st64 || st == IRT_U32)) ? REX_64 : 0), left);
1,516✔
852
    }
853
    emit_rr(as, XO_XORPS, dest, dest);  /* Avoid partial register stall. */
1,535✔
854
  } else if (stfp) {  /* FP to integer conversion. */
608✔
855
    if (irt_isguard(ir->t)) {
531✔
856
      /* Checked conversions are only supported from number to int. */
857
      lj_assertA(irt_isint(ir->t) && st == IRT_NUM,
113✔
858
                 "bad type for checked CONV");
859
      asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
113✔
860
    } else {
861
      Reg dest = ra_dest(as, ir, RSET_GPR);
418✔
862
      x86Op op = st == IRT_NUM ? XO_CVTTSD2SI : XO_CVTTSS2SI;
418✔
863
      if (LJ_64 ? irt_isu64(ir->t) : irt_isu32(ir->t)) {
418✔
864
        /* LJ_64: For inputs >= 2^63 add -2^64, convert again. */
865
        /* LJ_32: For inputs >= 2^31 add -2^31, convert again and add 2^31. */
866
        Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) :
78✔
867
                                          ra_scratch(as, RSET_FPR);
56✔
868
        MCLabel l_end = emit_label(as);
78✔
869
        if (LJ_32)
78✔
870
          emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000);
871
        emit_rr(as, op, dest|REX_64, tmp);
78✔
872
        if (st == IRT_NUM)
78✔
873
          emit_rma(as, XO_ADDSD, tmp, &as->J->k64[LJ_K64_M2P64_31]);
78✔
874
        else
875
          emit_rma(as, XO_ADDSS, tmp, &as->J->k32[LJ_K32_M2P64_31]);
×
876
        emit_sjcc(as, CC_NS, l_end);
78✔
877
        emit_rr(as, XO_TEST, dest|REX_64, dest);  /* Check if dest negative. */
78✔
878
        emit_rr(as, op, dest|REX_64, tmp);
78✔
879
        ra_left(as, tmp, lref);
78✔
880
      } else {
881
        if (LJ_64 && irt_isu32(ir->t))
340✔
882
          emit_rr(as, XO_MOV, dest, dest);  /* Zero hiword. */
×
883
        emit_mrm(as, op,
680✔
884
                 dest|((LJ_64 &&
283✔
885
                        (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0),
340✔
886
                 asm_fuseload(as, lref, RSET_FPR));
887
      }
888
    }
889
  } else if (st >= IRT_I8 && st <= IRT_U16) {  /* Extend to 32 bit integer. */
77✔
890
    Reg left, dest = ra_dest(as, ir, RSET_GPR);
12✔
891
    RegSet allow = RSET_GPR;
12✔
892
    x86Op op;
12✔
893
    lj_assertA(irt_isint(ir->t) || irt_isu32(ir->t), "bad type for CONV EXT");
12✔
894
    if (st == IRT_I8) {
12✔
895
      op = XO_MOVSXb; allow = RSET_GPR8; dest |= FORCE_REX;
6✔
896
    } else if (st == IRT_U8) {
6✔
897
      op = XO_MOVZXb; allow = RSET_GPR8; dest |= FORCE_REX;
6✔
898
    } else if (st == IRT_I16) {
×
899
      op = XO_MOVSXw;
900
    } else {
901
      op = XO_MOVZXw;
×
902
    }
903
    left = asm_fuseload(as, lref, allow);
12✔
904
    /* Add extra MOV if source is already in wrong register. */
905
    if (!LJ_64 && left != RID_MRM && !rset_test(allow, left)) {
12✔
906
      Reg tmp = ra_scratch(as, allow);
907
      emit_rr(as, op, dest, tmp);
908
      emit_rr(as, XO_MOV, tmp, left);
909
    } else {
910
      emit_mrm(as, op, dest, left);
12✔
911
    }
912
  } else {  /* 32/64 bit integer conversions. */
913
    if (LJ_32) {  /* Only need to handle 32/32 bit no-op (cast) on x86. */
65✔
914
      Reg dest = ra_dest(as, ir, RSET_GPR);
915
      ra_left(as, dest, lref);  /* Do nothing, but may need to move regs. */
916
    } else if (irt_is64(ir->t)) {
65✔
917
      Reg dest = ra_dest(as, ir, RSET_GPR);
52✔
918
      if (st64 || !(ir->op2 & IRCONV_SEXT)) {
52✔
919
        /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
920
        ra_left(as, dest, lref);  /* Do nothing, but may need to move regs. */
23✔
921
      } else {  /* 32 to 64 bit sign extension. */
922
        Reg left = asm_fuseload(as, lref, RSET_GPR);
29✔
923
        emit_mrm(as, XO_MOVSXd, dest|REX_64, left);
29✔
924
      }
925
    } else {
926
      Reg dest = ra_dest(as, ir, RSET_GPR);
13✔
927
      if (st64) {
13✔
928
        Reg left = asm_fuseload(as, lref, RSET_GPR);
9✔
929
        /* This is either a 32 bit reg/reg mov which zeroes the hiword
930
        ** or a load of the loword from a 64 bit address.
931
        */
932
        emit_mrm(as, XO_MOV, dest, left);
9✔
933
      } else {  /* 32/32 bit no-op (cast). */
934
        ra_left(as, dest, lref);  /* Do nothing, but may need to move regs. */
4✔
935
      }
936
    }
937
  }
938
}
939

940
#if LJ_32 && LJ_HASFFI
941
/* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */
942

943
/* 64 bit integer to FP conversion in 32 bit mode. */
944
static void asm_conv_fp_int64(ASMState *as, IRIns *ir)
945
{
946
  Reg hi = ra_alloc1(as, ir->op1, RSET_GPR);
947
  Reg lo = ra_alloc1(as, (ir-1)->op1, rset_exclude(RSET_GPR, hi));
948
  int32_t ofs = sps_scale(ir->s);  /* Use spill slot or temp slots. */
949
  Reg dest = ir->r;
950
  if (ra_hasreg(dest)) {
951
    ra_free(as, dest);
952
    ra_modified(as, dest);
953
    emit_rmro(as, irt_isnum(ir->t) ? XO_MOVSD : XO_MOVSS, dest, RID_ESP, ofs);
954
  }
955
  emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd,
956
            irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs);
957
  if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) {
958
    /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */
959
    MCLabel l_end = emit_label(as);
960
    emit_rma(as, XO_FADDq, XOg_FADDq, &as->J->k64[LJ_K64_2P64]);
961
    emit_sjcc(as, CC_NS, l_end);
962
    emit_rr(as, XO_TEST, hi, hi);  /* Check if u64 >= 2^63. */
963
  } else {
964
    lj_assertA(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64, "bad type for CONV");
965
  }
966
  emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0);
967
  /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */
968
  emit_rmro(as, XO_MOVto, hi, RID_ESP, 4);
969
  emit_rmro(as, XO_MOVto, lo, RID_ESP, 0);
970
}
971

972
/* FP to 64 bit integer conversion in 32 bit mode. */
973
static void asm_conv_int64_fp(ASMState *as, IRIns *ir)
974
{
975
  IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK);
976
  IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH);
977
  Reg lo, hi;
978
  lj_assertA(st == IRT_NUM || st == IRT_FLOAT, "bad type for CONV");
979
  lj_assertA(dt == IRT_I64 || dt == IRT_U64, "bad type for CONV");
980
  hi = ra_dest(as, ir, RSET_GPR);
981
  lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi));
982
  if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0);
983
  /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */
984
  if (!(as->flags & JIT_F_SSE3)) {  /* Set FPU rounding mode to default. */
985
    emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 4);
986
    emit_rmro(as, XO_MOVto, lo, RID_ESP, 4);
987
    emit_gri(as, XG_ARITHi(XOg_AND), lo, 0xf3ff);
988
  }
989
  if (dt == IRT_U64) {
990
    /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */
991
    MCLabel l_pop, l_end = emit_label(as);
992
    emit_x87op(as, XI_FPOP);
993
    l_pop = emit_label(as);
994
    emit_sjmp(as, l_end);
995
    emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
996
    if ((as->flags & JIT_F_SSE3))
997
      emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
998
    else
999
      emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1000
    emit_rma(as, XO_FADDq, XOg_FADDq, &as->J->k64[LJ_K64_M2P64]);
1001
    emit_sjcc(as, CC_NS, l_pop);
1002
    emit_rr(as, XO_TEST, hi, hi);  /* Check if out-of-range (2^63). */
1003
  }
1004
  emit_rmro(as, XO_MOV, hi, RID_ESP, 4);
1005
  if ((as->flags & JIT_F_SSE3)) {  /* Truncation is easy with SSE3. */
1006
    emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0);
1007
  } else {  /* Otherwise set FPU rounding mode to truncate before the store. */
1008
    emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0);
1009
    emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 0);
1010
    emit_rmro(as, XO_MOVtow, lo, RID_ESP, 0);
1011
    emit_rmro(as, XO_ARITHw(XOg_OR), lo, RID_ESP, 0);
1012
    emit_loadi(as, lo, 0xc00);
1013
    emit_rmro(as, XO_FNSTCW, XOg_FNSTCW, RID_ESP, 0);
1014
  }
1015
  if (dt == IRT_U64)
1016
    emit_x87op(as, XI_FDUP);
1017
  emit_mrm(as, st == IRT_NUM ? XO_FLDq : XO_FLDd,
1018
           st == IRT_NUM ? XOg_FLDq: XOg_FLDd,
1019
           asm_fuseload(as, ir->op1, RSET_EMPTY));
1020
}
1021

1022
static void asm_conv64(ASMState *as, IRIns *ir)
1023
{
1024
  if (irt_isfp(ir->t))
1025
    asm_conv_fp_int64(as, ir);
1026
  else
1027
    asm_conv_int64_fp(as, ir);
1028
}
1029
#endif
1030

1031
static void asm_strto(ASMState *as, IRIns *ir)
1032
{
1033
  /* Force a spill slot for the destination register (if any). */
1034
  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
1035
  IRRef args[2];
1036
  RegSet drop = RSET_SCRATCH;
1037
  if ((drop & RSET_FPR) != RSET_FPR && ra_hasreg(ir->r))
1038
    rset_set(drop, ir->r);  /* WIN64 doesn't spill all FPRs. */
1039
  ra_evictset(as, drop);
1040
  asm_guardcc(as, CC_E);
1041
  emit_rr(as, XO_TEST, RID_RET, RID_RET);  /* Test return status. */
1042
  args[0] = ir->op1;      /* GCstr *str */
1043
  args[1] = ASMREF_TMP1;  /* TValue *n  */
1044
  asm_gencall(as, ci, args);
1045
  /* Store the result to the spill slot or temp slots. */
1046
  emit_rmro(as, XO_LEA, ra_releasetmp(as, ASMREF_TMP1)|REX_64,
1047
            RID_ESP, sps_scale(ir->s));
1048
}
1049

1050
/* -- Memory references --------------------------------------------------- */
1051

1052
/* Get pointer to TValue. */
1053
static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
1,235✔
1054
{
1055
  IRIns *ir = IR(ref);
1,235✔
1056
  if (irt_isnum(ir->t)) {
1,235✔
1057
    /* For numbers use the constant itself or a spill slot as a TValue. */
1058
    if (irref_isk(ref))
289✔
1059
      emit_loada(as, dest, ir_knum(ir));
27✔
1060
    else
1061
      emit_rmro(as, XO_LEA, dest|REX_64, RID_ESP, ra_spill(as, ir));
262✔
1062
  } else {
1063
    /* Otherwise use g->tmptv to hold the TValue. */
1064
#if LJ_GC64
1065
    if (irref_isk(ref)) {
946✔
1066
      TValue k;
933✔
1067
      lj_ir_kvalue(as->J->L, &k, ir);
933✔
1068
      emit_movmroi(as, dest, 4, k.u32.hi);
933✔
1069
      emit_movmroi(as, dest, 0, k.u32.lo);
933✔
1070
    } else {
1071
      /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
1072
      Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest));
13✔
1073
      if (irt_is64(ir->t)) {
13✔
1074
        emit_u32(as, irt_toitype(ir->t) << 15);
13✔
1075
        emit_rmro(as, XO_ARITHi, XOg_OR, dest, 4);
13✔
1076
      } else {
1077
        /* Currently, no caller passes integers that might end up here. */
1078
        emit_movmroi(as, dest, 4, (irt_toitype(ir->t) << 15));
×
1079
      }
1080
      emit_movtomro(as, REX_64IR(ir, src), dest, 0);
13✔
1081
    }
1082
#else
1083
    if (!irref_isk(ref)) {
1084
      Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest));
1085
      emit_movtomro(as, REX_64IR(ir, src), dest, 0);
1086
    } else if (!irt_ispri(ir->t)) {
1087
      emit_movmroi(as, dest, 0, ir->i);
1088
    }
1089
    if (!(LJ_64 && irt_islightud(ir->t)))
1090
      emit_movmroi(as, dest, 4, irt_toitype(ir->t));
1091
#endif
1092
    emit_loada(as, dest, &J2G(as->J)->tmptv);
946✔
1093
  }
1094
}
1,235✔
1095

1096
static void asm_aref(ASMState *as, IRIns *ir)
35✔
1097
{
1098
  Reg dest = ra_dest(as, ir, RSET_GPR);
35✔
1099
  asm_fusearef(as, ir, RSET_GPR);
35✔
1100
  if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0))
35✔
1101
    emit_mrm(as, XO_LEA, dest|REX_GC64, RID_MRM);
35✔
1102
  else if (as->mrm.base != dest)
×
1103
    emit_rr(as, XO_MOV, dest|REX_GC64, as->mrm.base);
×
1104
}
35✔
1105

1106
/* Inlined hash lookup. Specialized for key type and for const keys.
1107
** The equivalent C code is:
1108
**   Node *n = hashkey(t, key);
1109
**   do {
1110
**     if (lj_obj_equal(&n->key, key)) return &n->val;
1111
**   } while ((n = nextnode(n)));
1112
**   return niltv(L);
1113
*/
1114
static void asm_href(ASMState *as, IRIns *ir, IROp merge)
1,665✔
1115
{
1116
  RegSet allow = RSET_GPR;
1,665✔
1117
  int destused = ra_used(ir);
1,665✔
1118
  Reg dest = ra_dest(as, ir, allow);
1,665✔
1119
  Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
1,665✔
1120
  Reg key = RID_NONE, tmp = RID_NONE;
1,665✔
1121
  IRIns *irkey = IR(ir->op2);
1,665✔
1122
  int isk = irref_isk(ir->op2);
1,665✔
1123
  IRType1 kt = irkey->t;
1,665✔
1124
  uint32_t khash;
1,665✔
1125
  MCLabel l_end, l_loop, l_next;
1,665✔
1126

1127
  if (!isk) {
1,665✔
1128
    rset_clear(allow, tab);
155✔
1129
    key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
207✔
1130
    if (LJ_GC64 || !irt_isstr(kt))
155✔
1131
      tmp = ra_scratch(as, rset_exclude(allow, key));
155✔
1132
  }
1133

1134
  /* Key not found in chain: jump to exit (if merged) or load niltv. */
1135
  l_end = emit_label(as);
1,665✔
1136
  if (merge == IR_NE)
1,665✔
1137
    asm_guardcc(as, CC_E);  /* XI_JMP is not found by lj_asm_patchexit. */
19✔
1138
  else if (destused)
1,646✔
1139
    emit_loada(as, dest, niltvg(J2G(as->J)));
65✔
1140

1141
  /* Follow hash chain until the end. */
1142
  l_loop = emit_sjcc_label(as, CC_NZ);
1,665✔
1143
  emit_rr(as, XO_TEST, dest|REX_GC64, dest);
1,665✔
1144
  emit_rmro(as, XO_MOV, dest|REX_GC64, dest, offsetof(Node, next));
1,665✔
1145
  l_next = emit_label(as);
1,665✔
1146

1147
  /* Type and value comparison. */
1148
  if (merge == IR_EQ)
1,665✔
1149
    asm_guardcc(as, CC_E);
1,581✔
1150
  else
1151
    emit_sjcc(as, CC_E, l_end);
84✔
1152
  if (irt_isnum(kt)) {
1,665✔
1153
    if (isk) {
114✔
1154
      /* Assumes -0.0 is already canonicalized to +0.0. */
1155
      emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo),
11✔
1156
                 (int32_t)ir_knum(irkey)->u32.lo);
11✔
1157
      emit_sjcc(as, CC_NE, l_next);
11✔
1158
      emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi),
11✔
1159
                 (int32_t)ir_knum(irkey)->u32.hi);
11✔
1160
    } else {
1161
      emit_sjcc(as, CC_P, l_next);
103✔
1162
      emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n));
103✔
1163
      emit_sjcc(as, CC_AE, l_next);
103✔
1164
      /* The type check avoids NaN penalties and complaints from Valgrind. */
1165
#if LJ_64 && !LJ_GC64
1166
      emit_u32(as, LJ_TISNUM);
1167
      emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it));
1168
#else
1169
      emit_i8(as, LJ_TISNUM);
103✔
1170
      emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
103✔
1171
#endif
1172
    }
1173
#if LJ_64 && !LJ_GC64
1174
  } else if (irt_islightud(kt)) {
1175
    emit_rmro(as, XO_CMP, key|REX_64, dest, offsetof(Node, key.u64));
1176
#endif
1177
#if LJ_GC64
1178
  } else if (irt_isaddr(kt)) {
1,551✔
1179
    if (isk) {
1,551✔
1180
      TValue k;
1,499✔
1181
      k.u64 = ((uint64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64;
1,499✔
1182
      emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo),
1,499✔
1183
                 k.u32.lo);
1184
      emit_sjcc(as, CC_NE, l_next);
1,499✔
1185
      emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi),
1,499✔
1186
                 k.u32.hi);
1187
    } else {
1188
      emit_rmro(as, XO_CMP, tmp|REX_64, dest, offsetof(Node, key.u64));
52✔
1189
    }
1190
  } else {
1191
    lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
×
1192
    emit_u32(as, (irt_toitype(kt)<<15)|0x7fff);
×
1193
    emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it));
×
1194
#else
1195
  } else {
1196
    if (!irt_ispri(kt)) {
1197
      lj_assertA(irt_isaddr(kt), "bad HREF key type");
1198
      if (isk)
1199
        emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.gcr),
1200
                   ptr2addr(ir_kgc(irkey)));
1201
      else
1202
        emit_rmro(as, XO_CMP, key, dest, offsetof(Node, key.gcr));
1203
      emit_sjcc(as, CC_NE, l_next);
1204
    }
1205
    lj_assertA(!irt_isnil(kt), "bad HREF key type");
1206
    emit_i8(as, irt_toitype(kt));
1207
    emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it));
1208
#endif
1209
  }
1210
  emit_sfixup(as, l_loop);
1,665✔
1211
  checkmclim(as);
1,665✔
1212
#if LJ_GC64
1213
  if (!isk && irt_isaddr(kt)) {
1,665✔
1214
    emit_rr(as, XO_OR, tmp|REX_64, key);
52✔
1215
    emit_loadu64(as, tmp, (uint64_t)irt_toitype(kt) << 47);
52✔
1216
  }
1217
#endif
1218

1219
  /* Load main position relative to tab->node into dest. */
1220
  khash = isk ? ir_khash(as, irkey) : 1;
1,665✔
1221
  if (khash == 0) {
1,510✔
1222
    emit_rmro(as, XO_MOV, dest|REX_GC64, tab, offsetof(GCtab, node));
×
1223
  } else {
1224
    emit_rmro(as, XO_ARITH(XOg_ADD), dest|REX_GC64, tab, offsetof(GCtab,node));
1,665✔
1225
    if ((as->flags & JIT_F_PREFER_IMUL)) {
1,665✔
1226
      emit_i8(as, sizeof(Node));
×
1227
      emit_rr(as, XO_IMULi8, dest, dest);
×
1228
    } else {
1229
      emit_shifti(as, XOg_SHL, dest, 3);
1,665✔
1230
      emit_rmrxo(as, XO_LEA, dest, dest, dest, XM_SCALE2, 0);
1,665✔
1231
    }
1232
    if (isk) {
1,665✔
1233
      emit_gri(as, XG_ARITHi(XOg_AND), dest, (int32_t)khash);
1,510✔
1234
      emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
1,510✔
1235
    } else if (irt_isstr(kt)) {
155✔
1236
      emit_rmro(as, XO_ARITH(XOg_AND), dest, key, offsetof(GCstr, hash));
41✔
1237
      emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
41✔
1238
    } else {  /* Must match with hashrot() in lj_tab.c. */
1239
      emit_rmro(as, XO_ARITH(XOg_AND), dest, tab, offsetof(GCtab, hmask));
114✔
1240
      emit_rr(as, XO_ARITH(XOg_SUB), dest, tmp);
114✔
1241
      emit_shifti(as, XOg_ROL, tmp, HASH_ROT3);
114✔
1242
      emit_rr(as, XO_ARITH(XOg_XOR), dest, tmp);
114✔
1243
      emit_shifti(as, XOg_ROL, dest, HASH_ROT2);
114✔
1244
      emit_rr(as, XO_ARITH(XOg_SUB), tmp, dest);
114✔
1245
      emit_shifti(as, XOg_ROL, dest, HASH_ROT1);
114✔
1246
      emit_rr(as, XO_ARITH(XOg_XOR), tmp, dest);
114✔
1247
      if (irt_isnum(kt)) {
114✔
1248
        emit_rr(as, XO_ARITH(XOg_ADD), dest, dest);
103✔
1249
#if LJ_64
1250
        emit_shifti(as, XOg_SHR|REX_64, dest, 32);
103✔
1251
        emit_rr(as, XO_MOV, tmp, dest);
103✔
1252
        emit_rr(as, XO_MOVDto, key|REX_64, dest);
103✔
1253
#else
1254
        emit_rmro(as, XO_MOV, dest, RID_ESP, ra_spill(as, irkey)+4);
1255
        emit_rr(as, XO_MOVDto, key, tmp);
1256
#endif
1257
      } else {
1258
        emit_rr(as, XO_MOV, tmp, key);
11✔
1259
#if LJ_GC64
1260
        checkmclim(as);
11✔
1261
        emit_gri(as, XG_ARITHi(XOg_XOR), dest, irt_toitype(kt) << 15);
11✔
1262
        if ((as->flags & JIT_F_BMI2)) {
11✔
1263
          emit_i8(as, 32);
11✔
1264
          emit_mrm(as, XV_RORX|VEX_64, dest, key);
11✔
1265
        } else {
1266
          emit_shifti(as, XOg_SHR|REX_64, dest, 32);
×
1267
          emit_rr(as, XO_MOV, dest|REX_64, key|REX_64);
×
1268
        }
1269
#else
1270
        emit_rmro(as, XO_LEA, dest, key, HASH_BIAS);
1271
#endif
1272
      }
1273
    }
1274
  }
1275
}
1,665✔
1276

1277
static void asm_hrefk(ASMState *as, IRIns *ir)
86,611✔
1278
{
1279
  IRIns *kslot = IR(ir->op2);
86,611✔
1280
  IRIns *irkey = IR(kslot->op1);
86,611✔
1281
  int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
86,611✔
1282
  Reg dest = ra_used(ir) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
86,611✔
1283
  Reg node = ra_alloc1(as, ir->op1, RSET_GPR);
86,611✔
1284
#if !LJ_64
1285
  MCLabel l_exit;
1286
#endif
1287
  lj_assertA(ofs % sizeof(Node) == 0, "unaligned HREFK slot");
86,611✔
1288
  if (ra_hasreg(dest)) {
86,611✔
1289
    if (ofs != 0) {
73,941✔
1290
      if (dest == node && !(as->flags & JIT_F_LEA_AGU))
71,018✔
1291
        emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, ofs);
70,968✔
1292
      else
1293
        emit_rmro(as, XO_LEA, dest|REX_GC64, node, ofs);
50✔
1294
    } else if (dest != node) {
2,923✔
1295
      emit_rr(as, XO_MOV, dest|REX_GC64, node);
3✔
1296
    }
1297
  }
1298
  asm_guardcc(as, CC_NE);
86,611✔
1299
#if LJ_64
1300
  if (!irt_ispri(irkey->t)) {
86,611✔
1301
    Reg key = ra_scratch(as, rset_exclude(RSET_GPR, node));
86,611✔
1302
    emit_rmro(as, XO_CMP, key|REX_64, node,
86,611✔
1303
               ofs + (int32_t)offsetof(Node, key.u64));
1304
    lj_assertA(irt_isnum(irkey->t) || irt_isgcv(irkey->t),
86,611✔
1305
               "bad HREFK key type");
1306
    /* Assumes -0.0 is already canonicalized to +0.0. */
1307
    emit_loadu64(as, key, irt_isnum(irkey->t) ? ir_knum(irkey)->u64 :
173,201✔
1308
#if LJ_GC64
1309
                          ((uint64_t)irt_toitype(irkey->t) << 47) |
86,590✔
1310
                          (uint64_t)ir_kgc(irkey));
86,590✔
1311
#else
1312
                          ((uint64_t)irt_toitype(irkey->t) << 32) |
1313
                          (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey)));
1314
#endif
1315
  } else {
1316
    lj_assertA(!irt_isnil(irkey->t), "bad HREFK key type");
×
1317
#if LJ_GC64
1318
    emit_i32(as, (irt_toitype(irkey->t)<<15)|0x7fff);
×
1319
    emit_rmro(as, XO_ARITHi, XOg_CMP, node,
×
1320
              ofs + (int32_t)offsetof(Node, key.it));
1321
#else
1322
    emit_i8(as, irt_toitype(irkey->t));
1323
    emit_rmro(as, XO_ARITHi8, XOg_CMP, node,
1324
              ofs + (int32_t)offsetof(Node, key.it));
1325
#endif
1326
  }
1327
#else
1328
  l_exit = emit_label(as);
1329
  if (irt_isnum(irkey->t)) {
1330
    /* Assumes -0.0 is already canonicalized to +0.0. */
1331
    emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
1332
               ofs + (int32_t)offsetof(Node, key.u32.lo),
1333
               (int32_t)ir_knum(irkey)->u32.lo);
1334
    emit_sjcc(as, CC_NE, l_exit);
1335
    emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
1336
               ofs + (int32_t)offsetof(Node, key.u32.hi),
1337
               (int32_t)ir_knum(irkey)->u32.hi);
1338
  } else {
1339
    if (!irt_ispri(irkey->t)) {
1340
      lj_assertA(irt_isgcv(irkey->t), "bad HREFK key type");
1341
      emit_gmroi(as, XG_ARITHi(XOg_CMP), node,
1342
                 ofs + (int32_t)offsetof(Node, key.gcr),
1343
                 ptr2addr(ir_kgc(irkey)));
1344
      emit_sjcc(as, CC_NE, l_exit);
1345
    }
1346
    lj_assertA(!irt_isnil(irkey->t), "bad HREFK key type");
1347
    emit_i8(as, irt_toitype(irkey->t));
1348
    emit_rmro(as, XO_ARITHi8, XOg_CMP, node,
1349
              ofs + (int32_t)offsetof(Node, key.it));
1350
  }
1351
#endif
1352
}
86,611✔
1353

1354
static void asm_uref(ASMState *as, IRIns *ir)
506✔
1355
{
1356
  Reg dest = ra_dest(as, ir, RSET_GPR);
506✔
1357
  if (irref_isk(ir->op1)) {
506✔
1358
    GCfunc *fn = ir_kfunc(IR(ir->op1));
253✔
1359
    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
253✔
1360
    emit_rma(as, XO_MOV, dest|REX_GC64, v);
253✔
1361
  } else {
1362
    Reg uv = ra_scratch(as, RSET_GPR);
253✔
1363
    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
253✔
1364
    if (ir->o == IR_UREFC) {
253✔
1365
      emit_rmro(as, XO_LEA, dest|REX_GC64, uv, offsetof(GCupval, tv));
184✔
1366
      asm_guardcc(as, CC_NE);
184✔
1367
      emit_i8(as, 1);
184✔
1368
      emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
184✔
1369
    } else {
1370
      emit_rmro(as, XO_MOV, dest|REX_GC64, uv, offsetof(GCupval, v));
69✔
1371
    }
1372
    emit_rmro(as, XO_MOV, uv|REX_GC64, func,
253✔
1373
              (int32_t)offsetof(GCfuncL, uvptr) +
253✔
1374
              (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
253✔
1375
  }
1376
}
506✔
1377

1378
static void asm_fref(ASMState *as, IRIns *ir)
×
1379
{
1380
  Reg dest = ra_dest(as, ir, RSET_GPR);
×
1381
  asm_fusefref(as, ir, RSET_GPR);
×
1382
  emit_mrm(as, XO_LEA, dest, RID_MRM);
×
1383
}
×
1384

1385
static void asm_strref(ASMState *as, IRIns *ir)
340✔
1386
{
1387
  Reg dest = ra_dest(as, ir, RSET_GPR);
340✔
1388
  asm_fusestrref(as, ir, RSET_GPR);
340✔
1389
  if (as->mrm.base == RID_NONE)
340✔
1390
    emit_loadi(as, dest, as->mrm.ofs);
×
1391
  else if (as->mrm.base == dest && as->mrm.idx == RID_NONE)
340✔
1392
    emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, as->mrm.ofs);
15✔
1393
  else
1394
    emit_mrm(as, XO_LEA, dest|REX_GC64, RID_MRM);
325✔
1395
}
340✔
1396

1397
/* -- Loads and stores ---------------------------------------------------- */
1398

1399
static void asm_fxload(ASMState *as, IRIns *ir)
197,745✔
1400
{
1401
  Reg dest = ra_dest(as, ir, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
395,414✔
1402
  x86Op xo;
197,745✔
1403
  if (ir->o == IR_FLOAD)
197,745✔
1404
    asm_fusefref(as, ir, RSET_GPR);
197,258✔
1405
  else
1406
    asm_fusexref(as, ir->op1, RSET_GPR);
487✔
1407
  /* ir->op2 is ignored -- unaligned loads are ok on x86. */
1408
  switch (irt_type(ir->t)) {
197,745✔
1409
  case IRT_I8: xo = XO_MOVSXb; break;
1410
  case IRT_U8: xo = XO_MOVZXb; break;
261✔
1411
  case IRT_I16: xo = XO_MOVSXw; break;
4✔
1412
  case IRT_U16: xo = XO_MOVZXw; break;
156✔
1413
  case IRT_NUM: xo = XO_MOVSD; break;
69✔
1414
  case IRT_FLOAT: xo = XO_MOVSS; break;
7✔
1415
  default:
197,229✔
1416
    if (LJ_64 && irt_is64(ir->t))
197,229✔
1417
      dest |= REX_64;
122,504✔
1418
    else
1419
      lj_assertA(irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t),
1420
                 "unsplit 64 bit load");
1421
    xo = XO_MOV;
1422
    break;
1423
  }
1424
  emit_mrm(as, xo, dest, RID_MRM);
197,745✔
1425
}
197,745✔
1426

1427
#define asm_fload(as, ir)        asm_fxload(as, ir)
1428
#define asm_xload(as, ir)        asm_fxload(as, ir)
1429

1430
static void asm_fxstore(ASMState *as, IRIns *ir)
692✔
1431
{
1432
  RegSet allow = RSET_GPR;
692✔
1433
  Reg src = RID_NONE, osrc = RID_NONE;
692✔
1434
  int32_t k = 0;
692✔
1435
  if (ir->r == RID_SINK)
692✔
1436
    return;
692✔
1437
  /* The IRT_I16/IRT_U16 stores should never be simplified for constant
1438
  ** values since mov word [mem], imm16 has a length-changing prefix.
1439
  */
1440
  if (irt_isi16(ir->t) || irt_isu16(ir->t) || irt_isfp(ir->t) ||
467✔
1441
      !asm_isk32(as, ir->op2, &k)) {
397✔
1442
    RegSet allow8 = irt_isfp(ir->t) ? RSET_FPR :
395✔
1443
                    (irt_isi8(ir->t) || irt_isu8(ir->t)) ? RSET_GPR8 : RSET_GPR;
1444
    src = osrc = ra_alloc1(as, ir->op2, allow8);
395✔
1445
    if (!LJ_64 && !rset_test(allow8, src)) {  /* Already in wrong register. */
395✔
1446
      rset_clear(allow, osrc);
1447
      src = ra_scratch(as, allow8);
1448
    }
1449
    rset_clear(allow, src);
395✔
1450
  }
1451
  if (ir->o == IR_FSTORE) {
467✔
1452
    asm_fusefref(as, IR(ir->op1), allow);
55✔
1453
  } else {
1454
    asm_fusexref(as, ir->op1, allow);
412✔
1455
    if (LJ_32 && ir->o == IR_HIOP) as->mrm.ofs += 4;
412✔
1456
  }
1457
  if (ra_hasreg(src)) {
467✔
1458
    x86Op xo;
395✔
1459
    switch (irt_type(ir->t)) {
395✔
1460
    case IRT_I8: case IRT_U8: xo = XO_MOVtob; src |= FORCE_REX; break;
48✔
1461
    case IRT_I16: case IRT_U16: xo = XO_MOVtow; break;
1462
    case IRT_NUM: xo = XO_MOVSDto; break;
65✔
1463
    case IRT_FLOAT: xo = XO_MOVSSto; break;
19✔
1464
#if LJ_64 && !LJ_GC64
1465
    case IRT_LIGHTUD:
1466
      /* NYI: mask 64 bit lightuserdata. */
1467
      lj_assertA(0, "store of lightuserdata");
1468
#endif
1469
    default:
221✔
1470
      if (LJ_64 && irt_is64(ir->t))
221✔
1471
        src |= REX_64;
90✔
1472
      else
1473
        lj_assertA(irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t),
1474
                   "unsplit 64 bit store");
1475
      xo = XO_MOVto;
1476
      break;
1477
    }
1478
    emit_mrm(as, xo, src, RID_MRM);
395✔
1479
    if (!LJ_64 && src != osrc) {
395✔
1480
      ra_noweak(as, osrc);
1481
      emit_rr(as, XO_MOV, src, osrc);
1482
    }
1483
  } else {
1484
    if (irt_isi8(ir->t) || irt_isu8(ir->t)) {
72✔
1485
      emit_i8(as, k);
33✔
1486
      emit_mrm(as, XO_MOVmib, 0, RID_MRM);
33✔
1487
    } else {
1488
      lj_assertA(irt_is64(ir->t) || irt_isint(ir->t) || irt_isu32(ir->t) ||
39✔
1489
                 irt_isaddr(ir->t), "bad store type");
1490
      emit_i32(as, k);
39✔
1491
      emit_mrm(as, XO_MOVmi, REX_64IR(ir, 0), RID_MRM);
62✔
1492
    }
1493
  }
1494
}
1495

1496
#define asm_fstore(as, ir)        asm_fxstore(as, ir)
1497
#define asm_xstore(as, ir)        asm_fxstore(as, ir)
1498

1499
#if LJ_64 && !LJ_GC64
1500
static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck)
1501
{
1502
  if (ra_used(ir) || typecheck) {
1503
    Reg dest = ra_dest(as, ir, RSET_GPR);
1504
    if (typecheck) {
1505
      Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, dest));
1506
      asm_guardcc(as, CC_NE);
1507
      emit_i8(as, -2);
1508
      emit_rr(as, XO_ARITHi8, XOg_CMP, tmp);
1509
      emit_shifti(as, XOg_SAR|REX_64, tmp, 47);
1510
      emit_rr(as, XO_MOV, tmp|REX_64, dest);
1511
    }
1512
    return dest;
1513
  } else {
1514
    return RID_NONE;
1515
  }
1516
}
1517
#endif
1518

1519
static void asm_ahuvload(ASMState *as, IRIns *ir)
49,444✔
1520
{
1521
#if LJ_GC64
1522
  Reg tmp = RID_NONE;
49,444✔
1523
#endif
1524
  lj_assertA(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
49,444✔
1525
             (LJ_DUALNUM && irt_isint(ir->t)),
1526
             "bad load type %d", irt_type(ir->t));
1527
#if LJ_64 && !LJ_GC64
1528
  if (irt_islightud(ir->t)) {
1529
    Reg dest = asm_load_lightud64(as, ir, 1);
1530
    if (ra_hasreg(dest)) {
1531
      asm_fuseahuref(as, ir->op1, RSET_GPR);
1532
      emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM);
1533
    }
1534
    return;
1535
  } else
1536
#endif
1537
  if (ra_used(ir)) {
89,455✔
1538
    RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
46,180✔
1539
    Reg dest = ra_dest(as, ir, allow);
46,180✔
1540
    asm_fuseahuref(as, ir->op1, RSET_GPR);
46,180✔
1541
#if LJ_GC64
1542
    if (irt_isaddr(ir->t)) {
46,180✔
1543
      emit_shifti(as, XOg_SHR|REX_64, dest, 17);
6,169✔
1544
      asm_guardcc(as, CC_NE);
6,169✔
1545
      emit_i8(as, irt_toitype(ir->t));
6,169✔
1546
      emit_rr(as, XO_ARITHi8, XOg_CMP, dest);
6,169✔
1547
      emit_i8(as, XI_O16);
6,169✔
1548
      if ((as->flags & JIT_F_BMI2)) {
6,169✔
1549
        emit_i8(as, 47);
6,169✔
1550
        emit_mrm(as, XV_RORX|VEX_64, dest, RID_MRM);
6,169✔
1551
      } else {
1552
        emit_shifti(as, XOg_ROR|REX_64, dest, 47);
×
1553
        emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM);
×
1554
      }
1555
      return;
6,169✔
1556
    } else
1557
#endif
1558
    emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XO_MOVSD, dest, RID_MRM);
80,022✔
1559
  } else {
1560
    RegSet gpr = RSET_GPR;
3,264✔
1561
#if LJ_GC64
1562
    if (irt_isaddr(ir->t)) {
3,264✔
1563
      tmp = ra_scratch(as, RSET_GPR);
15✔
1564
      gpr = rset_exclude(gpr, tmp);
15✔
1565
    }
1566
#endif
1567
    asm_fuseahuref(as, ir->op1, gpr);
3,264✔
1568
  }
1569
  /* Always do the type check, even if the load result is unused. */
1570
  as->mrm.ofs += 4;
43,275✔
1571
  asm_guardcc(as, irt_isnum(ir->t) ? CC_AE : CC_NE);
43,696✔
1572
  if (LJ_64 && irt_type(ir->t) >= IRT_NUM) {
43,275✔
1573
    lj_assertA(irt_isinteger(ir->t) || irt_isnum(ir->t),
42,854✔
1574
               "bad load type %d", irt_type(ir->t));
1575
#if LJ_GC64
1576
    emit_u32(as, LJ_TISNUM << 15);
42,854✔
1577
#else
1578
    emit_u32(as, LJ_TISNUM);
1579
#endif
1580
    emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM);
42,854✔
1581
#if LJ_GC64
1582
  } else if (irt_isaddr(ir->t)) {
421✔
1583
    as->mrm.ofs -= 4;
15✔
1584
    emit_i8(as, irt_toitype(ir->t));
15✔
1585
    emit_mrm(as, XO_ARITHi8, XOg_CMP, tmp);
15✔
1586
    emit_shifti(as, XOg_SAR|REX_64, tmp, 47);
15✔
1587
    emit_mrm(as, XO_MOV, tmp|REX_64, RID_MRM);
15✔
1588
  } else if (irt_isnil(ir->t)) {
406✔
1589
    as->mrm.ofs -= 4;
342✔
1590
    emit_i8(as, -1);
342✔
1591
    emit_mrm(as, XO_ARITHi8, XOg_CMP|REX_64, RID_MRM);
342✔
1592
  } else {
1593
    emit_u32(as, (irt_toitype(ir->t) << 15) | 0x7fff);
64✔
1594
    emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM);
64✔
1595
#else
1596
  } else {
1597
    emit_i8(as, irt_toitype(ir->t));
1598
    emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM);
1599
#endif
1600
  }
1601
}
1602

1603
static void asm_ahustore(ASMState *as, IRIns *ir)
44,013✔
1604
{
1605
  if (ir->r == RID_SINK)
44,013✔
1606
    return;
1607
  if (irt_isnum(ir->t)) {
43,797✔
1608
    Reg src = ra_alloc1(as, ir->op2, RSET_FPR);
40,911✔
1609
    asm_fuseahuref(as, ir->op1, RSET_GPR);
40,911✔
1610
    emit_mrm(as, XO_MOVSDto, src, RID_MRM);
40,911✔
1611
#if LJ_64 && !LJ_GC64
1612
  } else if (irt_islightud(ir->t)) {
1613
    Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
1614
    asm_fuseahuref(as, ir->op1, rset_exclude(RSET_GPR, src));
1615
    emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM);
1616
#endif
1617
#if LJ_GC64
1618
  } else if (irref_isk(ir->op2)) {
2,886✔
1619
    TValue k;
421✔
1620
    lj_ir_kvalue(as->J->L, &k, IR(ir->op2));
421✔
1621
    asm_fuseahuref(as, ir->op1, RSET_GPR);
421✔
1622
    if (tvisnil(&k)) {
421✔
1623
      emit_i32(as, -1);
359✔
1624
      emit_mrm(as, XO_MOVmi, REX_64, RID_MRM);
359✔
1625
    } else {
1626
      emit_u32(as, k.u32.lo);
62✔
1627
      emit_mrm(as, XO_MOVmi, 0, RID_MRM);
62✔
1628
      as->mrm.ofs += 4;
62✔
1629
      emit_u32(as, k.u32.hi);
62✔
1630
      emit_mrm(as, XO_MOVmi, 0, RID_MRM);
62✔
1631
    }
1632
#endif
1633
  } else {
1634
    IRIns *irr = IR(ir->op2);
2,465✔
1635
    RegSet allow = RSET_GPR;
2,465✔
1636
    Reg src = RID_NONE;
2,465✔
1637
    if (!irref_isk(ir->op2)) {
2,465✔
1638
      src = ra_alloc1(as, ir->op2, allow);
2,465✔
1639
      rset_clear(allow, src);
2,465✔
1640
    }
1641
    asm_fuseahuref(as, ir->op1, allow);
2,465✔
1642
    if (ra_hasreg(src)) {
2,465✔
1643
#if LJ_GC64
1644
      if (!(LJ_DUALNUM && irt_isinteger(ir->t))) {
2,465✔
1645
        /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
1646
        as->mrm.ofs += 4;
2,465✔
1647
        emit_u32(as, irt_toitype(ir->t) << 15);
2,465✔
1648
        emit_mrm(as, XO_ARITHi, XOg_OR, RID_MRM);
2,465✔
1649
        as->mrm.ofs -= 4;
2,465✔
1650
        emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM);
2,465✔
1651
        return;
2,465✔
1652
      }
1653
#endif
1654
      emit_mrm(as, XO_MOVto, src, RID_MRM);
1655
    } else if (!irt_ispri(irr->t)) {
×
1656
      lj_assertA(irt_isaddr(ir->t) || (LJ_DUALNUM && irt_isinteger(ir->t)),
×
1657
                 "bad store type");
1658
      emit_i32(as, irr->i);
×
1659
      emit_mrm(as, XO_MOVmi, 0, RID_MRM);
×
1660
    }
1661
    as->mrm.ofs += 4;
×
1662
#if LJ_GC64
1663
    lj_assertA(LJ_DUALNUM && irt_isinteger(ir->t), "bad store type");
×
1664
    emit_i32(as, LJ_TNUMX << 15);
×
1665
#else
1666
    emit_i32(as, (int32_t)irt_toitype(ir->t));
1667
#endif
1668
    emit_mrm(as, XO_MOVmi, 0, RID_MRM);
×
1669
  }
1670
}
1671

1672
static void asm_sload(ASMState *as, IRIns *ir)
16,590✔
1673
{
1674
  int32_t ofs = 8*((int32_t)ir->op1-1-LJ_FR2) +
16,590✔
1675
                (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0);
1676
  IRType1 t = ir->t;
16,590✔
1677
  Reg base;
16,590✔
1678
  lj_assertA(!(ir->op2 & IRSLOAD_PARENT),
16,590✔
1679
             "bad parent SLOAD"); /* Handled by asm_head_side(). */
1680
  lj_assertA(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK),
16,590✔
1681
             "inconsistent SLOAD variant");
1682
  lj_assertA(LJ_DUALNUM ||
16,590✔
1683
             !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME)),
1684
             "bad SLOAD type");
1685
  if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
16,788✔
1686
    Reg left = ra_scratch(as, RSET_FPR);
198✔
1687
    asm_tointg(as, ir, left);  /* Frees dest reg. Do this before base alloc. */
198✔
1688
    base = ra_alloc1(as, REF_BASE, RSET_GPR);
198✔
1689
    emit_rmro(as, XO_MOVSD, left, base, ofs);
198✔
1690
    t.irt = IRT_NUM;  /* Continue with a regular number type check. */
198✔
1691
#if LJ_64 && !LJ_GC64
1692
  } else if (irt_islightud(t)) {
1693
    Reg dest = asm_load_lightud64(as, ir, (ir->op2 & IRSLOAD_TYPECHECK));
1694
    if (ra_hasreg(dest)) {
1695
      base = ra_alloc1(as, REF_BASE, RSET_GPR);
1696
      emit_rmro(as, XO_MOV, dest|REX_64, base, ofs);
1697
    }
1698
    return;
1699
#endif
1700
  } else if (ra_used(ir)) {
16,392✔
1701
    RegSet allow = irt_isnum(t) ? RSET_FPR : RSET_GPR;
14,842✔
1702
    Reg dest = ra_dest(as, ir, allow);
14,842✔
1703
    base = ra_alloc1(as, REF_BASE, RSET_GPR);
14,842✔
1704
    lj_assertA(irt_isnum(t) || irt_isint(t) || irt_isaddr(t),
14,842✔
1705
               "bad SLOAD type %d", irt_type(t));
1706
    if ((ir->op2 & IRSLOAD_CONVERT)) {
14,842✔
1707
      t.irt = irt_isint(t) ? IRT_NUM : IRT_INT;  /* Check for original type. */
1,261✔
1708
      emit_rmro(as, irt_isint(t) ? XO_CVTSI2SD : XO_CVTTSD2SI, dest, base, ofs);
2,522✔
1709
    } else {
1710
#if LJ_GC64
1711
      if (irt_isaddr(t)) {
13,581✔
1712
        /* LJ_GC64 type check + tag removal without BMI2 and with BMI2:
1713
        **
1714
        **  mov r64, [addr]    rorx r64, [addr], 47
1715
        **  ror r64, 47
1716
        **  cmp r16, itype     cmp r16, itype
1717
        **  jne ->exit         jne ->exit
1718
        **  shr r64, 16        shr r64, 16
1719
        */
1720
        emit_shifti(as, XOg_SHR|REX_64, dest, 17);
10,506✔
1721
        if ((ir->op2 & IRSLOAD_TYPECHECK)) {
10,506✔
1722
          asm_guardcc(as, CC_NE);
7,776✔
1723
          emit_i8(as, irt_toitype(t));
7,776✔
1724
          emit_rr(as, XO_ARITHi8, XOg_CMP, dest);
7,776✔
1725
          emit_i8(as, XI_O16);
7,776✔
1726
        }
1727
        if ((as->flags & JIT_F_BMI2)) {
10,506✔
1728
          emit_i8(as, 47);
10,506✔
1729
          emit_rmro(as, XV_RORX|VEX_64, dest, base, ofs);
10,506✔
1730
        } else {
1731
          if ((ir->op2 & IRSLOAD_TYPECHECK))
×
1732
            emit_shifti(as, XOg_ROR|REX_64, dest, 47);
×
1733
          else
1734
            emit_shifti(as, XOg_SHL|REX_64, dest, 17);
×
1735
          emit_rmro(as, XO_MOV, dest|REX_64, base, ofs);
×
1736
        }
1737
        return;
10,506✔
1738
      } else
1739
#endif
1740
      emit_rmro(as, irt_isnum(t) ? XO_MOVSD : XO_MOV, dest, base, ofs);
3,086✔
1741
    }
1742
  } else {
1743
    if (!(ir->op2 & IRSLOAD_TYPECHECK))
1,550✔
1744
      return;  /* No type check: avoid base alloc. */
1745
    base = ra_alloc1(as, REF_BASE, RSET_GPR);
1,547✔
1746
  }
1747
  if ((ir->op2 & IRSLOAD_TYPECHECK)) {
6,081✔
1748
    /* Need type check, even if the load result is unused. */
1749
    asm_guardcc(as, irt_isnum(t) ? CC_AE : CC_NE);
3,957✔
1750
    if (LJ_64 && irt_type(t) >= IRT_NUM) {
3,778✔
1751
      lj_assertA(irt_isinteger(t) || irt_isnum(t),
3,599✔
1752
                 "bad SLOAD type %d", irt_type(t));
1753
#if LJ_GC64
1754
      emit_u32(as, LJ_TISNUM << 15);
3,599✔
1755
#else
1756
      emit_u32(as, LJ_TISNUM);
1757
#endif
1758
      emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4);
3,599✔
1759
#if LJ_GC64
1760
    } else if (irt_isnil(t)) {
179✔
1761
      /* LJ_GC64 type check for nil:
1762
      **
1763
      **   cmp qword [addr], -1
1764
      **   jne ->exit
1765
      */
1766
      emit_i8(as, -1);
70✔
1767
      emit_rmro(as, XO_ARITHi8, XOg_CMP|REX_64, base, ofs);
70✔
1768
    } else if (irt_ispri(t)) {
109✔
1769
      emit_u32(as, (irt_toitype(t) << 15) | 0x7fff);
97✔
1770
      emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4);
97✔
1771
    } else {
1772
      /* LJ_GC64 type check only:
1773
      **
1774
      **   mov r64, [addr]
1775
      **   sar r64, 47
1776
      **   cmp r32, itype
1777
      **   jne ->exit
1778
      */
1779
      Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, base));
12✔
1780
      emit_i8(as, irt_toitype(t));
12✔
1781
      emit_rr(as, XO_ARITHi8, XOg_CMP, tmp);
12✔
1782
      emit_shifti(as, XOg_SAR|REX_64, tmp, 47);
12✔
1783
      emit_rmro(as, XO_MOV, tmp|REX_64, base, ofs);
12✔
1784
#else
1785
    } else {
1786
      emit_i8(as, irt_toitype(t));
1787
      emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4);
1788
#endif
1789
    }
1790
  }
1791
}
1792

1793
/* -- Allocations --------------------------------------------------------- */
1794

1795
#if LJ_HASFFI
1796
static void asm_cnew(ASMState *as, IRIns *ir)
1,661✔
1797
{
1798
  CTState *cts = ctype_ctsG(J2G(as->J));
1,661✔
1799
  CTypeID id = (CTypeID)IR(ir->op1)->i;
1,661✔
1800
  CTSize sz;
1,661✔
1801
  CTInfo info = lj_ctype_info(cts, id, &sz);
1,661✔
1802
  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
1,661✔
1803
  IRRef args[4];
1,661✔
1804
  lj_assertA(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL),
1,661✔
1805
             "bad CNEW/CNEWI operands");
1806

1807
  as->gcsteps++;
1,661✔
1808
  asm_setupresult(as, ir, ci);  /* GCcdata * */
1,661✔
1809

1810
  /* Initialize immutable cdata object. */
1811
  if (ir->o == IR_CNEWI) {
1,661✔
1812
    RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
1,639✔
1813
#if LJ_64
1814
    Reg r64 = sz == 8 ? REX_64 : 0;
1,639✔
1815
    if (irref_isk(ir->op2)) {
1,639✔
1816
      IRIns *irk = IR(ir->op2);
156✔
1817
      uint64_t k = (irk->o == IR_KINT64 ||
312✔
1818
                    (LJ_GC64 && (irk->o == IR_KPTR || irk->o == IR_KKPTR))) ?
1819
                   ir_k64(irk)->u64 : (uint64_t)(uint32_t)irk->i;
156✔
1820
      if (sz == 4 || checki32((int64_t)k)) {
156✔
1821
        emit_i32(as, (int32_t)k);
156✔
1822
        emit_rmro(as, XO_MOVmi, r64, RID_RET, sizeof(GCcdata));
156✔
1823
      } else {
1824
        emit_movtomro(as, RID_ECX + r64, RID_RET, sizeof(GCcdata));
×
1825
        emit_loadu64(as, RID_ECX, k);
×
1826
      }
1827
    } else {
1828
      Reg r = ra_alloc1(as, ir->op2, allow);
1,483✔
1829
      emit_movtomro(as, r + r64, RID_RET, sizeof(GCcdata));
1,483✔
1830
    }
1831
#else
1832
    int32_t ofs = sizeof(GCcdata);
1833
    if (sz == 8) {
1834
      ofs += 4; ir++;
1835
      lj_assertA(ir->o == IR_HIOP, "missing CNEWI HIOP");
1836
    }
1837
    do {
1838
      if (irref_isk(ir->op2)) {
1839
        emit_movmroi(as, RID_RET, ofs, IR(ir->op2)->i);
1840
      } else {
1841
        Reg r = ra_alloc1(as, ir->op2, allow);
1842
        emit_movtomro(as, r, RID_RET, ofs);
1843
        rset_clear(allow, r);
1844
      }
1845
      if (ofs == sizeof(GCcdata)) break;
1846
      ofs -= 4; ir--;
1847
    } while (1);
1848
#endif
1849
    lj_assertA(sz == 4 || sz == 8, "bad CNEWI size %d", sz);
1850
  } else if (ir->op2 != REF_NIL) {  /* Create VLA/VLS/aligned cdata. */
22✔
1851
    ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv];
6✔
1852
    args[0] = ASMREF_L;     /* lua_State *L */
6✔
1853
    args[1] = ir->op1;      /* CTypeID id   */
6✔
1854
    args[2] = ir->op2;      /* CTSize sz    */
6✔
1855
    args[3] = ASMREF_TMP1;  /* CTSize align */
6✔
1856
    asm_gencall(as, ci, args);
6✔
1857
    emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info));
6✔
1858
    return;
6✔
1859
  }
1860

1861
  /* Increment cdatanum counter by address directly. */
1862
  emit_i8(as, 1);
1,655✔
1863
#if LJ_GC64
1864
  emit_rmro(as, XO_ARITHi8, XOg_ADD|REX_64, RID_DISPATCH,
1,655✔
1865
            dispofs(as, &J2G(as->J)->gc.cdatanum));
1866
#else
1867
  emit_rmro(as, XO_ARITHi8, XOg_ADD, RID_NONE,
1868
            ptr2addr(&J2G(as->J)->gc.cdatanum));
1869
#endif
1870
  /* Combine initialization of marked, gct and ctypeid. */
1871
  emit_movtomro(as, RID_ECX, RID_RET, offsetof(GCcdata, marked));
1,655✔
1872
  emit_gri(as, XG_ARITHi(XOg_OR), RID_ECX,
1,655✔
1873
           (int32_t)((~LJ_TCDATA<<8)+(id<<16)));
1,655✔
1874
  emit_gri(as, XG_ARITHi(XOg_AND), RID_ECX, LJ_GC_WHITES);
1,655✔
1875
  emit_opgl(as, XO_MOVZXb, RID_ECX, gc.currentwhite);
1,655✔
1876

1877
  args[0] = ASMREF_L;     /* lua_State *L */
1,655✔
1878
  args[1] = ASMREF_TMP1;  /* MSize size   */
1,655✔
1879
  asm_gencall(as, ci, args);
1,655✔
1880
  emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)(sz+sizeof(GCcdata)));
1,655✔
1881
}
1882
#endif
1883

1884
/* -- Write barriers ------------------------------------------------------ */
1885

1886
static void asm_tbar(ASMState *as, IRIns *ir)
1887
{
1888
  Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
1889
  Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab));
1890
  MCLabel l_end = emit_label(as);
1891
  emit_movtomro(as, tmp|REX_GC64, tab, offsetof(GCtab, gclist));
1892
  emit_setgl(as, tab, gc.grayagain);
1893
  emit_getgl(as, tmp, gc.grayagain);
1894
  emit_i8(as, ~LJ_GC_BLACK);
1895
  emit_rmro(as, XO_ARITHib, XOg_AND, tab, offsetof(GCtab, marked));
1896
  emit_sjcc(as, CC_Z, l_end);
1897
  emit_i8(as, LJ_GC_BLACK);
1898
  emit_rmro(as, XO_GROUP3b, XOg_TEST, tab, offsetof(GCtab, marked));
1899
}
1900

1901
static void asm_obar(ASMState *as, IRIns *ir)
1902
{
1903
  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
1904
  IRRef args[2];
1905
  MCLabel l_end;
1906
  Reg obj;
1907
  /* No need for other object barriers (yet). */
1908
  lj_assertA(IR(ir->op1)->o == IR_UREFC, "bad OBAR type");
1909
  ra_evictset(as, RSET_SCRATCH);
1910
  l_end = emit_label(as);
1911
  args[0] = ASMREF_TMP1;  /* global_State *g */
1912
  args[1] = ir->op1;      /* TValue *tv      */
1913
  asm_gencall(as, ci, args);
1914
  emit_loada(as, ra_releasetmp(as, ASMREF_TMP1), J2G(as->J));
1915
  obj = IR(ir->op1)->r;
1916
  emit_sjcc(as, CC_Z, l_end);
1917
  emit_i8(as, LJ_GC_WHITES);
1918
  if (irref_isk(ir->op2)) {
1919
    GCobj *vp = ir_kgc(IR(ir->op2));
1920
    emit_rma(as, XO_GROUP3b, XOg_TEST, &vp->gch.marked);
1921
  } else {
1922
    Reg val = ra_alloc1(as, ir->op2, rset_exclude(RSET_SCRATCH&RSET_GPR, obj));
1923
    emit_rmro(as, XO_GROUP3b, XOg_TEST, val, (int32_t)offsetof(GChead, marked));
1924
  }
1925
  emit_sjcc(as, CC_Z, l_end);
1926
  emit_i8(as, LJ_GC_BLACK);
1927
  emit_rmro(as, XO_GROUP3b, XOg_TEST, obj,
1928
            (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
1929
}
1930

1931
/* -- FP/int arithmetic and logic operations ------------------------------ */
1932

1933
/* Load reference onto x87 stack. Force a spill to memory if needed. */
1934
static void asm_x87load(ASMState *as, IRRef ref)
540✔
1935
{
1936
  IRIns *ir = IR(ref);
540✔
1937
  if (ir->o == IR_KNUM) {
540✔
1938
    cTValue *tv = ir_knum(ir);
270✔
1939
    if (tvispzero(tv))  /* Use fldz only for +0. */
270✔
1940
      emit_x87op(as, XI_FLDZ);
×
1941
    else if (tvispone(tv))
270✔
1942
      emit_x87op(as, XI_FLD1);
×
1943
    else
1944
      emit_rma(as, XO_FLDq, XOg_FLDq, tv);
270✔
1945
  } else if (ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT && !ra_used(ir) &&
270✔
1946
             !irref_isk(ir->op1) && mayfuse(as, ir->op1)) {
360✔
1947
    IRIns *iri = IR(ir->op1);
90✔
1948
    emit_rmro(as, XO_FILDd, XOg_FILDd, RID_ESP, ra_spill(as, iri));
90✔
1949
  } else {
1950
    emit_mrm(as, XO_FLDq, XOg_FLDq, asm_fuseload(as, ref, RSET_EMPTY));
180✔
1951
  }
1952
}
540✔
1953

1954
static void asm_fpmath(ASMState *as, IRIns *ir)
158✔
1955
{
1956
  IRFPMathOp fpm = (IRFPMathOp)ir->op2;
158✔
1957
  if (fpm == IRFPM_SQRT) {
158✔
1958
    Reg dest = ra_dest(as, ir, RSET_FPR);
3✔
1959
    Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
3✔
1960
    emit_mrm(as, XO_SQRTSD, dest, left);
3✔
1961
  } else if (fpm <= IRFPM_TRUNC) {
155✔
1962
    if (as->flags & JIT_F_SSE4_1) {  /* SSE4.1 has a rounding instruction. */
155✔
1963
      Reg dest = ra_dest(as, ir, RSET_FPR);
155✔
1964
      Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
155✔
1965
      /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
1966
      ** Let's pretend it's a 3-byte opcode, and compensate afterwards.
1967
      ** This is atrocious, but the alternatives are much worse.
1968
      */
1969
      /* Round down/up/trunc == 1001/1010/1011. */
1970
      emit_i8(as, 0x09 + fpm);
155✔
1971
      emit_mrm(as, XO_ROUNDSD, dest, left);
155✔
1972
      if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
155✔
1973
        as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f;  /* Swap 0F and REX. */
1✔
1974
      }
1975
      *--as->mcp = 0x66;  /* 1st byte of ROUNDSD opcode. */
155✔
1976
    } else {  /* Call helper functions for SSE2 variant. */
1977
      /* The modified regs must match with the *.dasc implementation. */
1978
      RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
×
1979
      if (ra_hasreg(ir->r))
×
1980
        rset_clear(drop, ir->r);  /* Dest reg handled below. */
×
1981
      ra_evictset(as, drop);
×
1982
      ra_destreg(as, ir, RID_XMM0);
×
1983
      emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse :
×
1984
                    fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
1985
      ra_left(as, RID_XMM0, ir->op1);
×
1986
    }
1987
  } else {
1988
    asm_callid(as, ir, IRCALL_lj_vm_floor + fpm);
×
1989
  }
1990
}
158✔
1991

1992
static void asm_ldexp(ASMState *as, IRIns *ir)
270✔
1993
{
1994
  int32_t ofs = sps_scale(ir->s);  /* Use spill slot or temp slots. */
270✔
1995
  Reg dest = ir->r;
270✔
1996
  if (ra_hasreg(dest)) {
270✔
1997
    ra_free(as, dest);
270✔
1998
    ra_modified(as, dest);
270✔
1999
    emit_rmro(as, XO_MOVSD, dest, RID_ESP, ofs);
270✔
2000
  }
2001
  emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
270✔
2002
  emit_x87op(as, XI_FPOP1);
270✔
2003
  emit_x87op(as, XI_FSCALE);
270✔
2004
  asm_x87load(as, ir->op1);
270✔
2005
  asm_x87load(as, ir->op2);
270✔
2006
}
270✔
2007

2008
static int asm_swapops(ASMState *as, IRIns *ir)
9,641✔
2009
{
2010
  IRIns *irl = IR(ir->op1);
9,641✔
2011
  IRIns *irr = IR(ir->op2);
9,641✔
2012
  lj_assertA(ra_noreg(irr->r), "bad usage");
9,641✔
2013
  if (!irm_iscomm(lj_ir_mode[ir->o]))
9,641✔
2014
    return 0;  /* Can't swap non-commutative operations. */
2015
  if (irref_isk(ir->op2))
6,024✔
2016
    return 0;  /* Don't swap constants to the left. */
2017
  if (ra_hasreg(irl->r))
2,046✔
2018
    return 1;  /* Swap if left already has a register. */
2019
  if (ra_samehint(ir->r, irr->r))
1,904✔
2020
    return 1;  /* Swap if dest and right have matching hints. */
2021
  if (as->curins > as->loopref) {  /* In variant part? */
1,678✔
2022
    if (ir->op2 < as->loopref && !irt_isphi(irr->t))
1,264✔
2023
      return 0;  /* Keep invariants on the right. */
2024
    if (ir->op1 < as->loopref && !irt_isphi(irl->t))
909✔
2025
      return 1;  /* Swap invariants to the right. */
2026
  }
2027
  if (opisfusableload(irl->o))
1,323✔
2028
    return 1;  /* Swap fusable loads to the right. */
1,051✔
2029
  return 0;  /* Otherwise don't swap. */
2030
}
2031

2032
static void asm_fparith(ASMState *as, IRIns *ir, x86Op xo)
47,705✔
2033
{
2034
  IRRef lref = ir->op1;
47,705✔
2035
  IRRef rref = ir->op2;
47,705✔
2036
  RegSet allow = RSET_FPR;
47,705✔
2037
  Reg dest;
47,705✔
2038
  Reg right = IR(rref)->r;
47,705✔
2039
  if (ra_hasreg(right)) {
47,705✔
2040
    rset_clear(allow, right);
38,430✔
2041
    ra_noweak(as, right);
38,430✔
2042
  }
2043
  dest = ra_dest(as, ir, allow);
47,705✔
2044
  if (lref == rref) {
47,705✔
2045
    right = dest;
2046
  } else if (ra_noreg(right)) {
47,688✔
2047
    if (asm_swapops(as, ir)) {
9,262✔
2048
      IRRef tmp = lref; lref = rref; rref = tmp;
1,312✔
2049
    }
2050
    right = asm_fuseload(as, rref, rset_clear(allow, dest));
9,262✔
2051
  }
2052
  emit_mrm(as, xo, dest, right);
47,705✔
2053
  ra_left(as, dest, lref);
47,705✔
2054
}
47,705✔
2055

2056
static void asm_intarith(ASMState *as, IRIns *ir, x86Arith xa)
5,415✔
2057
{
2058
  IRRef lref = ir->op1;
5,415✔
2059
  IRRef rref = ir->op2;
5,415✔
2060
  RegSet allow = RSET_GPR;
5,415✔
2061
  Reg dest, right;
5,415✔
2062
  int32_t k = 0;
5,415✔
2063
  if (as->flagmcp == as->mcp) {  /* Drop test r,r instruction. */
5,415✔
2064
    MCode *p = as->mcp + ((LJ_64 && *as->mcp < XI_TESTb) ? 3 : 2);
226✔
2065
    MCode *q = p[0] == 0x0f ? p+1 : p;
226✔
2066
    if ((*q & 15) < 14) {
226✔
2067
      if ((*q & 15) >= 12) *q -= 4;  /* L <->S, NL <-> NS */
217✔
2068
      as->flagmcp = NULL;
217✔
2069
      as->mcp = p;
217✔
2070
    }  /* else: cannot transform LE/NLE to cc without use of OF. */
2071
  }
2072
  right = IR(rref)->r;
5,415✔
2073
  if (ra_hasreg(right)) {
5,415✔
2074
    rset_clear(allow, right);
243✔
2075
    ra_noweak(as, right);
243✔
2076
  }
2077
  dest = ra_dest(as, ir, allow);
5,415✔
2078
  if (lref == rref) {
5,415✔
2079
    right = dest;
2080
  } else if (ra_noreg(right) && !asm_isk32(as, rref, &k)) {
10,087✔
2081
    if (asm_swapops(as, ir)) {
379✔
2082
      IRRef tmp = lref; lref = rref; rref = tmp;
107✔
2083
    }
2084
    right = asm_fuseloadm(as, rref, rset_clear(allow, dest), irt_is64(ir->t));
379✔
2085
  }
2086
  if (irt_isguard(ir->t))  /* For IR_ADDOV etc. */
5,415✔
2087
    asm_guardcc(as, CC_O);
130✔
2088
  if (xa != XOg_X_IMUL) {
5,415✔
2089
    if (ra_hasreg(right))
5,389✔
2090
      emit_mrm(as, XO_ARITH(xa), REX_64IR(ir, dest), right);
872✔
2091
    else
2092
      emit_gri(as, XG_ARITHi(xa), REX_64IR(ir, dest), k);
9,329✔
2093
  } else if (ra_hasreg(right)) {  /* IMUL r, mrm. */
26✔
2094
    emit_mrm(as, XO_IMUL, REX_64IR(ir, dest), right);
22✔
2095
  } else {  /* IMUL r, r, k. */
2096
    /* NYI: use lea/shl/add/sub (FOLD only does 2^k) depending on CPU. */
2097
    Reg left = asm_fuseloadm(as, lref, RSET_GPR, irt_is64(ir->t));
5✔
2098
    x86Op xo;
5✔
2099
    if (checki8(k)) { emit_i8(as, k); xo = XO_IMULi8;
5✔
2100
    } else { emit_i32(as, k); xo = XO_IMULi; }
×
2101
    emit_mrm(as, xo, REX_64IR(ir, dest), left);
5✔
2102
    return;
5✔
2103
  }
2104
  ra_left(as, dest, lref);
5,410✔
2105
}
2106

2107
/* LEA is really a 4-operand ADD with an independent destination register,
2108
** up to two source registers and an immediate. One register can be scaled
2109
** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several
2110
** instructions.
2111
**
2112
** Currently only a few common cases are supported:
2113
** - 3-operand ADD:    y = a+b; y = a+k   with a and b already allocated
2114
** - Left ADD fusion:  y = (a+b)+k; y = (a+k)+b
2115
** - Right ADD fusion: y = a+(b+k)
2116
** The ommited variants have already been reduced by FOLD.
2117
**
2118
** There are more fusion opportunities, like gathering shifts or joining
2119
** common references. But these are probably not worth the trouble, since
2120
** array indexing is not decomposed and already makes use of all fields
2121
** of the ModRM operand.
2122
*/
2123
static int asm_lea(ASMState *as, IRIns *ir)
4,332✔
2124
{
2125
  IRIns *irl = IR(ir->op1);
4,332✔
2126
  IRIns *irr = IR(ir->op2);
4,332✔
2127
  RegSet allow = RSET_GPR;
4,332✔
2128
  Reg dest;
4,332✔
2129
  as->mrm.base = as->mrm.idx = RID_NONE;
4,332✔
2130
  as->mrm.scale = XM_SCALE1;
4,332✔
2131
  as->mrm.ofs = 0;
4,332✔
2132
  if (ra_hasreg(irl->r)) {
4,332✔
2133
    rset_clear(allow, irl->r);
332✔
2134
    ra_noweak(as, irl->r);
332✔
2135
    as->mrm.base = irl->r;
332✔
2136
    if (irref_isk(ir->op2) || ra_hasreg(irr->r)) {
332✔
2137
      /* The PHI renaming logic does a better job in some cases. */
2138
      if (ra_hasreg(ir->r) &&
325✔
2139
          ((irt_isphi(irl->t) && as->phireg[ir->r] == ir->op1) ||
324✔
2140
           (irt_isphi(irr->t) && as->phireg[ir->r] == ir->op2)))
224✔
2141
        return 0;
2142
      if (irref_isk(ir->op2)) {
225✔
2143
        as->mrm.ofs = irr->i;
225✔
2144
      } else {
2145
        rset_clear(allow, irr->r);
×
2146
        ra_noweak(as, irr->r);
×
2147
        as->mrm.idx = irr->r;
×
2148
      }
2149
    } else if (irr->o == IR_ADD && mayfuse(as, ir->op2) &&
7✔
2150
               irref_isk(irr->op2)) {
×
2151
      Reg idx = ra_alloc1(as, irr->op1, allow);
×
2152
      rset_clear(allow, idx);
×
2153
      as->mrm.idx = (uint8_t)idx;
×
2154
      as->mrm.ofs = IR(irr->op2)->i;
×
2155
    } else {
2156
      return 0;
2157
    }
2158
  } else if (ir->op1 != ir->op2 && irl->o == IR_ADD && mayfuse(as, ir->op1) &&
4,000✔
2159
             (irref_isk(ir->op2) || irref_isk(irl->op2))) {
13✔
2160
    Reg idx, base = ra_alloc1(as, irl->op1, allow);
10✔
2161
    rset_clear(allow, base);
10✔
2162
    as->mrm.base = (uint8_t)base;
10✔
2163
    if (irref_isk(ir->op2)) {
10✔
2164
      as->mrm.ofs = irr->i;
10✔
2165
      idx = ra_alloc1(as, irl->op2, allow);
10✔
2166
    } else {
2167
      as->mrm.ofs = IR(irl->op2)->i;
×
2168
      idx = ra_alloc1(as, ir->op2, allow);
×
2169
    }
2170
    rset_clear(allow, idx);
10✔
2171
    as->mrm.idx = (uint8_t)idx;
10✔
2172
  } else {
2173
    return 0;
2174
  }
2175
  dest = ra_dest(as, ir, allow);
235✔
2176
  emit_mrm(as, XO_LEA, dest, RID_MRM);
235✔
2177
  return 1;  /* Success. */
235✔
2178
}
2179

2180
static void asm_add(ASMState *as, IRIns *ir)
30,428✔
2181
{
2182
  if (irt_isnum(ir->t))
30,428✔
2183
    asm_fparith(as, ir, XO_ADDSD);
25,621✔
2184
  else if ((as->flags & JIT_F_LEA_AGU) || as->flagmcp == as->mcp ||
4,807✔
2185
           irt_is64(ir->t) || !asm_lea(as, ir))
4,651✔
2186
    asm_intarith(as, ir, XOg_ADD);
4,572✔
2187
}
30,428✔
2188

2189
static void asm_sub(ASMState *as, IRIns *ir)
1,739✔
2190
{
2191
  if (irt_isnum(ir->t))
1,739✔
2192
    asm_fparith(as, ir, XO_SUBSD);
1,459✔
2193
  else  /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */
2194
    asm_intarith(as, ir, XOg_SUB);
280✔
2195
}
1,739✔
2196

2197
static void asm_mul(ASMState *as, IRIns *ir)
18,417✔
2198
{
2199
  if (irt_isnum(ir->t))
18,417✔
2200
    asm_fparith(as, ir, XO_MULSD);
18,393✔
2201
  else
2202
    asm_intarith(as, ir, XOg_X_IMUL);
24✔
2203
}
18,417✔
2204

2205
#define asm_fpdiv(as, ir)        asm_fparith(as, ir, XO_DIVSD)
2206

2207
static void asm_neg_not(ASMState *as, IRIns *ir, x86Group3 xg)
30✔
2208
{
2209
  Reg dest = ra_dest(as, ir, RSET_GPR);
30✔
2210
  emit_rr(as, XO_GROUP3, REX_64IR(ir, xg), dest);
57✔
2211
  ra_left(as, dest, ir->op1);
30✔
2212
}
30✔
2213

2214
static void asm_neg(ASMState *as, IRIns *ir)
29✔
2215
{
2216
  if (irt_isnum(ir->t))
29✔
2217
    asm_fparith(as, ir, XO_XORPS);
2✔
2218
  else
2219
    asm_neg_not(as, ir, XOg_NEG);
27✔
2220
}
29✔
2221

2222
#define asm_abs(as, ir)                asm_fparith(as, ir, XO_ANDPS)
2223

2224
static void asm_intmin_max(ASMState *as, IRIns *ir, int cc)
×
2225
{
2226
  Reg right, dest = ra_dest(as, ir, RSET_GPR);
×
2227
  IRRef lref = ir->op1, rref = ir->op2;
×
2228
  if (irref_isk(rref)) { lref = rref; rref = ir->op1; }
×
2229
  right = ra_alloc1(as, rref, rset_exclude(RSET_GPR, dest));
×
2230
  emit_rr(as, XO_CMOV + (cc<<24), REX_64IR(ir, dest), right);
×
2231
  emit_rr(as, XO_CMP, REX_64IR(ir, dest), right);
×
2232
  ra_left(as, dest, lref);
×
2233
}
×
2234

2235
static void asm_min(ASMState *as, IRIns *ir)
19✔
2236
{
2237
  if (irt_isnum(ir->t))
19✔
2238
    asm_fparith(as, ir, XO_MINSD);
19✔
2239
  else
2240
    asm_intmin_max(as, ir, CC_G);
×
2241
}
19✔
2242

2243
static void asm_max(ASMState *as, IRIns *ir)
2,070✔
2244
{
2245
  if (irt_isnum(ir->t))
2,070✔
2246
    asm_fparith(as, ir, XO_MAXSD);
2,070✔
2247
  else
2248
    asm_intmin_max(as, ir, CC_L);
×
2249
}
2,070✔
2250

2251
/* Note: don't use LEA for overflow-checking arithmetic! */
2252
#define asm_addov(as, ir)        asm_intarith(as, ir, XOg_ADD)
2253
#define asm_subov(as, ir)        asm_intarith(as, ir, XOg_SUB)
2254
#define asm_mulov(as, ir)        asm_intarith(as, ir, XOg_X_IMUL)
2255

2256
#define asm_bnot(as, ir)        asm_neg_not(as, ir, XOg_NOT)
2257

2258
static void asm_bswap(ASMState *as, IRIns *ir)
9✔
2259
{
2260
  Reg dest = ra_dest(as, ir, RSET_GPR);
9✔
2261
  as->mcp = emit_op(XO_BSWAP + ((dest&7) << 24),
9✔
2262
                    REX_64IR(ir, 0), dest, 0, as->mcp, 1);
9✔
2263
  ra_left(as, dest, ir->op1);
9✔
2264
}
9✔
2265

2266
#define asm_band(as, ir)        asm_intarith(as, ir, XOg_AND)
2267
#define asm_bor(as, ir)                asm_intarith(as, ir, XOg_OR)
2268
#define asm_bxor(as, ir)        asm_intarith(as, ir, XOg_XOR)
2269

2270
static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs, x86Op xv)
153✔
2271
{
2272
  IRRef rref = ir->op2;
153✔
2273
  IRIns *irr = IR(rref);
153✔
2274
  Reg dest;
153✔
2275
  if (irref_isk(rref)) {  /* Constant shifts. */
153✔
2276
    int shift;
125✔
2277
    dest = ra_dest(as, ir, RSET_GPR);
125✔
2278
    shift = irr->i & (irt_is64(ir->t) ? 63 : 31);
125✔
2279
    if (!xv && shift && (as->flags & JIT_F_BMI2)) {
125✔
2280
      Reg left = asm_fuseloadm(as, ir->op1, RSET_GPR, irt_is64(ir->t));
15✔
2281
      if (left != dest) {  /* BMI2 rotate right by constant. */
15✔
2282
        emit_i8(as, xs == XOg_ROL ? -shift : shift);
13✔
2283
        emit_mrm(as, VEX_64IR(ir, XV_RORX), dest, left);
13✔
2284
        return;
13✔
2285
      }
2286
    }
2287
    switch (shift) {
112✔
2288
    case 0: break;
2289
    case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break;
×
2290
    default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break;
171✔
2291
    }
2292
  } else if ((as->flags & JIT_F_BMI2) && xv) {        /* BMI2 variable shifts. */
28✔
2293
    Reg left, right;
17✔
2294
    dest = ra_dest(as, ir, RSET_GPR);
17✔
2295
    right = ra_alloc1(as, rref, RSET_GPR);
17✔
2296
    left = asm_fuseloadm(as, ir->op1, rset_exclude(RSET_GPR, right),
34✔
2297
                         irt_is64(ir->t));
17✔
2298
    emit_mrm(as, VEX_64IR(ir, xv) ^ (right << 19), dest, left);
17✔
2299
    return;
17✔
2300
  } else {  /* Variable shifts implicitly use register cl (i.e. ecx). */
2301
    Reg right;
11✔
2302
    dest = ra_dest(as, ir, rset_exclude(RSET_GPR, RID_ECX));
11✔
2303
    if (dest == RID_ECX) {
11✔
2304
      dest = ra_scratch(as, rset_exclude(RSET_GPR, RID_ECX));
2✔
2305
      emit_rr(as, XO_MOV, REX_64IR(ir, RID_ECX), dest);
2✔
2306
    }
2307
    right = irr->r;
11✔
2308
    if (ra_noreg(right))
11✔
2309
      right = ra_allocref(as, rref, RID2RSET(RID_ECX));
8✔
2310
    else if (right != RID_ECX)
3✔
2311
      ra_scratch(as, RID2RSET(RID_ECX));
×
2312
    emit_rr(as, XO_SHIFTcl, REX_64IR(ir, xs), dest);
17✔
2313
    ra_noweak(as, right);
11✔
2314
    if (right != RID_ECX)
11✔
2315
      emit_rr(as, XO_MOV, RID_ECX, right);
×
2316
  }
2317
  ra_left(as, dest, ir->op1);
123✔
2318
  /*
2319
  ** Note: avoid using the flags resulting from a shift or rotate!
2320
  ** All of them cause a partial flag stall, except for r,1 shifts
2321
  ** (but not rotates). And a shift count of 0 leaves the flags unmodified.
2322
  */
2323
}
2324

2325
#define asm_bshl(as, ir)        asm_bitshift(as, ir, XOg_SHL, XV_SHLX)
2326
#define asm_bshr(as, ir)        asm_bitshift(as, ir, XOg_SHR, XV_SHRX)
2327
#define asm_bsar(as, ir)        asm_bitshift(as, ir, XOg_SAR, XV_SARX)
2328
#define asm_brol(as, ir)        asm_bitshift(as, ir, XOg_ROL, 0)
2329
#define asm_bror(as, ir)        asm_bitshift(as, ir, XOg_ROR, 0)
2330

2331
/* -- Comparisons --------------------------------------------------------- */
2332

2333
/* Virtual flags for unordered FP comparisons. */
2334
#define VCC_U        0x1000                /* Unordered. */
2335
#define VCC_P        0x2000                /* Needs extra CC_P branch. */
2336
#define VCC_S        0x4000                /* Swap avoids CC_P branch. */
2337
#define VCC_PS        (VCC_P|VCC_S)
2338

2339
/* Map of comparisons to flags. ORDER IR. */
2340
#define COMPFLAGS(ci, cin, cu, cf)        ((ci)+((cu)<<4)+((cin)<<8)+(cf))
2341
static const uint16_t asm_compmap[IR_ABC+1] = {
2342
  /*                 signed non-eq unsigned flags */
2343
  /* LT  */ COMPFLAGS(CC_GE, CC_G,  CC_AE, VCC_PS),
2344
  /* GE  */ COMPFLAGS(CC_L,  CC_L,  CC_B,  0),
2345
  /* LE  */ COMPFLAGS(CC_G,  CC_G,  CC_A,  VCC_PS),
2346
  /* GT  */ COMPFLAGS(CC_LE, CC_L,  CC_BE, 0),
2347
  /* ULT */ COMPFLAGS(CC_AE, CC_A,  CC_AE, VCC_U),
2348
  /* UGE */ COMPFLAGS(CC_B,  CC_B,  CC_B,  VCC_U|VCC_PS),
2349
  /* ULE */ COMPFLAGS(CC_A,  CC_A,  CC_A,  VCC_U),
2350
  /* UGT */ COMPFLAGS(CC_BE, CC_B,  CC_BE, VCC_U|VCC_PS),
2351
  /* EQ  */ COMPFLAGS(CC_NE, CC_NE, CC_NE, VCC_P),
2352
  /* NE  */ COMPFLAGS(CC_E,  CC_E,  CC_E,  VCC_U|VCC_P),
2353
  /* ABC */ COMPFLAGS(CC_BE, CC_B,  CC_BE, VCC_U|VCC_PS)  /* Same as UGT. */
2354
};
2355

2356
/* FP and integer comparisons. */
2357
static void asm_comp(ASMState *as, IRIns *ir)
141,728✔
2358
{
2359
  uint32_t cc = asm_compmap[ir->o];
141,728✔
2360
  if (irt_isnum(ir->t)) {
141,728✔
2361
    IRRef lref = ir->op1;
4,635✔
2362
    IRRef rref = ir->op2;
4,635✔
2363
    Reg left, right;
4,635✔
2364
    MCLabel l_around;
4,635✔
2365
    /*
2366
    ** An extra CC_P branch is required to preserve ordered/unordered
2367
    ** semantics for FP comparisons. This can be avoided by swapping
2368
    ** the operands and inverting the condition (except for EQ and UNE).
2369
    ** So always try to swap if possible.
2370
    **
2371
    ** Another option would be to swap operands to achieve better memory
2372
    ** operand fusion. But it's unlikely that this outweighs the cost
2373
    ** of the extra branches.
2374
    */
2375
    if (cc & VCC_S) {  /* Swap? */
4,635✔
2376
      IRRef tmp = lref; lref = rref; rref = tmp;
2,583✔
2377
      cc ^= (VCC_PS|(5<<4));  /* A <-> B, AE <-> BE, PS <-> none */
2,583✔
2378
    }
2379
    left = ra_alloc1(as, lref, RSET_FPR);
4,635✔
2380
    l_around = emit_label(as);
4,635✔
2381
    asm_guardcc(as, cc >> 4);
4,635✔
2382
    if (cc & VCC_P) {  /* Extra CC_P branch required? */
4,635✔
2383
      if (!(cc & VCC_U)) {
310✔
2384
        asm_guardcc(as, CC_P);  /* Branch to exit for ordered comparisons. */
230✔
2385
      } else if (l_around != as->invmcp) {
80✔
2386
        emit_sjcc(as, CC_P, l_around);  /* Branch around for unordered. */
80✔
2387
      } else {
2388
        /* Patched to mcloop by asm_loop_fixup. */
UNCOV
2389
        as->loopinv = 2;
×
UNCOV
2390
        if (as->realign)
×
UNCOV
2391
          emit_sjcc(as, CC_P, as->mcp);
×
2392
        else
UNCOV
2393
          emit_jcc(as, CC_P, as->mcp);
×
2394
      }
2395
    }
2396
    right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left));
4,635✔
2397
    emit_mrm(as, XO_UCOMISD, left, right);
4,635✔
2398
  } else {
2399
    IRRef lref = ir->op1, rref = ir->op2;
137,093✔
2400
    IROp leftop = (IROp)(IR(lref)->o);
137,093✔
2401
    Reg r64 = REX_64IR(ir, 0);
137,093✔
2402
    int32_t imm = 0;
137,093✔
2403
    lj_assertA(irt_is64(ir->t) || irt_isint(ir->t) ||
137,093✔
2404
               irt_isu32(ir->t) || irt_isaddr(ir->t) || irt_isu8(ir->t),
2405
               "bad comparison data type %d", irt_type(ir->t));
2406
    /* Swap constants (only for ABC) and fusable loads to the right. */
2407
    if (irref_isk(lref) || (!irref_isk(rref) && opisfusableload(leftop))) {
137,093✔
2408
      if ((cc & 0xc) == 0xc) cc ^= 0x53;  /* L <-> G, LE <-> GE */
2,586✔
2409
      else if ((cc & 0xa) == 0x2) cc ^= 0x55;  /* A <-> B, AE <-> BE */
2,585✔
2410
      lref = ir->op2; rref = ir->op1;
2411
    }
2412
    if (asm_isk32(as, rref, &imm)) {
266,231✔
2413
      IRIns *irl = IR(lref);
129,229✔
2414
      /* Check wether we can use test ins. Not for unsigned, since CF=0. */
2415
      int usetest = (imm == 0 && (cc & 0xa) != 0x2);
129,229✔
2416
      if (usetest && irl->o == IR_BAND && irl+1 == ir && !ra_used(irl)) {
41,604✔
2417
        /* Combine comp(BAND(ref, r/imm), 0) into test mrm, r/imm. */
2418
        Reg right, left = RID_NONE;
41✔
2419
        RegSet allow = RSET_GPR;
41✔
2420
        if (!asm_isk32(as, irl->op2, &imm)) {
82✔
2421
          left = ra_alloc1(as, irl->op2, allow);
×
2422
          rset_clear(allow, left);
×
2423
        } else {  /* Try to Fuse IRT_I8/IRT_U8 loads, too. See below. */
2424
          IRIns *irll = IR(irl->op1);
41✔
2425
          if (opisfusableload((IROp)irll->o) &&
41✔
2426
              (irt_isi8(irll->t) || irt_isu8(irll->t))) {
39✔
2427
            IRType1 origt = irll->t;  /* Temporarily flip types. */
35✔
2428
            irll->t.irt = (irll->t.irt & ~IRT_TYPE) | IRT_INT;
35✔
2429
            as->curins--;  /* Skip to BAND to avoid failing in noconflict(). */
35✔
2430
            right = asm_fuseload(as, irl->op1, RSET_GPR);
35✔
2431
            as->curins++;
35✔
2432
            irll->t = origt;
35✔
2433
            if (right != RID_MRM) goto test_nofuse;
35✔
2434
            /* Fusion succeeded, emit test byte mrm, imm8. */
2435
            asm_guardcc(as, cc);
×
2436
            emit_i8(as, (imm & 0xff));
×
2437
            emit_mrm(as, XO_GROUP3b, XOg_TEST, RID_MRM);
×
2438
            return;
×
2439
          }
2440
        }
2441
        as->curins--;  /* Skip to BAND to avoid failing in noconflict(). */
6✔
2442
        right = asm_fuseloadm(as, irl->op1, allow, r64);
6✔
2443
        as->curins++;  /* Undo the above. */
6✔
2444
      test_nofuse:
41✔
2445
        asm_guardcc(as, cc);
41✔
2446
        if (ra_noreg(left)) {
41✔
2447
          emit_i32(as, imm);
41✔
2448
          emit_mrm(as, XO_GROUP3, r64 + XOg_TEST, right);
41✔
2449
        } else {
2450
          emit_mrm(as, XO_TEST, r64 + left, right);
×
2451
        }
2452
      } else {
2453
        Reg left;
129,188✔
2454
        if (opisfusableload((IROp)irl->o) &&
129,188✔
2455
            ((irt_isu8(irl->t) && checku8(imm)) ||
124,825✔
2456
             ((irt_isi8(irl->t) || irt_isi16(irl->t)) && checki8(imm)) ||
124,792✔
2457
             (irt_isu16(irl->t) && checku16(imm) && checki8((int16_t)imm)))) {
353✔
2458
          /* Only the IRT_INT case is fused by asm_fuseload.
2459
          ** The IRT_I8/IRT_U8 loads and some IRT_I16/IRT_U16 loads
2460
          ** are handled here.
2461
          ** Note that cmp word [mem], imm16 should not be generated,
2462
          ** since it has a length-changing prefix. Compares of a word
2463
          ** against a sign-extended imm8 are ok, however.
2464
          */
2465
          IRType1 origt = irl->t;  /* Temporarily flip types. */
299✔
2466
          irl->t.irt = (irl->t.irt & ~IRT_TYPE) | IRT_INT;
299✔
2467
          left = asm_fuseload(as, lref, RSET_GPR);
299✔
2468
          irl->t = origt;
299✔
2469
          if (left == RID_MRM) {  /* Fusion succeeded? */
299✔
2470
            if (irt_isu8(irl->t) || irt_isu16(irl->t))
278✔
2471
              cc >>= 4;  /* Need unsigned compare. */
223✔
2472
            asm_guardcc(as, cc);
278✔
2473
            emit_i8(as, imm);
278✔
2474
            emit_mrm(as, (irt_isi8(origt) || irt_isu8(origt)) ?
278✔
2475
                         XO_ARITHib : XO_ARITHiw8, r64 + XOg_CMP, RID_MRM);
2476
            return;
278✔
2477
          }  /* Otherwise handle register case as usual. */
2478
        } else {
2479
          left = asm_fuseloadm(as, lref,
128,889✔
2480
                               irt_isu8(ir->t) ? RSET_GPR8 : RSET_GPR, r64);
2481
        }
2482
        asm_guardcc(as, cc);
128,910✔
2483
        if (usetest && left != RID_MRM) {
128,910✔
2484
          /* Use test r,r instead of cmp r,0. */
2485
          x86Op xo = XO_TEST;
38,273✔
2486
          if (irt_isu8(ir->t)) {
38,273✔
2487
            lj_assertA(ir->o == IR_EQ || ir->o == IR_NE, "bad usage");
×
2488
            xo = XO_TESTb;
×
2489
            if (!rset_test(RSET_RANGE(RID_EAX, RID_EBX+1), left)) {
×
2490
              if (LJ_64) {
×
2491
                left |= FORCE_REX;
×
2492
              } else {
2493
                emit_i32(as, 0xff);
2494
                emit_mrm(as, XO_GROUP3, XOg_TEST, left);
2495
                return;
2496
              }
2497
            }
2498
          }
2499
          emit_rr(as, xo, r64 + left, left);
38,273✔
2500
          if (irl+1 == ir)  /* Referencing previous ins? */
38,273✔
2501
            as->flagmcp = as->mcp;  /* Set flag to drop test r,r if possible. */
38,153✔
2502
        } else {
2503
          emit_gmrmi(as, XG_ARITHi(XOg_CMP), r64 + left, imm);
90,637✔
2504
        }
2505
      }
2506
    } else {
2507
      Reg left = ra_alloc1(as, lref, RSET_GPR);
7,864✔
2508
      Reg right = asm_fuseloadm(as, rref, rset_exclude(RSET_GPR, left), r64);
7,864✔
2509
      asm_guardcc(as, cc);
7,864✔
2510
      emit_mrm(as, XO_CMP, r64 + left, right);
7,864✔
2511
    }
2512
  }
2513
}
2514

2515
#define asm_equal(as, ir)        asm_comp(as, ir)
2516

2517
#if LJ_32 && LJ_HASFFI
2518
/* 64 bit integer comparisons in 32 bit mode. */
2519
static void asm_comp_int64(ASMState *as, IRIns *ir)
2520
{
2521
  uint32_t cc = asm_compmap[(ir-1)->o];
2522
  RegSet allow = RSET_GPR;
2523
  Reg lefthi = RID_NONE, leftlo = RID_NONE;
2524
  Reg righthi = RID_NONE, rightlo = RID_NONE;
2525
  MCLabel l_around;
2526
  x86ModRM mrm;
2527

2528
  as->curins--;  /* Skip loword ins. Avoids failing in noconflict(), too. */
2529

2530
  /* Allocate/fuse hiword operands. */
2531
  if (irref_isk(ir->op2)) {
2532
    lefthi = asm_fuseload(as, ir->op1, allow);
2533
  } else {
2534
    lefthi = ra_alloc1(as, ir->op1, allow);
2535
    rset_clear(allow, lefthi);
2536
    righthi = asm_fuseload(as, ir->op2, allow);
2537
    if (righthi == RID_MRM) {
2538
      if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base);
2539
      if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx);
2540
    } else {
2541
      rset_clear(allow, righthi);
2542
    }
2543
  }
2544
  mrm = as->mrm;  /* Save state for hiword instruction. */
2545

2546
  /* Allocate/fuse loword operands. */
2547
  if (irref_isk((ir-1)->op2)) {
2548
    leftlo = asm_fuseload(as, (ir-1)->op1, allow);
2549
  } else {
2550
    leftlo = ra_alloc1(as, (ir-1)->op1, allow);
2551
    rset_clear(allow, leftlo);
2552
    rightlo = asm_fuseload(as, (ir-1)->op2, allow);
2553
  }
2554

2555
  /* All register allocations must be performed _before_ this point. */
2556
  l_around = emit_label(as);
2557
  as->invmcp = as->flagmcp = NULL;  /* Cannot use these optimizations. */
2558

2559
  /* Loword comparison and branch. */
2560
  asm_guardcc(as, cc >> 4);  /* Always use unsigned compare for loword. */
2561
  if (ra_noreg(rightlo)) {
2562
    int32_t imm = IR((ir-1)->op2)->i;
2563
    if (imm == 0 && ((cc >> 4) & 0xa) != 0x2 && leftlo != RID_MRM)
2564
      emit_rr(as, XO_TEST, leftlo, leftlo);
2565
    else
2566
      emit_gmrmi(as, XG_ARITHi(XOg_CMP), leftlo, imm);
2567
  } else {
2568
    emit_mrm(as, XO_CMP, leftlo, rightlo);
2569
  }
2570

2571
  /* Hiword comparison and branches. */
2572
  if ((cc & 15) != CC_NE)
2573
    emit_sjcc(as, CC_NE, l_around);  /* Hiword unequal: skip loword compare. */
2574
  if ((cc & 15) != CC_E)
2575
    asm_guardcc(as, cc >> 8);  /* Hiword compare without equality check. */
2576
  as->mrm = mrm;  /* Restore state. */
2577
  if (ra_noreg(righthi)) {
2578
    int32_t imm = IR(ir->op2)->i;
2579
    if (imm == 0 && (cc & 0xa) != 0x2 && lefthi != RID_MRM)
2580
      emit_rr(as, XO_TEST, lefthi, lefthi);
2581
    else
2582
      emit_gmrmi(as, XG_ARITHi(XOg_CMP), lefthi, imm);
2583
  } else {
2584
    emit_mrm(as, XO_CMP, lefthi, righthi);
2585
  }
2586
}
2587
#endif
2588

2589
/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
2590

2591
/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
2592
static void asm_hiop(ASMState *as, IRIns *ir)
2593
{
2594
#if LJ_32 && LJ_HASFFI
2595
  /* HIOP is marked as a store because it needs its own DCE logic. */
2596
  int uselo = ra_used(ir-1), usehi = ra_used(ir);  /* Loword/hiword used? */
2597
  if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1;
2598
  if ((ir-1)->o == IR_CONV) {  /* Conversions to/from 64 bit. */
2599
    as->curins--;  /* Always skip the CONV. */
2600
    if (usehi || uselo)
2601
      asm_conv64(as, ir);
2602
    return;
2603
  } else if ((ir-1)->o <= IR_NE) {  /* 64 bit integer comparisons. ORDER IR. */
2604
    asm_comp_int64(as, ir);
2605
    return;
2606
  } else if ((ir-1)->o == IR_XSTORE) {
2607
    if ((ir-1)->r != RID_SINK)
2608
      asm_fxstore(as, ir);
2609
    return;
2610
  }
2611
  if (!usehi) return;  /* Skip unused hiword op for all remaining ops. */
2612
  switch ((ir-1)->o) {
2613
  case IR_ADD:
2614
    as->flagmcp = NULL;
2615
    as->curins--;
2616
    asm_intarith(as, ir, XOg_ADC);
2617
    asm_intarith(as, ir-1, XOg_ADD);
2618
    break;
2619
  case IR_SUB:
2620
    as->flagmcp = NULL;
2621
    as->curins--;
2622
    asm_intarith(as, ir, XOg_SBB);
2623
    asm_intarith(as, ir-1, XOg_SUB);
2624
    break;
2625
  case IR_NEG: {
2626
    Reg dest = ra_dest(as, ir, RSET_GPR);
2627
    emit_rr(as, XO_GROUP3, XOg_NEG, dest);
2628
    emit_i8(as, 0);
2629
    emit_rr(as, XO_ARITHi8, XOg_ADC, dest);
2630
    ra_left(as, dest, ir->op1);
2631
    as->curins--;
2632
    asm_neg_not(as, ir-1, XOg_NEG);
2633
    break;
2634
    }
2635
  case IR_CALLN:
2636
  case IR_CALLXS:
2637
    if (!uselo)
2638
      ra_allocref(as, ir->op1, RID2RSET(RID_RETLO));  /* Mark lo op as used. */
2639
    break;
2640
  case IR_CNEWI:
2641
    /* Nothing to do here. Handled by CNEWI itself. */
2642
    break;
2643
  default: lj_assertA(0, "bad HIOP for op %d", (ir-1)->o); break;
2644
  }
2645
#else
2646
  /* Unused on x64 or without FFI. */
2647
  UNUSED(as); UNUSED(ir); lj_assertA(0, "unexpected HIOP");
2648
#endif
2649
}
2650

2651
/* -- Profiling ----------------------------------------------------------- */
2652

2653
static void asm_prof(ASMState *as, IRIns *ir)
2654
{
2655
  UNUSED(ir);
2656
  asm_guardcc(as, CC_NE);
2657
  emit_i8(as, HOOK_PROFILE);
2658
  emit_rma(as, XO_GROUP3b, XOg_TEST, &J2G(as->J)->hookmask);
2659
}
2660

2661
/* -- Stack handling ------------------------------------------------------ */
2662

2663
/* Check Lua stack size for overflow. Use exit handler as fallback. */
2664
static void asm_stack_check(ASMState *as, BCReg topslot,
399✔
2665
                            IRIns *irp, RegSet allow, ExitNo exitno)
2666
{
2667
  /* Try to get an unused temp. register, otherwise spill/restore eax. */
2668
  Reg pbase = irp ? irp->r : RID_BASE;
399✔
2669
  Reg r = allow ? rset_pickbot(allow) : RID_EAX;
399✔
2670
  emit_jcc(as, CC_B, exitstub_addr(as->J, exitno));
399✔
2671
  if (allow == RSET_EMPTY)  /* Restore temp. register. */
399✔
2672
    emit_rmro(as, XO_MOV, r|REX_64, RID_ESP, 0);
×
2673
  else
2674
    ra_modified(as, r);
399✔
2675
  emit_gri(as, XG_ARITHi(XOg_CMP), r|REX_GC64, (int32_t)(8*topslot));
399✔
2676
  if (ra_hasreg(pbase) && pbase != r)
399✔
2677
    emit_rr(as, XO_ARITH(XOg_SUB), r|REX_GC64, pbase);
274✔
2678
  else
2679
#if LJ_GC64
2680
    emit_rmro(as, XO_ARITH(XOg_SUB), r|REX_64, RID_DISPATCH,
125✔
2681
              (int32_t)dispofs(as, &J2G(as->J)->jit_base));
2682
#else
2683
    emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE,
2684
              ptr2addr(&J2G(as->J)->jit_base));
2685
#endif
2686
  emit_rmro(as, XO_MOV, r|REX_GC64, r, offsetof(lua_State, maxstack));
399✔
2687
  emit_getgl(as, r, cur_L);
399✔
2688
  if (allow == RSET_EMPTY)  /* Spill temp. register. */
399✔
2689
    emit_rmro(as, XO_MOVto, r|REX_64, RID_ESP, 0);
×
2690
}
399✔
2691

2692
/* Restore Lua stack from on-trace state. */
2693
static void asm_stack_restore(ASMState *as, SnapShot *snap)
2694
{
2695
  SnapEntry *map = &as->T->snapmap[snap->mapofs];
2696
#if !LJ_FR2 || defined(LUA_USE_ASSERT)
2697
  SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2];
2698
#endif
2699
  MSize n, nent = snap->nent;
2700
  /* Store the value of all modified slots to the Lua stack. */
2701
  for (n = 0; n < nent; n++) {
2702
    SnapEntry sn = map[n];
2703
    BCReg s = snap_slot(sn);
2704
    int32_t ofs = 8*((int32_t)s-1-LJ_FR2);
2705
    IRRef ref = snap_ref(sn);
2706
    IRIns *ir = IR(ref);
2707
    if ((sn & SNAP_NORESTORE))
2708
      continue;
2709
    if (irt_isnum(ir->t)) {
2710
      Reg src = ra_alloc1(as, ref, RSET_FPR);
2711
      emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
2712
    } else {
2713
      lj_assertA(irt_ispri(ir->t) || irt_isaddr(ir->t) ||
2714
                 (LJ_DUALNUM && irt_isinteger(ir->t)),
2715
                 "restore of IR type %d", irt_type(ir->t));
2716
      if (!irref_isk(ref)) {
2717
        Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
2718
#if LJ_GC64
2719
        if (irt_is64(ir->t)) {
2720
          /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */
2721
          emit_u32(as, irt_toitype(ir->t) << 15);
2722
          emit_rmro(as, XO_ARITHi, XOg_OR, RID_BASE, ofs+4);
2723
        } else if (LJ_DUALNUM && irt_isinteger(ir->t)) {
2724
          emit_movmroi(as, RID_BASE, ofs+4, LJ_TISNUM << 15);
2725
        } else {
2726
          emit_movmroi(as, RID_BASE, ofs+4, (irt_toitype(ir->t)<<15)|0x7fff);
2727
        }
2728
#endif
2729
        emit_movtomro(as, REX_64IR(ir, src), RID_BASE, ofs);
2730
#if LJ_GC64
2731
      } else {
2732
        TValue k;
2733
        lj_ir_kvalue(as->J->L, &k, ir);
2734
        if (tvisnil(&k)) {
2735
          emit_i32(as, -1);
2736
          emit_rmro(as, XO_MOVmi, REX_64, RID_BASE, ofs);
2737
        } else {
2738
          emit_movmroi(as, RID_BASE, ofs+4, k.u32.hi);
2739
          emit_movmroi(as, RID_BASE, ofs, k.u32.lo);
2740
        }
2741
#else
2742
      } else if (!irt_ispri(ir->t)) {
2743
        emit_movmroi(as, RID_BASE, ofs, ir->i);
2744
#endif
2745
      }
2746
      if ((sn & (SNAP_CONT|SNAP_FRAME))) {
2747
#if !LJ_FR2
2748
        if (s != 0)  /* Do not overwrite link to previous frame. */
2749
          emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*flinks--));
2750
#endif
2751
#if !LJ_GC64
2752
      } else {
2753
        if (!(LJ_64 && irt_islightud(ir->t)))
2754
          emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t));
2755
#endif
2756
      }
2757
    }
2758
    checkmclim(as);
2759
  }
2760
  lj_assertA(map + nent == flinks, "inconsistent frames in snapshot");
2761
}
2762

2763
/* -- GC handling --------------------------------------------------------- */
2764

2765
/* Check GC threshold and do one or more GC steps. */
2766
static void asm_gc_check(ASMState *as)
821✔
2767
{
2768
  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
821✔
2769
  IRRef args[2];
821✔
2770
  MCLabel l_end;
821✔
2771
  Reg tmp;
821✔
2772
  ra_evictset(as, RSET_SCRATCH);
821✔
2773
  l_end = emit_label(as);
821✔
2774
  /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
2775
  asm_guardcc(as, CC_NE);  /* Assumes asm_snap_prep() already done. */
821✔
2776
  emit_rr(as, XO_TEST, RID_RET, RID_RET);
821✔
2777
  args[0] = ASMREF_TMP1;  /* global_State *g */
821✔
2778
  args[1] = ASMREF_TMP2;  /* MSize steps     */
821✔
2779
  asm_gencall(as, ci, args);
821✔
2780
  tmp = ra_releasetmp(as, ASMREF_TMP1);
821✔
2781
#if LJ_GC64
2782
  emit_rmro(as, XO_LEA, tmp|REX_64, RID_DISPATCH, GG_DISP2G);
821✔
2783
#else
2784
  emit_loada(as, tmp, J2G(as->J));
2785
#endif
2786
  emit_loadi(as, ra_releasetmp(as, ASMREF_TMP2), as->gcsteps);
821✔
2787
  /* Jump around GC step if GC total < GC threshold. */
2788
  emit_sjcc(as, CC_B, l_end);
821✔
2789
  emit_opgl(as, XO_ARITH(XOg_CMP), tmp|REX_GC64, gc.threshold);
821✔
2790
  emit_getgl(as, tmp, gc.total);
821✔
2791
  as->gcsteps = 0;
821✔
2792
  checkmclim(as);
821✔
2793
}
821✔
2794

2795
/* -- Loop handling ------------------------------------------------------- */
2796

2797
/* Fixup the loop branch. */
2798
static void asm_loop_fixup(ASMState *as)
2,467✔
2799
{
2800
  MCode *p = as->mctop;
2,467✔
2801
  MCode *target = as->mcp;
2,467✔
2802
  if (as->realign) {  /* Realigned loops use short jumps. */
2,467✔
2803
    as->realign = NULL;  /* Stop another retry. */
1,116✔
2804
    lj_assertA(((intptr_t)target & 15) == 0, "loop realign failed");
1,116✔
2805
    if (as->loopinv) {  /* Inverted loop branch? */
1,116✔
2806
      p -= 5;
1,090✔
2807
      p[0] = XI_JMP;
1,090✔
2808
      lj_assertA(target - p >= -128, "loop realign failed");
1,090✔
2809
      p[-1] = (MCode)(target - p);  /* Patch sjcc. */
1,090✔
2810
      if (as->loopinv == 2)
1,090✔
UNCOV
2811
        p[-3] = (MCode)(target - p + 2);  /* Patch opt. short jp. */
×
2812
    } else {
2813
      lj_assertA(target - p >= -128, "loop realign failed");
26✔
2814
      p[-1] = (MCode)(int8_t)(target - p);  /* Patch short jmp. */
26✔
2815
      p[-2] = XI_JMPs;
26✔
2816
    }
2817
  } else {
2818
    MCode *newloop;
1,351✔
2819
    p[-5] = XI_JMP;
1,351✔
2820
    if (as->loopinv) {  /* Inverted loop branch? */
1,351✔
2821
      /* asm_guardcc already inverted the jcc and patched the jmp. */
2822
      p -= 5;
1,310✔
2823
      newloop = target+4;
1,310✔
2824
      *(int32_t *)(p-4) = (int32_t)(target - p);  /* Patch jcc. */
1,310✔
2825
      if (as->loopinv == 2) {
1,310✔
UNCOV
2826
        *(int32_t *)(p-10) = (int32_t)(target - p + 6);  /* Patch opt. jp. */
×
UNCOV
2827
        newloop = target+8;
×
2828
      }
2829
    } else {  /* Otherwise just patch jmp. */
2830
      *(int32_t *)(p-4) = (int32_t)(target - p);
41✔
2831
      newloop = target+3;
41✔
2832
    }
2833
    /* Realign small loops and shorten the loop branch. */
2834
    if (newloop >= p - 128) {
1,351✔
2835
      as->realign = newloop;  /* Force a retry and remember alignment. */
1,116✔
2836
      as->curins = as->stopins;  /* Abort asm_trace now. */
1,116✔
2837
      as->T->nins = as->orignins;  /* Remove any added renames. */
1,116✔
2838
    }
2839
  }
2840
}
2,467✔
2841

2842
/* -- Head of trace ------------------------------------------------------- */
2843

2844
/* Coalesce BASE register for a root trace. */
2845
static void asm_head_root_base(ASMState *as)
2,235✔
2846
{
2847
  IRIns *ir = IR(REF_BASE);
2,235✔
2848
  Reg r = ir->r;
2,235✔
2849
  if (ra_hasreg(r)) {
2,235✔
2850
    ra_free(as, r);
1,946✔
2851
    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
1,946✔
2852
      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
680✔
2853
    if (r != RID_BASE)
1,946✔
2854
      emit_rr(as, XO_MOV, r|REX_GC64, RID_BASE);
76✔
2855
  }
2856
}
2,235✔
2857

2858
/* Coalesce or reload BASE register for a side trace. */
2859
static Reg asm_head_side_base(ASMState *as, IRIns *irp)
2860
{
2861
  IRIns *ir = IR(REF_BASE);
2862
  Reg r = ir->r;
2863
  if (ra_hasreg(r)) {
2864
    ra_free(as, r);
2865
    if (rset_test(as->modset, r) || irt_ismarked(ir->t))
2866
      ir->r = RID_INIT;  /* No inheritance for modified BASE register. */
2867
    if (irp->r == r) {
2868
      return r;  /* Same BASE register already coalesced. */
2869
    } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
2870
      /* Move from coalesced parent reg. */
2871
      emit_rr(as, XO_MOV, r|REX_GC64, irp->r);
2872
      return irp->r;
2873
    } else {
2874
      emit_getgl(as, r, jit_base);  /* Otherwise reload BASE. */
2875
    }
2876
  }
2877
  return RID_NONE;
2878
}
2879

2880
/* -- Tail of trace ------------------------------------------------------- */
2881

2882
/* Fixup the tail code. */
2883
static void asm_tail_fixup(ASMState *as, TraceNo lnk)
3,903✔
2884
{
2885
  /* Note: don't use as->mcp swap + emit_*: emit_op overwrites more bytes. */
2886
  MCode *p = as->mctop;
3,903✔
2887
  MCode *target, *q;
3,903✔
2888
  int32_t spadj = as->T->spadjust;
3,903✔
2889
  if (spadj == 0) {
3,903✔
2890
    p -= ((as->flags & JIT_F_LEA_AGU) ? 7 : 6) + (LJ_64 ? 1 : 0);
6,560✔
2891
  } else {
2892
    MCode *p1;
623✔
2893
    /* Patch stack adjustment. */
2894
    if (checki8(spadj)) {
623✔
2895
      p -= 3;
589✔
2896
      p1 = p-6;
589✔
2897
      *p1 = (MCode)spadj;
589✔
2898
    } else {
2899
      p1 = p-9;
34✔
2900
      *(int32_t *)p1 = spadj;
34✔
2901
    }
2902
    if ((as->flags & JIT_F_LEA_AGU)) {
623✔
2903
#if LJ_64
2904
      p1[-4] = 0x48;
×
2905
#endif
2906
      p1[-3] = (MCode)XI_LEA;
×
2907
      p1[-2] = MODRM(checki8(spadj) ? XM_OFS8 : XM_OFS32, RID_ESP, RID_ESP);
×
2908
      p1[-1] = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
×
2909
    } else {
2910
#if LJ_64
2911
      p1[-3] = 0x48;
623✔
2912
#endif
2913
      p1[-2] = (MCode)(checki8(spadj) ? XI_ARITHi8 : XI_ARITHi);
623✔
2914
      p1[-1] = MODRM(XM_REG, XOg_ADD, RID_ESP);
623✔
2915
    }
2916
  }
2917
  /* Patch exit branch. */
2918
  target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp;
3,903✔
2919
  *(int32_t *)(p-4) = jmprel(as->J, p, target);
3,903✔
2920
  p[-5] = XI_JMP;
3,903✔
2921
  /* Drop unused mcode tail. Fill with NOPs to make the prefetcher happy. */
2922
  for (q = as->mctop-1; q >= p; q--)
28,630✔
2923
    *q = XI_NOP;
24,727✔
2924
  as->mctop = p;
3,903✔
2925
}
3,903✔
2926

2927
/* Prepare tail of code. */
2928
static void asm_tail_prep(ASMState *as)
6,506✔
2929
{
2930
  MCode *p = as->mctop;
6,506✔
2931
  /* Realign and leave room for backwards loop branch or exit branch. */
2932
  if (as->realign) {
6,506✔
2933
    int i = ((int)(intptr_t)as->realign) & 15;
1,116✔
2934
    /* Fill unused mcode tail with NOPs to make the prefetcher happy. */
2935
    while (i-- > 0)
9,285✔
2936
      *--p = XI_NOP;
8,169✔
2937
    as->mctop = p;
1,116✔
2938
    p -= (as->loopinv ? 5 : 2);  /* Space for short/near jmp. */
1,142✔
2939
  } else {
2940
    p -= 5;  /* Space for exit branch (near jmp). */
5,390✔
2941
  }
2942
  if (as->loopref) {
6,506✔
2943
    as->invmcp = as->mcp = p;
2,475✔
2944
  } else {
2945
    /* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */
2946
    as->mcp = p - (((as->flags & JIT_F_LEA_AGU) ? 7 : 6)  + (LJ_64 ? 1 : 0));
4,031✔
2947
    as->invmcp = NULL;
4,031✔
2948
  }
2949
}
6,506✔
2950

2951
/* -- Trace setup --------------------------------------------------------- */
2952

2953
/* Ensure there are enough stack slots for call arguments. */
2954
static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
677✔
2955
{
2956
  IRRef args[CCI_NARGS_MAX*2];
677✔
2957
  int nslots;
677✔
2958
  asm_collectargs(as, ir, ci, args);
677✔
2959
  nslots = asm_count_call_slots(as, ci, args);
677✔
2960
  if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
677✔
2961
    as->evenspill = nslots;
1✔
2962
#if LJ_64
2963
  return irt_isfp(ir->t) ? REGSP_HINT(RID_FPRET) : REGSP_HINT(RID_RET);
677✔
2964
#else
2965
  return irt_isfp(ir->t) ? REGSP_INIT : REGSP_HINT(RID_RET);
2966
#endif
2967
}
2968

2969
/* Target-specific setup. */
2970
static void asm_setup_target(ASMState *as)
5,327✔
2971
{
2972
  asm_exitstub_setup(as, as->T->nsnap);
5,327✔
2973
  as->mrm.base = 0;
5,327✔
2974
}
5,327✔
2975

2976
/* -- Trace patching ------------------------------------------------------ */
2977

2978
static const uint8_t map_op1[256] = {
2979
0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x20,
2980
0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,
2981
0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,
2982
0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,
2983
#if LJ_64
2984
0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x14,0x14,0x14,0x14,0x14,0x14,0x14,0x14,
2985
#else
2986
0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,
2987
#endif
2988
0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,
2989
0x51,0x51,0x92,0x92,0x10,0x10,0x12,0x11,0x45,0x86,0x52,0x93,0x51,0x51,0x51,0x51,
2990
0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,
2991
0x93,0x86,0x93,0x93,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,
2992
0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x47,0x51,0x51,0x51,0x51,0x51,
2993
#if LJ_64
2994
0x59,0x59,0x59,0x59,0x51,0x51,0x51,0x51,0x52,0x45,0x51,0x51,0x51,0x51,0x51,0x51,
2995
#else
2996
0x55,0x55,0x55,0x55,0x51,0x51,0x51,0x51,0x52,0x45,0x51,0x51,0x51,0x51,0x51,0x51,
2997
#endif
2998
0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x05,0x05,0x05,0x05,0x05,0x05,0x05,0x05,
2999
0x93,0x93,0x53,0x51,0x70,0x71,0x93,0x86,0x54,0x51,0x53,0x51,0x51,0x52,0x51,0x51,
3000
0x92,0x92,0x92,0x92,0x52,0x52,0x51,0x51,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,
3001
0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x45,0x45,0x47,0x52,0x51,0x51,0x51,0x51,
3002
0x10,0x51,0x10,0x10,0x51,0x51,0x63,0x66,0x51,0x51,0x51,0x51,0x51,0x51,0x92,0x92
3003
};
3004

3005
static const uint8_t map_op2[256] = {
3006
0x93,0x93,0x93,0x93,0x52,0x52,0x52,0x52,0x52,0x52,0x51,0x52,0x51,0x93,0x52,0x94,
3007
0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3008
0x53,0x53,0x53,0x53,0x53,0x53,0x53,0x53,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3009
0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x34,0x51,0x35,0x51,0x51,0x51,0x51,0x51,
3010
0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3011
0x53,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3012
0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3013
0x94,0x54,0x54,0x54,0x93,0x93,0x93,0x52,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3014
0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,
3015
0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3016
0x52,0x52,0x52,0x93,0x94,0x93,0x51,0x51,0x52,0x52,0x52,0x93,0x94,0x93,0x93,0x93,
3017
0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x94,0x93,0x93,0x93,0x93,0x93,
3018
0x93,0x93,0x94,0x93,0x94,0x94,0x94,0x93,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,
3019
0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3020
0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,
3021
0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x52
3022
};
3023

3024
static uint32_t asm_x86_inslen(const uint8_t* p)
1,380,348✔
3025
{
3026
  uint32_t result = 0;
1,380,348✔
3027
  uint32_t prefixes = 0;
1,380,348✔
3028
  uint32_t x = map_op1[*p];
1,380,348✔
3029
  for (;;) {
2,810,901✔
3030
    switch (x >> 4) {
2,810,901✔
3031
    case 0: return result + x + (prefixes & 4);
88,671✔
3032
    case 1: prefixes |= x; x = map_op1[*++p]; result++; break;
997,319✔
3033
    case 2: x = map_op2[*++p]; break;
433,234✔
3034
    case 3: p++; goto mrm;
123✔
3035
    case 4: result -= (prefixes & 2);  /* fallthrough */
286,307✔
3036
    case 5: return result + (x & 15);
328,797✔
3037
    case 6:  /* Group 3. */
39,218✔
3038
      if (p[1] & 0x38) x = 2;
39,218✔
3039
      else if ((prefixes & 2) && (x == 0x66)) x = 4;
39,202✔
3040
      goto mrm;
39,218✔
3041
    case 7: /* VEX c4/c5. */
3042
      if (LJ_32 && p[1] < 0xc0) {
13,774✔
3043
        x = 2;
3044
        goto mrm;
3045
      }
3046
      if (x == 0x70) {
13,774✔
3047
        x = *++p & 0x1f;
13,774✔
3048
        result++;
13,774✔
3049
        if (x >= 2) {
13,774✔
3050
          p += 2;
13,774✔
3051
          result += 2;
13,774✔
3052
          goto mrm;
13,774✔
3053
        }
3054
      }
3055
      p++;
×
3056
      result++;
×
3057
      x = map_op2[*++p];
×
3058
      break;
×
3059
    case 8: result -= (prefixes & 2);  /* fallthrough */
122,150✔
3060
    case 9: mrm:  /* ModR/M and possibly SIB. */
962,880✔
3061
      result += (x & 15);
962,880✔
3062
      x = *++p;
962,880✔
3063
      switch (x >> 6) {
962,880✔
3064
      case 0: if ((x & 7) == 5) return result + 4; break;
79,301✔
3065
      case 1: result++; break;
431,615✔
3066
      case 2: result += 4; break;
179,065✔
3067
      case 3: return result;
3068
      }
3069
      if ((x & 7) == 4) {
689,959✔
3070
        result++;
11,188✔
3071
        if (x < 0x40 && (p[1] & 7) == 5) result += 4;
11,188✔
3072
      }
3073
      return result;
3074
    }
3075
  }
3076
}
3077

3078
/* Patch exit jumps of existing machine code to a new target. */
3079
void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
3,009✔
3080
{
3081
  MCode *p = T->mcode;
3,009✔
3082
  MCode *mcarea = lj_mcode_patch(J, p, 0);
3,009✔
3083
  MSize len = T->szmcode;
3,009✔
3084
  MCode *px = exitstub_addr(J, exitno) - 6;
3,009✔
3085
  MCode *pe = p+len-6;
3,009✔
3086
#if LJ_GC64
3087
  uint32_t statei = (uint32_t)(GG_OFS(g.vmstate) - GG_OFS(dispatch));
3,009✔
3088
#else
3089
  uint32_t statei = u32ptr(&J2G(J)->vmstate);
3090
#endif
3091
  if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px)
3,009✔
3092
    *(int32_t *)(p+len-4) = jmprel(J, p+len, target);
34✔
3093
  /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */
3094
  for (; p < pe; p += asm_x86_inslen(p)) {
4,119✔
3095
    intptr_t ofs = LJ_GC64 ? (p[0] & 0xf0) == 0x40 : LJ_64;
4,119✔
3096
    if (*(uint32_t *)(p+2+ofs) == statei && p[ofs+LJ_GC64-LJ_64] == XI_MOVmi)
4,119✔
3097
      break;
3098
  }
3099
  lj_assertJ(p < pe, "instruction length decoder failed");
3100
  for (; p < pe; p += asm_x86_inslen(p))
1,382,247✔
3101
    if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px)
1,379,238✔
3102
      *(int32_t *)(p+2) = jmprel(J, p+6, target);
10,444✔
3103
  lj_mcode_sync(T->mcode, T->mcode + T->szmcode);
3,009✔
3104
  lj_mcode_patch(J, mcarea, 1);
3,009✔
3105
}
3,009✔
3106

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc