commit 2d02070af019e9896ecf2a63bedc098092fd395d
parent 436d0fc07e4551dd4265cfed0b0bc459c71ed8ce
Author: Quentin Carbonneaux <quentin@c9x.me>
Date: Tue, 16 May 2017 11:47:46 -0400
new hinting in the register allocator
The previous heuristics were ad hoc and it was
hard to understand why they worked at all.
This patch can be summarized in three points:
1. When a register is freed (an instruction
assigns it), we try to find if a temporary
would like to be in it, and if we find one,
we move it in the newly freed register.
I call this an "eager move".
2. Temporaries now remember in what register
they were last allocated; this information
is stored in the field Tmp.visit, and
prevails on the field Tmp.hint when it is
set. (This makes having the same hint for
interfering temporaries not so disastrous.)
3. Blocks are now allocated in "onion" order,
from the innermost loop to the outermost.
This is the change I am the least sure
about; it should be evaluated thorougly.
Diffstat:
M | all.h | | | 5 | +++-- |
M | rega.c | | | 420 | ++++++++++++++++++++++++++++++++++++++++++++++++------------------------------- |
2 files changed, 261 insertions(+), 164 deletions(-)
diff --git a/all.h b/all.h
@@ -288,8 +288,9 @@ struct Tmp {
short slot; /* -1 for unset */
short cls;
struct {
- int r;
- bits m;
+ int r; /* register or -1 */
+ int w; /* weight */
+ bits m; /* avoid these registers */
} hint;
int phi;
Alias alias;
diff --git a/rega.c b/rega.c
@@ -10,6 +10,7 @@ typedef struct RMap RMap;
struct RMap {
int t[Tmp0];
int r[Tmp0];
+ int w[Tmp0]; /* wait list, for unmatched hints */
BSet b[1];
int n;
};
@@ -20,8 +21,12 @@ static Mem *mem; /* function mem references */
static struct {
Ref src, dst;
int cls;
-} *pm; /* parallel move constructed */
-static int cpm, npm; /* capacity and size of pm */
+} pm[Tmp0]; /* parallel move constructed */
+static int npm; /* size of pm */
+static int loop; /* current loop level */
+
+static uint stmov; /* stats: added moves */
+static uint stblk; /* stats: added blocks */
static int *
hint(int t)
@@ -32,21 +37,21 @@ hint(int t)
static void
sethint(int t, int r)
{
- bits m;
+ Tmp *p;
- m = tmp[phicls(t, tmp)].hint.m;
- if (*hint(t) == -1)
- if (!(BIT(r) & m))
- *hint(t) = r;
+ p = &tmp[phicls(t, tmp)];
+ if (p->hint.r == -1 || p->hint.w > loop) {
+ p->hint.r = r;
+ p->hint.w = loop;
+ tmp[t].visit = -1;
+ }
}
static void
rcopy(RMap *ma, RMap *mb)
{
- memcpy(ma->t, mb->t, sizeof ma->t);
- memcpy(ma->r, mb->r, sizeof ma->r);
+ memcpy(ma, mb, sizeof *ma);
bscopy(ma->b, mb->b);
- ma->n = mb->n;
}
static int
@@ -93,10 +98,10 @@ radd(RMap *m, int t, int r)
}
static Ref
-ralloc(RMap *m, int t)
+ralloctry(RMap *m, int t, int try)
{
bits regs;
- int r, r0, r1;
+ int h, r, r0, r1;
if (t < Tmp0) {
assert(bshas(m->b, t));
@@ -107,8 +112,12 @@ ralloc(RMap *m, int t)
assert(r != -1);
return TMP(r);
}
- r = *hint(t);
+ r = tmp[t].visit;
+ if (r == -1 || bshas(m->b, r))
+ r = *hint(t);
if (r == -1 || bshas(m->b, r)) {
+ if (try)
+ return R;
regs = tmp[phicls(t, tmp)].hint.m;
regs |= m->b->t[0];
if (KBASE(tmp[t].cls) == 0) {
@@ -129,9 +138,19 @@ ralloc(RMap *m, int t)
Found:
radd(m, t, r);
sethint(t, r);
+ tmp[t].visit = r;
+ h = *hint(t);
+ if (h != -1 && h != r)
+ m->w[h] = t;
return TMP(r);
}
+static inline Ref
+ralloc(RMap *m, int t)
+{
+ return ralloctry(m, t, 0);
+}
+
static int
rfree(RMap *m, int t)
{
@@ -168,12 +187,8 @@ mdump(RMap *m)
static void
pmadd(Ref src, Ref dst, int k)
{
- if (npm == cpm) {
- cpm = cpm * 2 + 16;
- pm = realloc(pm, cpm * sizeof pm[0]);
- if (!pm)
- die("pmadd, out of memory");
- }
+ if (npm == Tmp0)
+ die("cannot have more moves than registers");
pm[npm].src = src;
pm[npm].dst = dst;
pm[npm].cls = k;
@@ -182,77 +197,61 @@ pmadd(Ref src, Ref dst, int k)
enum PMStat { ToMove, Moving, Moved };
-static Ref
+static int
pmrec(enum PMStat *status, int i, int *k)
{
- Ref swp, swp1;
- int j, k1;
+ int j, c;
/* note, this routine might emit
- * too many large instructions:
- *
- * , x -- x
- * x -- x -- x |
- * ` x -- x
- *
- * if only the first move is wide
- * the whole cycle will be wide,
- * this is safe but not necessary
+ * too many large instructions
*/
-
- if (req(pm[i].src, pm[i].dst))
- return R;
- status[i] = Moving;
- assert(KBASE(*k) == KBASE(pm[i].cls));
- assert((Kw|1) == Kl && (Ks|1) == Kd);
- *k |= KWIDE(pm[i].cls); /* see above */
- swp = R;
- for (j=0; j<npm; j++) {
- if (req(pm[j].src, pm[i].dst))
- switch (status[j]) {
- case ToMove:
- k1 = *k;
- swp1 = pmrec(status, j, &k1);
- if (!req(swp1, R)) {
- assert(req(swp, R));
- swp = swp1;
- *k = k1;
- }
- break;
- case Moving:
- assert(req(swp, R));
- swp = pm[i].dst;
- break;
- case Moved:
- break;
- }
+ if (req(pm[i].src, pm[i].dst)) {
+ status[i] = Moved;
+ return -1;
+ }
+ assert(KBASE(pm[i].cls) == KBASE(*k));
+ assert((Kw|Kl) == Kl && (Ks|Kd) == Kd);
+ *k |= pm[i].cls;
+ for (j=0; j<npm; j++)
+ if (req(pm[j].dst, pm[i].src))
+ break;
+ switch (j == npm ? Moved : status[j]) {
+ case Moving:
+ c = j; /* start of cycle */
+ emit(Oswap, *k, R, pm[i].src, pm[i].dst);
+ break;
+ case ToMove:
+ status[i] = Moving;
+ c = pmrec(status, j, k);
+ if (c == i) {
+ c = -1; /* end of cycle */
+ break;
+ }
+ if (c != -1) {
+ emit(Oswap, *k, R, pm[i].src, pm[i].dst);
+ break;
+ }
+ /* fall through */
+ case Moved:
+ c = -1;
+ emit(Ocopy, pm[i].cls, pm[i].dst, pm[i].src, R);
+ break;
}
status[i] = Moved;
- if (req(swp, R)) {
- *curi++ = (Ins){Ocopy, pm[i].dst, {pm[i].src}, pm[i].cls};
- return R;
- } else if (!req(swp, pm[i].src)) {
- *curi++ = (Ins){Oswap, R, {pm[i].src, pm[i].dst}, *k};
- return swp;
- } else
- return R;
-
+ return c;
}
static void
pmgen()
{
- int i, k;
+ int i;
enum PMStat *status;
status = alloc(npm * sizeof status[0]);
assert(!npm || status[npm-1] == ToMove);
- curi = insb;
for (i=0; i<npm; i++)
- if (status[i] == ToMove) {
- k = pm[i].cls;
- pmrec(status, i, &k);
- }
+ if (status[i] == ToMove)
+ pmrec(status, i, (int[]){pm[i].cls});
}
static void
@@ -261,8 +260,9 @@ move(int r, Ref to, RMap *m)
int n, t, r1;
r1 = req(to, R) ? -1 : rfree(m, to.val);
- if (bshas(m->b, r) && r1 != r) {
+ if (bshas(m->b, r)) {
/* r is used and not by to */
+ assert(r1 != r);
for (n=0; m->r[n] != r; n++)
assert(n+1 < m->n);
t = m->t[n];
@@ -286,10 +286,11 @@ dopm(Blk *b, Ins *i, RMap *m)
{
RMap m0;
int n, r, r1, t, s;
- Ins *i0, *i1, *ip, *ir;
+ Ins *i1, *ip;
bits def;
- m0 = *m;
+ m0 = *m; /* okay since we don't use m0.b */
+ m0.b->t = 0;
i1 = ++i;
do {
i--;
@@ -320,21 +321,11 @@ dopm(Blk *b, Ins *i, RMap *m)
radd(m, r, r);
}
pmgen();
-#ifdef TEST_PMOV
- return 0;
-#endif
- n = b->nins - (i1 - i) + (curi - insb);
- i0 = alloc(n * sizeof(Ins));
- ip = icpy(ip = i0, b->ins, i - b->ins);
- ip = icpy(ir = ip, insb, curi - insb);
- ip = icpy(ip, i1, &b->ins[b->nins] - i1);
- b->nins = n;
- b->ins = i0;
- return ir;
+ return i;
}
static int
-prio(Ref r1, Ref r2)
+prio1(Ref r1, Ref r2)
{
/* trivial heuristic to begin with,
* later we can use the distance to
@@ -350,7 +341,7 @@ insert(Ref *r, Ref **rs, int p)
int i;
rs[i = p] = r;
- while (i-- > 0 && prio(*r, *rs[i])) {
+ while (i-- > 0 && prio1(*r, *rs[i])) {
rs[i+1] = rs[i];
rs[i] = r;
}
@@ -359,9 +350,9 @@ insert(Ref *r, Ref **rs, int p)
static void
doblk(Blk *b, RMap *cur)
{
- int x, r, nr;
+ int t, x, r, rf, rt, nr;
bits rs;
- Ins *i;
+ Ins *i, *i1;
Mem *m;
Ref *ra[4];
@@ -369,8 +360,12 @@ doblk(Blk *b, RMap *cur)
radd(cur, r, r);
if (rtype(b->jmp.arg) == RTmp)
b->jmp.arg = ralloc(cur, b->jmp.arg.val);
- for (i=&b->ins[b->nins]; i!=b->ins;) {
- switch ((--i)->op) {
+ curi = &insb[NIns];
+ for (i1=&b->ins[b->nins]; i1!=b->ins;) {
+ emiti(*--i1);
+ i = curi;
+ rf = -1;
+ switch (i->op) {
case Ocall:
rs = T.argregs(i->arg[1], 0) | T.rglob;
for (r=0; T.rsave[r]>=0; r++)
@@ -378,8 +373,10 @@ doblk(Blk *b, RMap *cur)
rfree(cur, T.rsave[r]);
break;
case Ocopy:
- if (isreg(i->arg[0])) {
- i = dopm(b, i, cur);
+ if (regcpy(i)) {
+ curi++;
+ i1 = dopm(b, i1, cur);
+ stmov += i+1 - curi;
continue;
}
if (isreg(i->to))
@@ -390,14 +387,15 @@ doblk(Blk *b, RMap *cur)
if (!req(i->to, R)) {
assert(rtype(i->to) == RTmp);
r = i->to.val;
- if (r >= Tmp0 || !(BIT(r) & T.rglob))
- r = rfree(cur, r);
- if (r == -1) {
+ if (r < Tmp0 && (BIT(r) & T.rglob))
+ break;
+ rf = rfree(cur, r);
+ if (rf == -1) {
assert(!isreg(i->to));
- *i = (Ins){.op = Onop};
+ curi++;
continue;
}
- i->to = TMP(r);
+ i->to = TMP(rf);
}
break;
}
@@ -416,7 +414,57 @@ doblk(Blk *b, RMap *cur)
}
for (r=0; r<nr; r++)
*ra[r] = ralloc(cur, ra[r]->val);
+
+ /* try to change the register of a hinted
+ * temporary if rf is available */
+ x = 1;
+ if (rf != -1 && (t = cur->w[rf]) != 0)
+ if (!bshas(cur->b, rf) && *hint(t) == rf
+ && (rt = rfree(cur, t)) != -1) {
+ tmp[t].visit = -1;
+ ralloc(cur, t);
+ assert(bshas(cur->b, rf));
+ emit(Ocopy, tmp[t].cls, TMP(rt), TMP(rf), R);
+ stmov += 1;
+ cur->w[rf] = 0;
+ for (r=0; r<nr; r++)
+ if (req(*ra[r], TMP(rt)))
+ *ra[r] = TMP(rf);
+ /* one could iterate this logic with
+ * the newly freed rt, but in this case
+ * the above loop must be changed */
+ }
}
+ b->nins = &insb[NIns] - curi;
+ idup(&b->ins, curi, b->nins);
+}
+
+/* qsort() comparison function to peel
+ * loop nests from inside out */
+static int
+carve(const void *a, const void *b)
+{
+ Blk *ba, *bb;
+
+ /* todo, evaluate if this order is really
+ * better than the simple postorder */
+ ba = *(Blk**)a;
+ bb = *(Blk**)b;
+ if (ba->loop == bb->loop)
+ return ba->id > bb->id ? -1 : ba->id < bb->id;
+ return ba->loop > bb->loop ? -1 : +1;
+}
+
+/* comparison function to order temporaries
+ * for allocation at the end of blocks */
+static int
+prio2(int t1, int t2)
+{
+ if ((tmp[t1].visit ^ tmp[t2].visit) < 0) /* != signs */
+ return tmp[t1].visit != -1 ? +1 : -1;
+ if ((*hint(t1) ^ *hint(t2)) < 0)
+ return *hint(t1) != -1 ? +1 : -1;
+ return tmp[t1].cost - tmp[t2].cost;
}
/* register allocation
@@ -425,18 +473,21 @@ doblk(Blk *b, RMap *cur)
void
rega(Fn *fn)
{
- int j, t, r, r1, x, rl[Tmp0];
- Blk *b, *b1, *s, ***ps, *blist;
- RMap *end, *beg, cur, old;
+ int j, t, r, x, rl[Tmp0];
+ Blk *b, *b1, *s, ***ps, *blist, **blk, **bp;
+ RMap *end, *beg, cur, old, *m;
Ins *i;
Phi *p;
uint u, n;
Ref src, dst;
/* 1. setup */
+ stmov = 0;
+ stblk = 0;
regu = 0;
tmp = fn->tmp;
mem = fn->mem;
+ blk = alloc(fn->nblk * sizeof blk[0]);
end = alloc(fn->nblk * sizeof end[0]);
beg = alloc(fn->nblk * sizeof beg[0]);
for (n=0; n<fn->nblk; n++) {
@@ -446,8 +497,15 @@ rega(Fn *fn)
bsinit(cur.b, fn->ntmp);
bsinit(old.b, fn->ntmp);
- for (t=0; t<fn->ntmp; t++)
- *hint(t) = t < Tmp0 ? t : -1;
+ loop = INT_MAX;
+ for (t=0; t<fn->ntmp; t++) {
+ tmp[t].hint.r = t < Tmp0 ? t : -1;
+ tmp[t].hint.w = loop;
+ tmp[t].visit = -1;
+ }
+ for (bp=blk, b=fn->start; b; b=b->link)
+ *bp++ = b;
+ qsort(blk, fn->nblk, sizeof blk[0], carve);
for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
if (i->op != Ocopy || !isreg(i->arg[0]))
break;
@@ -456,71 +514,103 @@ rega(Fn *fn)
sethint(i->to.val, i->arg[0].val);
}
- /* 2. assign registers following post-order */
- for (n=fn->nblk-1; n!=-1u; n--) {
- b = fn->rpo[n];
+ /* 2. assign registers */
+ for (bp=blk; bp<&blk[fn->nblk]; bp++) {
+ b = *bp;
+ n = b->id;
+ loop = b->loop;
cur.n = 0;
bszero(cur.b);
- for (x=0; x<2; x++)
- for (t=Tmp0; bsiter(b->out, &t); t++)
- if (x || (r=*hint(t)) != -1)
- if (x || !bshas(cur.b, r))
- ralloc(&cur, t);
+ memset(cur.w, 0, sizeof cur.w);
+ for (x=0, t=Tmp0; bsiter(b->out, &t); t++) {
+ j = x++;
+ rl[j] = t;
+ while (j-- > 0 && prio2(t, rl[j]) > 0) {
+ rl[j+1] = rl[j];
+ rl[j] = t;
+ }
+ }
+ for (j=0; j<x; j++)
+ ralloctry(&cur, rl[j], 1);
+ for (j=0; j<x; j++)
+ ralloc(&cur, rl[j]);
rcopy(&end[n], &cur);
doblk(b, &cur);
bscopy(b->in, cur.b);
for (p=b->phi; p; p=p->link)
- if (rtype(p->to) == RTmp) {
+ if (rtype(p->to) == RTmp)
bsclr(b->in, p->to.val);
- /* heuristic 0:
- * if the phi destination has an
- * argument from a frequent block
- * that was already allocated to
- * 'r', use 'r' as the new hint
- */
- memset(rl, 0, sizeof rl);
- for (u=0; u<p->narg; u++) {
- t = p->arg[u].val;
- b1 = p->blk[u];
- if (rtype(p->arg[u]) == RTmp)
- if ((r=rfind(&end[b1->id], t)) != -1)
- rl[r] += b1->loop;
- }
- for (x=0, j=0; j<Tmp0; j++)
- if (rl[j] > rl[x])
- x = j;
- if (rl[x] >= b->loop)
- *hint(p->to.val) = x;
+ rcopy(&beg[n], &cur);
+ }
+
+ /* 3. emit copies shared by multiple edges
+ * to the same block */
+ for (s=fn->start; s; s=s->link) {
+ if (s->npred <= 1)
+ continue;
+ m = &beg[s->id];
+
+ /* rl maps a register that is live at the
+ * beginning of s to the one used in all
+ * predecessors (if any, -1 otherwise) */
+ memset(rl, 0, sizeof rl);
+
+ /* to find the register of a phi in a
+ * predecessor, we have to find the
+ * corresponding argument */
+ for (p=s->phi; p; p=p->link) {
+ r = rfind(m, p->to.val);
+ if (r == -1)
+ continue;
+ for (u=0; u<p->narg; u++) {
+ b = p->blk[u];
+ src = p->arg[u];
+ if (rtype(src) != RTmp)
+ continue;
+ x = rfind(&end[b->id], src.val);
+ assert(x != -1);
+ rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
}
- if (b->npred > 1) {
- /* heuristic 1:
- * attempt to satisfy hints
- * when it's simple and we have
- * multiple predecessors
- */
- rcopy(&old, &cur);
- curi = &insb[NIns];
- for (j=0; j<old.n; j++) {
- t = old.t[j];
- r = *hint(t);
- r1 = rfind(&cur, t);
- if (r != -1 && r != r1)
- if (!bshas(cur.b, r)) {
- rfree(&cur, t);
- radd(&cur, t, r);
- x = tmp[t].cls;
- emit(Ocopy, x, TMP(r1), TMP(r), R);
- }
+ if (rl[r] == 0)
+ rl[r] = -1;
+ }
+
+ /* process non-phis temporaries */
+ for (j=0; j<m->n; j++) {
+ t = m->t[j];
+ r = m->r[j];
+ if (rl[r] || t < Tmp0 /* todo, remove this */)
+ continue;
+ for (bp=s->pred; bp<&s->pred[s->npred]; bp++) {
+ x = rfind(&end[(*bp)->id], t);
+ assert(x != -1);
+ rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
}
- if ((j = &insb[NIns] - curi)) {
- b->nins += j;
- i = alloc(b->nins * sizeof(Ins));
- icpy(icpy(i, curi, j), b->ins, b->nins-j);
- b->ins = i;
+ }
+
+ npm = 0;
+ for (j=0; j<m->n; j++) {
+ t = m->t[j];
+ r = m->r[j];
+ x = rl[r];
+ assert(x != 0 || t < Tmp0 /* todo, ditto */);
+ if (x > 0) {
+ pmadd(TMP(x), TMP(r), tmp[t].cls);
+ m->r[j] = x;
}
}
- rcopy(&beg[n], &cur);
+ curi = &insb[NIns];
+ pmgen();
+ j = &insb[NIns] - curi;
+ if (j == 0)
+ continue;
+ stmov += j;
+ s->nins += j;
+ i = alloc(s->nins * sizeof(Ins));
+ icpy(icpy(i, curi, j), s->ins, s->nins-j);
+ s->ins = i;
}
+
if (debug['R']) {
fprintf(stderr, "\n> Register mappings:\n");
for (n=0; n<fn->nblk; n++) {
@@ -533,7 +623,7 @@ rega(Fn *fn)
fprintf(stderr, "\n");
}
- /* 3. compose glue code */
+ /* 4. emit remaining copies in new blocks */
blist = 0;
for (b=fn->start;; b=b->link) {
ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
@@ -560,8 +650,9 @@ rega(Fn *fn)
dst = rref(&beg[s->id], t);
pmadd(src, dst, tmp[t].cls);
}
+ curi = &insb[NIns];
pmgen();
- if (curi == insb)
+ if (curi == &insb[NIns])
continue;
b1 = blknew();
b1->loop = (b->loop+s->loop) / 2;
@@ -569,8 +660,10 @@ rega(Fn *fn)
blist = b1;
fn->nblk++;
sprintf(b1->name, "%s_%s", b->name, s->name);
- b1->nins = curi - insb;
- idup(&b1->ins, insb, b1->nins);
+ b1->nins = &insb[NIns] - curi;
+ stmov += b1->nins;
+ stblk += 1;
+ idup(&b1->ins, curi, b1->nins);
b1->jmp.type = Jjmp;
b1->s1 = s;
**ps = b1;
@@ -585,6 +678,9 @@ rega(Fn *fn)
fn->reg = regu;
if (debug['R']) {
+ fprintf(stderr, "\n> Register allocation statistics:\n");
+ fprintf(stderr, "\tnew moves: %d\n", stmov);
+ fprintf(stderr, "\tnew blocks: %d\n", stblk);
fprintf(stderr, "\n> After register allocation:\n");
printfn(fn, stderr);
}