new hinting in the register allocator - qbe - Internal scc patchset buffer for QBE

commit 2d02070af019e9896ecf2a63bedc098092fd395d
parent 436d0fc07e4551dd4265cfed0b0bc459c71ed8ce
Author: Quentin Carbonneaux <quentin@c9x.me>
Date:   Tue, 16 May 2017 11:47:46 -0400

new hinting in the register allocator

The previous heuristics were ad hoc and it was
hard to understand why they worked at all.

This patch can be summarized in three points:

 1. When a register is freed (an instruction
    assigns it), we try to find if a temporary
    would like to be in it, and if we find one,
    we move it in the newly freed register.
    I call this an "eager move".

 2. Temporaries now remember in what register
    they were last allocated; this information
    is stored in the field Tmp.visit, and
    prevails on the field Tmp.hint when it is
    set.  (This makes having the same hint for
    interfering temporaries not so disastrous.)

 3. Blocks are now allocated in "onion" order,
    from the innermost loop to the outermost.
    This is the change I am the least sure
    about; it should be evaluated thorougly.

Diffstat:
M all.h  | 5 +++--
M rega.c  | 420 ++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------

2 files changed, 261 insertions(+), 164 deletions(-)
diff --git a/all.h b/all.h
@@ -288,8 +288,9 @@ struct Tmp {
 	short slot; /* -1 for unset */
 	short cls;
 	struct {
-		int r;
-		bits m;
+		int r;  /* register or -1 */
+		int w;  /* weight */
+		bits m; /* avoid these registers */
 	} hint;
 	int phi;
 	Alias alias;
diff --git a/rega.c b/rega.c
@@ -10,6 +10,7 @@ typedef struct RMap RMap;
 struct RMap {
 	int t[Tmp0];
 	int r[Tmp0];
+	int w[Tmp0];   /* wait list, for unmatched hints */
 	BSet b[1];
 	int n;
 };
@@ -20,8 +21,12 @@ static Mem *mem;       /* function mem references */
 static struct {
 	Ref src, dst;
 	int cls;
-} *pm;                 /* parallel move constructed */
-static int cpm, npm;   /* capacity and size of pm */
+} pm[Tmp0];            /* parallel move constructed */
+static int npm;        /* size of pm */
+static int loop;       /* current loop level */
+
+static uint stmov;     /* stats: added moves */
+static uint stblk;     /* stats: added blocks */
 
 static int *
 hint(int t)
@@ -32,21 +37,21 @@ hint(int t)
 static void
 sethint(int t, int r)
 {
-	bits m;
+	Tmp *p;
 
-	m = tmp[phicls(t, tmp)].hint.m;
-	if (*hint(t) == -1)
-	if (!(BIT(r) & m))
-		*hint(t) = r;
+	p = &tmp[phicls(t, tmp)];
+	if (p->hint.r == -1 || p->hint.w > loop) {
+		p->hint.r = r;
+		p->hint.w = loop;
+		tmp[t].visit = -1;
+	}
 }
 
 static void
 rcopy(RMap *ma, RMap *mb)
 {
-	memcpy(ma->t, mb->t, sizeof ma->t);
-	memcpy(ma->r, mb->r, sizeof ma->r);
+	memcpy(ma, mb, sizeof *ma);
 	bscopy(ma->b, mb->b);
-	ma->n = mb->n;
 }
 
 static int
@@ -93,10 +98,10 @@ radd(RMap *m, int t, int r)
 }
 
 static Ref
-ralloc(RMap *m, int t)
+ralloctry(RMap *m, int t, int try)
 {
 	bits regs;
-	int r, r0, r1;
+	int h, r, r0, r1;
 
 	if (t < Tmp0) {
 		assert(bshas(m->b, t));
@@ -107,8 +112,12 @@ ralloc(RMap *m, int t)
 		assert(r != -1);
 		return TMP(r);
 	}
-	r = *hint(t);
+	r = tmp[t].visit;
+	if (r == -1 || bshas(m->b, r))
+		r = *hint(t);
 	if (r == -1 || bshas(m->b, r)) {
+		if (try)
+			return R;
 		regs = tmp[phicls(t, tmp)].hint.m;
 		regs |= m->b->t[0];
 		if (KBASE(tmp[t].cls) == 0) {
@@ -129,9 +138,19 @@ ralloc(RMap *m, int t)
 Found:
 	radd(m, t, r);
 	sethint(t, r);
+	tmp[t].visit = r;
+	h = *hint(t);
+	if (h != -1 && h != r)
+		m->w[h] = t;
 	return TMP(r);
 }
 
+static inline Ref
+ralloc(RMap *m, int t)
+{
+	return ralloctry(m, t, 0);
+}
+
 static int
 rfree(RMap *m, int t)
 {
@@ -168,12 +187,8 @@ mdump(RMap *m)
 static void
 pmadd(Ref src, Ref dst, int k)
 {
-	if (npm == cpm) {
-		cpm = cpm * 2 + 16;
-		pm = realloc(pm, cpm * sizeof pm[0]);
-		if (!pm)
-			die("pmadd, out of memory");
-	}
+	if (npm == Tmp0)
+		die("cannot have more moves than registers");
 	pm[npm].src = src;
 	pm[npm].dst = dst;
 	pm[npm].cls = k;
@@ -182,77 +197,61 @@ pmadd(Ref src, Ref dst, int k)
 
 enum PMStat { ToMove, Moving, Moved };
 
-static Ref
+static int
 pmrec(enum PMStat *status, int i, int *k)
 {
-	Ref swp, swp1;
-	int j, k1;
+	int j, c;
 
 	/* note, this routine might emit
-	 * too many large instructions:
-	 *
-	 *                  , x -- x
-	 *      x -- x -- x        |
-	 *                  ` x -- x
-	 *
-	 * if only the first move is wide
-	 * the whole cycle will be wide,
-	 * this is safe but not necessary
+	 * too many large instructions
 	 */
-
-	if (req(pm[i].src, pm[i].dst))
-		return R;
-	status[i] = Moving;
-	assert(KBASE(*k) == KBASE(pm[i].cls));
-	assert((Kw|1) == Kl && (Ks|1) == Kd);
-	*k |= KWIDE(pm[i].cls); /* see above */
-	swp = R;
-	for (j=0; j<npm; j++) {
-		if (req(pm[j].src, pm[i].dst))
-			switch (status[j]) {
-			case ToMove:
-				k1 = *k;
-				swp1 = pmrec(status, j, &k1);
-				if (!req(swp1, R)) {
-					assert(req(swp, R));
-					swp = swp1;
-					*k = k1;
-				}
-				break;
-			case Moving:
-				assert(req(swp, R));
-				swp = pm[i].dst;
-				break;
-			case Moved:
-				break;
-			}
+	if (req(pm[i].src, pm[i].dst)) {
+		status[i] = Moved;
+		return -1;
+	}
+	assert(KBASE(pm[i].cls) == KBASE(*k));
+	assert((Kw|Kl) == Kl && (Ks|Kd) == Kd);
+	*k |= pm[i].cls;
+	for (j=0; j<npm; j++)
+		if (req(pm[j].dst, pm[i].src))
+			break;
+	switch (j == npm ? Moved : status[j]) {
+	case Moving:
+		c = j; /* start of cycle */
+		emit(Oswap, *k, R, pm[i].src, pm[i].dst);
+		break;
+	case ToMove:
+		status[i] = Moving;
+		c = pmrec(status, j, k);
+		if (c == i) {
+			c = -1; /* end of cycle */
+			break;
+		}
+		if (c != -1) {
+			emit(Oswap, *k, R, pm[i].src, pm[i].dst);
+			break;
+		}
+		/* fall through */
+	case Moved:
+		c = -1;
+		emit(Ocopy, pm[i].cls, pm[i].dst, pm[i].src, R);
+		break;
 	}
 	status[i] = Moved;
-	if (req(swp, R)) {
-		*curi++ = (Ins){Ocopy, pm[i].dst, {pm[i].src}, pm[i].cls};
-		return R;
-	} else if (!req(swp, pm[i].src)) {
-		*curi++ = (Ins){Oswap, R, {pm[i].src, pm[i].dst}, *k};
-		return swp;
-	} else
-		return R;
-
+	return c;
 }
 
 static void
 pmgen()
 {
-	int i, k;
+	int i;
 	enum PMStat *status;
 
 	status = alloc(npm * sizeof status[0]);
 	assert(!npm || status[npm-1] == ToMove);
-	curi = insb;
 	for (i=0; i<npm; i++)
-		if (status[i] == ToMove) {
-			k = pm[i].cls;
-			pmrec(status, i, &k);
-		}
+		if (status[i] == ToMove)
+			pmrec(status, i, (int[]){pm[i].cls});
 }
 
 static void
@@ -261,8 +260,9 @@ move(int r, Ref to, RMap *m)
 	int n, t, r1;
 
 	r1 = req(to, R) ? -1 : rfree(m, to.val);
-	if (bshas(m->b, r) && r1 != r) {
+	if (bshas(m->b, r)) {
 		/* r is used and not by to */
+		assert(r1 != r);
 		for (n=0; m->r[n] != r; n++)
 			assert(n+1 < m->n);
 		t = m->t[n];
@@ -286,10 +286,11 @@ dopm(Blk *b, Ins *i, RMap *m)
 {
 	RMap m0;
 	int n, r, r1, t, s;
-	Ins *i0, *i1, *ip, *ir;
+	Ins *i1, *ip;
 	bits def;
 
-	m0 = *m;
+	m0 = *m; /* okay since we don't use m0.b */
+	m0.b->t = 0;
 	i1 = ++i;
 	do {
 		i--;
@@ -320,21 +321,11 @@ dopm(Blk *b, Ins *i, RMap *m)
 			radd(m, r, r);
 	}
 	pmgen();
-#ifdef TEST_PMOV
-	return 0;
-#endif
-	n = b->nins - (i1 - i) + (curi - insb);
-	i0 = alloc(n * sizeof(Ins));
-	ip = icpy(ip = i0, b->ins, i - b->ins);
-	ip = icpy(ir = ip, insb, curi - insb);
-	ip = icpy(ip, i1, &b->ins[b->nins] - i1);
-	b->nins = n;
-	b->ins = i0;
-	return ir;
+	return i;
 }
 
 static int
-prio(Ref r1, Ref r2)
+prio1(Ref r1, Ref r2)
 {
 	/* trivial heuristic to begin with,
 	 * later we can use the distance to
@@ -350,7 +341,7 @@ insert(Ref *r, Ref **rs, int p)
 	int i;
 
 	rs[i = p] = r;
-	while (i-- > 0 && prio(*r, *rs[i])) {
+	while (i-- > 0 && prio1(*r, *rs[i])) {
 		rs[i+1] = rs[i];
 		rs[i] = r;
 	}
@@ -359,9 +350,9 @@ insert(Ref *r, Ref **rs, int p)
 static void
 doblk(Blk *b, RMap *cur)
 {
-	int x, r, nr;
+	int t, x, r, rf, rt, nr;
 	bits rs;
-	Ins *i;
+	Ins *i, *i1;
 	Mem *m;
 	Ref *ra[4];
 
@@ -369,8 +360,12 @@ doblk(Blk *b, RMap *cur)
 		radd(cur, r, r);
 	if (rtype(b->jmp.arg) == RTmp)
 		b->jmp.arg = ralloc(cur, b->jmp.arg.val);
-	for (i=&b->ins[b->nins]; i!=b->ins;) {
-		switch ((--i)->op) {
+	curi = &insb[NIns];
+	for (i1=&b->ins[b->nins]; i1!=b->ins;) {
+		emiti(*--i1);
+		i = curi;
+		rf = -1;
+		switch (i->op) {
 		case Ocall:
 			rs = T.argregs(i->arg[1], 0) | T.rglob;
 			for (r=0; T.rsave[r]>=0; r++)
@@ -378,8 +373,10 @@ doblk(Blk *b, RMap *cur)
 					rfree(cur, T.rsave[r]);
 			break;
 		case Ocopy:
-			if (isreg(i->arg[0])) {
-				i = dopm(b, i, cur);
+			if (regcpy(i)) {
+				curi++;
+				i1 = dopm(b, i1, cur);
+				stmov += i+1 - curi;
 				continue;
 			}
 			if (isreg(i->to))
@@ -390,14 +387,15 @@ doblk(Blk *b, RMap *cur)
 			if (!req(i->to, R)) {
 				assert(rtype(i->to) == RTmp);
 				r = i->to.val;
-				if (r >= Tmp0 || !(BIT(r) & T.rglob))
-					r = rfree(cur, r);
-				if (r == -1) {
+				if (r < Tmp0 && (BIT(r) & T.rglob))
+					break;
+				rf = rfree(cur, r);
+				if (rf == -1) {
 					assert(!isreg(i->to));
-					*i = (Ins){.op = Onop};
+					curi++;
 					continue;
 				}
-				i->to = TMP(r);
+				i->to = TMP(rf);
 			}
 			break;
 		}
@@ -416,7 +414,57 @@ doblk(Blk *b, RMap *cur)
 			}
 		for (r=0; r<nr; r++)
 			*ra[r] = ralloc(cur, ra[r]->val);
+
+		/* try to change the register of a hinted
+		 * temporary if rf is available */
+		x = 1;
+		if (rf != -1 && (t = cur->w[rf]) != 0)
+		if (!bshas(cur->b, rf) && *hint(t) == rf
+		&& (rt = rfree(cur, t)) != -1) {
+			tmp[t].visit = -1;
+			ralloc(cur, t);
+			assert(bshas(cur->b, rf));
+			emit(Ocopy, tmp[t].cls, TMP(rt), TMP(rf), R);
+			stmov += 1;
+			cur->w[rf] = 0;
+			for (r=0; r<nr; r++)
+				if (req(*ra[r], TMP(rt)))
+					*ra[r] = TMP(rf);
+			/* one could iterate this logic with
+			 * the newly freed rt, but in this case
+			 * the above loop must be changed */
+		}
 	}
+	b->nins = &insb[NIns] - curi;
+	idup(&b->ins, curi, b->nins);
+}
+
+/* qsort() comparison function to peel
+ * loop nests from inside out */
+static int
+carve(const void *a, const void *b)
+{
+	Blk *ba, *bb;
+
+	/* todo, evaluate if this order is really
+	 * better than the simple postorder */
+	ba = *(Blk**)a;
+	bb = *(Blk**)b;
+	if (ba->loop == bb->loop)
+		return ba->id > bb->id ? -1 : ba->id < bb->id;
+	return ba->loop > bb->loop ? -1 : +1;
+}
+
+/* comparison function to order temporaries
+ * for allocation at the end of blocks */
+static int
+prio2(int t1, int t2)
+{
+	if ((tmp[t1].visit ^ tmp[t2].visit) < 0)  /* != signs */
+		return tmp[t1].visit != -1 ? +1 : -1;
+	if ((*hint(t1) ^ *hint(t2)) < 0)
+		return *hint(t1) != -1 ? +1 : -1;
+	return tmp[t1].cost - tmp[t2].cost;
 }
 
 /* register allocation
@@ -425,18 +473,21 @@ doblk(Blk *b, RMap *cur)
 void
 rega(Fn *fn)
 {
-	int j, t, r, r1, x, rl[Tmp0];
-	Blk *b, *b1, *s, ***ps, *blist;
-	RMap *end, *beg, cur, old;
+	int j, t, r, x, rl[Tmp0];
+	Blk *b, *b1, *s, ***ps, *blist, **blk, **bp;
+	RMap *end, *beg, cur, old, *m;
 	Ins *i;
 	Phi *p;
 	uint u, n;
 	Ref src, dst;
 
 	/* 1. setup */
+	stmov = 0;
+	stblk = 0;
 	regu = 0;
 	tmp = fn->tmp;
 	mem = fn->mem;
+	blk = alloc(fn->nblk * sizeof blk[0]);
 	end = alloc(fn->nblk * sizeof end[0]);
 	beg = alloc(fn->nblk * sizeof beg[0]);
 	for (n=0; n<fn->nblk; n++) {
@@ -446,8 +497,15 @@ rega(Fn *fn)
 	bsinit(cur.b, fn->ntmp);
 	bsinit(old.b, fn->ntmp);
 
-	for (t=0; t<fn->ntmp; t++)
-		*hint(t) = t < Tmp0 ? t : -1;
+	loop = INT_MAX;
+	for (t=0; t<fn->ntmp; t++) {
+		tmp[t].hint.r = t < Tmp0 ? t : -1;
+		tmp[t].hint.w = loop;
+		tmp[t].visit = -1;
+	}
+	for (bp=blk, b=fn->start; b; b=b->link)
+		*bp++ = b;
+	qsort(blk, fn->nblk, sizeof blk[0], carve);
 	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
 		if (i->op != Ocopy || !isreg(i->arg[0]))
 			break;
@@ -456,71 +514,103 @@ rega(Fn *fn)
 			sethint(i->to.val, i->arg[0].val);
 		}
 
-	/* 2. assign registers following post-order */
-	for (n=fn->nblk-1; n!=-1u; n--) {
-		b = fn->rpo[n];
+	/* 2. assign registers */
+	for (bp=blk; bp<&blk[fn->nblk]; bp++) {
+		b = *bp;
+		n = b->id;
+		loop = b->loop;
 		cur.n = 0;
 		bszero(cur.b);
-		for (x=0; x<2; x++)
-			for (t=Tmp0; bsiter(b->out, &t); t++)
-				if (x || (r=*hint(t)) != -1)
-				if (x || !bshas(cur.b, r))
-					ralloc(&cur, t);
+		memset(cur.w, 0, sizeof cur.w);
+		for (x=0, t=Tmp0; bsiter(b->out, &t); t++) {
+			j = x++;
+			rl[j] = t;
+			while (j-- > 0 && prio2(t, rl[j]) > 0) {
+				rl[j+1] = rl[j];
+				rl[j] = t;
+			}
+		}
+		for (j=0; j<x; j++)
+			ralloctry(&cur, rl[j], 1);
+		for (j=0; j<x; j++)
+			ralloc(&cur, rl[j]);
 		rcopy(&end[n], &cur);
 		doblk(b, &cur);
 		bscopy(b->in, cur.b);
 		for (p=b->phi; p; p=p->link)
-			if (rtype(p->to) == RTmp) {
+			if (rtype(p->to) == RTmp)
 				bsclr(b->in, p->to.val);
-				/* heuristic 0:
-				 * if the phi destination has an
-				 * argument from a frequent block
-				 * that was already allocated to
-				 * 'r', use 'r' as the new hint
-				 */
-				memset(rl, 0, sizeof rl);
-				for (u=0; u<p->narg; u++) {
-					t = p->arg[u].val;
-					b1 = p->blk[u];
-					if (rtype(p->arg[u]) == RTmp)
-					if ((r=rfind(&end[b1->id], t)) != -1)
-						rl[r] += b1->loop;
-				}
-				for (x=0, j=0; j<Tmp0; j++)
-					if (rl[j] > rl[x])
-						x = j;
-				if (rl[x] >= b->loop)
-					*hint(p->to.val) = x;
+		rcopy(&beg[n], &cur);
+	}
+
+	/* 3. emit copies shared by multiple edges
+	 * to the same block */
+	for (s=fn->start; s; s=s->link) {
+		if (s->npred <= 1)
+			continue;
+		m = &beg[s->id];
+
+		/* rl maps a register that is live at the
+		 * beginning of s to the one used in all
+		 * predecessors (if any, -1 otherwise) */
+		memset(rl, 0, sizeof rl);
+
+		/* to find the register of a phi in a
+		 * predecessor, we have to find the
+		 * corresponding argument */
+		for (p=s->phi; p; p=p->link) {
+			r = rfind(m, p->to.val);
+			if (r == -1)
+				continue;
+			for (u=0; u<p->narg; u++) {
+				b = p->blk[u];
+				src = p->arg[u];
+				if (rtype(src) != RTmp)
+					continue;
+				x = rfind(&end[b->id], src.val);
+				assert(x != -1);
+				rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
 			}
-		if (b->npred > 1) {
-			/* heuristic 1:
-			 * attempt to satisfy hints
-			 * when it's simple and we have
-			 * multiple predecessors
-			 */
-			rcopy(&old, &cur);
-			curi = &insb[NIns];
-			for (j=0; j<old.n; j++) {
-				t = old.t[j];
-				r = *hint(t);
-				r1 = rfind(&cur, t);
-				if (r != -1 && r != r1)
-				if (!bshas(cur.b, r)) {
-					rfree(&cur, t);
-					radd(&cur, t, r);
-					x = tmp[t].cls;
-					emit(Ocopy, x, TMP(r1), TMP(r), R);
-				}
+			if (rl[r] == 0)
+				rl[r] = -1;
+		}
+
+		/* process non-phis temporaries */
+		for (j=0; j<m->n; j++) {
+			t = m->t[j];
+			r = m->r[j];
+			if (rl[r] || t < Tmp0 /* todo, remove this */)
+				continue;
+			for (bp=s->pred; bp<&s->pred[s->npred]; bp++) {
+				x = rfind(&end[(*bp)->id], t);
+				assert(x != -1);
+				rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
 			}
-			if ((j = &insb[NIns] - curi)) {
-				b->nins += j;
-				i = alloc(b->nins * sizeof(Ins));
-				icpy(icpy(i, curi, j), b->ins, b->nins-j);
-				b->ins = i;
+		}
+
+		npm = 0;
+		for (j=0; j<m->n; j++) {
+			t = m->t[j];
+			r = m->r[j];
+			x = rl[r];
+			assert(x != 0 || t < Tmp0 /* todo, ditto */);
+			if (x > 0) {
+				pmadd(TMP(x), TMP(r), tmp[t].cls);
+				m->r[j] = x;
 			}
 		}
-		rcopy(&beg[n], &cur);
+		curi = &insb[NIns];
+		pmgen();
+		j = &insb[NIns] - curi;
+		if (j == 0)
+			continue;
+		stmov += j;
+		s->nins += j;
+		i = alloc(s->nins * sizeof(Ins));
+		icpy(icpy(i, curi, j), s->ins, s->nins-j);
+		s->ins = i;
 	}
+
 	if (debug['R'])  {
 		fprintf(stderr, "\n> Register mappings:\n");
 		for (n=0; n<fn->nblk; n++) {
@@ -533,7 +623,7 @@ rega(Fn *fn)
 		fprintf(stderr, "\n");
 	}
 
-	/* 3. compose glue code */
+	/* 4. emit remaining copies in new blocks */
 	blist = 0;
 	for (b=fn->start;; b=b->link) {
 		ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
@@ -560,8 +650,9 @@ rega(Fn *fn)
 				dst = rref(&beg[s->id], t);
 				pmadd(src, dst, tmp[t].cls);
 			}
+			curi = &insb[NIns];
 			pmgen();
-			if (curi == insb)
+			if (curi == &insb[NIns])
 				continue;
 			b1 = blknew();
 			b1->loop = (b->loop+s->loop) / 2;
@@ -569,8 +660,10 @@ rega(Fn *fn)
 			blist = b1;
 			fn->nblk++;
 			sprintf(b1->name, "%s_%s", b->name, s->name);
-			b1->nins = curi - insb;
-			idup(&b1->ins, insb, b1->nins);
+			b1->nins = &insb[NIns] - curi;
+			stmov += b1->nins;
+			stblk += 1;
+			idup(&b1->ins, curi, b1->nins);
 			b1->jmp.type = Jjmp;
 			b1->s1 = s;
 			**ps = b1;
@@ -585,6 +678,9 @@ rega(Fn *fn)
 	fn->reg = regu;
 
 	if (debug['R']) {
+		fprintf(stderr, "\n> Register allocation statistics:\n");
+		fprintf(stderr, "\tnew moves:  %d\n", stmov);
+		fprintf(stderr, "\tnew blocks: %d\n", stblk);
 		fprintf(stderr, "\n> After register allocation:\n");
 		printfn(fn, stderr);
 	}

	qbe Internal scc patchset buffer for QBE
	Log \| Files \| Refs \| README \| LICENSE

M	all.h	\|	5	+++--
M	rega.c	\|	420	++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------