new copy elimination pass - qbe - Internal scc patchset buffer for QBE

commit b2ea8c11b61014cb90e2a025d605ac77a1c7d6bc
parent dadf6d69d8ef24ada3461ddc81cf56418cfdc91e
Author: Quentin Carbonneaux <quentin@c9x.me>
Date:   Thu, 21 Feb 2019 22:35:12 +0100

new copy elimination pass

The sparse data-flow analysis used for
copy elimination before this patch
could sometimes diverge.  The core
reason for this behavior is that the
visitphi() function was not monotonic
in the following copy-of lattice:

     top   (represented as the temp
    / | \   itself)
   x  y  z ...
    \ | /
     bot   (represented as R)

This monotonicity defect could be
fixed by reverting 2f41ff03, but
then the pass would end up missing
some redundant phis.

This patch re-implements the pass
from scratch using a different
approach.  The new algorithm should
get rid of all redundant copies.  On
the other hand, it can run slower
than the monotonic sparse data-flow
analysis because, in the worst case,
an instruction in a phi cluster can
be visited as many times as there
are phis in the input program.

Thanks to Michael Forney for reviewing
and testing the new pass.

Diffstat:
M copy.c  | 220 ++++++++++++++++++++++++++++++++++++++++---------------------------------------

1 file changed, 112 insertions(+), 108 deletions(-)
diff --git a/copy.c b/copy.c
@@ -1,58 +1,7 @@
 #include "all.h"
 
-typedef struct RList RList;
-struct RList {
-	int t;
-	RList *l;
-};
-
-static Ref
-copyof(Ref r, Ref *cp)
-{
-	if (rtype(r) == RTmp)
-		return cp[r.val];
-	else
-		return r;
-}
-
-static void
-update(Ref r, Ref rcp, Ref *cp, RList ***pw)
-{
-	RList *l;
-
-	if (!req(cp[r.val], rcp)) {
-		cp[r.val] = rcp;
-		l = emalloc(sizeof *l);
-		l->t = r.val;
-		l->l = 0;
-		**pw = l;
-		*pw = &l->l;
-	}
-}
-
-static void
-visitphi(Phi *p, Ref *cp, RList ***pw)
-{
-	uint a;
-	Ref r, r1;
-
-	r = R;
-	for (a=0; a<p->narg; a++) {
-		r1 = copyof(p->arg[a], cp);
-		if (req(r1, R) || req(r1, p->to))
-			continue;
-		if (req(r, R) || req(r, r1))
-			r = r1;
-		else {
-			r = p->to;
-			break;
-		}
-	}
-	update(p->to, r, cp, pw);
-}
-
 static int
-iscopy(Ins *i, Ref r, Fn *fn)
+iscopy(Ins *i, Ref r, Tmp *tmp)
 {
 	static bits extcpy[] = {
 		[WFull] = 0,
@@ -74,7 +23,7 @@ iscopy(Ins *i, Ref r, Fn *fn)
 	if (i->cls == Kw)
 		return 1;
 
-	t = &fn->tmp[r.val];
+	t = &tmp[r.val];
 	assert(KBASE(t->cls) == 0);
 	if (i->cls == Kl && t->cls == Kw)
 		return 0;
@@ -82,105 +31,160 @@ iscopy(Ins *i, Ref r, Fn *fn)
 	return (BIT(Wsb + (i->op-Oextsb)) & b) != 0;
 }
 
+static Ref
+copyof(Ref r, Ref *cpy)
+{
+	if (rtype(r) == RTmp && !req(cpy[r.val], R))
+		return cpy[r.val];
+	return r;
+}
+
+/* detects a cluster of phis/copies redundant with 'r';
+ * the algorithm is inspired by Section 3.2 of "Simple
+ * and Efficient SSA Construction" by Braun M. et al.
+ */
 static void
-visitins(Ins *i, Ref *cp, RList ***pw, Fn *fn)
+phisimpl(Phi *p, Ref r, Ref *cpy, Use ***pstk, BSet *ts, BSet *as, Tmp *tmp)
 {
-	Ref r;
+	Use **stk, *u, *u1;
+	uint nstk, a;
+	int t;
+	Ref r1;
 
-	r = copyof(i->arg[0], cp);
-	if (iscopy(i, r, fn)) {
-		update(i->to, r, cp, pw);
-	} else if (!req(i->to, R)) {
-		assert(rtype(i->to) == RTmp);
-		update(i->to, i->to, cp, pw);
+	bszero(ts);
+	bszero(as);
+	stk = *pstk;
+	nstk = 1;
+	stk[0] = &(Use){.type = UPhi, .u.phi = p};
+	while (nstk) {
+		u = stk[--nstk];
+		if (u->type == UIns && iscopy(u->u.ins, r, tmp)) {
+			p = &(Phi){.narg = 0};
+			t = u->u.ins->to.val;
+		}
+		else if (u->type == UPhi) {
+			p = u->u.phi;
+			t = p->to.val;
+		}
+		else
+			continue;
+		if (bshas(ts, t))
+			continue;
+		bsset(ts, t);
+		for (a=0; a<p->narg; a++) {
+			r1 = copyof(p->arg[a], cpy);
+			if (req(r1, r))
+				continue;
+			if (rtype(r1) != RTmp)
+				return;
+			bsset(as, r1.val);
+		}
+		u = tmp[t].use;
+		u1 = &u[tmp[t].nuse];
+		vgrow(pstk, nstk+(u1-u));
+		stk = *pstk;
+		for (; u<u1; u++)
+			stk[nstk++] = u;
 	}
+	bsdiff(as, ts);
+	if (!bscount(as))
+		for (t=0; bsiter(ts, &t); t++)
+			cpy[t] = r;
 }
 
 static void
-subst(Ref *r, Ref *cp)
+subst(Ref *pr, Ref *cpy)
 {
-	assert((rtype(*r) != RTmp || !req(copyof(*r, cp), R)) && "ssa invariant broken");
-	*r = copyof(*r, cp);
+	assert(rtype(*pr) != RTmp || !req(cpy[pr->val], R));
+	*pr = copyof(*pr, cpy);
 }
 
+/* requires use and rpo, breaks use */
 void
 copy(Fn *fn)
 {
-	Blk *b;
-	Ref *cp, r;
-	RList *w, *w1, **pw;
-	Use *u, *u1;
-	Ins *i;
+	BSet ts[1], as[1];
+	Use **stk;
 	Phi *p, **pp;
-	uint a;
+	Ins *i;
+	Blk *b;
+	uint n, a;
+	Ref *cpy, r;
 	int t;
 
-	w = 0;
-	pw = &w;
-	cp = emalloc(fn->ntmp * sizeof cp[0]);
-	for (b=fn->start; b; b=b->link) {
-		for (p=b->phi; p; p=p->link)
-			visitphi(p, cp, &pw);
-		for (i=b->ins; i<&b->ins[b->nins]; i++)
-			visitins(i, cp, &pw, fn);
-	}
-	while ((w1=w)) {
-		t = w->t;
-		u = fn->tmp[t].use;
-		u1 = u + fn->tmp[t].nuse;
-		for (; u<u1; u++)
-			switch (u->type) {
-			case UPhi:
-				visitphi(u->u.phi, cp, &pw);
-				break;
-			case UIns:
-				visitins(u->u.ins, cp, &pw, fn);
-				break;
-			case UJmp:
-				break;
-			default:
-				die("invalid use %d", u->type);
-			}
-		w = w->l;
-		free(w1);
+	bsinit(ts, fn->ntmp);
+	bsinit(as, fn->ntmp);
+	cpy = emalloc(fn->ntmp * sizeof cpy[0]);
+	stk = vnew(10, sizeof stk[0], Pheap);
+
+	/* 1. build the copy-of map */
+	for (n=0; n<fn->nblk; n++) {
+		b = fn->rpo[n];
+		for (p=b->phi; p; p=p->link) {
+			assert(rtype(p->to) == RTmp);
+			if (!req(cpy[p->to.val], R))
+				continue;
+			r = R;
+			for (a=0; a<p->narg; a++)
+				if (p->blk[a]->id < n)
+					r = copyof(p->arg[a], cpy);
+			assert(!req(r, R));
+			cpy[p->to.val] = p->to;
+			phisimpl(p, r, cpy, &stk, ts, as, fn->tmp);
+		}
+		for (i=b->ins; i<&b->ins[b->nins]; i++) {
+			assert(rtype(i->to) <= RTmp);
+			if (!req(cpy[i->to.val], R))
+				continue;
+			r = copyof(i->arg[0], cpy);
+			if (iscopy(i, r, fn->tmp))
+				cpy[i->to.val] = r;
+			else
+				cpy[i->to.val] = i->to;
+		}
 	}
+
+	/* 2. remove redundant phis/copies
+	 * and rewrite their uses */
 	for (b=fn->start; b; b=b->link) {
 		for (pp=&b->phi; (p=*pp);) {
-			r = cp[p->to.val];
+			r = cpy[p->to.val];
 			if (!req(r, p->to)) {
 				*pp = p->link;
 				continue;
 			}
 			for (a=0; a<p->narg; a++)
-				subst(&p->arg[a], cp);
+				subst(&p->arg[a], cpy);
 			pp=&p->link;
 		}
 		for (i=b->ins; i<&b->ins[b->nins]; i++) {
-			r = copyof(i->to, cp);
+			r = cpy[i->to.val];
 			if (!req(r, i->to)) {
 				*i = (Ins){.op = Onop};
 				continue;
 			}
-			for (a=0; a<2; a++)
-				subst(&i->arg[a], cp);
+			subst(&i->arg[0], cpy);
+			subst(&i->arg[1], cpy);
 		}
-		subst(&b->jmp.arg, cp);
+		subst(&b->jmp.arg, cpy);
 	}
+
 	if (debug['C']) {
 		fprintf(stderr, "\n> Copy information:");
 		for (t=Tmp0; t<fn->ntmp; t++) {
-			if (req(cp[t], R)) {
+			if (req(cpy[t], R)) {
 				fprintf(stderr, "\n%10s not seen!",
 					fn->tmp[t].name);
 			}
-			else if (!req(cp[t], TMP(t))) {
+			else if (!req(cpy[t], TMP(t))) {
 				fprintf(stderr, "\n%10s copy of ",
 					fn->tmp[t].name);
-				printref(cp[t], fn, stderr);
+				printref(cpy[t], fn, stderr);
 			}
 		}
 		fprintf(stderr, "\n\n> After copy elimination:\n");
 		printfn(fn, stderr);
 	}
-	free(cp);
+	vfree(stk);
+	free(cpy);
 }

	qbe Internal scc patchset buffer for QBE
	Log \| Files \| Refs \| README \| LICENSE