commit 674901b71834739cccf3f8665833dc125d1172b7
parent f5007126f3140a8c932d393c5b217308b80bd46a
Author: Quentin Carbonneaux <quentin@c9x.me>
Date: Sun, 20 Nov 2022 22:14:41 +0100
new slot coalescing pass
This pass limits stack usage when
many small aggregates are allocated
on the stack. A fast liveness
analysis figures out which slots
interfere and the pass then fuses
slots that do not interfere. The
pass also kills stack slots that
are only ever assigned.
On the hare stdlib test suite, this
fusion pass managed to reduce the
total eligible slot bytes count
by 84%.
The slots considered for fusion
must not escape and not exceed
64 bytes in size.
Diffstat:
M | all.h | | | 3 | ++- |
M | main.c | | | 5 | ++++- |
M | mem.c | | | 303 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- |
3 files changed, 306 insertions(+), 5 deletions(-)
diff --git a/all.h b/all.h
@@ -514,7 +514,8 @@ void fillloop(Fn *);
void simpljmp(Fn *);
/* mem.c */
-void memopt(Fn *);
+void promote(Fn *);
+void coalesce(Fn *);
/* alias.c */
void fillalias(Fn *);
diff --git a/main.c b/main.c
@@ -62,7 +62,7 @@ func(Fn *fn)
fillrpo(fn);
fillpreds(fn);
filluse(fn);
- memopt(fn);
+ promote(fn);
filluse(fn);
ssa(fn);
filluse(fn);
@@ -70,6 +70,9 @@ func(Fn *fn)
fillalias(fn);
loadopt(fn);
filluse(fn);
+ fillalias(fn);
+ coalesce(fn);
+ filluse(fn);
ssacheck(fn);
copy(fn);
filluse(fn);
diff --git a/mem.c b/mem.c
@@ -1,8 +1,11 @@
#include "all.h"
+typedef struct Range Range;
+typedef struct Slot Slot;
+
/* require use, maintains use counts */
void
-memopt(Fn *fn)
+promote(Fn *fn)
{
Blk *b;
Ins *i, *l;
@@ -22,7 +25,7 @@ memopt(Fn *fn)
goto Skip;
k = -1;
s = -1;
- for (u=t->use; u != &t->use[t->nuse]; u++) {
+ for (u=t->use; u<&t->use[t->nuse]; u++) {
if (u->type != UIns)
goto Skip;
l = u->u.ins;
@@ -82,7 +85,301 @@ memopt(Fn *fn)
Skip:;
}
if (debug['M']) {
- fprintf(stderr, "\n> After memory optimization:\n");
+ fprintf(stderr, "\n> After slot promotion:\n");
printfn(fn, stderr);
}
}
+
+/* [a, b) with 0 <= a */
+struct Range {
+ int a, b;
+};
+
+struct Slot {
+ int t;
+ int sz;
+ bits m;
+ bits l;
+ Range r;
+ Slot *s;
+};
+
+static inline int
+rin(Range r, int n)
+{
+ return r.a <= n && n < r.b;
+}
+
+static inline int
+rovlap(Range r0, Range r1)
+{
+ return r0.b && r1.b && r0.a < r1.b && r1.a < r0.b;
+}
+
+static void
+radd(Range *r, int n)
+{
+ if (!r->b)
+ *r = (Range){n, n+1};
+ else if (n < r->a)
+ r->a = n;
+ else if (n >= r->b)
+ r->b = n+1;
+}
+
+static int
+slot(Slot **ps, int64_t *off, Ref r, Fn *fn, Slot *sl)
+{
+ Alias a;
+ Tmp *t;
+
+ getalias(&a, r, fn);
+ if (a.type != ALoc)
+ return 0;
+ t = &fn->tmp[a.base];
+ if (t->visit < 0)
+ return 0;
+ *off = a.offset;
+ *ps = &sl[t->visit];
+ return 1;
+}
+
+static void
+memr(Ref r, bits x, int ip, Fn *fn, Slot *sl)
+{
+ int64_t off;
+ Slot *s;
+
+ if (slot(&s, &off, r, fn, sl)) {
+ s->l |= x << off;
+ s->l &= s->m;
+ if (s->l)
+ radd(&s->r, ip);
+ }
+}
+
+static void
+memw(Ref r, bits x, int ip, Fn *fn, Slot *sl)
+{
+ int64_t off;
+ Slot *s;
+
+ if (slot(&s, &off, r, fn, sl)) {
+ if (s->l)
+ radd(&s->r, ip);
+ s->l &= ~(x << off);
+ }
+}
+
+static int
+scmp(const void *pa, const void *pb)
+{
+ Slot *a, *b;
+
+ a = (Slot *)pa, b = (Slot *)pb;
+ if (a->sz != b->sz)
+ return b->sz - a->sz;
+ return a->r.a - b->r.a;
+}
+
+static void
+maxrpo(Blk *hd, Blk *b)
+{
+ if (hd->loop < (int)b->id)
+ hd->loop = b->id;
+}
+
+void
+coalesce(Fn *fn)
+{
+ Range r, *br;
+ Slot *s, *s0, *sl;
+ Blk *b, **ps, *succ[3];
+ Ins *i;
+ Use *u;
+ Tmp *t;
+ Ref *arg;
+ bits x;
+ int n, m, nsl, ip, *kill;
+ uint total, freed, fused;
+
+ /* minimize the stack usage
+ * by coalescing slots
+ */
+ nsl = 0;
+ sl = vnew(0, sizeof sl[0], PHeap);
+ for (n=Tmp0; n<fn->ntmp; n++) {
+ t = &fn->tmp[n];
+ t->visit = -1;
+ if (t->alias.type == ALoc)
+ if (t->alias.slot == &t->alias)
+ if (t->alias.u.loc.sz != -1) {
+ t->visit = nsl;
+ vgrow(&sl, ++nsl);
+ s = &sl[nsl-1];
+ s->t = n;
+ s->sz = t->alias.u.loc.sz;
+ s->m = t->alias.u.loc.m;
+ s->s = 0;
+ }
+ }
+
+ /* one-pass liveness analysis */
+ for (b=fn->start; b; b=b->link)
+ b->loop = -1;
+ loopiter(fn, maxrpo);
+ br = vnew(fn->nblk, sizeof br[0], PHeap);
+ ip = INT_MAX - 1;
+ for (n=fn->nblk-1; n>=0; n--) {
+ b = fn->rpo[n];
+ succ[0] = b->s1;
+ succ[1] = b->s2;
+ succ[2] = 0;
+ br[n].b = ip--;
+ for (s=sl; s<&sl[nsl]; s++) {
+ s->l = 0;
+ for (ps=succ; *ps; ps++) {
+ m = (*ps)->id;
+ if (m > n && rin(s->r, br[m].a)) {
+ s->l = s->m;
+ radd(&s->r, ip);
+ }
+ }
+ }
+ for (i=&b->ins[b->nins]; i!=b->ins;) {
+ arg = (--i)->arg;
+ if (i->op == Oargc) {
+ memr(arg[1], -1, --ip, fn, sl);
+ }
+ if (isload(i->op)) {
+ x = BIT(loadsz(i)) - 1;
+ memr(arg[0], x, --ip, fn, sl);
+ }
+ if (isstore(i->op)) {
+ x = BIT(storesz(i)) - 1;
+ memw(arg[1], x, ip--, fn, sl);
+ }
+ }
+ for (s=sl; s<&sl[nsl]; s++)
+ if (s->l) {
+ radd(&s->r, ip);
+ if (b->loop != -1) {
+ assert(b->loop > n);
+ radd(&s->r, br[b->loop].b - 1);
+ }
+ }
+ br[n].a = ip;
+ }
+ vfree(br);
+
+ /* kill slots with an empty live range */
+ total = 0;
+ freed = 0;
+ kill = vnew(0, sizeof kill[0], PHeap);
+ n = 0;
+ for (s=s0=sl; s<&sl[nsl]; s++) {
+ total += s->sz;
+ if (!s->r.b) {
+ vgrow(&kill, ++n);
+ kill[n-1] = s->t;
+ freed += s->sz;
+ } else
+ *s0++ = *s;
+ }
+ nsl = s0-sl;
+ if (debug['M']) {
+ fputs("\n> Slot coalescing:\n", stderr);
+ if (n) {
+ fputs("\tDEAD", stderr);
+ for (m=0; m<n; m++)
+ fprintf(stderr, " %%%s",
+ fn->tmp[kill[m]].name);
+ fputc('\n', stderr);
+ }
+ }
+ while (n--) {
+ t = &fn->tmp[kill[n]];
+ assert(t->ndef == 1 && t->ins);
+ *t->ins = (Ins){.op = Onop};
+ for (u=t->use; u<&t->use[t->nuse]; u++) {
+ assert(u->type == UIns);
+ i = u->u.ins;
+ if (!req(i->to, R)) {
+ assert(rtype(i->to) == RTmp);
+ vgrow(&kill, ++n);
+ kill[n-1] = i->to.val;
+ } else
+ *i = (Ins){.op = Onop};
+ }
+ }
+ vfree(kill);
+
+ /* fuse slots by decreasing size */
+ qsort(sl, nsl, sizeof *sl, scmp);
+ fused = 0;
+ for (n=0; n<nsl; n++) {
+ s0 = &sl[n];
+ if (s0->s)
+ continue;
+ s0->s = s0;
+ r = s0->r;
+ for (s=s0+1; s<&sl[nsl]; s++) {
+ if (s->s || !s->r.b)
+ goto Skip;
+ if (rovlap(r, s->r))
+ /* O(n^2) can be approximated
+ * by 'goto Skip;' if need be
+ */
+ for (m=n; &sl[m]<s; m++)
+ if (sl[m].s == s0)
+ if (rovlap(sl[m].r, s->r))
+ goto Skip;
+ radd(&r, s->r.a);
+ radd(&r, s->r.b - 1);
+ s->s = s0;
+ fused += s->sz;
+ Skip:;
+ }
+ }
+
+ /* substitute fused slots */
+ for (s=sl; s<&sl[nsl]; s++) {
+ if (s->s == s)
+ continue;
+ t = &fn->tmp[s->t];
+ assert(t->ndef == 1 && t->ins);
+ *t->ins = (Ins){.op = Onop};
+ for (u=t->use; u<&t->use[t->nuse]; u++) {
+ assert(u->type == UIns);
+ arg = u->u.ins->arg;
+ for (n=0; n<2; n++)
+ if (req(arg[n], TMP(s->t)))
+ arg[n] = TMP(s->s->t);
+ }
+ }
+
+ if (debug['M']) {
+ for (s0=sl; s0<&sl[nsl]; s0++) {
+ if (s0->s != s0)
+ continue;
+ fprintf(stderr, "\tLOCL (% 3db) %s:",
+ s0->sz, fn->tmp[s0->t].name);
+ for (s=s0; s<&sl[nsl]; s++) {
+ if (s->s != s0)
+ continue;
+ fprintf(stderr, " %%%s", fn->tmp[s->t].name);
+ if (s->r.b)
+ fprintf(stderr, "[%d,%d)",
+ s->r.a-ip, s->r.b-ip);
+ else
+ fputs("{}", stderr);
+ }
+ fputc('\n', stderr);
+ }
+ fprintf(stderr, "\tSUMS %u/%u/%u (freed/fused/total)\n\n",
+ freed, fused, total);
+ printfn(fn, stderr);
+ }
+
+ vfree(sl);
+}