qbe

Internal scc patchset buffer for QBE
Log | Files | Refs | README | LICENSE

mem.c (9306B)


      1 #include "all.h"
      2 
      3 typedef struct Range Range;
      4 typedef struct Store Store;
      5 typedef struct Slot Slot;
      6 
      7 /* require use, maintains use counts */
      8 void
      9 promote(Fn *fn)
     10 {
     11 	Blk *b;
     12 	Ins *i, *l;
     13 	Tmp *t;
     14 	Use *u, *ue;
     15 	int s, k;
     16 
     17 	/* promote uniform stack slots to temporaries */
     18 	b = fn->start;
     19 	for (i=b->ins; i<&b->ins[b->nins]; i++) {
     20 		if (Oalloc > i->op || i->op > Oalloc1)
     21 			continue;
     22 		/* specific to NAlign == 3 */
     23 		assert(rtype(i->to) == RTmp);
     24 		t = &fn->tmp[i->to.val];
     25 		if (t->ndef != 1)
     26 			goto Skip;
     27 		k = -1;
     28 		s = -1;
     29 		for (u=t->use; u<&t->use[t->nuse]; u++) {
     30 			if (u->type != UIns)
     31 				goto Skip;
     32 			l = u->u.ins;
     33 			if (isload(l->op))
     34 			if (s == -1 || s == loadsz(l)) {
     35 				s = loadsz(l);
     36 				continue;
     37 			}
     38 			if (isstore(l->op))
     39 			if (req(i->to, l->arg[1]) && !req(i->to, l->arg[0]))
     40 			if (s == -1 || s == storesz(l))
     41 			if (k == -1 || k == optab[l->op].argcls[0][0]) {
     42 				s = storesz(l);
     43 				k = optab[l->op].argcls[0][0];
     44 				continue;
     45 			}
     46 			goto Skip;
     47 		}
     48 		/* get rid of the alloc and replace uses */
     49 		*i = (Ins){.op = Onop};
     50 		t->ndef--;
     51 		ue = &t->use[t->nuse];
     52 		for (u=t->use; u!=ue; u++) {
     53 			l = u->u.ins;
     54 			if (isstore(l->op)) {
     55 				l->cls = k;
     56 				l->op = Ocopy;
     57 				l->to = l->arg[1];
     58 				l->arg[1] = R;
     59 				t->nuse--;
     60 				t->ndef++;
     61 			} else {
     62 				if (k == -1)
     63 					err("slot %%%s is read but never stored to",
     64 						fn->tmp[l->arg[0].val].name);
     65 				/* try to turn loads into copies so we
     66 				 * can eliminate them later */
     67 				switch(l->op) {
     68 				case Oloadsw:
     69 				case Oloaduw:
     70 					if (k == Kl)
     71 						goto Extend;
     72 					/* fall through */
     73 				case Oload:
     74 					if (KBASE(k) != KBASE(l->cls))
     75 						l->op = Ocast;
     76 					else
     77 						l->op = Ocopy;
     78 					break;
     79 				default:
     80 				Extend:
     81 					l->op = Oextsb + (l->op - Oloadsb);
     82 					break;
     83 				}
     84 			}
     85 		}
     86 	Skip:;
     87 	}
     88 	if (debug['M']) {
     89 		fprintf(stderr, "\n> After slot promotion:\n");
     90 		printfn(fn, stderr);
     91 	}
     92 }
     93 
     94 /* [a, b) with 0 <= a */
     95 struct Range {
     96 	int a, b;
     97 };
     98 
     99 struct Store {
    100 	int ip;
    101 	Ins *i;
    102 };
    103 
    104 struct Slot {
    105 	int t;
    106 	int sz;
    107 	bits m;
    108 	bits l;
    109 	Range r;
    110 	Slot *s;
    111 	Store *st;
    112 	int nst;
    113 };
    114 
    115 static inline int
    116 rin(Range r, int n)
    117 {
    118 	return r.a <= n && n < r.b;
    119 }
    120 
    121 static inline int
    122 rovlap(Range r0, Range r1)
    123 {
    124 	return r0.b && r1.b && r0.a < r1.b && r1.a < r0.b;
    125 }
    126 
    127 static void
    128 radd(Range *r, int n)
    129 {
    130 	if (!r->b)
    131 		*r = (Range){n, n+1};
    132 	else if (n < r->a)
    133 		r->a = n;
    134 	else if (n >= r->b)
    135 		r->b = n+1;
    136 }
    137 
    138 static int
    139 slot(Slot **ps, int64_t *off, Ref r, Fn *fn, Slot *sl)
    140 {
    141 	Alias a;
    142 	Tmp *t;
    143 
    144 	getalias(&a, r, fn);
    145 	if (a.type != ALoc)
    146 		return 0;
    147 	t = &fn->tmp[a.base];
    148 	if (t->visit < 0)
    149 		return 0;
    150 	*off = a.offset;
    151 	*ps = &sl[t->visit];
    152 	return 1;
    153 }
    154 
    155 static void
    156 load(Ref r, bits x, int ip, Fn *fn, Slot *sl)
    157 {
    158 	int64_t off;
    159 	Slot *s;
    160 
    161 	if (slot(&s, &off, r, fn, sl)) {
    162 		s->l |= x << off;
    163 		s->l &= s->m;
    164 		if (s->l)
    165 			radd(&s->r, ip);
    166 	}
    167 }
    168 
    169 static void
    170 store(Ref r, bits x, int ip, Ins *i, Fn *fn, Slot *sl)
    171 {
    172 	int64_t off;
    173 	Slot *s;
    174 
    175 	if (slot(&s, &off, r, fn, sl)) {
    176 		if (s->l) {
    177 			radd(&s->r, ip);
    178 			s->l &= ~(x << off);
    179 		} else {
    180 			vgrow(&s->st, ++s->nst);
    181 			s->st[s->nst-1].ip = ip;
    182 			s->st[s->nst-1].i = i;
    183 		}
    184 	}
    185 }
    186 
    187 static int
    188 scmp(const void *pa, const void *pb)
    189 {
    190 	Slot *a, *b;
    191 
    192 	a = (Slot *)pa, b = (Slot *)pb;
    193 	if (a->sz != b->sz)
    194 		return b->sz - a->sz;
    195 	return a->r.a - b->r.a;
    196 }
    197 
    198 static void
    199 maxrpo(Blk *hd, Blk *b)
    200 {
    201 	if (hd->loop < (int)b->id)
    202 		hd->loop = b->id;
    203 }
    204 
    205 void
    206 coalesce(Fn *fn)
    207 {
    208 	Range r, *br;
    209 	Slot *s, *s0, *sl;
    210 	Blk *b, **ps, *succ[3];
    211 	Ins *i, **bl;
    212 	Use *u;
    213 	Tmp *t, *ts;
    214 	Ref *arg;
    215 	bits x;
    216 	int64_t off0, off1;
    217 	int n, m, ip, sz, nsl, nbl, *stk;
    218 	uint total, freed, fused;
    219 
    220 	/* minimize the stack usage
    221 	 * by coalescing slots
    222 	 */
    223 	nsl = 0;
    224 	sl = vnew(0, sizeof sl[0], PHeap);
    225 	for (n=Tmp0; n<fn->ntmp; n++) {
    226 		t = &fn->tmp[n];
    227 		t->visit = -1;
    228 		if (t->alias.type == ALoc)
    229 		if (t->alias.slot == &t->alias)
    230 		if (t->bid == fn->start->id)
    231 		if (t->alias.u.loc.sz != -1) {
    232 			t->visit = nsl;
    233 			vgrow(&sl, ++nsl);
    234 			s = &sl[nsl-1];
    235 			s->t = n;
    236 			s->sz = t->alias.u.loc.sz;
    237 			s->m = t->alias.u.loc.m;
    238 			s->s = 0;
    239 			s->st = vnew(0, sizeof s->st[0], PHeap);
    240 			s->nst = 0;
    241 		}
    242 	}
    243 
    244 	/* one-pass liveness analysis */
    245 	for (b=fn->start; b; b=b->link)
    246 		b->loop = -1;
    247 	loopiter(fn, maxrpo);
    248 	nbl = 0;
    249 	bl = vnew(0, sizeof bl[0], PHeap);
    250 	br = emalloc(fn->nblk * sizeof br[0]);
    251 	ip = INT_MAX - 1;
    252 	for (n=fn->nblk-1; n>=0; n--) {
    253 		b = fn->rpo[n];
    254 		succ[0] = b->s1;
    255 		succ[1] = b->s2;
    256 		succ[2] = 0;
    257 		br[n].b = ip--;
    258 		for (s=sl; s<&sl[nsl]; s++) {
    259 			s->l = 0;
    260 			for (ps=succ; *ps; ps++) {
    261 				m = (*ps)->id;
    262 				if (m > n && rin(s->r, br[m].a)) {
    263 					s->l = s->m;
    264 					radd(&s->r, ip);
    265 				}
    266 			}
    267 		}
    268 		if (b->jmp.type == Jretc)
    269 			load(b->jmp.arg, -1, --ip, fn, sl);
    270 		for (i=&b->ins[b->nins]; i!=b->ins;) {
    271 			--i;
    272 			arg = i->arg;
    273 			if (i->op == Oargc) {
    274 				load(arg[1], -1, --ip, fn, sl);
    275 			}
    276 			if (isload(i->op)) {
    277 				x = BIT(loadsz(i)) - 1;
    278 				load(arg[0], x, --ip, fn, sl);
    279 			}
    280 			if (isstore(i->op)) {
    281 				x = BIT(storesz(i)) - 1;
    282 				store(arg[1], x, ip--, i, fn, sl);
    283 			}
    284 			if (i->op == Oblit0) {
    285 				assert((i+1)->op == Oblit1);
    286 				assert(rtype((i+1)->arg[0]) == RInt);
    287 				sz = abs(rsval((i+1)->arg[0]));
    288 				x = sz >= NBit ? (bits)-1 : BIT(sz) - 1;
    289 				store(arg[1], x, ip--, i, fn, sl);
    290 				load(arg[0], x, ip, fn, sl);
    291 				vgrow(&bl, ++nbl);
    292 				bl[nbl-1] = i;
    293 			}
    294 		}
    295 		for (s=sl; s<&sl[nsl]; s++)
    296 			if (s->l) {
    297 				radd(&s->r, ip);
    298 				if (b->loop != -1) {
    299 					assert(b->loop > n);
    300 					radd(&s->r, br[b->loop].b - 1);
    301 				}
    302 			}
    303 		br[n].a = ip;
    304 	}
    305 	free(br);
    306 
    307 	/* kill dead stores */
    308 	for (s=sl; s<&sl[nsl]; s++)
    309 		for (n=0; n<s->nst; n++)
    310 			if (!rin(s->r, s->st[n].ip)) {
    311 				i = s->st[n].i;
    312 				if (i->op == Oblit0)
    313 					*(i+1) = (Ins){.op = Onop};
    314 				*i = (Ins){.op = Onop};
    315 			}
    316 
    317 	/* kill slots with an empty live range */
    318 	total = 0;
    319 	freed = 0;
    320 	stk = vnew(0, sizeof stk[0], PHeap);
    321 	n = 0;
    322 	for (s=s0=sl; s<&sl[nsl]; s++) {
    323 		total += s->sz;
    324 		if (!s->r.b) {
    325 			vfree(s->st);
    326 			vgrow(&stk, ++n);
    327 			stk[n-1] = s->t;
    328 			freed += s->sz;
    329 		} else
    330 			*s0++ = *s;
    331 	}
    332 	nsl = s0-sl;
    333 	if (debug['M']) {
    334 		fputs("\n> Slot coalescing:\n", stderr);
    335 		if (n) {
    336 			fputs("\tkill [", stderr);
    337 			for (m=0; m<n; m++)
    338 				fprintf(stderr, " %%%s",
    339 					fn->tmp[stk[m]].name);
    340 			fputs(" ]\n", stderr);
    341 		}
    342 	}
    343 	while (n--) {
    344 		t = &fn->tmp[stk[n]];
    345 		assert(t->ndef == 1 && t->def);
    346 		i = t->def;
    347 		if (isload(i->op)) {
    348 			i->op = Ocopy;
    349 			i->arg[0] = UNDEF;
    350 			continue;
    351 		}
    352 		*i = (Ins){.op = Onop};
    353 		for (u=t->use; u<&t->use[t->nuse]; u++) {
    354 			if (u->type == UJmp) {
    355 				b = fn->rpo[u->bid];
    356 				assert(isret(b->jmp.type));
    357 				b->jmp.type = Jret0;
    358 				b->jmp.arg = R;
    359 				continue;
    360 			}
    361 			assert(u->type == UIns);
    362 			i = u->u.ins;
    363 			if (!req(i->to, R)) {
    364 				assert(rtype(i->to) == RTmp);
    365 				vgrow(&stk, ++n);
    366 				stk[n-1] = i->to.val;
    367 			} else if (isarg(i->op)) {
    368 				assert(i->op == Oargc);
    369 				i->arg[1] = CON_Z;  /* crash */
    370 			} else {
    371 				if (i->op == Oblit0)
    372 					*(i+1) = (Ins){.op = Onop};
    373 				*i = (Ins){.op = Onop};
    374 			}
    375 		}
    376 	}
    377 	vfree(stk);
    378 
    379 	/* fuse slots by decreasing size */
    380 	qsort(sl, nsl, sizeof *sl, scmp);
    381 	fused = 0;
    382 	for (n=0; n<nsl; n++) {
    383 		s0 = &sl[n];
    384 		if (s0->s)
    385 			continue;
    386 		s0->s = s0;
    387 		r = s0->r;
    388 		for (s=s0+1; s<&sl[nsl]; s++) {
    389 			if (s->s || !s->r.b)
    390 				goto Skip;
    391 			if (rovlap(r, s->r))
    392 				/* O(n); can be approximated
    393 				 * by 'goto Skip;' if need be
    394 				 */
    395 				for (m=n; &sl[m]<s; m++)
    396 					if (sl[m].s == s0)
    397 					if (rovlap(sl[m].r, s->r))
    398 						goto Skip;
    399 			radd(&r, s->r.a);
    400 			radd(&r, s->r.b - 1);
    401 			s->s = s0;
    402 			fused += s->sz;
    403 		Skip:;
    404 		}
    405 	}
    406 
    407 	/* substitute fused slots */
    408 	for (s=sl; s<&sl[nsl]; s++) {
    409 		t = &fn->tmp[s->t];
    410 		/* the visit link is stale,
    411 		 * reset it before the slot()
    412 		 * calls below
    413 		 */
    414 		t->visit = s-sl;
    415 		assert(t->ndef == 1 && t->def);
    416 		if (s->s == s)
    417 			continue;
    418 		*t->def = (Ins){.op = Onop};
    419 		ts = &fn->tmp[s->s->t];
    420 		assert(t->bid == ts->bid);
    421 		if (t->def < ts->def) {
    422 			/* make sure the slot we
    423 			 * selected has a def that
    424 			 * dominates its new uses
    425 			 */
    426 			*t->def = *ts->def;
    427 			*ts->def = (Ins){.op = Onop};
    428 			ts->def = t->def;
    429 		}
    430 		for (u=t->use; u<&t->use[t->nuse]; u++) {
    431 			if (u->type == UJmp) {
    432 				b = fn->rpo[u->bid];
    433 				b->jmp.arg = TMP(s->s->t);
    434 				continue;
    435 			}
    436 			assert(u->type == UIns);
    437 			arg = u->u.ins->arg;
    438 			for (n=0; n<2; n++)
    439 				if (req(arg[n], TMP(s->t)))
    440 					arg[n] = TMP(s->s->t);
    441 		}
    442 	}
    443 
    444 	/* fix newly overlapping blits */
    445 	for (n=0; n<nbl; n++) {
    446 		i = bl[n];
    447 		if (i->op == Oblit0)
    448 		if (slot(&s, &off0, i->arg[0], fn, sl))
    449 		if (slot(&s0, &off1, i->arg[1], fn, sl))
    450 		if (s->s == s0->s && off0 < off1) {
    451 			sz = rsval((i+1)->arg[0]);
    452 			assert(sz >= 0);
    453 			(i+1)->arg[0] = INT(-sz);
    454 		}
    455 	}
    456 	vfree(bl);
    457 
    458 	if (debug['M']) {
    459 		for (s0=sl; s0<&sl[nsl]; s0++) {
    460 			if (s0->s != s0)
    461 				continue;
    462 			fprintf(stderr, "\tfuse (% 3db) [", s0->sz);
    463 			for (s=s0; s<&sl[nsl]; s++) {
    464 				if (s->s != s0)
    465 					continue;
    466 				fprintf(stderr, " %%%s", fn->tmp[s->t].name);
    467 				if (s->r.b)
    468 					fprintf(stderr, "[%d,%d)",
    469 						s->r.a-ip, s->r.b-ip);
    470 				else
    471 					fputs("{}", stderr);
    472 			}
    473 			fputs(" ]\n", stderr);
    474 		}
    475 		fprintf(stderr, "\tsums %u/%u/%u (killed/fused/total)\n\n",
    476 			freed, fused, total);
    477 		printfn(fn, stderr);
    478 	}
    479 
    480 	for (s=sl; s<&sl[nsl]; s++)
    481 		vfree(s->st);
    482 	vfree(sl);
    483 }