qbe

Internal scc patchset buffer for QBE
Log | Files | Refs | README | LICENSE

spill.c (10666B)


      1 #include "all.h"
      2 
      3 static void
      4 aggreg(Blk *hd, Blk *b)
      5 {
      6 	int k;
      7 
      8 	/* aggregate looping information at
      9 	 * loop headers */
     10 	bsunion(hd->gen, b->gen);
     11 	for (k=0; k<2; k++)
     12 		if (b->nlive[k] > hd->nlive[k])
     13 			hd->nlive[k] = b->nlive[k];
     14 }
     15 
     16 static void
     17 tmpuse(Ref r, int use, int loop, Fn *fn)
     18 {
     19 	Mem *m;
     20 	Tmp *t;
     21 
     22 	if (rtype(r) == RMem) {
     23 		m = &fn->mem[r.val];
     24 		tmpuse(m->base, 1, loop, fn);
     25 		tmpuse(m->index, 1, loop, fn);
     26 	}
     27 	else if (rtype(r) == RTmp && r.val >= Tmp0) {
     28 		t = &fn->tmp[r.val];
     29 		t->nuse += use;
     30 		t->ndef += !use;
     31 		t->cost += loop;
     32 	}
     33 }
     34 
     35 /* evaluate spill costs of temporaries,
     36  * this also fills usage information
     37  * requires rpo, preds
     38  */
     39 void
     40 fillcost(Fn *fn)
     41 {
     42 	int n;
     43 	uint a;
     44 	Blk *b;
     45 	Ins *i;
     46 	Tmp *t;
     47 	Phi *p;
     48 
     49 	loopiter(fn, aggreg);
     50 	if (debug['S']) {
     51 		fprintf(stderr, "\n> Loop information:\n");
     52 		for (b=fn->start; b; b=b->link) {
     53 			for (a=0; a<b->npred; ++a)
     54 				if (b->id <= b->pred[a]->id)
     55 					break;
     56 			if (a != b->npred) {
     57 				fprintf(stderr, "\t%-10s", b->name);
     58 				fprintf(stderr, " (% 3d ", b->nlive[0]);
     59 				fprintf(stderr, "% 3d) ", b->nlive[1]);
     60 				dumpts(b->gen, fn->tmp, stderr);
     61 			}
     62 		}
     63 	}
     64 	for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) {
     65 		t->cost = t-fn->tmp < Tmp0 ? UINT_MAX : 0;
     66 		t->nuse = 0;
     67 		t->ndef = 0;
     68 	}
     69 	for (b=fn->start; b; b=b->link) {
     70 		for (p=b->phi; p; p=p->link) {
     71 			t = &fn->tmp[p->to.val];
     72 			tmpuse(p->to, 0, 0, fn);
     73 			for (a=0; a<p->narg; a++) {
     74 				n = p->blk[a]->loop;
     75 				t->cost += n;
     76 				tmpuse(p->arg[a], 1, n, fn);
     77 			}
     78 		}
     79 		n = b->loop;
     80 		for (i=b->ins; i<&b->ins[b->nins]; i++) {
     81 			tmpuse(i->to, 0, n, fn);
     82 			tmpuse(i->arg[0], 1, n, fn);
     83 			tmpuse(i->arg[1], 1, n, fn);
     84 		}
     85 		tmpuse(b->jmp.arg, 1, n, fn);
     86 	}
     87 	if (debug['S']) {
     88 		fprintf(stderr, "\n> Spill costs:\n");
     89 		for (n=Tmp0; n<fn->ntmp; n++)
     90 			fprintf(stderr, "\t%-10s %d\n",
     91 				fn->tmp[n].name,
     92 				fn->tmp[n].cost);
     93 		fprintf(stderr, "\n");
     94 	}
     95 }
     96 
     97 static BSet *fst; /* temps to prioritize in registers (for tcmp1) */
     98 static Tmp *tmp;  /* current temporaries (for tcmpX) */
     99 static int ntmp;  /* current # of temps (for limit) */
    100 static int locs;  /* stack size used by locals */
    101 static int slot4; /* next slot of 4 bytes */
    102 static int slot8; /* ditto, 8 bytes */
    103 static BSet mask[2][1]; /* class masks */
    104 
    105 static int
    106 tcmp0(const void *pa, const void *pb)
    107 {
    108 	uint ca, cb;
    109 
    110 	ca = tmp[*(int *)pa].cost;
    111 	cb = tmp[*(int *)pb].cost;
    112 	return (cb < ca) ? -1 : (cb > ca);
    113 }
    114 
    115 static int
    116 tcmp1(const void *pa, const void *pb)
    117 {
    118 	int c;
    119 
    120 	c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa);
    121 	return c ? c : tcmp0(pa, pb);
    122 }
    123 
    124 static Ref
    125 slot(int t)
    126 {
    127 	int s;
    128 
    129 	assert(t >= Tmp0 && "cannot spill register");
    130 	s = tmp[t].slot;
    131 	if (s == -1) {
    132 		/* specific to NAlign == 3 */
    133 		/* nice logic to pack stack slots
    134 		 * on demand, there can be only
    135 		 * one hole and slot4 points to it
    136 		 *
    137 		 * invariant: slot4 <= slot8
    138 		 */
    139 		if (KWIDE(tmp[t].cls)) {
    140 			s = slot8;
    141 			if (slot4 == slot8)
    142 				slot4 += 2;
    143 			slot8 += 2;
    144 		} else {
    145 			s = slot4;
    146 			if (slot4 == slot8) {
    147 				slot8 += 2;
    148 				slot4 += 1;
    149 			} else
    150 				slot4 = slot8;
    151 		}
    152 		s += locs;
    153 		tmp[t].slot = s;
    154 	}
    155 	return SLOT(s);
    156 }
    157 
    158 /* restricts b to hold at most k
    159  * temporaries, preferring those
    160  * present in f (if given), then
    161  * those with the largest spill
    162  * cost
    163  */
    164 static void
    165 limit(BSet *b, int k, BSet *f)
    166 {
    167 	static int *tarr, maxt;
    168 	int i, t, nt;
    169 
    170 	nt = bscount(b);
    171 	if (nt <= k)
    172 		return;
    173 	if (nt > maxt) {
    174 		free(tarr);
    175 		tarr = emalloc(nt * sizeof tarr[0]);
    176 		maxt = nt;
    177 	}
    178 	for (i=0, t=0; bsiter(b, &t); t++) {
    179 		bsclr(b, t);
    180 		tarr[i++] = t;
    181 	}
    182 	if (nt > 1) {
    183 		if (!f)
    184 			qsort(tarr, nt, sizeof tarr[0], tcmp0);
    185 		else {
    186 			fst = f;
    187 			qsort(tarr, nt, sizeof tarr[0], tcmp1);
    188 		}
    189 	}
    190 	for (i=0; i<k && i<nt; i++)
    191 		bsset(b, tarr[i]);
    192 	for (; i<nt; i++)
    193 		slot(tarr[i]);
    194 }
    195 
    196 /* spills temporaries to fit the
    197  * target limits using the same
    198  * preferences as limit(); assumes
    199  * that k1 gprs and k2 fprs are
    200  * currently in use
    201  */
    202 static void
    203 limit2(BSet *b1, int k1, int k2, BSet *f)
    204 {
    205 	BSet b2[1];
    206 
    207 	bsinit(b2, ntmp); /* todo, free those */
    208 	bscopy(b2, b1);
    209 	bsinter(b1, mask[0]);
    210 	bsinter(b2, mask[1]);
    211 	limit(b1, T.ngpr - k1, f);
    212 	limit(b2, T.nfpr - k2, f);
    213 	bsunion(b1, b2);
    214 }
    215 
    216 static void
    217 sethint(BSet *u, bits r)
    218 {
    219 	int t;
    220 
    221 	for (t=Tmp0; bsiter(u, &t); t++)
    222 		tmp[phicls(t, tmp)].hint.m |= r;
    223 }
    224 
    225 /* reloads temporaries in u that are
    226  * not in v from their slots
    227  */
    228 static void
    229 reloads(BSet *u, BSet *v)
    230 {
    231 	int t;
    232 
    233 	for (t=Tmp0; bsiter(u, &t); t++)
    234 		if (!bshas(v, t))
    235 			emit(Oload, tmp[t].cls, TMP(t), slot(t), R);
    236 }
    237 
    238 static void
    239 store(Ref r, int s)
    240 {
    241 	if (s != -1)
    242 		emit(Ostorew + tmp[r.val].cls, 0, R, r, SLOT(s));
    243 }
    244 
    245 static int
    246 regcpy(Ins *i)
    247 {
    248 	return i->op == Ocopy && isreg(i->arg[0]);
    249 }
    250 
    251 static Ins *
    252 dopm(Blk *b, Ins *i, BSet *v)
    253 {
    254 	int n, t;
    255 	BSet u[1];
    256 	Ins *i1;
    257 	bits r;
    258 
    259 	bsinit(u, ntmp); /* todo, free those */
    260 	/* consecutive copies from
    261 	 * registers need to be handled
    262 	 * as one large instruction
    263 	 *
    264 	 * fixme: there is an assumption
    265 	 * that calls are always followed
    266 	 * by copy instructions here, this
    267 	 * might not be true if previous
    268 	 * passes change
    269 	 */
    270 	i1 = ++i;
    271 	do {
    272 		i--;
    273 		t = i->to.val;
    274 		if (!req(i->to, R))
    275 		if (bshas(v, t)) {
    276 			bsclr(v, t);
    277 			store(i->to, tmp[t].slot);
    278 		}
    279 		bsset(v, i->arg[0].val);
    280 	} while (i != b->ins && regcpy(i-1));
    281 	bscopy(u, v);
    282 	if (i != b->ins && (i-1)->op == Ocall) {
    283 		v->t[0] &= ~T.retregs((i-1)->arg[1], 0);
    284 		limit2(v, T.nrsave[0], T.nrsave[1], 0);
    285 		for (n=0, r=0; T.rsave[n]>=0; n++)
    286 			r |= BIT(T.rsave[n]);
    287 		v->t[0] |= T.argregs((i-1)->arg[1], 0);
    288 	} else {
    289 		limit2(v, 0, 0, 0);
    290 		r = v->t[0];
    291 	}
    292 	sethint(v, r);
    293 	reloads(u, v);
    294 	do
    295 		emiti(*--i1);
    296 	while (i1 != i);
    297 	return i;
    298 }
    299 
    300 static void
    301 merge(BSet *u, Blk *bu, BSet *v, Blk *bv)
    302 {
    303 	int t;
    304 
    305 	if (bu->loop <= bv->loop)
    306 		bsunion(u, v);
    307 	else
    308 		for (t=0; bsiter(v, &t); t++)
    309 			if (tmp[t].slot == -1)
    310 				bsset(u, t);
    311 }
    312 
    313 /* spill code insertion
    314  * requires spill costs, rpo, liveness
    315  *
    316  * Note: this will replace liveness
    317  * information (in, out) with temporaries
    318  * that must be in registers at block
    319  * borders
    320  *
    321  * Be careful with:
    322  * - Ocopy instructions to ensure register
    323  *   constraints
    324  */
    325 void
    326 spill(Fn *fn)
    327 {
    328 	Blk *b, *s1, *s2, *hd, **bp;
    329 	int j, l, t, k, lvarg[2];
    330 	uint n;
    331 	BSet u[1], v[1], w[1];
    332 	Ins *i;
    333 	Phi *p;
    334 	Mem *m;
    335 	bits r;
    336 
    337 	tmp = fn->tmp;
    338 	ntmp = fn->ntmp;
    339 	bsinit(u, ntmp);
    340 	bsinit(v, ntmp);
    341 	bsinit(w, ntmp);
    342 	bsinit(mask[0], ntmp);
    343 	bsinit(mask[1], ntmp);
    344 	locs = fn->slot;
    345 	slot4 = 0;
    346 	slot8 = 0;
    347 	for (t=0; t<ntmp; t++) {
    348 		k = 0;
    349 		if (t >= T.fpr0 && t < T.fpr0 + T.nfpr)
    350 			k = 1;
    351 		if (t >= Tmp0)
    352 			k = KBASE(tmp[t].cls);
    353 		bsset(mask[k], t);
    354 	}
    355 
    356 	for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) {
    357 		b = *--bp;
    358 		/* invariant: all blocks with bigger rpo got
    359 		 * their in,out updated. */
    360 
    361 		/* 1. find temporaries in registers at
    362 		 * the end of the block (put them in v) */
    363 		curi = 0;
    364 		s1 = b->s1;
    365 		s2 = b->s2;
    366 		hd = 0;
    367 		if (s1 && s1->id <= b->id)
    368 			hd = s1;
    369 		if (s2 && s2->id <= b->id)
    370 		if (!hd || s2->id >= hd->id)
    371 			hd = s2;
    372 		if (hd) {
    373 			/* back-edge */
    374 			bszero(v);
    375 			hd->gen->t[0] |= T.rglob; /* don't spill registers */
    376 			for (k=0; k<2; k++) {
    377 				n = k == 0 ? T.ngpr : T.nfpr;
    378 				bscopy(u, b->out);
    379 				bsinter(u, mask[k]);
    380 				bscopy(w, u);
    381 				bsinter(u, hd->gen);
    382 				bsdiff(w, hd->gen);
    383 				if (bscount(u) < n) {
    384 					j = bscount(w); /* live through */
    385 					l = hd->nlive[k];
    386 					limit(w, n - (l - j), 0);
    387 					bsunion(u, w);
    388 				} else
    389 					limit(u, n, 0);
    390 				bsunion(v, u);
    391 			}
    392 		} else if (s1) {
    393 			/* avoid reloading temporaries
    394 			 * in the middle of loops */
    395 			bszero(v);
    396 			liveon(w, b, s1);
    397 			merge(v, b, w, s1);
    398 			if (s2) {
    399 				liveon(u, b, s2);
    400 				merge(v, b, u, s2);
    401 				bsinter(w, u);
    402 			}
    403 			limit2(v, 0, 0, w);
    404 		} else {
    405 			bscopy(v, b->out);
    406 			if (rtype(b->jmp.arg) == RCall)
    407 				v->t[0] |= T.retregs(b->jmp.arg, 0);
    408 		}
    409 		for (t=Tmp0; bsiter(b->out, &t); t++)
    410 			if (!bshas(v, t))
    411 				slot(t);
    412 		bscopy(b->out, v);
    413 
    414 		/* 2. process the block instructions */
    415 		if (rtype(b->jmp.arg) == RTmp) {
    416 			t = b->jmp.arg.val;
    417 			assert(KBASE(tmp[t].cls) == 0);
    418 			lvarg[0] = bshas(v, t);
    419 			bsset(v, t);
    420 			bscopy(u, v);
    421 			limit2(v, 0, 0, NULL);
    422 			if (!bshas(v, t)) {
    423 				if (!lvarg[0])
    424 					bsclr(u, t);
    425 				b->jmp.arg = slot(t);
    426 			}
    427 			reloads(u, v);
    428 		}
    429 		curi = &insb[NIns];
    430 		for (i=&b->ins[b->nins]; i!=b->ins;) {
    431 			i--;
    432 			if (regcpy(i)) {
    433 				i = dopm(b, i, v);
    434 				continue;
    435 			}
    436 			bszero(w);
    437 			if (!req(i->to, R)) {
    438 				assert(rtype(i->to) == RTmp);
    439 				t = i->to.val;
    440 				if (bshas(v, t))
    441 					bsclr(v, t);
    442 				else {
    443 					/* make sure we have a reg
    444 					 * for the result */
    445 					assert(t >= Tmp0 && "dead reg");
    446 					bsset(v, t);
    447 					bsset(w, t);
    448 				}
    449 			}
    450 			j = T.memargs(i->op);
    451 			for (n=0; n<2; n++)
    452 				if (rtype(i->arg[n]) == RMem)
    453 					j--;
    454 			for (n=0; n<2; n++)
    455 				switch (rtype(i->arg[n])) {
    456 				case RMem:
    457 					t = i->arg[n].val;
    458 					m = &fn->mem[t];
    459 					if (rtype(m->base) == RTmp) {
    460 						bsset(v, m->base.val);
    461 						bsset(w, m->base.val);
    462 					}
    463 					if (rtype(m->index) == RTmp) {
    464 						bsset(v, m->index.val);
    465 						bsset(w, m->index.val);
    466 					}
    467 					break;
    468 				case RTmp:
    469 					t = i->arg[n].val;
    470 					lvarg[n] = bshas(v, t);
    471 					bsset(v, t);
    472 					if (j-- <= 0)
    473 						bsset(w, t);
    474 					break;
    475 				}
    476 			bscopy(u, v);
    477 			limit2(v, 0, 0, w);
    478 			for (n=0; n<2; n++)
    479 				if (rtype(i->arg[n]) == RTmp) {
    480 					t = i->arg[n].val;
    481 					if (!bshas(v, t)) {
    482 						/* do not reload if the
    483 						 * argument is dead
    484 						 */
    485 						if (!lvarg[n])
    486 							bsclr(u, t);
    487 						i->arg[n] = slot(t);
    488 					}
    489 				}
    490 			reloads(u, v);
    491 			if (!req(i->to, R)) {
    492 				t = i->to.val;
    493 				store(i->to, tmp[t].slot);
    494 				if (t >= Tmp0)
    495 					/* in case i->to was a
    496 					 * dead temporary */
    497 					bsclr(v, t);
    498 			}
    499 			emiti(*i);
    500 			r = v->t[0]; /* Tmp0 is NBit */
    501 			if (r)
    502 				sethint(v, r);
    503 		}
    504 		if (b == fn->start)
    505 			assert(v->t[0] == (T.rglob | fn->reg));
    506 		else
    507 			assert(v->t[0] == T.rglob);
    508 
    509 		for (p=b->phi; p; p=p->link) {
    510 			assert(rtype(p->to) == RTmp);
    511 			t = p->to.val;
    512 			if (bshas(v, t)) {
    513 				bsclr(v, t);
    514 				store(p->to, tmp[t].slot);
    515 			} else if (bshas(b->in, t))
    516 				/* only if the phi is live */
    517 				p->to = slot(p->to.val);
    518 		}
    519 		bscopy(b->in, v);
    520 		b->nins = &insb[NIns] - curi;
    521 		idup(&b->ins, curi, b->nins);
    522 	}
    523 
    524 	/* align the locals to a 16 byte boundary */
    525 	/* specific to NAlign == 3 */
    526 	slot8 += slot8 & 3;
    527 	fn->slot += slot8;
    528 
    529 	if (debug['S']) {
    530 		fprintf(stderr, "\n> Block information:\n");
    531 		for (b=fn->start; b; b=b->link) {
    532 			fprintf(stderr, "\t%-10s (% 5d) ", b->name, b->loop);
    533 			dumpts(b->out, fn->tmp, stderr);
    534 		}
    535 		fprintf(stderr, "\n> After spilling:\n");
    536 		printfn(fn, stderr);
    537 	}
    538 }