spill.c (10666B)
1 #include "all.h" 2 3 static void 4 aggreg(Blk *hd, Blk *b) 5 { 6 int k; 7 8 /* aggregate looping information at 9 * loop headers */ 10 bsunion(hd->gen, b->gen); 11 for (k=0; k<2; k++) 12 if (b->nlive[k] > hd->nlive[k]) 13 hd->nlive[k] = b->nlive[k]; 14 } 15 16 static void 17 tmpuse(Ref r, int use, int loop, Fn *fn) 18 { 19 Mem *m; 20 Tmp *t; 21 22 if (rtype(r) == RMem) { 23 m = &fn->mem[r.val]; 24 tmpuse(m->base, 1, loop, fn); 25 tmpuse(m->index, 1, loop, fn); 26 } 27 else if (rtype(r) == RTmp && r.val >= Tmp0) { 28 t = &fn->tmp[r.val]; 29 t->nuse += use; 30 t->ndef += !use; 31 t->cost += loop; 32 } 33 } 34 35 /* evaluate spill costs of temporaries, 36 * this also fills usage information 37 * requires rpo, preds 38 */ 39 void 40 fillcost(Fn *fn) 41 { 42 int n; 43 uint a; 44 Blk *b; 45 Ins *i; 46 Tmp *t; 47 Phi *p; 48 49 loopiter(fn, aggreg); 50 if (debug['S']) { 51 fprintf(stderr, "\n> Loop information:\n"); 52 for (b=fn->start; b; b=b->link) { 53 for (a=0; a<b->npred; ++a) 54 if (b->id <= b->pred[a]->id) 55 break; 56 if (a != b->npred) { 57 fprintf(stderr, "\t%-10s", b->name); 58 fprintf(stderr, " (% 3d ", b->nlive[0]); 59 fprintf(stderr, "% 3d) ", b->nlive[1]); 60 dumpts(b->gen, fn->tmp, stderr); 61 } 62 } 63 } 64 for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) { 65 t->cost = t-fn->tmp < Tmp0 ? UINT_MAX : 0; 66 t->nuse = 0; 67 t->ndef = 0; 68 } 69 for (b=fn->start; b; b=b->link) { 70 for (p=b->phi; p; p=p->link) { 71 t = &fn->tmp[p->to.val]; 72 tmpuse(p->to, 0, 0, fn); 73 for (a=0; a<p->narg; a++) { 74 n = p->blk[a]->loop; 75 t->cost += n; 76 tmpuse(p->arg[a], 1, n, fn); 77 } 78 } 79 n = b->loop; 80 for (i=b->ins; i<&b->ins[b->nins]; i++) { 81 tmpuse(i->to, 0, n, fn); 82 tmpuse(i->arg[0], 1, n, fn); 83 tmpuse(i->arg[1], 1, n, fn); 84 } 85 tmpuse(b->jmp.arg, 1, n, fn); 86 } 87 if (debug['S']) { 88 fprintf(stderr, "\n> Spill costs:\n"); 89 for (n=Tmp0; n<fn->ntmp; n++) 90 fprintf(stderr, "\t%-10s %d\n", 91 fn->tmp[n].name, 92 fn->tmp[n].cost); 93 fprintf(stderr, "\n"); 94 } 95 } 96 97 static BSet *fst; /* temps to prioritize in registers (for tcmp1) */ 98 static Tmp *tmp; /* current temporaries (for tcmpX) */ 99 static int ntmp; /* current # of temps (for limit) */ 100 static int locs; /* stack size used by locals */ 101 static int slot4; /* next slot of 4 bytes */ 102 static int slot8; /* ditto, 8 bytes */ 103 static BSet mask[2][1]; /* class masks */ 104 105 static int 106 tcmp0(const void *pa, const void *pb) 107 { 108 uint ca, cb; 109 110 ca = tmp[*(int *)pa].cost; 111 cb = tmp[*(int *)pb].cost; 112 return (cb < ca) ? -1 : (cb > ca); 113 } 114 115 static int 116 tcmp1(const void *pa, const void *pb) 117 { 118 int c; 119 120 c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa); 121 return c ? c : tcmp0(pa, pb); 122 } 123 124 static Ref 125 slot(int t) 126 { 127 int s; 128 129 assert(t >= Tmp0 && "cannot spill register"); 130 s = tmp[t].slot; 131 if (s == -1) { 132 /* specific to NAlign == 3 */ 133 /* nice logic to pack stack slots 134 * on demand, there can be only 135 * one hole and slot4 points to it 136 * 137 * invariant: slot4 <= slot8 138 */ 139 if (KWIDE(tmp[t].cls)) { 140 s = slot8; 141 if (slot4 == slot8) 142 slot4 += 2; 143 slot8 += 2; 144 } else { 145 s = slot4; 146 if (slot4 == slot8) { 147 slot8 += 2; 148 slot4 += 1; 149 } else 150 slot4 = slot8; 151 } 152 s += locs; 153 tmp[t].slot = s; 154 } 155 return SLOT(s); 156 } 157 158 /* restricts b to hold at most k 159 * temporaries, preferring those 160 * present in f (if given), then 161 * those with the largest spill 162 * cost 163 */ 164 static void 165 limit(BSet *b, int k, BSet *f) 166 { 167 static int *tarr, maxt; 168 int i, t, nt; 169 170 nt = bscount(b); 171 if (nt <= k) 172 return; 173 if (nt > maxt) { 174 free(tarr); 175 tarr = emalloc(nt * sizeof tarr[0]); 176 maxt = nt; 177 } 178 for (i=0, t=0; bsiter(b, &t); t++) { 179 bsclr(b, t); 180 tarr[i++] = t; 181 } 182 if (nt > 1) { 183 if (!f) 184 qsort(tarr, nt, sizeof tarr[0], tcmp0); 185 else { 186 fst = f; 187 qsort(tarr, nt, sizeof tarr[0], tcmp1); 188 } 189 } 190 for (i=0; i<k && i<nt; i++) 191 bsset(b, tarr[i]); 192 for (; i<nt; i++) 193 slot(tarr[i]); 194 } 195 196 /* spills temporaries to fit the 197 * target limits using the same 198 * preferences as limit(); assumes 199 * that k1 gprs and k2 fprs are 200 * currently in use 201 */ 202 static void 203 limit2(BSet *b1, int k1, int k2, BSet *f) 204 { 205 BSet b2[1]; 206 207 bsinit(b2, ntmp); /* todo, free those */ 208 bscopy(b2, b1); 209 bsinter(b1, mask[0]); 210 bsinter(b2, mask[1]); 211 limit(b1, T.ngpr - k1, f); 212 limit(b2, T.nfpr - k2, f); 213 bsunion(b1, b2); 214 } 215 216 static void 217 sethint(BSet *u, bits r) 218 { 219 int t; 220 221 for (t=Tmp0; bsiter(u, &t); t++) 222 tmp[phicls(t, tmp)].hint.m |= r; 223 } 224 225 /* reloads temporaries in u that are 226 * not in v from their slots 227 */ 228 static void 229 reloads(BSet *u, BSet *v) 230 { 231 int t; 232 233 for (t=Tmp0; bsiter(u, &t); t++) 234 if (!bshas(v, t)) 235 emit(Oload, tmp[t].cls, TMP(t), slot(t), R); 236 } 237 238 static void 239 store(Ref r, int s) 240 { 241 if (s != -1) 242 emit(Ostorew + tmp[r.val].cls, 0, R, r, SLOT(s)); 243 } 244 245 static int 246 regcpy(Ins *i) 247 { 248 return i->op == Ocopy && isreg(i->arg[0]); 249 } 250 251 static Ins * 252 dopm(Blk *b, Ins *i, BSet *v) 253 { 254 int n, t; 255 BSet u[1]; 256 Ins *i1; 257 bits r; 258 259 bsinit(u, ntmp); /* todo, free those */ 260 /* consecutive copies from 261 * registers need to be handled 262 * as one large instruction 263 * 264 * fixme: there is an assumption 265 * that calls are always followed 266 * by copy instructions here, this 267 * might not be true if previous 268 * passes change 269 */ 270 i1 = ++i; 271 do { 272 i--; 273 t = i->to.val; 274 if (!req(i->to, R)) 275 if (bshas(v, t)) { 276 bsclr(v, t); 277 store(i->to, tmp[t].slot); 278 } 279 bsset(v, i->arg[0].val); 280 } while (i != b->ins && regcpy(i-1)); 281 bscopy(u, v); 282 if (i != b->ins && (i-1)->op == Ocall) { 283 v->t[0] &= ~T.retregs((i-1)->arg[1], 0); 284 limit2(v, T.nrsave[0], T.nrsave[1], 0); 285 for (n=0, r=0; T.rsave[n]>=0; n++) 286 r |= BIT(T.rsave[n]); 287 v->t[0] |= T.argregs((i-1)->arg[1], 0); 288 } else { 289 limit2(v, 0, 0, 0); 290 r = v->t[0]; 291 } 292 sethint(v, r); 293 reloads(u, v); 294 do 295 emiti(*--i1); 296 while (i1 != i); 297 return i; 298 } 299 300 static void 301 merge(BSet *u, Blk *bu, BSet *v, Blk *bv) 302 { 303 int t; 304 305 if (bu->loop <= bv->loop) 306 bsunion(u, v); 307 else 308 for (t=0; bsiter(v, &t); t++) 309 if (tmp[t].slot == -1) 310 bsset(u, t); 311 } 312 313 /* spill code insertion 314 * requires spill costs, rpo, liveness 315 * 316 * Note: this will replace liveness 317 * information (in, out) with temporaries 318 * that must be in registers at block 319 * borders 320 * 321 * Be careful with: 322 * - Ocopy instructions to ensure register 323 * constraints 324 */ 325 void 326 spill(Fn *fn) 327 { 328 Blk *b, *s1, *s2, *hd, **bp; 329 int j, l, t, k, lvarg[2]; 330 uint n; 331 BSet u[1], v[1], w[1]; 332 Ins *i; 333 Phi *p; 334 Mem *m; 335 bits r; 336 337 tmp = fn->tmp; 338 ntmp = fn->ntmp; 339 bsinit(u, ntmp); 340 bsinit(v, ntmp); 341 bsinit(w, ntmp); 342 bsinit(mask[0], ntmp); 343 bsinit(mask[1], ntmp); 344 locs = fn->slot; 345 slot4 = 0; 346 slot8 = 0; 347 for (t=0; t<ntmp; t++) { 348 k = 0; 349 if (t >= T.fpr0 && t < T.fpr0 + T.nfpr) 350 k = 1; 351 if (t >= Tmp0) 352 k = KBASE(tmp[t].cls); 353 bsset(mask[k], t); 354 } 355 356 for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) { 357 b = *--bp; 358 /* invariant: all blocks with bigger rpo got 359 * their in,out updated. */ 360 361 /* 1. find temporaries in registers at 362 * the end of the block (put them in v) */ 363 curi = 0; 364 s1 = b->s1; 365 s2 = b->s2; 366 hd = 0; 367 if (s1 && s1->id <= b->id) 368 hd = s1; 369 if (s2 && s2->id <= b->id) 370 if (!hd || s2->id >= hd->id) 371 hd = s2; 372 if (hd) { 373 /* back-edge */ 374 bszero(v); 375 hd->gen->t[0] |= T.rglob; /* don't spill registers */ 376 for (k=0; k<2; k++) { 377 n = k == 0 ? T.ngpr : T.nfpr; 378 bscopy(u, b->out); 379 bsinter(u, mask[k]); 380 bscopy(w, u); 381 bsinter(u, hd->gen); 382 bsdiff(w, hd->gen); 383 if (bscount(u) < n) { 384 j = bscount(w); /* live through */ 385 l = hd->nlive[k]; 386 limit(w, n - (l - j), 0); 387 bsunion(u, w); 388 } else 389 limit(u, n, 0); 390 bsunion(v, u); 391 } 392 } else if (s1) { 393 /* avoid reloading temporaries 394 * in the middle of loops */ 395 bszero(v); 396 liveon(w, b, s1); 397 merge(v, b, w, s1); 398 if (s2) { 399 liveon(u, b, s2); 400 merge(v, b, u, s2); 401 bsinter(w, u); 402 } 403 limit2(v, 0, 0, w); 404 } else { 405 bscopy(v, b->out); 406 if (rtype(b->jmp.arg) == RCall) 407 v->t[0] |= T.retregs(b->jmp.arg, 0); 408 } 409 for (t=Tmp0; bsiter(b->out, &t); t++) 410 if (!bshas(v, t)) 411 slot(t); 412 bscopy(b->out, v); 413 414 /* 2. process the block instructions */ 415 if (rtype(b->jmp.arg) == RTmp) { 416 t = b->jmp.arg.val; 417 assert(KBASE(tmp[t].cls) == 0); 418 lvarg[0] = bshas(v, t); 419 bsset(v, t); 420 bscopy(u, v); 421 limit2(v, 0, 0, NULL); 422 if (!bshas(v, t)) { 423 if (!lvarg[0]) 424 bsclr(u, t); 425 b->jmp.arg = slot(t); 426 } 427 reloads(u, v); 428 } 429 curi = &insb[NIns]; 430 for (i=&b->ins[b->nins]; i!=b->ins;) { 431 i--; 432 if (regcpy(i)) { 433 i = dopm(b, i, v); 434 continue; 435 } 436 bszero(w); 437 if (!req(i->to, R)) { 438 assert(rtype(i->to) == RTmp); 439 t = i->to.val; 440 if (bshas(v, t)) 441 bsclr(v, t); 442 else { 443 /* make sure we have a reg 444 * for the result */ 445 assert(t >= Tmp0 && "dead reg"); 446 bsset(v, t); 447 bsset(w, t); 448 } 449 } 450 j = T.memargs(i->op); 451 for (n=0; n<2; n++) 452 if (rtype(i->arg[n]) == RMem) 453 j--; 454 for (n=0; n<2; n++) 455 switch (rtype(i->arg[n])) { 456 case RMem: 457 t = i->arg[n].val; 458 m = &fn->mem[t]; 459 if (rtype(m->base) == RTmp) { 460 bsset(v, m->base.val); 461 bsset(w, m->base.val); 462 } 463 if (rtype(m->index) == RTmp) { 464 bsset(v, m->index.val); 465 bsset(w, m->index.val); 466 } 467 break; 468 case RTmp: 469 t = i->arg[n].val; 470 lvarg[n] = bshas(v, t); 471 bsset(v, t); 472 if (j-- <= 0) 473 bsset(w, t); 474 break; 475 } 476 bscopy(u, v); 477 limit2(v, 0, 0, w); 478 for (n=0; n<2; n++) 479 if (rtype(i->arg[n]) == RTmp) { 480 t = i->arg[n].val; 481 if (!bshas(v, t)) { 482 /* do not reload if the 483 * argument is dead 484 */ 485 if (!lvarg[n]) 486 bsclr(u, t); 487 i->arg[n] = slot(t); 488 } 489 } 490 reloads(u, v); 491 if (!req(i->to, R)) { 492 t = i->to.val; 493 store(i->to, tmp[t].slot); 494 if (t >= Tmp0) 495 /* in case i->to was a 496 * dead temporary */ 497 bsclr(v, t); 498 } 499 emiti(*i); 500 r = v->t[0]; /* Tmp0 is NBit */ 501 if (r) 502 sethint(v, r); 503 } 504 if (b == fn->start) 505 assert(v->t[0] == (T.rglob | fn->reg)); 506 else 507 assert(v->t[0] == T.rglob); 508 509 for (p=b->phi; p; p=p->link) { 510 assert(rtype(p->to) == RTmp); 511 t = p->to.val; 512 if (bshas(v, t)) { 513 bsclr(v, t); 514 store(p->to, tmp[t].slot); 515 } else if (bshas(b->in, t)) 516 /* only if the phi is live */ 517 p->to = slot(p->to.val); 518 } 519 bscopy(b->in, v); 520 b->nins = &insb[NIns] - curi; 521 idup(&b->ins, curi, b->nins); 522 } 523 524 /* align the locals to a 16 byte boundary */ 525 /* specific to NAlign == 3 */ 526 slot8 += slot8 & 3; 527 fn->slot += slot8; 528 529 if (debug['S']) { 530 fprintf(stderr, "\n> Block information:\n"); 531 for (b=fn->start; b; b=b->link) { 532 fprintf(stderr, "\t%-10s (% 5d) ", b->name, b->loop); 533 dumpts(b->out, fn->tmp, stderr); 534 } 535 fprintf(stderr, "\n> After spilling:\n"); 536 printfn(fn, stderr); 537 } 538 }