mem.c (9306B)
1 #include "all.h" 2 3 typedef struct Range Range; 4 typedef struct Store Store; 5 typedef struct Slot Slot; 6 7 /* require use, maintains use counts */ 8 void 9 promote(Fn *fn) 10 { 11 Blk *b; 12 Ins *i, *l; 13 Tmp *t; 14 Use *u, *ue; 15 int s, k; 16 17 /* promote uniform stack slots to temporaries */ 18 b = fn->start; 19 for (i=b->ins; i<&b->ins[b->nins]; i++) { 20 if (Oalloc > i->op || i->op > Oalloc1) 21 continue; 22 /* specific to NAlign == 3 */ 23 assert(rtype(i->to) == RTmp); 24 t = &fn->tmp[i->to.val]; 25 if (t->ndef != 1) 26 goto Skip; 27 k = -1; 28 s = -1; 29 for (u=t->use; u<&t->use[t->nuse]; u++) { 30 if (u->type != UIns) 31 goto Skip; 32 l = u->u.ins; 33 if (isload(l->op)) 34 if (s == -1 || s == loadsz(l)) { 35 s = loadsz(l); 36 continue; 37 } 38 if (isstore(l->op)) 39 if (req(i->to, l->arg[1]) && !req(i->to, l->arg[0])) 40 if (s == -1 || s == storesz(l)) 41 if (k == -1 || k == optab[l->op].argcls[0][0]) { 42 s = storesz(l); 43 k = optab[l->op].argcls[0][0]; 44 continue; 45 } 46 goto Skip; 47 } 48 /* get rid of the alloc and replace uses */ 49 *i = (Ins){.op = Onop}; 50 t->ndef--; 51 ue = &t->use[t->nuse]; 52 for (u=t->use; u!=ue; u++) { 53 l = u->u.ins; 54 if (isstore(l->op)) { 55 l->cls = k; 56 l->op = Ocopy; 57 l->to = l->arg[1]; 58 l->arg[1] = R; 59 t->nuse--; 60 t->ndef++; 61 } else { 62 if (k == -1) 63 err("slot %%%s is read but never stored to", 64 fn->tmp[l->arg[0].val].name); 65 /* try to turn loads into copies so we 66 * can eliminate them later */ 67 switch(l->op) { 68 case Oloadsw: 69 case Oloaduw: 70 if (k == Kl) 71 goto Extend; 72 /* fall through */ 73 case Oload: 74 if (KBASE(k) != KBASE(l->cls)) 75 l->op = Ocast; 76 else 77 l->op = Ocopy; 78 break; 79 default: 80 Extend: 81 l->op = Oextsb + (l->op - Oloadsb); 82 break; 83 } 84 } 85 } 86 Skip:; 87 } 88 if (debug['M']) { 89 fprintf(stderr, "\n> After slot promotion:\n"); 90 printfn(fn, stderr); 91 } 92 } 93 94 /* [a, b) with 0 <= a */ 95 struct Range { 96 int a, b; 97 }; 98 99 struct Store { 100 int ip; 101 Ins *i; 102 }; 103 104 struct Slot { 105 int t; 106 int sz; 107 bits m; 108 bits l; 109 Range r; 110 Slot *s; 111 Store *st; 112 int nst; 113 }; 114 115 static inline int 116 rin(Range r, int n) 117 { 118 return r.a <= n && n < r.b; 119 } 120 121 static inline int 122 rovlap(Range r0, Range r1) 123 { 124 return r0.b && r1.b && r0.a < r1.b && r1.a < r0.b; 125 } 126 127 static void 128 radd(Range *r, int n) 129 { 130 if (!r->b) 131 *r = (Range){n, n+1}; 132 else if (n < r->a) 133 r->a = n; 134 else if (n >= r->b) 135 r->b = n+1; 136 } 137 138 static int 139 slot(Slot **ps, int64_t *off, Ref r, Fn *fn, Slot *sl) 140 { 141 Alias a; 142 Tmp *t; 143 144 getalias(&a, r, fn); 145 if (a.type != ALoc) 146 return 0; 147 t = &fn->tmp[a.base]; 148 if (t->visit < 0) 149 return 0; 150 *off = a.offset; 151 *ps = &sl[t->visit]; 152 return 1; 153 } 154 155 static void 156 load(Ref r, bits x, int ip, Fn *fn, Slot *sl) 157 { 158 int64_t off; 159 Slot *s; 160 161 if (slot(&s, &off, r, fn, sl)) { 162 s->l |= x << off; 163 s->l &= s->m; 164 if (s->l) 165 radd(&s->r, ip); 166 } 167 } 168 169 static void 170 store(Ref r, bits x, int ip, Ins *i, Fn *fn, Slot *sl) 171 { 172 int64_t off; 173 Slot *s; 174 175 if (slot(&s, &off, r, fn, sl)) { 176 if (s->l) { 177 radd(&s->r, ip); 178 s->l &= ~(x << off); 179 } else { 180 vgrow(&s->st, ++s->nst); 181 s->st[s->nst-1].ip = ip; 182 s->st[s->nst-1].i = i; 183 } 184 } 185 } 186 187 static int 188 scmp(const void *pa, const void *pb) 189 { 190 Slot *a, *b; 191 192 a = (Slot *)pa, b = (Slot *)pb; 193 if (a->sz != b->sz) 194 return b->sz - a->sz; 195 return a->r.a - b->r.a; 196 } 197 198 static void 199 maxrpo(Blk *hd, Blk *b) 200 { 201 if (hd->loop < (int)b->id) 202 hd->loop = b->id; 203 } 204 205 void 206 coalesce(Fn *fn) 207 { 208 Range r, *br; 209 Slot *s, *s0, *sl; 210 Blk *b, **ps, *succ[3]; 211 Ins *i, **bl; 212 Use *u; 213 Tmp *t, *ts; 214 Ref *arg; 215 bits x; 216 int64_t off0, off1; 217 int n, m, ip, sz, nsl, nbl, *stk; 218 uint total, freed, fused; 219 220 /* minimize the stack usage 221 * by coalescing slots 222 */ 223 nsl = 0; 224 sl = vnew(0, sizeof sl[0], PHeap); 225 for (n=Tmp0; n<fn->ntmp; n++) { 226 t = &fn->tmp[n]; 227 t->visit = -1; 228 if (t->alias.type == ALoc) 229 if (t->alias.slot == &t->alias) 230 if (t->bid == fn->start->id) 231 if (t->alias.u.loc.sz != -1) { 232 t->visit = nsl; 233 vgrow(&sl, ++nsl); 234 s = &sl[nsl-1]; 235 s->t = n; 236 s->sz = t->alias.u.loc.sz; 237 s->m = t->alias.u.loc.m; 238 s->s = 0; 239 s->st = vnew(0, sizeof s->st[0], PHeap); 240 s->nst = 0; 241 } 242 } 243 244 /* one-pass liveness analysis */ 245 for (b=fn->start; b; b=b->link) 246 b->loop = -1; 247 loopiter(fn, maxrpo); 248 nbl = 0; 249 bl = vnew(0, sizeof bl[0], PHeap); 250 br = emalloc(fn->nblk * sizeof br[0]); 251 ip = INT_MAX - 1; 252 for (n=fn->nblk-1; n>=0; n--) { 253 b = fn->rpo[n]; 254 succ[0] = b->s1; 255 succ[1] = b->s2; 256 succ[2] = 0; 257 br[n].b = ip--; 258 for (s=sl; s<&sl[nsl]; s++) { 259 s->l = 0; 260 for (ps=succ; *ps; ps++) { 261 m = (*ps)->id; 262 if (m > n && rin(s->r, br[m].a)) { 263 s->l = s->m; 264 radd(&s->r, ip); 265 } 266 } 267 } 268 if (b->jmp.type == Jretc) 269 load(b->jmp.arg, -1, --ip, fn, sl); 270 for (i=&b->ins[b->nins]; i!=b->ins;) { 271 --i; 272 arg = i->arg; 273 if (i->op == Oargc) { 274 load(arg[1], -1, --ip, fn, sl); 275 } 276 if (isload(i->op)) { 277 x = BIT(loadsz(i)) - 1; 278 load(arg[0], x, --ip, fn, sl); 279 } 280 if (isstore(i->op)) { 281 x = BIT(storesz(i)) - 1; 282 store(arg[1], x, ip--, i, fn, sl); 283 } 284 if (i->op == Oblit0) { 285 assert((i+1)->op == Oblit1); 286 assert(rtype((i+1)->arg[0]) == RInt); 287 sz = abs(rsval((i+1)->arg[0])); 288 x = sz >= NBit ? (bits)-1 : BIT(sz) - 1; 289 store(arg[1], x, ip--, i, fn, sl); 290 load(arg[0], x, ip, fn, sl); 291 vgrow(&bl, ++nbl); 292 bl[nbl-1] = i; 293 } 294 } 295 for (s=sl; s<&sl[nsl]; s++) 296 if (s->l) { 297 radd(&s->r, ip); 298 if (b->loop != -1) { 299 assert(b->loop > n); 300 radd(&s->r, br[b->loop].b - 1); 301 } 302 } 303 br[n].a = ip; 304 } 305 free(br); 306 307 /* kill dead stores */ 308 for (s=sl; s<&sl[nsl]; s++) 309 for (n=0; n<s->nst; n++) 310 if (!rin(s->r, s->st[n].ip)) { 311 i = s->st[n].i; 312 if (i->op == Oblit0) 313 *(i+1) = (Ins){.op = Onop}; 314 *i = (Ins){.op = Onop}; 315 } 316 317 /* kill slots with an empty live range */ 318 total = 0; 319 freed = 0; 320 stk = vnew(0, sizeof stk[0], PHeap); 321 n = 0; 322 for (s=s0=sl; s<&sl[nsl]; s++) { 323 total += s->sz; 324 if (!s->r.b) { 325 vfree(s->st); 326 vgrow(&stk, ++n); 327 stk[n-1] = s->t; 328 freed += s->sz; 329 } else 330 *s0++ = *s; 331 } 332 nsl = s0-sl; 333 if (debug['M']) { 334 fputs("\n> Slot coalescing:\n", stderr); 335 if (n) { 336 fputs("\tkill [", stderr); 337 for (m=0; m<n; m++) 338 fprintf(stderr, " %%%s", 339 fn->tmp[stk[m]].name); 340 fputs(" ]\n", stderr); 341 } 342 } 343 while (n--) { 344 t = &fn->tmp[stk[n]]; 345 assert(t->ndef == 1 && t->def); 346 i = t->def; 347 if (isload(i->op)) { 348 i->op = Ocopy; 349 i->arg[0] = UNDEF; 350 continue; 351 } 352 *i = (Ins){.op = Onop}; 353 for (u=t->use; u<&t->use[t->nuse]; u++) { 354 if (u->type == UJmp) { 355 b = fn->rpo[u->bid]; 356 assert(isret(b->jmp.type)); 357 b->jmp.type = Jret0; 358 b->jmp.arg = R; 359 continue; 360 } 361 assert(u->type == UIns); 362 i = u->u.ins; 363 if (!req(i->to, R)) { 364 assert(rtype(i->to) == RTmp); 365 vgrow(&stk, ++n); 366 stk[n-1] = i->to.val; 367 } else if (isarg(i->op)) { 368 assert(i->op == Oargc); 369 i->arg[1] = CON_Z; /* crash */ 370 } else { 371 if (i->op == Oblit0) 372 *(i+1) = (Ins){.op = Onop}; 373 *i = (Ins){.op = Onop}; 374 } 375 } 376 } 377 vfree(stk); 378 379 /* fuse slots by decreasing size */ 380 qsort(sl, nsl, sizeof *sl, scmp); 381 fused = 0; 382 for (n=0; n<nsl; n++) { 383 s0 = &sl[n]; 384 if (s0->s) 385 continue; 386 s0->s = s0; 387 r = s0->r; 388 for (s=s0+1; s<&sl[nsl]; s++) { 389 if (s->s || !s->r.b) 390 goto Skip; 391 if (rovlap(r, s->r)) 392 /* O(n); can be approximated 393 * by 'goto Skip;' if need be 394 */ 395 for (m=n; &sl[m]<s; m++) 396 if (sl[m].s == s0) 397 if (rovlap(sl[m].r, s->r)) 398 goto Skip; 399 radd(&r, s->r.a); 400 radd(&r, s->r.b - 1); 401 s->s = s0; 402 fused += s->sz; 403 Skip:; 404 } 405 } 406 407 /* substitute fused slots */ 408 for (s=sl; s<&sl[nsl]; s++) { 409 t = &fn->tmp[s->t]; 410 /* the visit link is stale, 411 * reset it before the slot() 412 * calls below 413 */ 414 t->visit = s-sl; 415 assert(t->ndef == 1 && t->def); 416 if (s->s == s) 417 continue; 418 *t->def = (Ins){.op = Onop}; 419 ts = &fn->tmp[s->s->t]; 420 assert(t->bid == ts->bid); 421 if (t->def < ts->def) { 422 /* make sure the slot we 423 * selected has a def that 424 * dominates its new uses 425 */ 426 *t->def = *ts->def; 427 *ts->def = (Ins){.op = Onop}; 428 ts->def = t->def; 429 } 430 for (u=t->use; u<&t->use[t->nuse]; u++) { 431 if (u->type == UJmp) { 432 b = fn->rpo[u->bid]; 433 b->jmp.arg = TMP(s->s->t); 434 continue; 435 } 436 assert(u->type == UIns); 437 arg = u->u.ins->arg; 438 for (n=0; n<2; n++) 439 if (req(arg[n], TMP(s->t))) 440 arg[n] = TMP(s->s->t); 441 } 442 } 443 444 /* fix newly overlapping blits */ 445 for (n=0; n<nbl; n++) { 446 i = bl[n]; 447 if (i->op == Oblit0) 448 if (slot(&s, &off0, i->arg[0], fn, sl)) 449 if (slot(&s0, &off1, i->arg[1], fn, sl)) 450 if (s->s == s0->s && off0 < off1) { 451 sz = rsval((i+1)->arg[0]); 452 assert(sz >= 0); 453 (i+1)->arg[0] = INT(-sz); 454 } 455 } 456 vfree(bl); 457 458 if (debug['M']) { 459 for (s0=sl; s0<&sl[nsl]; s0++) { 460 if (s0->s != s0) 461 continue; 462 fprintf(stderr, "\tfuse (% 3db) [", s0->sz); 463 for (s=s0; s<&sl[nsl]; s++) { 464 if (s->s != s0) 465 continue; 466 fprintf(stderr, " %%%s", fn->tmp[s->t].name); 467 if (s->r.b) 468 fprintf(stderr, "[%d,%d)", 469 s->r.a-ip, s->r.b-ip); 470 else 471 fputs("{}", stderr); 472 } 473 fputs(" ]\n", stderr); 474 } 475 fprintf(stderr, "\tsums %u/%u/%u (killed/fused/total)\n\n", 476 freed, fused, total); 477 printfn(fn, stderr); 478 } 479 480 for (s=sl; s<&sl[nsl]; s++) 481 vfree(s->st); 482 vfree(sl); 483 }