great renaming campain! - qbe - Internal scc patchset buffer for QBE

commit 62e238a6ef151d56b79e1f076a57463f2e1fb020
parent 97b58def96d47d937d86849380d8316ddb16bed8
Author: Quentin Carbonneaux <quentin.carbonneaux@yale.edu>
Date:   Fri, 25 Mar 2016 14:02:43 -0400

great renaming campain!

Diffstat:
D lisc/.gitignore  | 5 -----
D lisc/Makefile  | 17 -----------------
D lisc/copy.c  | 159 -------------------------------------------------------------------------------
D lisc/emit.c  | 666 -------------------------------------------------------------------------------
D lisc/isel.c  | 1135 -------------------------------------------------------------------------------
D lisc/live.c  | 174 -------------------------------------------------------------------------------
D lisc/main.c  | 117 -------------------------------------------------------------------------------
D lisc/mem.c  | 81 -------------------------------------------------------------------------------
D lisc/parse.c  | 1081 -------------------------------------------------------------------------------
D lisc/rega.c  | 597 -------------------------------------------------------------------------------
D lisc/spill.c  | 507 -------------------------------------------------------------------------------
D lisc/ssa.c  | 516 -------------------------------------------------------------------------------
D lisc/test/go.sh  | 116 -------------------------------------------------------------------------------
D lisc/tools/abitest.sh  | 104 -------------------------------------------------------------------------------
D lisc/tools/regress.sh  | 17 -----------------
D lisc/util.c  | 329 -------------------------------------------------------------------------------
M minic/mcc  | 2 +-
A src/.gitignore  | 5 +++++
R lisc/.tag -> src/.tag  | 0 
A src/Makefile  | 17 +++++++++++++++++
R lisc/lisc.h -> src/all.h  | 0 
A src/copy.c  | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/emit.c  | 666 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/isel.c  | 1135 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/live.c  | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/main.c  | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/mem.c  | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/parse.c  | 1081 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/rega.c  | 598 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/spill.c  | 507 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/ssa.c  | 516 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R lisc/test/_alt.ssa -> src/test/_alt.ssa  | 0 
R lisc/test/_dragon.ssa -> src/test/_dragon.ssa  | 0 
R lisc/test/_fix1.ssa -> src/test/_fix1.ssa  | 0 
R lisc/test/_fix2.ssa -> src/test/_fix2.ssa  | 0 
R lisc/test/_fix3.ssa -> src/test/_fix3.ssa  | 0 
R lisc/test/_fix4.ssa -> src/test/_fix4.ssa  | 0 
R lisc/test/_live.ssa -> src/test/_live.ssa  | 0 
R lisc/test/_rpo.ssa -> src/test/_rpo.ssa  | 0 
R lisc/test/_spill1.ssa -> src/test/_spill1.ssa  | 0 
R lisc/test/_spill2.ssa -> src/test/_spill2.ssa  | 0 
R lisc/test/_spill3.ssa -> src/test/_spill3.ssa  | 0 
R lisc/test/abi1.ssa -> src/test/abi1.ssa  | 0 
R lisc/test/abi2.ssa -> src/test/abi2.ssa  | 0 
R lisc/test/abi3.ssa -> src/test/abi3.ssa  | 0 
R lisc/test/abi4.ssa -> src/test/abi4.ssa  | 0 
R lisc/test/abi5.ssa -> src/test/abi5.ssa  | 0 
R lisc/test/align.ssa -> src/test/align.ssa  | 0 
R lisc/test/collatz.ssa -> src/test/collatz.ssa  | 0 
R lisc/test/cprime.ssa -> src/test/cprime.ssa  | 0 
R lisc/test/cup.ssa -> src/test/cup.ssa  | 0 
R lisc/test/dark.ssa -> src/test/dark.ssa  | 0 
R lisc/test/double.ssa -> src/test/double.ssa  | 0 
R lisc/test/echo.ssa -> src/test/echo.ssa  | 0 
R lisc/test/eucl.ssa -> src/test/eucl.ssa  | 0 
R lisc/test/euclc.ssa -> src/test/euclc.ssa  | 0 
R lisc/test/fpcnv.ssa -> src/test/fpcnv.ssa  | 0 
A src/test/go.sh  | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R lisc/test/loop.ssa -> src/test/loop.ssa  | 0 
R lisc/test/mandel.ssa -> src/test/mandel.ssa  | 0 
R lisc/test/max.ssa -> src/test/max.ssa  | 0 
R lisc/test/prime.ssa -> src/test/prime.ssa  | 0 
R lisc/test/puts10.ssa -> src/test/puts10.ssa  | 0 
R lisc/test/sum.ssa -> src/test/sum.ssa  | 0 
R lisc/tools/abi.ml -> src/tools/abi.ml  | 0 
A src/tools/abitest.sh  | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R lisc/tools/fptox.c -> src/tools/fptox.c  | 0 
R lisc/tools/pmov.c -> src/tools/pmov.c  | 0 
A src/tools/regress.sh  | 17 +++++++++++++++++
A src/util.c  | 329 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

70 files changed, 5623 insertions(+), 5622 deletions(-)
diff --git a/lisc/.gitignore b/lisc/.gitignore
@@ -1,5 +0,0 @@
-lisc
-doc
-.comfile
-*.o
-*.out
diff --git a/lisc/Makefile b/lisc/Makefile
@@ -1,17 +0,0 @@
-BIN = lisc
-OBJ = main.o util.o parse.o mem.o ssa.o copy.o live.o isel.o spill.o rega.o emit.o
-
-CFLAGS = -Wall -Wextra -std=c99 -g -pedantic
-
-$(BIN): $(OBJ)
-	$(CC) $(LDFLAGS) $(OBJ) -o $@
-
-$(OBJ): lisc.h
-
-.PHONY: clean check syndoc
-clean:
-	rm -f $(BIN) $(OBJ)
-check: $(BIN)
-	test/go.sh all
-syndoc:
-	unison -auto doc ssh://qcar@h/data/d/ssa-doc
diff --git a/lisc/copy.c b/lisc/copy.c
@@ -1,159 +0,0 @@
-#include "lisc.h"
-
-typedef struct RList RList;
-struct RList {
-	int t;
-	RList *l;
-};
-
-static Ref
-copyof(Ref r, Ref *cp)
-{
-	if (rtype(r) == RTmp)
-		return cp[r.val];
-	else
-		return r;
-}
-
-static void
-update(Ref r, Ref rcp, Ref *cp, RList **w)
-{
-	RList *l;
-
-	if (!req(cp[r.val], rcp)) {
-		cp[r.val] = rcp;
-		l = emalloc(sizeof *l);
-		l->t = r.val;
-		l->l = *w;
-		*w = l;
-	}
-}
-
-static void
-visitphi(Phi *p, Ref *cp, RList **w)
-{
-	uint a;
-	Ref r, r1;
-
-	r = R;
-	for (a=0; a<p->narg; a++) {
-		r1 = copyof(p->arg[a], cp);
-		if (req(r1, R))
-			continue;
-		if (req(r, R) || req(r, r1))
-			r = r1;
-		else {
-			r = p->to;
-			break;
-		}
-	}
-	assert(!req(r, R));
-	update(p->to, r, cp, w);
-}
-
-static void
-visitins(Ins *i, Ref *cp, RList **w)
-{
-	Ref r;
-
-	if (i->op == OCopy) {
-		r = copyof(i->arg[0], cp);
-		update(i->to, r, cp, w);
-	} else if (!req(i->to, R)) {
-		assert(rtype(i->to) == RTmp);
-		update(i->to, i->to, cp, w);
-	}
-}
-
-void
-copy(Fn *fn)
-{
-	Blk *b;
-	Ref *cp, r;
-	RList *w, *w1;
-	Use *u, *u1;
-	Ins *i;
-	Phi *p, **pp;
-	uint a;
-	int t;
-
-	w = 0;
-	cp = emalloc(fn->ntmp * sizeof cp[0]);
-	for (b=fn->start; b; b=b->link) {
-		for (p=b->phi; p; p=p->link)
-			visitphi(p, cp, &w);
-		for (i=b->ins; i-b->ins < b->nins; i++)
-			visitins(i, cp, &w);
-	}
-	while ((w1=w)) {
-		t = w->t;
-		w = w->l;
-		free(w1);
-		u = fn->tmp[t].use;
-		u1 = u + fn->tmp[t].nuse;
-		for (; u<u1; u++)
-			switch (u->type) {
-			default:
-				diag("copy: invalid use");
-			case UPhi:
-				visitphi(u->u.phi, cp, &w);
-				break;
-			case UIns:
-				visitins(u->u.ins, cp, &w);
-				break;
-			case UJmp:
-				break;
-			}
-	}
-	for (b=fn->start; b; b=b->link) {
-		for (pp=&b->phi; (p=*pp);) {
-			r = cp[p->to.val];
-			if (!req(r, p->to)) {
-				*pp = p->link;
-				continue;
-			}
-			for (a=0; a<p->narg; a++)
-				if (rtype(p->arg[a]) == RTmp) {
-					r = cp[p->arg[a].val];
-					assert(!req(r, R));
-					p->arg[a] = r;
-				}
-			pp=&p->link;
-		}
-		for (i=b->ins; i-b->ins < b->nins; i++) {
-			r = cp[i->to.val];
-			if (!req(r, i->to)) {
-				*i = (Ins){.op = ONop};
-				continue;
-			}
-			for (a=0; a<2; a++)
-				if (rtype(i->arg[a]) == RTmp) {
-					r = cp[i->arg[a].val];
-					assert(!req(r, R));
-					i->arg[a] = r;
-				}
-		}
-		if (rtype(b->jmp.arg) == RTmp) {
-			r = cp[b->jmp.arg.val];
-			assert(!req(r, R));
-			b->jmp.arg = r;
-		}
-	}
-	if (debug['C']) {
-		fprintf(stderr, "\n> Copy information:");
-		for (t=Tmp0; t<fn->ntmp; t++) {
-			if (req(cp[t], R)) {
-				fprintf(stderr, "\n%10s not seen!",
-					fn->tmp[t].name);
-			}
-			else if (!req(cp[t], TMP(t))) {
-				fprintf(stderr, "\n%10s copy of ",
-					fn->tmp[t].name);
-				printref(cp[t], fn, stderr);
-			}
-		}
-		fprintf(stderr, "\n\n> After copy elimination:\n");
-		printfn(fn, stderr);
-	}
-	free(cp);
-}
diff --git a/lisc/emit.c b/lisc/emit.c
@@ -1,666 +0,0 @@
-#include "lisc.h"
-
-enum {
-	SLong = 0,
-	SWord = 1,
-	SShort = 2,
-	SByte = 3,
-
-	Ki = -1, /* matches Kw and Kl */
-	Ka = -2, /* matches all classes */
-};
-
-/* Instruction format strings:
- *
- * if the format string starts with -, the instruction
- * is assumed to be 3-address and is put in 2-address
- * mode using an extra mov if necessary
- *
- * if the format string starts with +, the same as the
- * above applies, but commutativity is also assumed
- *
- * %k  is used to set the class of the instruction,
- *     it'll expand to "l", "q", "ss", "sd", depending
- *     on the instruction class
- * %0  designates the first argument
- * %1  designates the second argument
- * %=  designates the result
- *
- * if %k is not used, a prefix to 0, 1, or = must be
- * added, it can be:
- *   M - memory reference
- *   L - long  (64 bits)
- *   W - word  (32 bits)
- *   H - short (16 bits)
- *   B - byte  (8 bits)
- *   S - single precision float
- *   D - double precision float
- */
-static struct {
-	short op;
-	short cls;
-	char *asm;
-} omap[] = {
-	{ OAdd,    Ka, "+add%k %1, %=" },
-	{ OSub,    Ka, "-sub%k %1, %=" },
-	{ OAnd,    Ki, "+and%k %1, %=" },
-	{ OOr,     Ki, "+or%k %1, %=" },
-	{ OXor,    Ki, "+xor%k %1, %=" },
-	{ OSar,    Ki, "-sar%k %B1, %=" },
-	{ OShr,    Ki, "-shr%k %B1, %=" },
-	{ OShl,    Ki, "-shl%k %B1, %=" },
-	{ OMul,    Ki, "+imul%k %1, %=" },
-	{ OMul,    Ks, "+mulss %1, %=" }, /* fixme */
-	{ OMul,    Kd, "+mulsd %1, %=" },
-	{ ODiv,    Ka, "-div%k %1, %=" },
-	{ OStorel, Ka, "movq %L0, %M1" },
-	{ OStorew, Ka, "movl %W0, %M1" },
-	{ OStoreh, Ka, "movw %H0, %M1" },
-	{ OStoreb, Ka, "movb %B0, %M1" },
-	{ OStores, Ka, "movss %S0, %M1" },
-	{ OStored, Ka, "movsd %D0, %M1" },
-	{ OLoad,   Ka, "mov%k %M0, %=" },
-	{ OLoadsw, Kl, "movslq %M0, %L=" },
-	{ OLoadsw, Kw, "movl %M0, %W=" },
-	{ OLoaduw, Ki, "movl %M0, %W=" },
-	{ OLoadsh, Ki, "movsw%k %M0, %=" },
-	{ OLoaduh, Ki, "movzw%k %M0, %=" },
-	{ OLoadsb, Ki, "movsb%k %M0, %=" },
-	{ OLoadub, Ki, "movzb%k %M0, %=" },
-	{ OExtsw,  Kl, "movslq %W0, %L=" },
-	{ OExtuw,  Kl, "movl %W0, %W=" },
-	{ OExtsh,  Ki, "movsw%k %H0, %=" },
-	{ OExtuh,  Ki, "movzw%k %H0, %=" },
-	{ OExtsb,  Ki, "movsb%k %B0, %=" },
-	{ OExtub,  Ki, "movzb%k %B0, %=" },
-
-	{ OExts,   Kd, "cvtss2sd %0, %=" },  /* see if factorization is possible */
-	{ OTruncd, Ks, "cvttsd2ss %0, %=" },
-	{ OFtosi,  Kw, "cvttss2si %0, %=" },
-	{ OFtosi,  Kl, "cvttsd2si %0, %=" },
-	{ OSitof,  Ks, "cvtsi2ss %W0, %=" },
-	{ OSitof,  Kd, "cvtsi2sd %L0, %=" },
-	{ OCast,   Ki, "movq %D0, %L=" },
-	{ OCast,   Ka, "movq %L0, %D=" },
-
-	{ OAddr,   Ki, "lea%k %M0, %=" },
-	{ OSwap,   Ki, "xchg%k %0, %1" },
-	{ OSign,   Kl, "cqto" },
-	{ OSign,   Kw, "cltd" },
-	{ OXDiv,   Ki, "div%k %0" },
-	{ OXIDiv,  Ki, "idiv%k %0" },
-	{ OXCmp,   Ks, "comiss %S0, %S1" },  /* fixme, Kf */
-	{ OXCmp,   Kd, "comisd %D0, %D1" },
-	{ OXCmp,   Ki, "cmp%k %0, %1" },
-	{ OXTest,  Ki, "test%k %0, %1" },
-	{ OXSet+ICeq,  Ki, "setz %B=\n\tmovzb%k %B=, %=" },
-	{ OXSet+ICsle, Ki, "setle %B=\n\tmovzb%k %B=, %=" },
-	{ OXSet+ICslt, Ki, "setl %B=\n\tmovzb%k %B=, %=" },
-	{ OXSet+ICsgt, Ki, "setg %B=\n\tmovzb%k %B=, %=" },
-	{ OXSet+ICsge, Ki, "setge %B=\n\tmovzb%k %B=, %=" },
-	{ OXSet+ICne,  Ki, "setnz %B=\n\tmovzb%k %B=, %=" },
-	{ OXSet+ICXnp, Ki, "setnp %B=\n\tmovsb%k %B=, %=" },
-	{ OXSet+ICXp,  Ki, "setp %B=\n\tmovsb%k %B=, %=" },
-	{ NOp, 0, 0 }
-};
-
-static char *rname[][4] = {
-	[RAX] = {"rax", "eax", "ax", "al"},
-	[RBX] = {"rbx", "ebx", "bx", "bl"},
-	[RCX] = {"rcx", "ecx", "cx", "cl"},
-	[RDX] = {"rdx", "edx", "dx", "dl"},
-	[RSI] = {"rsi", "esi", "si", "sil"},
-	[RDI] = {"rdi", "edi", "di", "dil"},
-	[RBP] = {"rbp", "ebp", "bp", "bpl"},
-	[RSP] = {"rsp", "esp", "sp", "spl"},
-	[R8 ] = {"r8" , "r8d", "r8w", "r8b"},
-	[R9 ] = {"r9" , "r9d", "r9w", "r9b"},
-	[R10] = {"r10", "r10d", "r10w", "r10b"},
-	[R11] = {"r11", "r11d", "r11w", "r11b"},
-	[R12] = {"r12", "r12d", "r12w", "r12b"},
-	[R13] = {"r13", "r13d", "r13w", "r13b"},
-	[R14] = {"r14", "r14d", "r14w", "r14b"},
-	[R15] = {"r15", "r15d", "r15w", "r15b"},
-};
-
-
-static int
-slot(int s, Fn *fn)
-{
-	struct { int i:14; } x;
-
-	/* sign extend s using a bitfield */
-	x.i = s;
-	assert(NAlign == 3);
-	if (x.i < 0)
-		return -4 * x.i;
-	else {
-		assert(fn->slot >= x.i);
-		return -4 * (fn->slot - x.i);
-	}
-}
-
-static void
-emitcon(Con *con, FILE *f)
-{
-	switch (con->type) {
-	default:
-		diag("emit: invalid constant");
-	case CAddr:
-		fputs(con->label, f);
-		if (con->bits.i)
-			fprintf(f, "%+"PRId64, con->bits.i);
-		break;
-	case CBits:
-		fprintf(f, "%"PRId64, con->bits.i);
-		break;
-	}
-}
-
-static char *
-regtoa(int reg, int sz)
-{
-	static char buf[6];
-
-	if (reg >= XMM0) {
-		sprintf(buf, "xmm%d", reg-XMM0);
-		return buf;
-	} else
-		return rname[reg][sz];
-}
-
-static Ref
-getarg(char c, Ins *i)
-{
-	switch (c) {
-	default:
-		diag("emit: 0, 1, = expected in format");
-	case '0':
-		return i->arg[0];
-	case '1':
-		return i->arg[1];
-	case '=':
-		return i->to;
-	}
-}
-
-static void emitins(Ins, Fn *, FILE *);
-
-static void
-emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f)
-{
-	Ins icp;
-
-	icp.op = OCopy;
-	icp.arg[0] = r2;
-	icp.to = r1;
-	icp.cls = k;
-	emitins(icp, fn, f);
-}
-
-static void
-emitf(char *s, Ins *i, Fn *fn, FILE *f)
-{
-	static char clstoa[][3] = {"l", "q", "ss", "sd"};
-	char c;
-	int sz;
-	Ref ref;
-	Mem *m;
-	Con off;
-
-	switch (*s) {
-	case '+':
-		if (req(i->arg[1], i->to)) {
-			ref = i->arg[0];
-			i->arg[0] = i->arg[1];
-			i->arg[1] = ref;
-		}
-		/* fall through */
-	case '-':
-		if (req(i->arg[1], i->to) && !req(i->arg[0], i->to))
-			diag("emit: cannot convert to 2-address");
-		emitcopy(i->to, i->arg[0], i->cls, fn, f);
-		s++;
-		break;
-	}
-
-	fputc('\t', f);
-Next:
-	while ((c = *s++) != '%')
-		if (!c) {
-			fputc('\n', f);
-			return;
-		} else
-			fputc(c, f);
-	switch ((c = *s++)) {
-	default:
-		diag("emit: invalid escape");
-	case '%':
-		fputc('%', f);
-		break;
-	case 'k':
-		fputs(clstoa[i->cls], f);
-		break;
-	case '0':
-	case '1':
-	case '=':
-		sz = KWIDE(i->cls) ? SLong : SWord;
-		s--;
-		/* fall through */
-	case 'D':
-	case 'S':
-	Ref:
-		c = *s++;
-		ref = getarg(c, i);
-		switch (rtype(ref)) {
-		default:
-			diag("emit: invalid reference");
-		case RTmp:
-			assert(isreg(ref));
-			fprintf(f, "%%%s", regtoa(ref.val, sz));
-			break;
-		case RSlot:
-			fprintf(f, "%d(%%rbp)", slot(ref.val, fn));
-			break;
-		case RAMem:
-		Mem:
-			m = &fn->mem[ref.val & AMask];
-			if (rtype(m->base) == RSlot) {
-				off.type = CBits;
-				off.bits.i = slot(m->base.val, fn);
-				addcon(&m->offset, &off);
-				m->base = TMP(RBP);
-			}
-			if (m->offset.type != CUndef)
-				emitcon(&m->offset, f);
-			if (req(m->base, R) && req(m->index, R))
-				break;
-			fputc('(', f);
-			if (!req(m->base, R))
-				fprintf(f, "%%%s", regtoa(m->base.val, SLong));
-			if (!req(m->index, R))
-				fprintf(f, ", %%%s, %d",
-					regtoa(m->index.val, SLong),
-					m->scale
-				);
-			fputc(')', f);
-			break;
-		case RCon:
-			fputc('$', f);
-			emitcon(&fn->con[ref.val], f);
-			break;
-		}
-		break;
-	case 'L':
-		sz = SLong;
-		goto Ref;
-	case 'W':
-		sz = SWord;
-		goto Ref;
-	case 'H':
-		sz = SShort;
-		goto Ref;
-	case 'B':
-		sz = SByte;
-		goto Ref;
-	case 'M':
-		c = *s++;
-		ref = getarg(c, i);
-		switch (rtype(ref)) {
-		default:
-			diag("emit: invalid memory reference");
-		case RAMem:
-			goto Mem;
-		case RSlot:
-			fprintf(f, "%d(%%rbp)", slot(ref.val, fn));
-			break;
-		case RCon:
-			emitcon(&fn->con[ref.val], f);
-			fprintf(f, "(%%rip)");
-			break;
-		case RTmp:
-			assert(isreg(ref));
-			fprintf(f, "(%%%s)", regtoa(ref.val, SLong));
-			break;
-		}
-		break;
-	}
-	goto Next;
-}
-
-static void
-emitins(Ins i, Fn *fn, FILE *f)
-{
-	Ref r;
-	int64_t val;
-	int o;
-
-	switch (i.op) {
-	default:
-	Table:
-		/* most instructions are just pulled out of
-		 * the table omap[], some special cases are
-		 * detailed below */
-		for (o=0;; o++) {
-			/* this linear search should really be a binary
-			 * search */
-			if (omap[o].op == NOp)
-				diag("emit: no entry found for instruction");
-			if (omap[o].op == i.op)
-			if (omap[o].cls == i.cls
-			|| (omap[o].cls == Ki && KBASE(i.cls) == 0)
-			|| (omap[o].cls == Ka))
-				break;
-		}
-		emitf(omap[o].asm, &i, fn, f);
-		break;
-	case ONop:
-		/* just do nothing for nops, they are inserted
-		 * by some passes */
-		break;
-	case OMul:
-		/* here, we try to use the 3-addresss form
-		 * of multiplication when possible */
-		if (rtype(i.arg[1]) == RCon) {
-			r = i.arg[0];
-			i.arg[0] = i.arg[1];
-			i.arg[1] = r;
-		}
-		if (KBASE(i.cls) == 0 /* only available for ints */
-		&& rtype(i.arg[0]) == RCon
-		&& rtype(i.arg[1]) == RTmp) {
-			emitf("imul%k %0, %1, %=", &i, fn, f);
-			break;
-		}
-		goto Table;
-	case OSub:
-		/* we have to use the negation trick to handle
-		 * some 3-address substractions */
-		if (req(i.to, i.arg[1])) {
-			emitf("neg%k %=", &i, fn, f);
-			emitf("add%k %0, %=", &i, fn, f);
-			break;
-		}
-		goto Table;
-	case OCopy:
-		/* make sure we don't emit useless copies,
-		 * also, we can use a trick to load 64-bits
-		 * registers, it's detailed in my note below
-		 * http://c9x.me/art/notes.html?09/19/2015 */
-		if (req(i.to, R) || req(i.arg[0], R))
-			break;
-		if (isreg(i.to)
-		&& rtype(i.arg[0]) == RCon
-		&& i.cls == Kl
-		&& fn->con[i.arg[0].val].type == CBits
-		&& (val = fn->con[i.arg[0].val].bits.i) >= 0
-		&& val <= UINT32_MAX) {
-			emitf("movl %W0, %W=", &i, fn, f);
-		} else if (!req(i.arg[0], i.to))
-			emitf("mov%k %0, %=", &i, fn, f);
-		break;
-	case OCall:
-		/* calls simply have a weird syntax in AT&T
-		 * assembly... */
-		switch (rtype(i.arg[0])) {
-		default:
-			diag("emit: invalid call instruction");
-		case RCon:
-			fprintf(f, "\tcallq ");
-			emitcon(&fn->con[i.arg[0].val], f);
-			fprintf(f, "\n");
-			break;
-		case RTmp:
-			emitf("callq *%L0", &i, fn, f);
-			break;
-		}
-		break;
-	case OSAlloc:
-		/* there is no good reason why this is here
-		 * maybe we should split OSAlloc in 2 different
-		 * instructions depending on the result
-		 */
-		emitf("subq %L0, %%rsp", &i, fn, f);
-		if (!req(i.to, R))
-			emitcopy(i.to, TMP(RSP), Kl, fn, f);
-		break;
-	case OSwap:
-		if (KBASE(i.cls) == 0)
-			goto Table;
-		/* for floats, there is no swap instruction
-		 * so we use xmm15 as a temporary
-		 */
-		emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f);
-		emitcopy(i.arg[0], i.arg[1], i.cls, fn, f);
-		emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f);
-		break;
-	}
-}
-
-static int
-cneg(int cmp)
-{
-	switch (cmp) {
-	default:   diag("emit: cneg() unhandled comparison");
-	case ICule: return ICugt;
-	case ICult: return ICuge;
-	case ICsle: return ICsgt;
-	case ICslt: return ICsge;
-	case ICsgt: return ICsle;
-	case ICsge: return ICslt;
-	case ICugt: return ICule;
-	case ICuge: return ICult;
-	case ICeq:  return ICne;
-	case ICne:  return ICeq;
-	case ICXnp: return ICXp;
-	case ICXp:  return ICXnp;
-	}
-}
-
-static int
-framesz(Fn *fn)
-{
-	int i, o, f;
-
-	assert(NAlign == 3);
-	for (i=0, o=0; i<NRClob; i++)
-		o ^= 1 & (fn->reg >> rclob[i]);
-	f = fn->slot;
-	f = (f + 3) & -4;
-	return 4*f + 8*o;
-}
-
-void
-emitfn(Fn *fn, FILE *f)
-{
-	static char *ctoa[] = {
-		[ICeq]  = "z",
-		[ICule] = "be",
-		[ICult] = "b",
-		[ICsle] = "le",
-		[ICslt] = "l",
-		[ICsgt] = "g",
-		[ICsge] = "ge",
-		[ICugt] = "a",
-		[ICuge] = "ae",
-		[ICne]  = "nz",
-		[ICXnp] = "np",
-		[ICXp]  = "p"
-	};
-	Blk *b, *s;
-	Ins *i, itmp;
-	int *r, c, fs;
-
-	fprintf(f,
-		".text\n"
-		".globl %s\n"
-		".type %s, @function\n"
-		"%s:\n"
-		"\tpush %%rbp\n"
-		"\tmov %%rsp, %%rbp\n",
-		fn->name, fn->name, fn->name
-	);
-	fs = framesz(fn);
-	if (fs)
-		fprintf(f, "\tsub $%d, %%rsp\n", fs);
-	for (r=rclob; r-rclob < NRClob; r++)
-		if (fn->reg & BIT(*r)) {
-			itmp.arg[0] = TMP(*r);
-			emitf("pushq %L0", &itmp, fn, f);
-		}
-
-	for (b=fn->start; b; b=b->link) {
-		fprintf(f, ".L%s:\n", b->name);
-		for (i=b->ins; i!=&b->ins[b->nins]; i++)
-			emitins(*i, fn, f);
-		switch (b->jmp.type) {
-		case JRet0:
-			for (r=&rclob[NRClob]; r>rclob;)
-				if (fn->reg & BIT(*--r)) {
-					itmp.arg[0] = TMP(*r);
-					emitf("popq %L0", &itmp, fn, f);
-				}
-			fprintf(f,
-				"\tleave\n"
-				"\tret\n"
-			);
-			break;
-		case JJmp:
-			if (b->s1 != b->link)
-				fprintf(f, "\tjmp .L%s\n", b->s1->name);
-			break;
-		default:
-			c = b->jmp.type - JXJc;
-			if (0 <= c && c <= NXICmp) {
-				if (b->link == b->s2) {
-					s = b->s1;
-				} else if (b->link == b->s1) {
-					c = cneg(c);
-					s = b->s2;
-				} else
-					diag("emit: unhandled jump (1)");
-				fprintf(f, "\tj%s .L%s\n", ctoa[c], s->name);
-				break;
-			}
-			diag("emit: unhandled jump (2)");
-		}
-	}
-
-}
-
-void
-emitdat(Dat *d, FILE *f)
-{
-	static int align;
-	static char *dtoa[] = {
-		[DAlign] = ".align",
-		[DB] = "\t.byte",
-		[DH] = "\t.value",
-		[DW] = "\t.long",
-		[DL] = "\t.quad"
-	};
-
-	switch (d->type) {
-	case DStart:
-		align = 0;
-		fprintf(f, ".data\n");
-		break;
-	case DEnd:
-		break;
-	case DName:
-		if (!align)
-			fprintf(f, ".align 8\n");
-		fprintf(f,
-			".globl %s\n"
-			".type %s, @object\n"
-			"%s:\n",
-			d->u.str, d->u.str, d->u.str
-		);
-		break;
-	case DZ:
-		fprintf(f, "\t.fill %"PRId64",1,0\n", d->u.num);
-		break;
-	default:
-		if (d->type == DAlign)
-			align = 1;
-
-		if (d->isstr) {
-			if (d->type != DB)
-				err("strings only supported for 'b' currently");
-			fprintf(f, "\t.ascii \"%s\"\n", d->u.str);
-		}
-		else if (d->isref) {
-			fprintf(f, "%s %s%+"PRId64"\n",
-				dtoa[d->type], d->u.ref.nam,
-				d->u.ref.off);
-		}
-		else {
-			fprintf(f, "%s %"PRId64"\n",
-				dtoa[d->type], d->u.num);
-		}
-		break;
-	}
-}
-
-typedef struct FBits FBits;
-
-struct FBits {
-	int64_t bits;
-	int wide;
-	FBits *link;
-};
-
-static FBits *stash;
-
-int
-stashfp(int64_t n, int w)
-{
-	FBits **pb, *b;
-	int i;
-
-	/* does a dumb de-dup of fp constants
-	 * this should be the linker's job */
-	for (pb=&stash, i=0; (b=*pb); pb=&b->link, i++)
-		if (n == b->bits && w == b->wide)
-			return i;
-	b = emalloc(sizeof *b);
-	b->bits = n;
-	b->wide = w;
-	b->link = 0;
-	*pb = b;
-	return i;
-}
-
-void
-emitfin(FILE *f)
-{
-	FBits *b;
-	int i;
-
-	if (!stash)
-		return;
-	fprintf(f, "/* floating point constants */\n");
-	fprintf(f, ".data\n.align 8\n");
-	for (b=stash, i=0; b; b=b->link, i++)
-		if (b->wide)
-			fprintf(f,
-				".Lfp%d:\n"
-				"\t.quad %"PRId64
-				" /* %f */\n",
-				i, b->bits,
-				*(double *)&b->bits
-			);
-	for (b=stash, i=0; b; b=b->link, i++)
-		if (!b->wide)
-			fprintf(f,
-				".Lfp%d:\n"
-				"\t.long %"PRId64
-				" /* %lf */\n",
-				i, b->bits & 0xffffffff,
-				*(float *)&b->bits
-			);
-	while ((b=stash)) {
-		stash = b->link;
-		free(b);
-	}
-}
diff --git a/lisc/isel.c b/lisc/isel.c
@@ -1,1135 +0,0 @@
-#include "lisc.h"
-#include <limits.h>
-
-/* For x86_64, do the following:
- *
- * - lower calls
- * - check that constants are used only in
- *   places allowed
- * - ensure immediates always fit in 32b
- * - explicit machine register contraints
- *   on instructions like division.
- * - implement fast locals (the streak of
- *   constant allocX in the first basic block)
- * - recognize complex addressing modes
- *
- * Invariant: the use counts that are used
- *            in sel() must be sound.  This
- *            is not so trivial, maybe the
- *            dce should be moved out...
- */
-
-typedef struct ANum ANum;
-typedef struct AClass AClass;
-typedef struct RAlloc RAlloc;
-
-struct ANum {
-	char n, l, r;
-	Ins *i;
-	Ref mem;
-};
-
-static void amatch(Addr *, Ref, ANum *, Fn *, int);
-
-static int
-fcmptoi(int fc)
-{
-	switch (fc) {
-	default:   diag("isel: fcmptoi defaulted");
-	case FCle: return ICule;
-	case FClt: return ICult;
-	case FCgt: return ICugt;
-	case FCge: return ICuge;
-	case FCne: return ICne;
-	case FCeq: return ICeq;
-	case FCo:  return ICXnp;
-	case FCuo: return ICXp;
-	}
-}
-
-static int
-iscmp(int op, int *pk, int *pc)
-{
-	int k, c;
-
-	if (OCmpw <= op && op <= OCmpw1) {
-		c = op - OCmpw;
-		k = Kw;
-	}
-	else if (OCmpl <= op && op <= OCmpl1) {
-		c = op - OCmpl;
-		k = Kl;
-	}
-	else if (OCmps <= op && op <= OCmps1) {
-		c = fcmptoi(op - OCmps);
-		k = Ks;
-	}
-	else if (OCmpd <= op && op <= OCmpd1) {
-		c = fcmptoi(op - OCmpd);
-		k = Kd;
-	}
-	else
-		return 0;
-	if (pk)
-		*pk = k;
-	if (pc)
-		*pc = c;
-	return 1;
-}
-
-static int
-noimm(Ref r, Fn *fn)
-{
-	int64_t val;
-
-	if (rtype(r) != RCon)
-		return 0;
-	switch (fn->con[r.val].type) {
-	default:
-		diag("isel: invalid constant");
-	case CAddr:
-		/* we only support the 'small'
-		 * code model of the ABI, this
-		 * means that we can always
-		 * address data with 32bits
-		 */
-		return 0;
-	case CBits:
-		val = fn->con[r.val].bits.i;
-		return (val < INT32_MIN || val > INT32_MAX);
-	}
-}
-
-static int
-rslot(Ref r, Fn *fn)
-{
-	if (rtype(r) != RTmp)
-		return -1;
-	return fn->tmp[r.val].slot;
-}
-
-static int
-argcls(Ins *i, int n)
-{
-	return opdesc[i->op].argcls[n][i->cls];
-}
-
-static void
-fixarg(Ref *r, int k, int phi, Fn *fn)
-{
-	Addr a;
-	Ref r0, r1;
-	int s, n;
-
-	r1 = r0 = *r;
-	s = rslot(r0, fn);
-	if (KBASE(k) == 1 && rtype(r0) == RCon) {
-		/* load floating points from memory
-		 * slots, they can't be used as
-		 * immediates
-		 */
-		r1 = MEM(fn->nmem);
-		vgrow(&fn->mem, ++fn->nmem);
-		memset(&a, 0, sizeof a);
-		a.offset.type = CAddr;
-		n = stashfp(fn->con[r0.val].bits.i, KWIDE(k));
-		sprintf(a.offset.label, ".Lfp%d", n);
-		fn->mem[fn->nmem-1] = a;
-	}
-	else if (!phi && k == Kl && noimm(r0, fn)) {
-		/* load constants that do not fit in
-		 * a 32bit signed integer into a
-		 * long temporary
-		 */
-		r1 = newtmp("isel", Kl, fn);
-		emit(OCopy, Kl, r1, r0, R);
-	}
-	else if (s != -1) {
-		/* load fast locals' addresses into
-		 * temporaries right before the
-		 * instruction
-		 */
-		r1 = newtmp("isel", Kl, fn);
-		emit(OAddr, Kl, r1, SLOT(s), R);
-	}
-	*r = r1;
-}
-
-static void
-chuse(Ref r, int du, Fn *fn)
-{
-	if (rtype(r) == RTmp)
-		fn->tmp[r.val].nuse += du;
-}
-
-static void
-seladdr(Ref *r, ANum *an, Fn *fn)
-{
-	Addr a;
-	Ref r0, r1;
-
-	r0 = *r;
-	if (rtype(r0) == RTmp) {
-		chuse(r0, -1, fn);
-		r1 = an[r0.val].mem;
-		if (req(r1, R)) {
-			amatch(&a, r0, an, fn, 1);
-			vgrow(&fn->mem, ++fn->nmem);
-			fn->mem[fn->nmem-1] = a;
-			r1 = MEM(fn->nmem-1);
-			chuse(a.base, +1, fn);
-			chuse(a.index, +1, fn);
-			if (rtype(a.base) != RTmp)
-			if (rtype(a.index) != RTmp)
-				an[r0.val].mem = r1;
-		}
-		*r = r1;
-	}
-}
-
-static void
-selcmp(Ref arg[2], int k, Fn *fn)
-{
-	Ref r;
-
-	if (rtype(arg[0]) == RCon) {
-		r = arg[1];
-		arg[1] = arg[0];
-		arg[0] = r;
-	}
-	assert(rtype(arg[0]) != RCon);
-	emit(OXCmp, k, R, arg[1], arg[0]);
-	fixarg(&curi->arg[0], k, 0, fn);
-}
-
-static void
-sel(Ins i, ANum *an, Fn *fn)
-{
-	Ref r0, r1;
-	int x, k, kc;
-	int64_t val;
-	Ins *i0;
-
-	if (rtype(i.to) == RTmp)
-	if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
-	if (fn->tmp[i.to.val].nuse == 0) {
-		chuse(i.arg[0], -1, fn);
-		chuse(i.arg[1], -1, fn);
-		return;
-	}
-	i0 = curi;
-	k = i.cls;
-	switch (i.op) {
-	case ODiv:
-	case ORem:
-	case OUDiv:
-	case OURem:
-		if (i.op == ODiv || i.op == OUDiv)
-			r0 = TMP(RAX), r1 = TMP(RDX);
-		else
-			r0 = TMP(RDX), r1 = TMP(RAX);
-		emit(OCopy, k, i.to, r0, R);
-		emit(OCopy, k, R, r1, R);
-		if (rtype(i.arg[1]) == RCon) {
-			/* immediates not allowed for
-			 * divisions in x86
-			 */
-			r0 = newtmp("isel", k, fn);
-		} else
-			r0 = i.arg[1];
-		if (i.op == ODiv || i.op == ORem) {
-			emit(OXIDiv, k, R, r0, R);
-			emit(OSign, k, TMP(RDX), TMP(RAX), R);
-		} else {
-			emit(OXDiv, k, R, r0, R);
-			emit(OCopy, k, TMP(RDX), CON_Z, R);
-		}
-		emit(OCopy, k, TMP(RAX), i.arg[0], R);
-		if (rtype(i.arg[1]) == RCon)
-			emit(OCopy, k, r0, i.arg[1], R);
-		break;
-	case OSar:
-	case OShr:
-	case OShl:
-		if (rtype(i.arg[1]) == RCon)
-			goto Emit;
-		r0 = i.arg[1];
-		i.arg[1] = TMP(RCX);
-		emit(OCopy, Kw, R, TMP(RCX), R);
-		emiti(i);
-		emit(OCopy, Kw, TMP(RCX), r0, R);
-		break;
-	case ONop:
-		break;
-	case OStored:
-	case OStores:
-	case OStorel:
-	case OStorew:
-	case OStoreh:
-	case OStoreb:
-		if (rtype(i.arg[0]) == RCon) {
-			if (i.op == OStored)
-				i.op = OStorel;
-			if (i.op == OStores)
-				i.op = OStorew;
-		}
-		seladdr(&i.arg[1], an, fn);
-		goto Emit;
-	case_OLoad:
-		seladdr(&i.arg[0], an, fn);
-		goto Emit;
-	case OCall:
-	case OSAlloc:
-	case OCopy:
-	case OAdd:
-	case OSub:
-	case OMul:
-	case OAnd:
-	case OOr:
-	case OXor:
-	case OXTest:
-	case OFtosi:
-	case OSitof:
-	case OExts:
-	case OTruncd:
-	case OCast:
-	case_OExt:
-Emit:
-		emiti(i);
-		fixarg(&curi->arg[0], argcls(curi, 0), 0, fn);
-		fixarg(&curi->arg[1], argcls(curi, 1), 0, fn);
-		break;
-	case OAlloc:
-	case OAlloc+1:
-	case OAlloc+2: /* == OAlloc1 */
-		/* we need to make sure
-		 * the stack remains aligned
-		 * (rsp = 0) mod 16
-		 */
-		if (rtype(i.arg[0]) == RCon) {
-			assert(fn->con[i.arg[0].val].type == CBits);
-			val = fn->con[i.arg[0].val].bits.i;
-			val = (val + 15)  & ~INT64_C(15);
-			if (val < 0 || val > INT32_MAX)
-				diag("isel: alloc too large");
-			emit(OSAlloc, Kl, i.to, getcon(val, fn), R);
-		} else {
-			/* r0 = (i.arg[0] + 15) & -16 */
-			r0 = newtmp("isel", Kl, fn);
-			r1 = newtmp("isel", Kl, fn);
-			emit(OSAlloc, Kl, i.to, r0, R);
-			emit(OAnd, Kl, r0, r1, getcon(-16, fn));
-			emit(OAdd, Kl, r1, i.arg[0], getcon(15, fn));
-		}
-		break;
-	default:
-		if (isext(i.op))
-			goto case_OExt;
-		if (isload(i.op))
-			goto case_OLoad;
-		if (iscmp(i.op, &kc, &x)) {
-			if (rtype(i.arg[0]) == RCon)
-				x = icmpop(x);
-			emit(OXSet+x, k, i.to, R, R);
-			selcmp(i.arg, kc, fn);
-			break;
-		}
-		diag("isel: non-exhaustive implementation");
-	}
-
-	while (i0 > curi && --i0)
-		if (rslot(i0->arg[0], fn) != -1
-		||  rslot(i0->arg[1], fn) != -1)
-			diag("isel: usupported address argument");
-}
-
-static Ins *
-flagi(Ins *i0, Ins *i)
-{
-	while (i>i0) {
-		i--;
-		if (opdesc[i->op].sflag)
-			return i;
-		if (opdesc[i->op].lflag)
-			continue;
-		return 0;
-	}
-	return 0;
-}
-
-struct AClass {
-	int inmem;
-	int align;
-	uint size;
-	int cls[2];
-};
-
-static void
-aclass(AClass *a, Typ *t)
-{
-	int e, s, n, cls;
-	uint sz, al;
-
-	sz = t->size;
-	al = 1u << t->align;
-
-	/* the ABI requires sizes to be rounded
-	 * up to the nearest multiple of 8, moreover
-	 * it makes it easy load and store structures
-	 * in registers
-	 */
-	if (al < 8)
-		al = 8;
-	sz = (sz + al-1) & -al;
-
-	a->size = sz;
-	a->align = t->align;
-
-	if (t->dark || sz > 16) {
-		/* large or unaligned structures are
-		 * required to be passed in memory
-		 */
-		a->inmem = 1;
-		return;
-	}
-
-	a->inmem = 0;
-	for (e=0, s=0; e<2; e++) {
-		cls = -1;
-		for (n=0; n<8 && t->seg[s].len; s++) {
-			if (t->seg[s].ispad) {
-				/* don't change anything */
-			}
-			else if (t->seg[s].isflt) {
-				if (cls == -1)
-					cls = Kd;
-			}
-			else
-				cls = Kl;
-			n += t->seg[s].len;
-		}
-		assert(n <= 8);
-		a->cls[e] = cls;
-	}
-}
-
-static void
-blit(Ref rstk, uint soff, Ref rsrc, uint sz, Fn *fn)
-{
-	Ref r, r1;
-	uint boff;
-
-	/* it's an impolite blit, we might go across the end
-	 * of the source object a little bit... */
-	for (boff=0; sz>0; sz-=8, soff+=8, boff+=8) {
-		r = newtmp("abi", Kl, fn);
-		r1 = newtmp("abi", Kl, fn);
-		emit(OStorel, 0, R, r, r1);
-		emit(OAdd, Kl, r1, rstk, getcon(soff, fn));
-		r1 = newtmp("abi", Kl, fn);
-		emit(OLoad, Kl, r, r1, R);
-		emit(OAdd, Kl, r1, rsrc, getcon(boff, fn));
-		chuse(rsrc, +1, fn);
-		chuse(rstk, +1, fn);
-	}
-}
-
-static int
-retr(Ref reg[2], AClass *aret)
-{
-	static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
-	int n, k, ca, nr[2];
-
-	nr[0] = nr[1] = 0;
-	ca = 0;
-	for (n=0; aret->cls[n]>=0 && n<2; n++) {
-		k = KBASE(aret->cls[n]);
-		reg[n] = TMP(retreg[k][nr[k]++]);
-		ca += 1 << (2 * k);
-	}
-	return ca;
-}
-
-static void
-selret(Blk *b, Fn *fn)
-{
-	int j, k, ca;
-	Ref r, r0, reg[2];
-	AClass aret;
-
-	j = b->jmp.type;
-
-	if (!isret(j) || j == JRet0)
-		return;
-
-	r0 = b->jmp.arg;
-	b->jmp.type = JRet0;
-
-	if (j == JRetc) {
-		aclass(&aret, &typ[fn->retty]);
-		if (aret.inmem) {
-			assert(rtype(fn->retr) == RTmp);
-			emit(OCopy, Kl, TMP(RAX), fn->retr, R);
-			chuse(fn->retr, +1, fn);
-			blit(fn->retr, 0, r0, aret.size, fn);
-			ca = 1;
-		} else {
-			ca = retr(reg, &aret);
-			if (aret.size > 8) {
-				r = newtmp("abi", Kl, fn);
-				emit(OLoad, Kl, reg[1], r, R);
-				emit(OAdd, Kl, r, r0, getcon(8, fn));
-				chuse(r0, +1, fn);
-			}
-			emit(OLoad, Kl, reg[0], r0, R);
-		}
-	} else {
-		k = j - JRetw;
-		if (KBASE(k) == 0) {
-			emit(OCopy, k, TMP(RAX), r0, R);
-			ca = 1;
-		} else {
-			emit(OCopy, k, TMP(XMM0), r0, R);
-			ca = 1 << 2;
-		}
-	}
-
-	b->jmp.arg = CALL(ca);
-}
-
-static void
-seljmp(Blk *b, Fn *fn)
-{
-	Ref r;
-	int c, k;
-	Ins *fi;
-
-	if (b->jmp.type == JRet0 || b->jmp.type == JJmp)
-		return;
-	assert(b->jmp.type == JJnz);
-	r = b->jmp.arg;
-	b->jmp.arg = R;
-	assert(!req(r, R));
-	if (rtype(r) == RCon) {
-		b->jmp.type = JJmp;
-		if (req(r, CON_Z))
-			b->s1 = b->s2;
-		b->s2 = 0;
-		return;
-	}
-	fi = flagi(b->ins, &b->ins[b->nins]);
-	if (fi && req(fi->to, r)) {
-		if (iscmp(fi->op, &k, &c)) {
-			if (rtype(fi->arg[0]) == RCon)
-				c = icmpop(c);
-			b->jmp.type = JXJc + c;
-			if (fn->tmp[r.val].nuse == 1) {
-				assert(fn->tmp[r.val].ndef == 1);
-				selcmp(fi->arg, k, fn);
-				*fi = (Ins){.op = ONop};
-			}
-			return;
-		}
-		if (fi->op == OAnd && fn->tmp[r.val].nuse == 1
-		&& (rtype(fi->arg[0]) == RTmp ||
-		    rtype(fi->arg[1]) == RTmp)) {
-			fi->op = OXTest;
-			fi->to = R;
-			b->jmp.type = JXJc + ICne;
-			if (rtype(fi->arg[1]) == RCon) {
-				r = fi->arg[1];
-				fi->arg[1] = fi->arg[0];
-				fi->arg[0] = r;
-			}
-			return;
-		}
-		/* since flags are not tracked in liveness,
-		 * the result of the flag-setting instruction
-		 * has to be marked as live
-		 */
-		if (fn->tmp[r.val].nuse == 1)
-			emit(OCopy, Kw, R, r, R);
-		b->jmp.type = JXJc + ICne;
-		return;
-	}
-	selcmp((Ref[2]){r, CON_Z}, Kw, fn); /* todo, add long branch if non-zero */
-	b->jmp.type = JXJc + ICne;
-}
-
-static int
-classify(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret)
-{
-	int nint, ni, nsse, ns, n, *pn;
-	AClass *a;
-	Ins *i;
-
-	if (aret && aret->inmem)
-		nint = 5; /* hidden argument */
-	else
-		nint = 6;
-	nsse = 8;
-	for (i=i0, a=ac; i<i1; i++, a++) {
-		if (i->op == op) {
-			if (KBASE(i->cls) == 0)
-				pn = &nint;
-			else
-				pn = &nsse;
-			if (*pn > 0) {
-				--*pn;
-				a->inmem = 0;
-			} else
-				a->inmem = 2;
-			a->align = 3;
-			a->size = 8;
-			a->cls[0] = i->cls;
-		} else {
-			n = i->arg[0].val & AMask;
-			aclass(a, &typ[n]);
-			if (a->inmem)
-				continue;
-			ni = ns = 0;
-			for (n=0; n<2; n++)
-				if (KBASE(a->cls[n]) == 0)
-					ni++;
-				else
-					ns++;
-			if (nint >= ni && nsse >= ns) {
-				nint -= ni;
-				nsse -= ns;
-			} else
-				a->inmem = 1;
-		}
-	}
-
-	return ((6-nint) << 4) | ((8-nsse) << 8);
-}
-
-int rsave[] = {
-	RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
-	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
-	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14
-};
-int rclob[] = {RBX, R12, R13, R14, R15};
-
-MAKESURE(rsave_has_correct_size, sizeof rsave == NRSave * sizeof(int));
-MAKESURE(rclob_has_correct_size, sizeof rclob == NRClob * sizeof(int));
-
-bits
-retregs(Ref r, int p[2])
-{
-	bits b;
-	int ni, nf;
-
-	assert(rtype(r) == RACall);
-	b = 0;
-	ni = r.val & 3;
-	nf = (r.val >> 2) & 3;
-	if (ni >= 1)
-		b |= BIT(RAX);
-	if (ni >= 2)
-		b |= BIT(RDX);
-	if (nf >= 1)
-		b |= BIT(XMM0);
-	if (nf >= 2)
-		b |= BIT(XMM1);
-	if (p) {
-		p[0] = ni;
-		p[1] = nf;
-	}
-	return b;
-}
-
-bits
-argregs(Ref r, int p[2])
-{
-	bits b;
-	int j, ni, nf;
-
-	assert(rtype(r) == RACall);
-	b = 0;
-	ni = (r.val >> 4) & 15;
-	nf = (r.val >> 8) & 15;
-	for (j=0; j<ni; j++)
-		b |= BIT(rsave[j]);
-	for (j=0; j<nf; j++)
-		b |= BIT(XMM0+j);
-	if (p) {
-		p[0] = ni + 1;
-		p[1] = nf;
-	}
-	return b | BIT(RAX);
-}
-
-static Ref
-rarg(int ty, int *ni, int *ns)
-{
-	if (KBASE(ty) == 0)
-		return TMP(rsave[(*ni)++]);
-	else
-		return TMP(XMM0 + (*ns)++);
-}
-
-struct RAlloc {
-	Ins i;
-	RAlloc *link;
-};
-
-static void
-selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
-{
-	Ins *i;
-	AClass *ac, *a, aret;
-	int ca, ni, ns;
-	uint stk, off;
-	Ref r, r1, r2, reg[2], regcp[2];
-	RAlloc *ra;
-
-	ac = alloc((i1-i0) * sizeof ac[0]);
-	if (!req(i1->arg[1], R)) {
-		assert(rtype(i1->arg[1]) == RAType);
-		aclass(&aret, &typ[i1->arg[1].val & AMask]);
-		ca = classify(i0, i1, ac, OArg, &aret);
-	} else
-		ca = classify(i0, i1, ac, OArg, 0);
-
-	for (stk=0, a=&ac[i1-i0]; a>ac;)
-		if ((--a)->inmem) {
-			assert(a->align <= 4);
-			stk += a->size;
-			if (a->align == 4)
-				stk += stk & 15;
-		}
-	stk += stk & 15;
-	if (stk) {
-		r = getcon(-(int64_t)stk, fn);
-		emit(OSAlloc, Kl, R, r, R);
-	}
-
-	if (!req(i1->arg[1], R)) {
-		if (aret.inmem) {
-			/* get the return location from eax
-			 * it saves one callee-save reg */
-			r1 = newtmp("abi", Kl, fn);
-			emit(OCopy, Kl, i1->to, TMP(RAX), R);
-			ca += 1;
-		} else {
-			if (aret.size > 8) {
-				r = newtmp("abi", Kl, fn);
-				regcp[1] = newtmp("abi", aret.cls[1], fn);
-				emit(OStorel, 0, R, regcp[1], r);
-				emit(OAdd, Kl, r, i1->to, getcon(8, fn));
-				chuse(i1->to, +1, fn);
-				ca += 1 << (2 * KBASE(aret.cls[1]));
-			}
-			regcp[0] = newtmp("abi", aret.cls[0], fn);
-			emit(OStorel, 0, R, regcp[0], i1->to);
-			ca += 1 << (2 * KBASE(aret.cls[0]));
-			retr(reg, &aret);
-			if (aret.size > 8)
-				emit(OCopy, aret.cls[1], regcp[1], reg[1], R);
-			emit(OCopy, aret.cls[0], regcp[0], reg[0], R);
-			r1 = i1->to;
-		}
-		/* allocate return pad */
-		ra = alloc(sizeof *ra);
-		assert(NAlign == 3);
-		aret.align -= 2;
-		if (aret.align < 0)
-			aret.align = 0;
-		ra->i.op = OAlloc + aret.align;
-		ra->i.cls = Kl;
-		ra->i.to = r1;
-		ra->i.arg[0] = getcon(aret.size, fn);
-		ra->link = (*rap);
-		*rap = ra;
-	} else {
-		ra = 0;
-		if (KBASE(i1->cls) == 0) {
-			emit(OCopy, i1->cls, i1->to, TMP(RAX), R);
-			ca += 1;
-		} else {
-			emit(OCopy, i1->cls, i1->to, TMP(XMM0), R);
-			ca += 1 << 2;
-		}
-	}
-	emit(OCall, i1->cls, R, i1->arg[0], CALL(ca));
-	emit(OCopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
-
-	ni = ns = 0;
-	if (ra && aret.inmem)
-		emit(OCopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
-	for (i=i0, a=ac; i<i1; i++, a++) {
-		if (a->inmem)
-			continue;
-		r1 = rarg(a->cls[0], &ni, &ns);
-		if (i->op == OArgc) {
-			if (a->size > 8) {
-				r2 = rarg(a->cls[1], &ni, &ns);
-				r = newtmp("abi", Kl, fn);
-				emit(OLoad, a->cls[1], r2, r, R);
-				emit(OAdd, Kl, r, i->arg[1], getcon(8, fn));
-				chuse(i->arg[1], +1, fn);
-			}
-			emit(OLoad, a->cls[0], r1, i->arg[1], R);
-		} else
-			emit(OCopy, i->cls, r1, i->arg[0], R);
-	}
-
-	if (!stk)
-		return;
-
-	r = newtmp("abi", Kl, fn);
-	chuse(r, -1, fn);
-	for (i=i0, a=ac, off=0; i<i1; i++, a++) {
-		if (!a->inmem)
-			continue;
-		if (i->op == OArgc) {
-			if (a->align == 4)
-				off += off & 15;
-			blit(r, off, i->arg[1], a->size, fn);
-		} else {
-			r1 = newtmp("abi", Kl, fn);
-			emit(OStorel, 0, R, i->arg[0], r1);
-			emit(OAdd, Kl, r1, r, getcon(off, fn));
-			chuse(r, +1, fn);
-		}
-		off += a->size;
-	}
-	emit(OSAlloc, Kl, r, getcon(stk, fn), R);
-}
-
-static void
-selpar(Fn *fn, Ins *i0, Ins *i1)
-{
-	AClass *ac, *a, aret;
-	Ins *i;
-	int ni, ns, s, al;
-	Ref r, r1;
-
-	ac = alloc((i1-i0) * sizeof ac[0]);
-	curi = insb;
-	ni = ns = 0;
-
-	if (fn->retty >= 0) {
-		aclass(&aret, &typ[fn->retty]);
-		if (aret.inmem) {
-			r = newtmp("abi", Kl, fn);
-			*curi++ = (Ins){OCopy, r, {rarg(Kl, &ni, &ns)}, Kl};
-			fn->retr = r;
-		}
-		classify(i0, i1, ac, OPar, &aret);
-	} else
-		classify(i0, i1, ac, OPar, 0);
-
-	assert(NAlign == 3);
-
-	s = 4;
-	for (i=i0, a=ac; i<i1; i++, a++) {
-		switch (a->inmem) {
-		case 1:
-			assert(a->align <= 4);
-			if (a->align == 4)
-				s = (s+3) & -4;
-			fn->tmp[i->to.val].slot = -s; /* HACK! */
-			s += a->size / 4;
-			continue;
-		case 2:
-			*curi++ = (Ins){OLoad, i->to, {SLOT(-s)}, i->cls};
-			s += 2;
-			continue;
-		}
-		r1 = rarg(a->cls[0], &ni, &ns);
-		if (i->op == OParc) {
-			r = newtmp("abi", Kl, fn);
-			*curi++ = (Ins){OCopy, r, {r1}, Kl};
-			a->cls[0] = r.val;
-			if (a->size > 8) {
-				r1 = rarg(a->cls[1], &ni, &ns);
-				r = newtmp("abi", Kl, fn);
-				*curi++ = (Ins){OCopy, r, {r1}, Kl};
-				a->cls[1] = r.val;
-			}
-		} else
-			*curi++ = (Ins){OCopy, i->to, {r1}, i->cls};
-	}
-	for (i=i0, a=ac; i<i1; i++, a++) {
-		if (i->op != OParc || a->inmem)
-			continue;
-		assert(NAlign == 3);
-		for (al=0; a->align >> (al+2); al++)
-			;
-		r = TMP(a->cls[0]);
-		r1 = i->to;
-		*curi++ = (Ins){OAlloc+al, r1, {getcon(a->size, fn)}, Kl};
-		*curi++ = (Ins){OStorel, R, {r, r1}, 0};
-		if (a->size > 8) {
-			r = newtmp("abi", Kl, fn);
-			*curi++ = (Ins){OAdd, r, {r1, getcon(8, fn)}, Kl};
-			r1 = TMP(a->cls[1]);
-			*curi++ = (Ins){OStorel, R, {r1, r}, 0};
-		}
-	}
-}
-
-static int
-aref(Ref r, ANum *ai)
-{
-	switch (rtype(r)) {
-	default:
-		diag("isel: aref defaulted");
-	case RCon:
-		return 2;
-	case RTmp:
-		return ai[r.val].n;
-	}
-}
-
-static int
-ascale(Ref r, Con *con)
-{
-	int64_t n;
-
-	if (rtype(r) != RCon)
-		return 0;
-	if (con[r.val].type != CBits)
-		return 0;
-	n = con[r.val].bits.i;
-	return n == 1 || n == 2 || n == 4 || n == 8;
-}
-
-static void
-anumber(ANum *ai, Blk *b, Con *con)
-{
-	/* This should be made obsolete by a proper
-	 * reassoc pass.
-	 *
-	 * Rules:
-	 *
-	 *   RTmp(_) -> 0    tmp
-	 *   ( RTmp(_) -> 1    slot )
-	 *   RCon(_) -> 2    con
-	 *   0 * 2   -> 3    s * i (when constant is 1,2,4,8)
-	 */
-	static char add[10][10] = {
-		[2] [2] = 2,              /* folding */
-		[2] [5] = 5, [5] [2] = 5,
-		[2] [6] = 6, [6] [2] = 6,
-		[2] [7] = 7, [7] [2] = 7,
-		[0] [0] = 4,              /* 4: b + s * i */
-		[0] [3] = 4, [3] [0] = 4,
-		[2] [3] = 5, [3] [2] = 5, /* 5: o + s * i */
-		[0] [2] = 6, [2] [0] = 6, /* 6: o + b */
-		[2] [4] = 7, [4] [2] = 7, /* 7: o + b + s * i */
-		[0] [5] = 7, [5] [0] = 7,
-		[6] [3] = 7, [3] [6] = 7,
-
-	};
-	int a, a1, a2, n1, n2, t1, t2;
-	Ins *i;
-
-	for (i=b->ins; i-b->ins < b->nins; i++) {
-		if (rtype(i->to) == RTmp)
-			ai[i->to.val].i = i;
-		if (i->op != OAdd && i->op != OMul)
-			continue;
-		a1 = aref(i->arg[0], ai);
-		a2 = aref(i->arg[1], ai);
-		t1 = a1 != 1 && a1 != 2;
-		t2 = a2 != 1 && a2 != 2;
-		if (i->op == OAdd) {
-			a = add[n1 = a1][n2 = a2];
-			if (t1 && a < add[0][a2])
-				a = add[n1 = 0][n2 = a2];
-			if (t2 && a < add[a1][0])
-				a = add[n1 = a1][n2 = 0];
-			if (t1 && t2 && a < add[0][0])
-				a = add[n1 = 0][n2 = 0];
-		} else {
-			n1 = n2 = a = 0;
-			if (ascale(i->arg[0], con) && t2)
-				a = 3, n1 = 2, n2 = 0;
-			if (t1 && ascale(i->arg[1], con))
-				a = 3, n1 = 0, n2 = 2;
-		}
-		ai[i->to.val].n = a;
-		ai[i->to.val].l = n1;
-		ai[i->to.val].r = n2;
-	}
-}
-
-static void
-amatch(Addr *a, Ref r, ANum *ai, Fn *fn, int top)
-{
-	Ins *i;
-	int nl, nr, t, s;
-	Ref al, ar;
-
-	if (top)
-		memset(a, 0, sizeof *a);
-	if (rtype(r) == RCon) {
-		addcon(&a->offset, &fn->con[r.val]);
-		return;
-	}
-	assert(rtype(r) == RTmp);
-	i = ai[r.val].i;
-	nl = ai[r.val].l;
-	nr = ai[r.val].r;
-	if (i) {
-		if (nl > nr) {
-			al = i->arg[1];
-			ar = i->arg[0];
-			t = nl, nl = nr, nr = t;
-		} else {
-			al = i->arg[0];
-			ar = i->arg[1];
-		}
-	}
-	switch (ai[r.val].n) {
-	default:
-		diag("isel: amatch defaulted");
-	case 3: /* s * i */
-		if (!top) {
-			a->index = al;
-			a->scale = fn->con[ar.val].bits.i;
-		} else
-			a->base = r;
-		break;
-	case 4: /* b + s * i */
-		switch (nr) {
-		case 0:
-			if (fn->tmp[ar.val].slot != -1) {
-				al = i->arg[1];
-				ar = i->arg[0];
-			}
-			a->index = ar;
-			a->scale = 1;
-			break;
-		case 3:
-			amatch(a, ar, ai, fn, 0);
-			break;
-		}
-		r = al;
-	case 0:
-		s = fn->tmp[r.val].slot;
-		if (s != -1)
-			r = SLOT(s);
-		a->base = r;
-		break;
-	case 2: /* constants */
-	case 5: /* o + s * i */
-	case 6: /* o + b */
-	case 7: /* o + b + s * i */
-		amatch(a, ar, ai, fn, 0);
-		amatch(a, al, ai, fn, 0);
-		break;
-	}
-}
-
-/* instruction selection
- * requires use counts (as given by parsing)
- */
-void
-isel(Fn *fn)
-{
-	Blk *b, **sb;
-	Ins *i, *i0, *ip;
-	Phi *p;
-	uint a;
-	int n, al;
-	int64_t sz;
-	ANum *ainfo;
-	RAlloc *ral;
-
-	for (n=0; n<fn->ntmp; n++)
-		fn->tmp[n].slot = -1;
-	fn->slot = 0;
-
-	/* lower arguments */
-	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
-		if (i->op != OPar && i->op != OParc)
-			break;
-	selpar(fn, b->ins, i);
-	n = b->nins - (i - b->ins) + (curi - insb);
-	i0 = alloc(n * sizeof(Ins));
-	ip = icpy(ip = i0, insb, curi - insb);
-	ip = icpy(ip, i, &b->ins[b->nins] - i);
-	b->nins = n;
-	b->ins = i0;
-
-	/* lower function calls and returns */
-	ral = 0;
-	b = fn->start;
-	do {
-		if (!(b = b->link))
-			b = fn->start; /* do it last */
-		curi = &insb[NIns];
-		selret(b, fn);
-		for (i=&b->ins[b->nins]; i!=b->ins;) {
-			if ((--i)->op == OCall) {
-				for (i0=i; i0>b->ins; i0--)
-					if ((i0-1)->op != OArg)
-					if ((i0-1)->op != OArgc)
-						break;
-				selcall(fn, i0, i, &ral);
-				i = i0;
-				continue;
-			}
-			assert(i->op != OArg && i->op != OArgc);
-			emiti(*i);
-		}
-		if (b == fn->start)
-			for (; ral; ral=ral->link)
-				emiti(ral->i);
-		b->nins = &insb[NIns] - curi;
-		idup(&b->ins, curi, b->nins);
-	} while (b != fn->start);
-
-	if (debug['A']) {
-		fprintf(stderr, "\n> After call lowering:\n");
-		printfn(fn, stderr);
-	}
-
-	/* assign slots to fast allocs */
-	b = fn->start;
-	assert(NAlign == 3 && "change n=4 and sz /= 4 below");
-	for (al=OAlloc, n=4; al<=OAlloc1; al++, n*=2)
-		for (i=b->ins; i-b->ins < b->nins; i++)
-			if (i->op == al) {
-				if (rtype(i->arg[0]) != RCon)
-					break;
-				sz = fn->con[i->arg[0].val].bits.i;
-				if (sz < 0 || sz >= INT_MAX-3)
-					diag("isel: invalid alloc size");
-				sz = (sz + n-1) & -n;
-				sz /= 4;
-				fn->tmp[i->to.val].slot = fn->slot;
-				fn->slot += sz;
-				*i = (Ins){.op = ONop};
-			}
-
-	/* process basic blocks */
-	n = fn->ntmp;
-	ainfo = emalloc(n * sizeof ainfo[0]);
-	for (b=fn->start; b; b=b->link) {
-		curi = &insb[NIns];
-		for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
-			for (p=(*sb)->phi; p; p=p->link) {
-				for (a=0; p->blk[a] != b; a++)
-					assert(a+1 < p->narg);
-				fixarg(&p->arg[a], p->cls, 1, fn);
-			}
-		memset(ainfo, 0, n * sizeof ainfo[0]);
-		anumber(ainfo, b, fn->con);
-		seljmp(b, fn);
-		for (i=&b->ins[b->nins]; i!=b->ins;)
-			sel(*--i, ainfo, fn);
-		b->nins = &insb[NIns] - curi;
-		idup(&b->ins, curi, b->nins);
-	}
-	free(ainfo);
-
-	if (debug['I']) {
-		fprintf(stderr, "\n> After instruction selection:\n");
-		printfn(fn, stderr);
-	}
-}
diff --git a/lisc/live.c b/lisc/live.c
@@ -1,174 +0,0 @@
-#include "lisc.h"
-
-void
-liveon(BSet *v, Blk *b, Blk *s)
-{
-	Phi *p;
-	uint a;
-
-	bscopy(v, s->in);
-	for (p=s->phi; p; p=p->link) {
-		bsclr(v, p->to.val);
-		for (a=0; a<p->narg; a++)
-			if (p->blk[a] == b)
-			if (rtype(p->arg[a]) == RTmp)
-				bsset(v, p->arg[a].val);
-	}
-}
-
-static int
-phitmp(int t, Tmp *tmp)
-{
-	int tp;
-
-	tp = tmp[t].phi;
-	return tp ? tp : t;
-}
-
-static void
-phifix(int t1, short *phi, Tmp *tmp)
-{
-	int t, t2;
-
-	/* detect temporaries arguments
-	 * of the same phi node that
-	 * interfere and separate them
-	 */
-	t = phitmp(t1, tmp);
-	t2 = phi[t];
-	if (t2 && t2 != t1) {
-		if (t != t1) {
-			tmp[t1].phi = t1;
-			t = t1;
-		} else {
-			tmp[t2].phi = t2;
-			phi[t2] = t2;
-		}
-	}
-	phi[t] = t1;
-}
-
-static void
-bset(Ref r, Blk *b, int *nlv, short *phi, Tmp *tmp)
-{
-
-	if (rtype(r) != RTmp)
-		return;
-	bsset(b->gen, r.val);
-	phifix(r.val, phi, tmp);
-	if (!bshas(b->in, r.val)) {
-		nlv[KBASE(tmp[r.val].cls)]++;
-		bsset(b->in, r.val);
-	}
-}
-
-/* liveness analysis
- * requires rpo computation
- */
-void
-filllive(Fn *f)
-{
-	Blk *b;
-	Ins *i;
-	int k, t, m[2], n, chg, nlv[2];
-	short *phi;
-	BSet u[1], v[1];
-	Mem *ma;
-
-	bsinit(u, f->ntmp);
-	bsinit(v, f->ntmp);
-	phi = emalloc(f->ntmp * sizeof phi[0]);
-	for (b=f->start; b; b=b->link) {
-		bsinit(b->in, f->ntmp);
-		bsinit(b->out, f->ntmp);
-		bsinit(b->gen, f->ntmp);
-	}
-	chg = 1;
-Again:
-	for (n=f->nblk-1; n>=0; n--) {
-		b = f->rpo[n];
-
-		bscopy(u, b->out);
-		if (b->s1) {
-			liveon(v, b, b->s1);
-			bsunion(b->out, v);
-		}
-		if (b->s2) {
-			liveon(v, b, b->s2);
-			bsunion(b->out, v);
-		}
-		chg |= !bsequal(b->out, u);
-
-		memset(phi, 0, f->ntmp * sizeof phi[0]);
-		memset(nlv, 0, sizeof nlv);
-		bscopy(b->in, b->out);
-		for (t=0; t<f->ntmp; t++)
-			if (bshas(b->in, t)) {
-				phifix(t, phi, f->tmp);
-				nlv[KBASE(f->tmp[t].cls)]++;
-			}
-		if (rtype(b->jmp.arg) == RACall) {
-			assert(bscount(b->in) == 0 && nlv[0] == 0 && nlv[1] == 0);
-			b->in->t[0] |= retregs(b->jmp.arg, nlv);
-		} else
-			bset(b->jmp.arg, b, nlv, phi, f->tmp);
-		for (k=0; k<2; k++)
-			b->nlive[k] = nlv[k];
-		for (i=&b->ins[b->nins]; i!=b->ins;) {
-			if ((--i)->op == OCall && rtype(i->arg[1]) == RACall) {
-				b->in->t[0] &= ~retregs(i->arg[1], m);
-				for (k=0; k<2; k++)
-					nlv[k] -= m[k];
-				if (nlv[0] + NISave > b->nlive[0])
-					b->nlive[0] = nlv[0] + NISave;
-				if (nlv[1] + NFSave > b->nlive[1])
-					b->nlive[1] = nlv[1] + NFSave;
-				b->in->t[0] |= argregs(i->arg[1], m);
-				for (k=0; k<2; k++)
-					nlv[k] += m[k];
-			}
-			if (!req(i->to, R)) {
-				assert(rtype(i->to) == RTmp);
-				t = i->to.val;
-				if (bshas(b->in, i->to.val))
-					nlv[KBASE(f->tmp[t].cls)]--;
-				bsset(b->gen, t);
-				bsclr(b->in, t);
-				phi[phitmp(t, f->tmp)] = 0;
-			}
-			for (k=0; k<2; k++)
-				switch (rtype(i->arg[k])) {
-				case RAMem:
-					ma = &f->mem[i->arg[k].val & AMask];
-					bset(ma->base, b, nlv, phi, f->tmp);
-					bset(ma->index, b, nlv, phi, f->tmp);
-					break;
-				default:
-					bset(i->arg[k], b, nlv, phi, f->tmp);
-					break;
-				}
-			for (k=0; k<2; k++)
-				if (nlv[k] > b->nlive[k])
-					b->nlive[k] = nlv[k];
-		}
-	}
-	if (chg) {
-		chg = 0;
-		goto Again;
-	}
-	free(phi);
-
-	if (debug['L']) {
-		fprintf(stderr, "\n> Liveness analysis:\n");
-		for (b=f->start; b; b=b->link) {
-			fprintf(stderr, "\t%-10sin:   ", b->name);
-			dumpts(b->in, f->tmp, stderr);
-			fprintf(stderr, "\t          out:  ");
-			dumpts(b->out, f->tmp, stderr);
-			fprintf(stderr, "\t          gen:  ");
-			dumpts(b->gen, f->tmp, stderr);
-			fprintf(stderr, "\t          live: ");
-			fprintf(stderr, "%d %d\n", b->nlive[0], b->nlive[1]);
-		}
-	}
-}
diff --git a/lisc/main.c b/lisc/main.c
@@ -1,117 +0,0 @@
-#include "lisc.h"
-#include <ctype.h>
-#include <getopt.h>
-
-char debug['Z'+1] = {
-	['P'] = 0, /* parsing */
-	['A'] = 0, /* abi lowering */
-	['I'] = 0, /* instruction selection */
-	['L'] = 0, /* liveness */
-	['M'] = 0, /* memory optimization */
-	['N'] = 0, /* ssa construction */
-	['C'] = 0, /* copy elimination */
-	['S'] = 0, /* spilling */
-	['R'] = 0, /* reg. allocation */
-};
-
-static FILE *outf;
-static int dbg;
-
-static void
-data(Dat *d)
-{
-	if (dbg)
-		return;
-	if (d->type == DEnd) {
-		fputs("/* end data */\n\n", outf);
-		freeall();
-	}
-	emitdat(d, outf);
-}
-
-static void
-func(Fn *fn)
-{
-	int n;
-
-	if (dbg)
-		fprintf(stderr, "**** Function %s ****", fn->name);
-	if (debug['P']) {
-		fprintf(stderr, "\n> After parsing:\n");
-		printfn(fn, stderr);
-	}
-	fillrpo(fn);
-	fillpreds(fn);
-	filluse(fn);
-	memopt(fn);
-	ssa(fn);
-	filluse(fn);
-	copy(fn);
-	filluse(fn);
-	isel(fn);
-	filllive(fn);
-	fillcost(fn);
-	spill(fn);
-	rega(fn);
-	fillrpo(fn);
-	assert(fn->rpo[0] == fn->start);
-	for (n=0;; n++)
-		if (n == fn->nblk-1) {
-			fn->rpo[n]->link = 0;
-			break;
-		} else
-			fn->rpo[n]->link = fn->rpo[n+1];
-	if (!dbg) {
-		emitfn(fn, outf);
-		fprintf(outf, "/* end function %s */\n\n", fn->name);
-	} else
-		fprintf(stderr, "\n");
-	freeall();
-}
-
-int
-main(int ac, char *av[])
-{
-	FILE *inf;
-	char *f;
-	int c;
-
-	outf = stdout;
-	while ((c = getopt(ac, av, "d:o:")) != -1)
-		switch (c) {
-		case 'd':
-			for (; *optarg; optarg++)
-				if (isalpha(*optarg)) {
-					debug[toupper(*optarg)] = 1;
-					dbg = 1;
-				}
-			break;
-		case 'o':
-			if (strcmp(optarg, "-") != 0)
-				outf = fopen(optarg, "w");
-			break;
-		default:
-			fprintf(stderr, "usage: %s [-d <flags>] [-o out] {file.ssa, -}\n", av[0]);
-			exit(1);
-		}
-
-	do {
-		f = av[optind];
-		if (!f || strcmp(f, "-") == 0) {
-			inf = stdin;
-			f = "-";
-		} else {
-			inf = fopen(f, "r");
-			if (!inf) {
-				fprintf(stderr, "cannot open '%s'\n", f);
-				exit(1);
-			}
-		}
-		parse(inf, f, data, func);
-	} while (++optind < ac);
-
-	if (!dbg)
-		emitfin(outf);
-
-	exit(0);
-}
diff --git a/lisc/mem.c b/lisc/mem.c
@@ -1,81 +0,0 @@
-#include "lisc.h"
-
-/* Memory optimization:
- *
- * - replace alloced slots used only in
- *   load/store operations
- *   Assumption: all the accesses have the
- *   same size (this could be wrong...)
- */
-
-/* require use, maintains use counts */
-void
-memopt(Fn *fn)
-{
-	Blk *b;
-	Ins *i, *l;
-	Tmp *t;
-	Use *u, *ue;
-	int a;
-
-	b = fn->start;
-	for (i=b->ins; i-b->ins < b->nins; i++) {
-		if (OAlloc > i->op || i->op > OAlloc1)
-			continue;
-		assert(NAlign == 3);
-		assert(rtype(i->to) == RTmp);
-		t = &fn->tmp[i->to.val];
-		for (u=t->use; u != &t->use[t->nuse]; u++) {
-			if (u->type != UIns)
-				goto NextIns;
-			l = u->u.ins;
-			if (!isload(l->op)
-			&& (!isstore(l->op) || req(i->to, l->arg[0])))
-				goto NextIns;
-		}
-		/* get rid of the alloc and replace uses */
-		*i = (Ins){.op = ONop};
-		t->ndef--;
-		ue = &t->use[t->nuse];
-		for (u=t->use; u!=ue; u++) {
-			l = u->u.ins;
-			if (isstore(l->op)) {
-				if (l->op == OStores)
-					l->cls = Kd;
-				else if (l->op == OStored)
-					l->cls = Kd;
-				else if (l->op == OStorel)
-					l->cls = Kl;
-				else
-					l->cls = Kw;
-				l->op = OCopy;
-				l->to = l->arg[1];
-				l->arg[1] = R;
-				t->nuse--;
-				t->ndef++;
-			} else
-				/* try to turn loads into copies so we
-				 * can eliminate them later */
-				switch(l->op) {
-				case OLoad:
-					l->op = OCopy;
-					break;
-				case OLoadsw:
-				case OLoaduw:
-					l->cls = Kw;
-					l->op = OCopy;
-					break;
-				default:
-					/* keep l->cls */
-					a = l->op - OLoadsw;
-					l->op = OExtsw + a;
-					break;
-				}
-		}
-	NextIns:;
-	}
-	if (debug['M']) {
-		fprintf(stderr, "\n> After memory optimization:\n");
-		printfn(fn, stderr);
-	}
-}
diff --git a/lisc/parse.c b/lisc/parse.c
@@ -1,1081 +0,0 @@
-#include "lisc.h"
-#include <ctype.h>
-#include <stdarg.h>
-
-enum {
-	Kx = -1, /* Invalid operand */
-	Km = Kl, /* Memory pointer (for x64) */
-};
-
-OpDesc opdesc[NOp] = {
-#define A(a,b,c,d) {[Kw]=K##a, [Kl]=K##b, [Ks]=K##c, [Kd]=K##d}
-
-	/*            NAME       NM      ARGCLS0     ARGCLS1  SF LF */
-	[OAdd]    = { "add",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
-	[OSub]    = { "sub",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
-	[ODiv]    = { "div",      2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
-	[ORem]    = { "rem",      2, {A(w,l,x,x), A(w,l,x,x)}, 0, 0 },
-	[OUDiv]   = { "udiv",     2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
-	[OURem]   = { "urem",     2, {A(w,l,x,x), A(w,l,x,x)}, 0, 0 },
-	[OMul]    = { "mul",      2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
-	[OAnd]    = { "and",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
-	[OOr]     = { "or",       2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
-	[OXor]    = { "xor",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
-	[OSar]    = { "sar",      1, {A(w,l,x,x), A(w,w,x,x)}, 1, 0 },
-	[OShr]    = { "shr",      1, {A(w,l,x,x), A(w,w,x,x)}, 1, 0 },
-	[OShl]    = { "shl",      1, {A(w,l,x,x), A(w,w,x,x)}, 1, 0 },
-	[OStored] = { "stored",   0, {A(d,d,d,d), A(m,m,m,m)}, 0, 1 },
-	[OStores] = { "stores",   0, {A(s,s,s,s), A(m,m,m,m)}, 0, 1 },
-	[OStorel] = { "storel",   0, {A(l,l,l,l), A(m,m,m,m)}, 0, 1 },
-	[OStorew] = { "storew",   0, {A(w,w,w,w), A(m,m,m,m)}, 0, 1 },
-	[OStoreh] = { "storeh",   0, {A(w,w,w,w), A(m,m,m,m)}, 0, 1 },
-	[OStoreb] = { "storeb",   0, {A(w,w,w,w), A(m,m,m,m)}, 0, 1 },
-	[OLoad]   = { "load",     0, {A(m,m,m,m), A(x,x,x,x)}, 0, 1 },
-	[OLoadsw] = { "loadsw",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
-	[OLoaduw] = { "loaduw",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
-	[OLoadsh] = { "loadsh",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
-	[OLoaduh] = { "loaduh",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
-	[OLoadsb] = { "loadsb",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
-	[OLoadub] = { "loadub",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
-	[OExtsw]  = { "extsw",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
-	[OExtuw]  = { "extuw",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
-	[OExtsh]  = { "extsh",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
-	[OExtuh]  = { "extuh",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
-	[OExtsb]  = { "extsb",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
-	[OExtub]  = { "extub",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
-	[OExts]   = { "exts",     0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
-	[OTruncd] = { "truncd",   0, {A(d,d,d,d), A(x,x,x,x)}, 0, 1 },
-	[OFtosi]  = { "ftosi",    0, {A(s,d,x,x), A(x,x,x,x)}, 0, 1 },
-	[OSitof]  = { "sitof",    0, {A(x,x,w,l), A(x,x,x,x)}, 0, 1 },
-	[OCast]   = { "cast",     0, {A(s,d,w,l), A(x,x,x,x)}, 0, 1 },
-	[OCopy]   = { "copy",     1, {A(w,l,s,d), A(x,x,x,x)}, 0, 1 },
-	[ONop]    = { "nop",      0, {A(x,x,x,x), A(x,x,x,x)}, 0, 1 },
-	[OSwap]   = { "swap",     2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
-	[OSign]   = { "sign",     0, {A(w,l,x,x), A(x,x,x,x)}, 0, 0 },
-	[OSAlloc] = { "salloc",   0, {A(x,l,x,x), A(x,x,x,x)}, 0, 0 },
-	[OXDiv]   = { "xdiv",     1, {A(w,l,x,x), A(x,x,x,x)}, 0, 0 },
-	[OXCmp]   = { "xcmp",     1, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
-	[OXTest]  = { "xtest",    1, {A(w,l,x,x), A(w,l,x,x)}, 1, 0 },
-	[OAddr]   = { "addr",     0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
-	[OPar]    = { "parn",     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
-	[OParc]   = { "parc",     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
-	[OArg]    = { "arg",      0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
-	[OArgc]   = { "argc",     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
-	[OCall]   = { "call",     0, {A(m,m,m,m), A(x,x,x,x)}, 0, 0 },
-	[OXSetnp] = { "xsetnp",   0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
-	[OXSetp]  = { "xsetp",    0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
-	[OAlloc]   = { "alloc4",  1, {A(l,l,l,l), A(x,x,x,x)}, 0, 0 },
-	[OAlloc+1] = { "alloc8",  1, {A(l,l,l,l), A(x,x,x,x)}, 0, 0 },
-	[OAlloc+2] = { "alloc16", 1, {A(l,l,l,l), A(x,x,x,x)}, 0, 0 },
-#define X(c) \
-	[OCmpw+IC##c] = { "c"    #c "w", 0, {A(w,w,x,x), A(w,w,x,x)}, 1, 0 }, \
-	[OCmpl+IC##c] = { "c"    #c "l", 0, {A(l,l,x,x), A(l,l,x,x)}, 1, 0 }, \
-	[OXSet+IC##c] = { "xset" #c,     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 1 },
-	ICMPS(X)
-#undef X
-#define X(c) \
-	[OCmps+FC##c] = { "c"    #c "s", 0, {A(s,s,x,x), A(s,s,x,x)}, 1, 0 }, \
-	[OCmpd+FC##c] = { "c"    #c "d", 0, {A(d,d,x,x), A(d,d,x,x)}, 1, 0 },
-	FCMPS(X)
-#undef X
-
-};
-#undef A
-
-typedef enum {
-	PXXX,
-	PLbl,
-	PPhi,
-	PIns,
-	PEnd,
-} PState;
-
-enum {
-	TXXX = NPubOp,
-	TCall,
-	TPhi,
-	TJmp,
-	TJnz,
-	TRet,
-	TFunc,
-	TType,
-	TData,
-	TAlign,
-	TL,
-	TW,
-	TH,
-	TB,
-	TD,
-	TS,
-	TZ,
-
-	TInt,
-	TFlts,
-	TFltd,
-	TTmp,
-	TLbl,
-	TGlo,
-	TTyp,
-	TStr,
-
-	TPlus,
-	TEq,
-	TComma,
-	TLParen,
-	TRParen,
-	TLBrace,
-	TRBrace,
-	TNL,
-	TEOF,
-};
-
-
-static FILE *inf;
-static char *inpath;
-static int thead;
-static struct {
-	char chr;
-	double fltd;
-	float flts;
-	int64_t num;
-	char *str;
-} tokval;
-static int lnum;
-
-static Tmp *tmp;
-static Con *con;
-static int ntmp;
-static int ncon;
-static Phi **plink;
-static Blk **bmap;
-static Blk *curb;
-static Blk **blink;
-static int nblk;
-static int rcls;
-static int ntyp;
-
-
-void
-err(char *s, ...)
-{
-	char buf[100], *p, *end;
-	va_list ap;
-
-	p = buf;
-	end = buf + sizeof(buf);
-
-	va_start(ap, s);
-	p += snprintf(p, end - p, "%s:%d: ", inpath, lnum);
-	p += vsnprintf(p, end - p, s, ap);
-	va_end(ap);
-
-	diag(buf);
-}
-
-static int
-lex()
-{
-	static struct {
-		char *str;
-		int tok;
-	} tmap[] = {
-		{ "call", TCall },
-		{ "phi", TPhi },
-		{ "jmp", TJmp },
-		{ "jnz", TJnz },
-		{ "ret", TRet },
-		{ "function", TFunc },
-		{ "type", TType },
-		{ "data", TData },
-		{ "align", TAlign },
-		{ "l", TL },
-		{ "w", TW },
-		{ "h", TH },
-		{ "b", TB },
-		{ "d", TD },
-		{ "s", TS },
-		{ "z", TZ },
-		{ "loadw", OLoad }, /* for convenience */
-		{ "loadl", OLoad },
-		{ "loads", OLoad },
-		{ "loadd", OLoad },
-		{ "alloc1", OAlloc },
-		{ "alloc2", OAlloc },
-		{ 0, TXXX }
-	};
-	static char tok[NString];
-	int c, i;
-	int t;
-
-	do
-		c = fgetc(inf);
-	while (isblank(c));
-	t = TXXX;
-	tokval.chr = c;
-	switch (c) {
-	case EOF:
-		return TEOF;
-	case ',':
-		return TComma;
-	case '(':
-		return TLParen;
-	case ')':
-		return TRParen;
-	case '{':
-		return TLBrace;
-	case '}':
-		return TRBrace;
-	case '=':
-		return TEq;
-	case '+':
-		return TPlus;
-	case 's':
-		if (fscanf(inf, "_%f", &tokval.flts) != 1)
-			break;
-		return TFlts;
-	case 'd':
-		if (fscanf(inf, "_%lf", &tokval.fltd) != 1)
-			break;
-		return TFltd;
-	case '%':
-		t = TTmp;
-		goto Alpha;
-	case '@':
-		t = TLbl;
-		goto Alpha;
-	case '$':
-		t = TGlo;
-		goto Alpha;
-	case ':':
-		t = TTyp;
-		goto Alpha;
-	case '#':
-		while (fgetc(inf) != '\n')
-			;
-	case '\n':
-		lnum++;
-		return TNL;
-	}
-	if (isdigit(c) || c == '-' || c == '+') {
-		ungetc(c, inf);
-		if (fscanf(inf, "%"SCNd64, &tokval.num) != 1)
-			err("invalid integer literal");
-		return TInt;
-	}
-	if (c == '"') {
-		tokval.str = vnew(0, 1);
-		for (i=0;; i++) {
-			c = fgetc(inf);
-			vgrow(&tokval.str, i+1);
-			if (c == '"')
-			if (!i || tokval.str[i-1] != '\\') {
-				tokval.str[i] = 0;
-				return TStr;
-			}
-			tokval.str[i] = c;
-		}
-	}
-	if (0)
-Alpha:		c = fgetc(inf);
-	if (!isalpha(c) && c != '.' && c != '_')
-		err("lexing failure: invalid character %c (%d)", c, c);
-	i = 0;
-	do {
-		if (i >= NString-1)
-			err("identifier too long");
-		tok[i++] = c;
-		c = fgetc(inf);
-	} while (isalpha(c) || c == '$' || c == '.' || c == '_' || isdigit(c));
-	tok[i] = 0;
-	ungetc(c, inf);
-	tokval.str = tok;
-	if (t != TXXX) {
-		return t;
-	}
-	for (i=0; i<NPubOp; i++)
-		if (opdesc[i].name)
-		if (strcmp(tok, opdesc[i].name) == 0)
-			return i;
-	for (i=0; tmap[i].str; i++)
-		if (strcmp(tok, tmap[i].str) == 0)
-			return tmap[i].tok;
-	err("unknown keyword %s", tokval.str);
-	return TXXX;
-}
-
-static int
-peek()
-{
-	if (thead == TXXX)
-		thead = lex();
-	return thead;
-}
-
-static int
-next()
-{
-	int t;
-
-	t = peek();
-	thead = TXXX;
-	return t;
-}
-
-static int
-nextnl()
-{
-	int t;
-
-	while ((t = next()) == TNL)
-		;
-	return t;
-}
-
-static void
-expect(int t)
-{
-	static char *ttoa[] = {
-		[TLbl] = "label",
-		[TComma] = ",",
-		[TEq] = "=",
-		[TNL] = "newline",
-		[TLParen] = "(",
-		[TRParen] = ")",
-		[TLBrace] = "{",
-		[TRBrace] = "}",
-		[TEOF] = 0,
-	};
-	char buf[128], *s1, *s2;
-	int t1;
-
-	t1 = next();
-	if (t == t1)
-		return;
-	s1 = ttoa[t] ? ttoa[t] : "??";
-	s2 = ttoa[t1] ? ttoa[t1] : "??";
-	sprintf(buf, "%s expected, got %s instead", s1, s2);
-	err(buf);
-}
-
-static Ref
-tmpref(char *v)
-{
-	int t;
-
-	for (t=Tmp0; t<ntmp; t++)
-		if (strcmp(v, tmp[t].name) == 0)
-			return TMP(t);
-	vgrow(&tmp, ++ntmp);
-	strcpy(tmp[t].name, v);
-	return TMP(t);
-}
-
-static Ref
-parseref()
-{
-	Con c;
-	int i;
-
-	memset(&c, 0, sizeof c);
-	switch (next()) {
-	case TTmp:
-		return tmpref(tokval.str);
-	case TInt:
-		c.type = CBits;
-		c.bits.i = tokval.num;
-		goto Look;
-	case TFlts:
-		c.type = CBits;
-		c.bits.s = tokval.flts;
-		c.flt = 1;
-		goto Look;
-	case TFltd:
-		c.type = CBits;
-		c.bits.d = tokval.fltd;
-		c.flt = 2;
-		goto Look;
-	case TGlo:
-		c.type = CAddr;
-		strcpy(c.label, tokval.str);
-	Look:
-		for (i=0; i<ncon; i++)
-			if (con[i].type == c.type
-			&& con[i].bits.i == c.bits.i
-			&& strcmp(con[i].label, c.label) == 0)
-				return CON(i);
-		vgrow(&con, ++ncon);
-		con[i] = c;
-		return CON(i);
-	default:
-		return R;
-	}
-}
-
-static int
-parsecls(int *tyn)
-{
-	int i;
-
-	switch (next()) {
-	default:
-		err("invalid class specifier");
-	case TTyp:
-		for (i=0; i<ntyp; i++)
-			if (strcmp(tokval.str, typ[i].name) == 0) {
-				*tyn = i;
-				return 4;
-			}
-		err("undefined type");
-	case TW:
-		return Kw;
-	case TL:
-		return Kl;
-	case TS:
-		return Ks;
-	case TD:
-		return Kd;
-	}
-}
-
-static void
-parserefl(int arg)
-{
-	int k, t, ty;
-	Ref r;
-
-	expect(TLParen);
-	if (peek() == TRParen) {
-		next();
-		return;
-	}
-	for (;;) {
-		if (curi - insb >= NIns)
-			err("too many instructions (1)");
-		k = parsecls(&ty);
-		r = parseref();
-		if (req(r, R))
-			err("invalid reference argument");
-		if (!arg && rtype(r) != RTmp)
-			err("invalid function parameter");
-		if (k == 4)
-			if (arg)
-				*curi = (Ins){OArgc, R, {TYPE(ty), r}, Kl};
-			else
-				*curi = (Ins){OParc, r, {TYPE(ty)}, Kl};
-		else
-			if (arg)
-				*curi = (Ins){OArg, R, {r}, k};
-			else
-				*curi = (Ins){OPar, r, {R}, k};
-		curi++;
-		t = next();
-		if (t == TRParen)
-			break;
-		if (t != TComma)
-			err(", or ) expected");
-	}
-}
-
-static Blk *
-findblk(char *name)
-{
-	int i;
-
-	for (i=0; i<nblk; i++)
-		if (strcmp(bmap[i]->name, name) == 0)
-			return bmap[i];
-	vgrow(&bmap, ++nblk);
-	bmap[i] = blknew();
-	strcpy(bmap[i]->name, name);
-	return bmap[i];
-}
-
-static void
-closeblk()
-{
-	curb->nins = curi - insb;
-	idup(&curb->ins, insb, curb->nins);
-	blink = &curb->link;
-	curi = insb;
-}
-
-static PState
-parseline(PState ps)
-{
-	Ref arg[NPred] = {R};
-	Blk *blk[NPred];
-	Phi *phi;
-	Ref r;
-	Blk *b;
-	int t, op, i, k, ty;
-
-	t = nextnl();
-	if (ps == PLbl && t != TLbl && t != TRBrace)
-		err("label or } expected");
-	switch (t) {
-	default:
-		if (isstore(t)) {
-			/* operations without result */
-			r = R;
-			k = 0;
-			op = t;
-			goto DoOp;
-		}
-		err("label, instruction or jump expected");
-	case TRBrace:
-		return PEnd;
-	case TTmp:
-		break;
-	case TLbl:
-		b = findblk(tokval.str);
-		if (b->jmp.type != JXXX)
-			err("multiple definitions of block");
-		if (curb && curb->jmp.type == JXXX) {
-			closeblk();
-			curb->jmp.type = JJmp;
-			curb->s1 = b;
-		}
-		*blink = b;
-		curb = b;
-		plink = &curb->phi;
-		expect(TNL);
-		return PPhi;
-	case TRet:
-		curb->jmp.type = (int[]){
-			JRetw, JRetl,
-			JRets, JRetd,
-			JRetc, JRet0
-		}[rcls];
-		if (rcls < 5) {
-			r = parseref();
-			if (req(r, R))
-				err("return value expected");
-			curb->jmp.arg = r;
-		}
-		goto Close;
-	case TJmp:
-		curb->jmp.type = JJmp;
-		goto Jump;
-	case TJnz:
-		curb->jmp.type = JJnz;
-		r = parseref();
-		if (req(r, R))
-			err("invalid argument for jnz jump");
-		curb->jmp.arg = r;
-		expect(TComma);
-	Jump:
-		expect(TLbl);
-		curb->s1 = findblk(tokval.str);
-		if (curb->jmp.type != JJmp) {
-			expect(TComma);
-			expect(TLbl);
-			curb->s2 = findblk(tokval.str);
-		}
-	Close:
-		expect(TNL);
-		closeblk();
-		return PLbl;
-	}
-	r = tmpref(tokval.str);
-	expect(TEq);
-	k = parsecls(&ty);
-	op = next();
-DoOp:
-	if (op == TPhi) {
-		if (ps != PPhi)
-			err("unexpected phi instruction");
-		op = -1;
-	}
-	if (op == TCall) {
-		arg[0] = parseref();
-		parserefl(1);
-		expect(TNL);
-		op = OCall;
-		if (k == 4) {
-			k = Kl;
-			arg[1] = TYPE(ty);
-		} else
-			arg[1] = R;
-		goto Ins;
-	}
-	if (k == 4)
-		err("size class must be w, l, s, or d");
-	if (op >= NPubOp)
-		err("invalid instruction");
-	i = 0;
-	if (peek() != TNL)
-		for (;;) {
-			if (i == NPred)
-				err("too many arguments");
-			if (op == -1) {
-				expect(TLbl);
-				blk[i] = findblk(tokval.str);
-			}
-			arg[i] = parseref();
-			if (req(arg[i], R))
-				err("invalid instruction argument");
-			i++;
-			t = peek();
-			if (t == TNL)
-				break;
-			if (t != TComma)
-				err(", or end of line expected");
-			next();
-		}
-	next();
-	if (op != -1) {
-	Ins:
-		if (curi - insb >= NIns)
-			err("too many instructions (2)");
-		curi->op = op;
-		curi->cls = k;
-		curi->to = r;
-		curi->arg[0] = arg[0];
-		curi->arg[1] = arg[1];
-		curi++;
-		return PIns;
-	} else {
-		phi = alloc(sizeof *phi);
-		phi->to = r;
-		phi->cls = k;
-		memcpy(phi->arg, arg, i * sizeof arg[0]);
-		memcpy(phi->blk, blk, i * sizeof blk[0]);
-		phi->narg = i;
-		*plink = phi;
-		plink = &phi->link;
-		return PPhi;
-	}
-}
-
-static Fn *
-parsefn()
-{
-	PState ps;
-	Fn *fn;
-
-	ntmp = Tmp0;
-	ncon = 1; /* first constant must be 0 */
-	curb = 0;
-	nblk = 0;
-	curi = insb;
-	tmp = vnew(ntmp, sizeof tmp[0]);
-	con = vnew(ncon, sizeof con[0]);
-	bmap = vnew(nblk, sizeof bmap[0]);
-	con[0].type = CBits;
-	fn = alloc(sizeof *fn);
-	blink = &fn->start;
-	fn->retty = -1;
-	if (peek() != TGlo)
-		rcls = parsecls(&fn->retty);
-	else
-		rcls = 5;
-	if (next() != TGlo)
-		err("function name expected");
-	strcpy(fn->name, tokval.str);
-	parserefl(0);
-	if (nextnl() != TLBrace)
-		err("function body must start with {");
-	ps = PLbl;
-	do
-		ps = parseline(ps);
-	while (ps != PEnd);
-	if (!curb)
-		err("empty file");
-	if (curb->jmp.type == JXXX)
-		err("last block misses jump");
-	fn->tmp = tmp;
-	fn->con = con;
-	fn->mem = vnew(0, sizeof fn->mem[0]);
-	fn->ntmp = ntmp;
-	fn->ncon = ncon;
-	fn->nmem = 0;
-	fn->nblk = nblk;
-	fn->rpo = 0;
-	return fn;
-}
-
-static void
-parsetyp()
-{
-	Typ *ty;
-	int t, n, sz, al, s, a, c, flt;
-
-	if (ntyp >= NTyp)
-		err("too many type definitions");
-	ty = &typ[ntyp++];
-	ty->align = -1;
-	if (nextnl() != TTyp ||  nextnl() != TEq)
-		err("type name, then = expected");
-	strcpy(ty->name, tokval.str);
-	t = nextnl();
-	if (t == TAlign) {
-		if (nextnl() != TInt)
-			err("alignment expected");
-		for (al=0; tokval.num /= 2; al++)
-			;
-		ty->align = al;
-		t = nextnl();
-	}
-	if (t != TLBrace)
-		err("type body must start with {");
-	t = nextnl();
-	if (t == TInt) {
-		ty->dark = 1;
-		ty->size = tokval.num;
-		if (ty->align == -1)
-			err("dark types need alignment");
-		t = nextnl();
-	} else {
-		ty->dark = 0;
-		n = -1;
-		sz = 0;
-		al = 0;
-		for (;;) {
-			flt = 0;
-			switch (t) {
-			default: err("invalid size specifier %c", tokval.chr);
-			case TD: flt = 1;
-			case TL: s = 8; a = 3; break;
-			case TS: flt = 1;
-			case TW: s = 4; a = 2; break;
-			case TH: s = 2; a = 1; break;
-			case TB: s = 1; a = 0; break;
-			}
-			if (a > al)
-				al = a;
-			if ((a = sz & (s-1))) {
-				a = s - a;
-				if (++n < NSeg) {
-					/* padding segment */
-					ty->seg[n].ispad = 1;
-					ty->seg[n].len = a;
-				}
-			}
-			t = nextnl();
-			if (t == TInt) {
-				c = tokval.num;
-				t = nextnl();
-			} else
-				c = 1;
-			while (c-- > 0) {
-				if (++n < NSeg) {
-					ty->seg[n].isflt = flt;
-					ty->seg[n].ispad = 0;
-					ty->seg[n].len = s;
-				}
-				sz += a + s;
-			}
-			if (t != TComma)
-				break;
-			t = nextnl();
-		}
-		if (++n >= NSeg)
-			ty->dark = 1;
-		else
-			ty->seg[n].len = 0;
-		if (ty->align == -1)
-			ty->align = al;
-		else
-			al = ty->align;
-		a = (1 << al) - 1;
-		ty->size = (sz + a) & ~a;
-	}
-	if (t != TRBrace)
-		err("expected closing }");
-}
-
-static void
-parsedatref(Dat *d)
-{
-	int t;
-
-	d->isref = 1;
-	d->u.ref.nam = tokval.str;
-	d->u.ref.off = 0;
-	t = peek();
-	if (t == TPlus) {
-		next();
-		if (next() != TInt)
-			err("invalid token after offset in ref");
-		d->u.ref.off = tokval.num;
-	}
-}
-
-static void
-parsedatstr(Dat *d)
-{
-	d->isstr = 1;
-	d->u.str = tokval.str;
-}
-
-static void
-parsedat(void cb(Dat *))
-{
-	char s[NString];
-	int t;
-	Dat d;
-
-	d.type = DStart;
-	d.isstr = 0;
-	d.isref = 0;
-	cb(&d);
-	if (nextnl() != TGlo || nextnl() != TEq)
-		err("data name, then = expected");
-	strcpy(s, tokval.str);
-	t = nextnl();
-	if (t == TAlign) {
-		if (nextnl() != TInt)
-			err("alignment expected");
-		d.type = DAlign;
-		d.u.num = tokval.num;
-		cb(&d);
-		t = nextnl();
-	}
-	d.type = DName;
-	d.u.str = s;
-	cb(&d);
-
-	if (t != TLBrace)
-		err("expected data contents in { .. }");
-	for (;;) {
-		switch (nextnl()) {
-		default: err("invalid size specifier %c in data", tokval.chr);
-		case TRBrace: goto Done;
-		case TL: d.type = DL; break;
-		case TW: d.type = DW; break;
-		case TH: d.type = DH; break;
-		case TB: d.type = DB; break;
-		case TS: d.type = DW; break;
-		case TD: d.type = DL; break;
-		case TZ: d.type = DZ; break;
-		}
-		t = nextnl();
-		do {
-			d.isref = 0;
-			d.isstr = 0;
-			memset(&d.u, 0, sizeof d.u);
-			if (t == TFlts)
-				d.u.flts = tokval.flts;
-			else if (t == TFltd)
-				d.u.fltd = tokval.fltd;
-			else if (t == TInt)
-				d.u.num = tokval.num;
-			else if (t == TGlo)
-				parsedatref(&d);
-			else if (t == TStr)
-				parsedatstr(&d);
-			else
-				err("constant literal expected");
-			cb(&d);
-			t = nextnl();
-		} while (t == TInt || t == TFlts || t == TFltd);
-		if (t == TRBrace)
-			break;
-		if (t != TComma)
-			err(", or } expected");
-	}
-Done:
-	d.type = DEnd;
-	cb(&d);
-}
-
-void
-parse(FILE *f, char *path, void data(Dat *), void func(Fn *))
-{
-	inf = f;
-	inpath = path;
-	lnum = 1;
-	thead = TXXX;
-	ntyp = 0;
-	for (;;)
-		switch (nextnl()) {
-		case TFunc:
-			func(parsefn());
-			break;
-		case TType:
-			parsetyp();
-			break;
-		case TData:
-			parsedat(data);
-			break;
-		case TEOF:
-			return;
-		default:
-			err("top-level definition expected");
-			break;
-		}
-}
-
-static void
-printcon(Con *c, FILE *f)
-{
-	switch (c->type) {
-	case CUndef:
-		break;
-	case CAddr:
-		fprintf(f, "$%s", c->label);
-		if (c->bits.i)
-			fprintf(f, "%+"PRIi64, c->bits.i);
-		break;
-	case CBits:
-		if (c->flt == 1)
-			fprintf(f, "s_%f", c->bits.s);
-		else if (c->flt == 2)
-			fprintf(f, "d_%lf", c->bits.d);
-		else
-			fprintf(f, "%"PRIi64, c->bits.i);
-		break;
-	}
-}
-
-void
-printref(Ref r, Fn *fn, FILE *f)
-{
-	int i;
-	Mem *m;
-
-	switch (rtype(r)) {
-	case RTmp:
-		if (r.val < Tmp0)
-			fprintf(f, "R%d", r.val);
-		else
-			fprintf(f, "%%%s", fn->tmp[r.val].name);
-		break;
-	case RCon:
-		printcon(&fn->con[r.val], f);
-		break;
-	case RSlot:
-		fprintf(f, "S%d", r.val);
-		break;
-	case RACall:
-		fprintf(f, "%03x", r.val & AMask);
-		break;
-	case RAType:
-		fprintf(f, ":%s", typ[r.val & AMask].name);
-		break;
-	case RAMem:
-		i = 0;
-		m = &fn->mem[r.val & AMask];
-		fputc('[', f);
-		if (m->offset.type != CUndef) {
-			printcon(&m->offset, f);
-			i = 1;
-		}
-		if (!req(m->base, R)) {
-			if (i)
-				fprintf(f, " + ");
-			printref(m->base, fn, f);
-			i = 1;
-		}
-		if (!req(m->index, R)) {
-			if (i)
-				fprintf(f, " + ");
-			fprintf(f, "%d * ", m->scale);
-			printref(m->index, fn, f);
-		}
-		fputc(']', f);
-		break;
-	}
-}
-
-void
-printfn(Fn *fn, FILE *f)
-{
-	static char *jtoa[NJmp] = {
-		[JRet0]     = "ret",
-		[JRetw]     = "retw",
-		[JRetl]     = "retl",
-		[JRetc]     = "retc",
-		[JRets]     = "rets",
-		[JRetd]     = "retd",
-		[JJnz]      = "jnz",
-		[JXJnp]     = "xjnp",
-		[JXJp]      = "xjp",
-	#define X(c) [JXJc+IC##c] = "xj" #c,
-		ICMPS(X)
-	#undef X
-	};
-	static char prcls[NOp] = {
-		[OArg] = 1,
-		[OSwap] = 1,
-		[OXCmp] = 1,
-		[OXTest] = 1,
-		[OXDiv] = 1,
-		[OXIDiv] = 1,
-	};
-	static char ktoc[] = "wlsd";
-	Blk *b;
-	Phi *p;
-	Ins *i;
-	uint n;
-
-	fprintf(f, "function $%s() {\n", fn->name);
-	for (b=fn->start; b; b=b->link) {
-		fprintf(f, "@%s\n", b->name);
-		for (p=b->phi; p; p=p->link) {
-			fprintf(f, "\t");
-			printref(p->to, fn, f);
-			fprintf(f, " =%c phi ", ktoc[p->cls]);
-			assert(p->narg);
-			for (n=0;; n++) {
-				fprintf(f, "@%s ", p->blk[n]->name);
-				printref(p->arg[n], fn, f);
-				if (n == p->narg-1) {
-					fprintf(f, "\n");
-					break;
-				} else
-					fprintf(f, ", ");
-			}
-		}
-		for (i=b->ins; i-b->ins < b->nins; i++) {
-			fprintf(f, "\t");
-			if (!req(i->to, R)) {
-				printref(i->to, fn, f);
-				fprintf(f, " =%c ", ktoc[i->cls]);
-			}
-			assert(opdesc[i->op].name);
-			fprintf(f, "%s", opdesc[i->op].name);
-			if (req(i->to, R) && prcls[i->op])
-				fputc(ktoc[i->cls], f);
-			if (!req(i->arg[0], R)) {
-				fprintf(f, " ");
-				printref(i->arg[0], fn, f);
-			}
-			if (!req(i->arg[1], R)) {
-				fprintf(f, ", ");
-				printref(i->arg[1], fn, f);
-			}
-			fprintf(f, "\n");
-		}
-		switch (b->jmp.type) {
-		case JRet0:
-		case JRetw:
-		case JRetl:
-		case JRets:
-		case JRetd:
-		case JRetc:
-			fprintf(f, "\t%s", jtoa[b->jmp.type]);
-			if (b->jmp.type != JRet0 || !req(b->jmp.arg, R)) {
-				fprintf(f, " ");
-				printref(b->jmp.arg, fn, f);
-			}
-			if (b->jmp.type == JRetc)
-				fprintf(f, ", :%s", typ[fn->retty].name);
-			fprintf(f, "\n");
-			break;
-		case JJmp:
-			if (b->s1 != b->link)
-				fprintf(f, "\tjmp @%s\n", b->s1->name);
-			break;
-		default:
-			fprintf(f, "\t%s ", jtoa[b->jmp.type]);
-			if (b->jmp.type == JJnz) {
-				printref(b->jmp.arg, fn, f);
-				fprintf(f, ", ");
-			}
-			fprintf(f, "@%s, @%s\n", b->s1->name, b->s2->name);
-			break;
-		}
-	}
-	fprintf(f, "}\n");
-}
diff --git a/lisc/rega.c b/lisc/rega.c
@@ -1,597 +0,0 @@
-#include "lisc.h"
-#ifdef TEST_PMOV
-	#undef assert
-	#define assert(x) assert_test(#x, x)
-#endif
-
-typedef struct RMap RMap;
-
-struct RMap {
-	int t[NIReg+NFReg];
-	int r[NIReg+NFReg];
-	BSet b[1];
-	int n;
-};
-
-static bits regu;      /* registers used */
-static Tmp *tmp;       /* function temporaries */
-static Mem *mem;       /* function mem references */
-static struct {
-	Ref src, dst;
-	int cls;
-} *pm;                 /* parallel move constructed */
-static int cpm, npm;   /* capacity and size of pm */
-
-static int *
-hint(int t)
-{
-	return &tmp[phicls(t, tmp)].hint.r;
-}
-
-static void
-sethint(int t, int r)
-{
-	bits m;
-
-	m = tmp[phicls(t, tmp)].hint.m;
-	if (*hint(t) == -1)
-	if (!(BIT(r) & m))
-		*hint(t) = r;
-}
-
-static void
-rcopy(RMap *ma, RMap *mb)
-{
-	memcpy(ma->t, mb->t, sizeof ma->t);
-	memcpy(ma->r, mb->r, sizeof ma->r);
-	bscopy(ma->b, mb->b);
-	ma->n = mb->n;
-}
-
-static int
-rfind(RMap *m, int t)
-{
-	int i;
-
-	for (i=0; i<m->n; i++)
-		if (m->t[i] == t)
-			return m->r[i];
-	return -1;
-}
-
-static Ref
-rref(RMap *m, int t)
-{
-	int r, s;
-
-	r = rfind(m, t);
-	if (r == -1) {
-		s = tmp[t].slot;
-		assert(s != -1 && "should have spilled");
-		return SLOT(s);
-	} else
-		return TMP(r);
-}
-
-static void
-radd(RMap *m, int t, int r)
-{
-	assert((t >= Tmp0 || t == r) && "invalid temporary");
-	assert(((RAX <= r && r < RAX + NIReg) || (XMM0 <= r && r < XMM0 + NFReg)) && "invalid register");
-	assert(!bshas(m->b, t) && "temporary has mapping");
-	assert(!bshas(m->b, r) && "register already allocated");
-	assert(m->n <= NIReg+NFReg && "too many mappings");
-	bsset(m->b, t);
-	bsset(m->b, r);
-	m->t[m->n] = t;
-	m->r[m->n] = r;
-	m->n++;
-	regu |= BIT(r);
-}
-
-static Ref
-ralloc(RMap *m, int t)
-{
-	bits regs;
-	int r, r0, r1;
-
-	if (t < Tmp0) {
-		assert(bshas(m->b, t));
-		return TMP(t);
-	}
-	if (bshas(m->b, t)) {
-		r = rfind(m, t);
-		assert(r != -1);
-		return TMP(r);
-	}
-	r = *hint(t);
-	if (r == -1 || bshas(m->b, r)) {
-		regs = tmp[phicls(t, tmp)].hint.m;
-		regs |= m->b->t[0];
-		switch (KBASE(tmp[t].cls)) {
-		case 0:
-			r0 = RAX;
-			r1 = RAX + NIReg;
-			break;
-		case 1:
-			r0 = XMM0;
-			r1 = XMM0 + NFReg;
-			break;
-		}
-		for (r=r0; r<r1; r++)
-			if (!(regs & BIT(r)))
-				goto Found;
-		for (r=r0; r<r1; r++)
-			if (!bshas(m->b, r))
-				goto Found;
-		diag("rega: no more regs");
-	}
-Found:
-	radd(m, t, r);
-	sethint(t, r);
-	return TMP(r);
-}
-
-static int
-rfree(RMap *m, int t)
-{
-	int i, r;
-
-	if (!bshas(m->b, t))
-		return -1;
-	for (i=0; m->t[i] != t; i++)
-		assert(i+1 < m->n);
-	r = m->r[i];
-	bsclr(m->b, t);
-	bsclr(m->b, r);
-	m->n--;
-	memmove(&m->t[i], &m->t[i+1], (m->n-i) * sizeof m->t[0]);
-	memmove(&m->r[i], &m->r[i+1], (m->n-i) * sizeof m->r[0]);
-	return r;
-}
-
-static void
-mdump(RMap *m)
-{
-	int i;
-
-	for (i=0; i<m->n; i++)
-		fprintf(stderr, " (%s, R%d)",
-			tmp[m->t[i]].name,
-			m->r[i]);
-	fprintf(stderr, "\n");
-}
-
-static void
-pmadd(Ref src, Ref dst, int k)
-{
-	if (npm == cpm) {
-		cpm = cpm * 2 + 16;
-		pm = realloc(pm, cpm * sizeof pm[0]);
-		if (!pm)
-			diag("pmadd: out of memory");
-	}
-	pm[npm].src = src;
-	pm[npm].dst = dst;
-	pm[npm].cls = k;
-	npm++;
-}
-
-enum PMStat { ToMove, Moving, Moved };
-
-static Ref
-pmrec(enum PMStat *status, int i, int *k)
-{
-	Ref swp, swp1;
-	int j, k1;
-
-	/* note, this routine might emit
-	 * too many large instructions:
-	 *
-	 *                  , x -- x
-	 *      x -- x -- x        |
-	 *                  ` x -- x
-	 *
-	 * if only the first move is wide
-	 * the whole cycle will be wide,
-	 * this is safe but not necessary
-	 */
-
-	if (req(pm[i].src, pm[i].dst))
-		return R;
-	status[i] = Moving;
-	assert(KBASE(*k) == KBASE(pm[i].cls));
-	assert((Kw|1) == Kl && (Ks|1) == Kd);
-	*k |= KWIDE(pm[i].cls); /* see above */
-	swp = R;
-	for (j=0; j<npm; j++) {
-		if (req(pm[j].src, pm[i].dst))
-			switch (status[j]) {
-			case ToMove:
-				k1 = *k;
-				swp1 = pmrec(status, j, &k1);
-				if (!req(swp1, R)) {
-					assert(req(swp, R));
-					swp = swp1;
-					*k = k1;
-				}
-				break;
-			case Moving:
-				assert(req(swp, R));
-				swp = pm[i].dst;
-				break;
-			case Moved:
-				break;
-			}
-	}
-	status[i] = Moved;
-	if (req(swp, R)) {
-		*curi++ = (Ins){OCopy, pm[i].dst, {pm[i].src}, pm[i].cls};
-		return R;
-	} else if (!req(swp, pm[i].src)) {
-		*curi++ = (Ins){OSwap, R, {pm[i].src, pm[i].dst}, *k};
-		return swp;
-	} else
-		return R;
-
-}
-
-static void
-pmgen()
-{
-	int i, k;
-	enum PMStat *status;
-
-	status = alloc(npm * sizeof status[0]);
-	assert(!npm || status[npm-1] == ToMove);
-	curi = insb;
-	for (i=0; i<npm; i++)
-		if (status[i] == ToMove) {
-			k = pm[i].cls;
-			pmrec(status, i, &k);
-		}
-}
-
-static void
-move(int r, Ref to, RMap *m)
-{
-	int n, t, r1;
-
-	r1 = req(to, R) ? -1 : rfree(m, to.val);
-	if (bshas(m->b, r) && r1 != r) {
-		/* r is used and not by to */
-		for (n=0; m->r[n] != r; n++)
-			assert(n+1 < m->n);
-		t = m->t[n];
-		rfree(m, t);
-		bsset(m->b, r);
-		ralloc(m, t);
-		bsclr(m->b, r);
-	}
-	t = req(to, R) ? r : to.val;
-	radd(m, t, r);
-}
-
-static int
-regcpy(Ins *i)
-{
-	return i->op == OCopy && isreg(i->arg[0]);
-}
-
-static Ins *
-dopm(Blk *b, Ins *i, RMap *m)
-{
-	RMap m0;
-	int n, r, r1, t, s;
-	Ins *i0, *i1, *ip, *ir;
-	bits def;
-
-	m0 = *m;
-	i1 = ++i;
-	do {
-		i--;
-		move(i->arg[0].val, i->to, m);
-	} while (i != b->ins && regcpy(i-1));
-	assert(m0.n <= m->n);
-	if (i != b->ins && (i-1)->op == OCall) {
-		def = retregs((i-1)->arg[1], 0);
-		for (r=0; r<NRSave; r++)
-			if (!(BIT(rsave[r]) & def))
-				move(rsave[r], R, m);
-	}
-	for (npm=0, n=0; n<m->n; n++) {
-		t = m->t[n];
-		s = tmp[t].slot;
-		r1 = m->r[n];
-		r = rfind(&m0, t);
-		if (r != -1)
-			pmadd(TMP(r1), TMP(r), tmp[t].cls);
-		else if (s != -1)
-			pmadd(TMP(r1), SLOT(s), tmp[t].cls);
-	}
-	for (ip=i; ip<i1; ip++) {
-		if (!req(ip->to, R))
-			rfree(m, ip->to.val);
-		r = ip->arg[0].val;
-		if (rfind(m, r) == -1)
-			radd(m, r, r);
-	}
-	pmgen();
-#ifdef TEST_PMOV
-	return 0;
-#endif
-	n = b->nins - (i1 - i) + (curi - insb);
-	i0 = alloc(n * sizeof(Ins));
-	ip = icpy(ip = i0, b->ins, i - b->ins);
-	ip = icpy(ir = ip, insb, curi - insb);
-	ip = icpy(ip, i1, &b->ins[b->nins] - i1);
-	b->nins = n;
-	b->ins = i0;
-	return ir;
-}
-
-static int
-prio(Ref r1, Ref r2)
-{
-	/* trivial heuristic to begin with,
-	 * later we can use the distance to
-	 * the definition instruction
-	 */
-	(void) r2;
-	return *hint(r1.val) != -1;
-}
-
-static void
-insert(Ref *r, Ref **rs, int p)
-{
-	int i;
-
-	rs[i = p] = r;
-	while (i-- > 0 && prio(*r, *rs[i])) {
-		rs[i+1] = rs[i];
-		rs[i] = r;
-	}
-}
-
-static void
-doblk(Blk *b, RMap *cur)
-{
-	int x, r, nr;
-	bits rs;
-	Ins *i;
-	Mem *m;
-	Ref *ra[4];
-
-	if (rtype(b->jmp.arg) == RTmp)
-		b->jmp.arg = ralloc(cur, b->jmp.arg.val);
-	else if (rtype(b->jmp.arg) == RACall) {
-		/* add return registers */
-		rs = retregs(b->jmp.arg, 0);
-		for (r=0; rs; rs/=2, r++)
-			if (rs & 1)
-				radd(cur, r, r);
-	}
-	for (i=&b->ins[b->nins]; i!=b->ins;) {
-		switch ((--i)->op) {
-		case OCall:
-			rs = argregs(i->arg[1], 0);
-			for (r=0; r<NRSave; r++)
-				if (!(BIT(rsave[r]) & rs))
-					rfree(cur, rsave[r]);
-			break;
-		case OCopy:
-			if (isreg(i->arg[0])) {
-				i = dopm(b, i, cur);
-				continue;
-			}
-			if (isreg(i->to))
-			if (rtype(i->arg[0]) == RTmp)
-				sethint(i->arg[0].val, i->to.val);
-			/* fall through */
-		default:
-			if (!req(i->to, R)) {
-				assert(rtype(i->to) == RTmp);
-				r = rfree(cur, i->to.val);
-				if (r == -1 && !isreg(i->to)) {
-					*i = (Ins){.op = ONop};
-					continue;
-				}
-				if (i->to.val >= Tmp0)
-					i->to = TMP(r);
-			}
-			break;
-		}
-		for (x=0, nr=0; x<2; x++)
-			switch (rtype(i->arg[x])) {
-			case RAMem:
-				m = &mem[i->arg[x].val & AMask];
-				if (rtype(m->base) == RTmp)
-					insert(&m->base, ra, nr++);
-				if (rtype(m->index) == RTmp)
-					insert(&m->index, ra, nr++);
-				break;
-			case RTmp:
-				insert(&i->arg[x], ra, nr++);
-				break;
-			}
-		for (r=0; r<nr; r++)
-			*ra[r] = ralloc(cur, ra[r]->val);
-	}
-}
-
-/* register allocation
- * depends on rpo, phi, cost, (and obviously spill)
- */
-void
-rega(Fn *fn)
-{
-	int j, n, t, r, r1, x, rl[Tmp0];
-	Blk *b, *b1, *s, ***ps, *blist;
-	RMap *end, *beg, cur, old;
-	Ins *i;
-	Phi *p;
-	uint u;
-	Ref src, dst;
-
-	/* 1. setup */
-	regu = 0;
-	tmp = fn->tmp;
-	mem = fn->mem;
-	end = alloc(fn->nblk * sizeof end[0]);
-	beg = alloc(fn->nblk * sizeof beg[0]);
-	for (n=0; n<fn->nblk; n++) {
-		bsinit(end[n].b, fn->ntmp);
-		bsinit(beg[n].b, fn->ntmp);
-	}
-	bsinit(cur.b, fn->ntmp);
-	bsinit(old.b, fn->ntmp);
-
-	for (t=Tmp0; t<fn->ntmp; t++)
-		*hint(t) = -1;
-	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
-		if (i->op != OCopy || !isreg(i->arg[0]))
-			break;
-		else {
-			assert(rtype(i->to) == RTmp);
-			sethint(i->to.val, i->arg[0].val);
-		}
-
-	/* 2. assign registers following post-order */
-	for (n=fn->nblk-1; n>=0; n--) {
-		b = fn->rpo[n];
-		cur.n = 0;
-		bszero(cur.b);
-		for (x=0; x<2; x++)
-			for (t=Tmp0; t<fn->ntmp; t++) {
-				assert(bshas(b->out, t) ||
-					!bshas(cur.b, t));
-				if (bshas(b->out, t))
-				if (!bshas(cur.b, t))
-				if (x || (r=*hint(t)) != -1)
-				if (x || !bshas(cur.b, r))
-					ralloc(&cur, t);
-			}
-		rcopy(&end[n], &cur);
-		doblk(b, &cur);
-		bscopy(b->in, cur.b);
-		for (p=b->phi; p; p=p->link)
-			if (rtype(p->to) == RTmp) {
-				bsclr(b->in, p->to.val);
-				/* heuristic 0:
-				 * if the phi destination has an
-				 * argument from a frequent block
-				 * that was already allocated to
-				 * 'r', use 'r' as the new hint
-				 */
-				memset(rl, 0, sizeof rl);
-				for (u=0; u<p->narg; u++) {
-					t = p->arg[u].val;
-					b1 = p->blk[u];
-					if (rtype(p->arg[u]) == RTmp)
-					if ((r=rfind(&end[b1->id], t)) != -1)
-						rl[r] += b1->loop;
-				}
-				for (x=0, j=0; j<Tmp0; j++)
-					if (rl[j] > rl[x])
-						x = j;
-				if (rl[x] >= b->loop)
-					*hint(p->to.val) = x;
-			}
-		if (b->npred > 1) {
-			/* heuristic 1:
-			 * attempt to satisfy hints
-			 * when it's simple and we have
-			 * multiple predecessors
-			 */
-			rcopy(&old, &cur);
-			curi = &insb[NIns];
-			for (j=0; j<old.n; j++) {
-				t = old.t[j];
-				r = *hint(t);
-				r1 = rfind(&cur, t);
-				if (r != -1 && r != r1)
-				if (!bshas(cur.b, r)) {
-					rfree(&cur, t);
-					radd(&cur, t, r);
-					x = tmp[t].cls;
-					emit(OCopy, x, TMP(r1), TMP(r), R);
-				}
-			}
-			if ((j = &insb[NIns] - curi)) {
-				b->nins += j;
-				i = alloc(b->nins * sizeof(Ins));
-				icpy(icpy(i, curi, j), b->ins, b->nins-j);
-				b->ins = i;
-			}
-		}
-		rcopy(&beg[n], &cur);
-	}
-	if (debug['R'])  {
-		fprintf(stderr, "\n> Register mappings:\n");
-		for (n=0; n<fn->nblk; n++) {
-			b = fn->rpo[n];
-			fprintf(stderr, "\t%-10s beg", b->name);
-			mdump(&beg[n]);
-			fprintf(stderr, "\t           end");
-			mdump(&end[n]);
-		}
-		fprintf(stderr, "\n");
-	}
-
-	/* 3. compose glue code */
-	blist = 0;
-	for (b=fn->start;; b=b->link) {
-		ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
-		for (; (s=**ps); ps++) {
-			npm = 0;
-			for (p=s->phi; p; p=p->link) {
-				dst = p->to;
-				assert(rtype(dst)==RSlot || rtype(dst)==RTmp);
-				if (rtype(dst) == RTmp) {
-					r = rfind(&beg[s->id], dst.val);
-					if (r == -1)
-						continue;
-					dst = TMP(r);
-				}
-				for (u=0; p->blk[u]!=b; u++)
-					assert(u+1 < p->narg);
-				src = p->arg[u];
-				if (rtype(src) == RTmp)
-					src = rref(&end[b->id], src.val);
-				pmadd(src, dst, p->cls);
-			}
-			for (t=Tmp0; t<fn->ntmp; t++)
-				if (bshas(s->in, t)) {
-					src = rref(&end[b->id], t);
-					dst = rref(&beg[s->id], t);
-					pmadd(src, dst, tmp[t].cls);
-				}
-			pmgen();
-			if (curi == insb)
-				continue;
-			b1 = blknew();
-			b1->loop = (b->loop+s->loop) / 2;
-			b1->link = blist;
-			blist = b1;
-			fn->nblk++;
-			sprintf(b1->name, "%s_%s", b->name, s->name);
-			b1->nins = curi - insb;
-			idup(&b1->ins, insb, b1->nins);
-			b1->jmp.type = JJmp;
-			b1->s1 = s;
-			**ps = b1;
-		}
-		if (!b->link) {
-			b->link = blist;
-			break;
-		}
-	}
-	for (b=fn->start; b; b=b->link)
-		b->phi = 0;
-	fn->reg = regu;
-
-	if (debug['R']) {
-		fprintf(stderr, "\n> After register allocation:\n");
-		printfn(fn, stderr);
-	}
-}
diff --git a/lisc/spill.c b/lisc/spill.c
@@ -1,507 +0,0 @@
-#include "lisc.h"
-
-static void
-loopmark(Blk *hd, Blk *b, Phi *p)
-{
-	int k, head;
-	uint n, a;
-
-	head = hd->id;
-	if (b->id < head)
-		return;
-	for (; p; p=p->link)
-		for (a=0; a<p->narg; a++)
-			if (p->blk[a] == b)
-			if (rtype(p->arg[a]) == RTmp)
-				bsset(hd->gen, p->arg[a].val);
-	if (b->visit == head)
-		return;
-	b->visit = head;
-	b->loop *= 10;
-	/* aggregate looping information at
-	 * loop headers */
-	bsunion(hd->gen, b->gen);
-	for (k=0; k<2; k++)
-		if (b->nlive[k] > hd->nlive[k])
-			hd->nlive[k] = b->nlive[k];
-	for (n=0; n<b->npred; n++)
-		loopmark(hd, b->pred[n], b->phi);
-}
-
-static void
-tmpuse(Ref r, int use, int loop, Fn *fn)
-{
-	Mem *m;
-	Tmp *t;
-
-	if (rtype(r) == RAMem) {
-		m = &fn->mem[r.val & AMask];
-		tmpuse(m->base, 1, loop, fn);
-		tmpuse(m->index, 1, loop, fn);
-	}
-	else if (rtype(r) == RTmp && r.val >= Tmp0) {
-		t = &fn->tmp[r.val];
-		t->nuse += use;
-		t->ndef += !use;
-		t->cost += loop;
-	}
-}
-
-/* evaluate spill costs of temporaries,
- * this also fills usage information
- * requires rpo, preds
- */
-void
-fillcost(Fn *fn)
-{
-	int n, hd;
-	uint a;
-	Blk *b;
-	Ins *i;
-	Tmp *t;
-	Phi *p;
-
-	for (b=fn->start; b; b=b->link) {
-		b->loop = 1;
-		b->visit = -1;
-	}
-	if (debug['S'])
-		fprintf(stderr, "\n> Loop information:\n");
-	for (n=0; n<fn->nblk; n++) {
-		b = fn->rpo[n];
-		hd = 0;
-		for (a=0; a<b->npred; a++)
-			if (b->pred[a]->id >= n) {
-				loopmark(b, b->pred[a], b->phi);
-				hd = 1;
-			}
-		if (hd && debug['S']) {
-			fprintf(stderr, "\t%-10s", b->name);
-			fprintf(stderr, " (% 3d ", b->nlive[0]);
-			fprintf(stderr, "% 3d) ", b->nlive[1]);
-			dumpts(b->gen, fn->tmp, stderr);
-		}
-	}
-	for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) {
-		t->cost = t-fn->tmp < Tmp0 ? 1e6 : 0;
-		t->nuse = 0;
-		t->ndef = 0;
-	}
-	for (b=fn->start; b; b=b->link) {
-		for (p=b->phi; p; p=p->link) {
-			/* todo, the cost computation
-			 * for p->to is not great... */
-			tmpuse(p->to, 0, 0, fn);
-			for (a=0; a<p->narg; a++) {
-				n = p->blk[a]->loop;
-				assert(b->npred==p->narg &&
-					"wrong cfg");
-				n /= b->npred;
-				tmpuse(p->arg[a], 1, n, fn);
-			}
-		}
-		n = b->loop;
-		for (i=b->ins; i-b->ins < b->nins; i++) {
-			tmpuse(i->to, 0, n, fn);
-			tmpuse(i->arg[0], 1, n, fn);
-			tmpuse(i->arg[1], 1, n, fn);
-		}
-		tmpuse(b->jmp.arg, 1, n, fn);
-	}
-	if (debug['S']) {
-		fprintf(stderr, "\n> Spill costs:\n");
-		for (n=Tmp0; n<fn->ntmp; n++)
-			fprintf(stderr, "\t%-10s %d\n",
-				fn->tmp[n].name,
-				fn->tmp[n].cost);
-		fprintf(stderr, "\n");
-	}
-}
-
-static BSet *fst; /* temps to prioritize in registers (for tcmp1) */
-static Tmp *tmp;  /* current temporaries (for tcmpX) */
-static int ntmp;  /* current # of temps (for limit) */
-static int locs;  /* stack size used by locals */
-static int slot4; /* next slot of 4 bytes */
-static int slot8; /* ditto, 8 bytes */
-static BSet mask[2][1]; /* class masks */
-
-static int
-tcmp0(const void *pa, const void *pb)
-{
-	return tmp[*(int *)pb].cost - tmp[*(int *)pa].cost;
-}
-
-static int
-tcmp1(const void *pa, const void *pb)
-{
-	int c;
-
-	c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa);
-	return c ? c : tcmp0(pa, pb);
-}
-
-static Ref
-slot(int t)
-{
-	int s;
-
-	if (t < Tmp0)
-		diag("spill: cannot spill register");
-	s = tmp[t].slot;
-	if (s == -1) {
-		assert(NAlign == 3);
-		/* nice logic to pack stack slots
-		 * on demand, there can be only
-		 * one hole and slot4 points to it
-		 *
-		 * invariant: slot4 <= slot8
-		 */
-		if (KWIDE(tmp[t].cls)) {
-			s = slot8;
-			if (slot4 == slot8)
-				slot4 += 2;
-			slot8 += 2;
-		} else {
-			s = slot4;
-			if (slot4 == slot8) {
-				slot8 += 2;
-				slot4 += 1;
-			} else
-				slot4 = slot8;
-		}
-		s += locs;
-		tmp[t].slot = s;
-	}
-	return SLOT(s);
-}
-
-static void
-limit(BSet *b, int k, BSet *f)
-{
-	static int *tarr, maxt;
-	int i, nt;
-	uint t;
-
-	nt = bscount(b);
-	if (nt <= k)
-		return;
-	if (nt > maxt) {
-		free(tarr);
-		tarr = emalloc(nt * sizeof tarr[0]);
-		maxt = nt;
-	}
-	for (i=0, t=0; bsiter(b, &t); t++) {
-		bsclr(b, t);
-		tarr[i++] = t;
-	}
-	if (!f)
-		qsort(tarr, nt, sizeof tarr[0], tcmp0);
-	else {
-		fst = f;
-		qsort(tarr, nt, sizeof tarr[0], tcmp1);
-	}
-	for (i=0; i<k && i<nt; i++)
-		bsset(b, tarr[i]);
-	for (; i<nt; i++)
-		slot(tarr[i]);
-}
-
-static void
-limit2(BSet *b1, int k1, int k2, BSet *fst)
-{
-	BSet b2[1];
-
-	bsinit(b2, ntmp); /* todo, free those */
-	bscopy(b2, b1);
-	bsinter(b1, mask[0]);
-	bsinter(b2, mask[1]);
-	limit(b1, NIReg - k1, fst);
-	limit(b2, NFReg - k2, fst);
-	bsunion(b1, b2);
-}
-
-static void
-sethint(BSet *u, bits r)
-{
-	uint t;
-
-	for (t=Tmp0; bsiter(u, &t); t++)
-		tmp[phicls(t, tmp)].hint.m |= r;
-}
-
-static void
-reloads(BSet *u, BSet *v)
-{
-	uint t;
-
-	for (t=Tmp0; bsiter(u, &t); t++)
-		if (!bshas(v, t))
-			emit(OLoad, tmp[t].cls, TMP(t), slot(t), R);
-}
-
-static void
-store(Ref r, int s)
-{
-	static int kstore[] = {
-		[Kw] = OStorew, [Kl] = OStorel,
-		[Ks] = OStores, [Kd] = OStored,
-	};
-
-	if (s != -1)
-		emit(kstore[tmp[r.val].cls], 0, R, r, SLOT(s));
-}
-
-static int
-regcpy(Ins *i)
-{
-	return i->op == OCopy && isreg(i->arg[0]);
-}
-
-static Ins *
-dopm(Blk *b, Ins *i, BSet *v)
-{
-	int n, t;
-	BSet u[1];
-	Ins *i1;
-	bits r;
-
-	bsinit(u, ntmp); /* todo, free those */
-	/* consecutive copies from
-	 * registers need to be handled
-	 * as one large instruction
-	 *
-	 * fixme: there is an assumption
-	 * that calls are always followed
-	 * by copy instructions here, this
-	 * might not be true if previous
-	 * passes change
-	 */
-	i1 = ++i;
-	do {
-		i--;
-		t = i->to.val;
-		if (!req(i->to, R))
-		if (bshas(v, t)) {
-			bsclr(v, t);
-			store(i->to, tmp[t].slot);
-		}
-		bsset(v, i->arg[0].val);
-	} while (i != b->ins && regcpy(i-1));
-	bscopy(u, v);
-	if (i != b->ins && (i-1)->op == OCall) {
-		v->t[0] &= ~retregs((i-1)->arg[1], 0);
-		limit2(v, NISave, NFSave, 0);
-		for (r=0, n=0; n<NRSave; n++)
-			r |= BIT(rsave[n]);
-		v->t[0] |= argregs((i-1)->arg[1], 0);
-	} else {
-		limit2(v, 0, 0, 0);
-		r = v->t[0];
-	}
-	sethint(v, r);
-	reloads(u, v);
-	do
-		emiti(*--i1);
-	while (i1 != i);
-	return i;
-}
-
-/* spill code insertion
- * requires spill costs, rpo, liveness
- *
- * Note: this will replace liveness
- * information (in, out) with temporaries
- * that must be in registers at block
- * borders
- *
- * Be careful with:
- * - OCopy instructions to ensure register
- *   constraints
- */
-void
-spill(Fn *fn)
-{
-	Blk *b, *s1, *s2, *hd, **bp;
-	int j, n, l, t, k, lvarg[2];
-	BSet u[1], v[1], w[1];
-	Ins *i;
-	Phi *p;
-	Mem *m;
-	bits r;
-
-	tmp = fn->tmp;
-	ntmp = fn->ntmp;
-	bsinit(u, ntmp);
-	bsinit(v, ntmp);
-	bsinit(w, ntmp);
-	bsinit(mask[0], ntmp);
-	bsinit(mask[1], ntmp);
-	locs = fn->slot;
-	slot4 = 0;
-	slot8 = 0;
-	for (t=0; t<ntmp; t++) {
-		k = 0;
-		if (t >= XMM0 && t < XMM0 + NFReg)
-			k = 1;
-		else if (t >= Tmp0)
-			k = KBASE(tmp[t].cls);
-		bsset(mask[k], t);
-	}
-
-	for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) {
-		b = *--bp;
-		/* invariant: all bocks with bigger rpo got
-		 * their in,out updated. */
-
-		/* 1. find temporaries in registers at
-		 * the end of the block (put them in v) */
-		curi = 0;
-		s1 = b->s1;
-		s2 = b->s2;
-		hd = 0;
-		if (s1 && s1->id <= n)
-			hd = s1;
-		if (s2 && s2->id <= n)
-		if (!hd || s2->id >= hd->id)
-			hd = s2;
-		r = 0;
-		bszero(v);
-		if (hd) {
-			/* back-edge */
-			for (k=0; k<2; k++) {
-				n = k == 0 ? NIReg : NFReg;
-				bscopy(u, b->out);
-				bsinter(u, mask[k]);
-				bscopy(w, u);
-				bsinter(u, hd->gen);
-				bsdiff(w, hd->gen);
-				if ((int)bscount(u) < n) { /* fixme */
-					j = bscount(w);   /* live through */
-					l = hd->nlive[k];
-					limit(w, n - (l - j), 0);
-					bsunion(u, w);
-				} else
-					limit(u, n, 0);
-				bsunion(v, u);
-			}
-		} else if (s1) {
-			liveon(v, b, s1);
-			if (s2) {
-				liveon(u, b, s2);
-				bscopy(w, u);
-				bsinter(w, v);
-				bsunion(v, u);
-			}
-			limit2(v, 0, 0, w);
-		} else if (rtype(b->jmp.arg) == RACall) {
-			/* return */
-			r = retregs(b->jmp.arg, 0);
-			v->t[0] |= r;
-		}
-		bscopy(b->out, v);
-
-		/* 2. process the block instructions */
-		curi = &insb[NIns];
-		for (i=&b->ins[b->nins]; i!=b->ins;) {
-			i--;
-			if (regcpy(i)) {
-				i = dopm(b, i, v);
-				continue;
-			}
-			bszero(w);
-			if (!req(i->to, R)) {
-				assert(rtype(i->to) == RTmp);
-				t = i->to.val;
-				if (bshas(v, t))
-					bsclr(v, t);
-				else {
-					/* make sure we have a reg
-					 * for the result */
-					bsset(v, t);
-					bsset(w, t);
-				}
-			}
-			j = opdesc[i->op].nmem;
-			for (n=0; n<2; n++)
-				if (rtype(i->arg[n]) == RAMem)
-					j--;
-			for (n=0; n<2; n++)
-				switch (rtype(i->arg[n])) {
-				case RAMem:
-					t = i->arg[n].val;
-					m = &fn->mem[t & AMask];
-					if (rtype(m->base) == RTmp) {
-						bsset(v, m->base.val);
-						bsset(w, m->base.val);
-					}
-					if (rtype(m->index) == RTmp) {
-						bsset(v, m->index.val);
-						bsset(w, m->index.val);
-					}
-					break;
-				case RTmp:
-					t = i->arg[n].val;
-					lvarg[n] = bshas(v, t);
-					bsset(v, t);
-					if (j-- <= 0)
-						bsset(w, t);
-					break;
-				}
-			bscopy(u, v);
-			limit2(v, 0, 0, w);
-			for (n=0; n<2; n++)
-				if (rtype(i->arg[n]) == RTmp) {
-					t = i->arg[n].val;
-					if (!bshas(v, t)) {
-						/* do not reload if the
-						 * the temporary was dead
-						 */
-						if (!lvarg[n])
-							bsclr(u, t);
-						i->arg[n] = slot(t);
-					}
-				}
-			reloads(u, v);
-			if (!req(i->to, R)) {
-				t = i->to.val;
-				store(i->to, tmp[t].slot);
-				bsclr(v, t);
-			}
-			emiti(*i);
-			r = v->t[0] & (BIT(Tmp0)-1);
-			if (r)
-				sethint(v, r);
-		}
-		assert(!r || b==fn->start);
-
-		for (p=b->phi; p; p=p->link) {
-			assert(rtype(p->to) == RTmp);
-			t = p->to.val;
-			if (bshas(v, t)) {
-				bsclr(v, t);
-				store(p->to, tmp[t].slot);
-			} else if (bshas(b->in, t))
-				/* only if the phi is live */
-				p->to = slot(p->to.val);
-		}
-		bscopy(b->in, v);
-		b->nins = &insb[NIns] - curi;
-		idup(&b->ins, curi, b->nins);
-	}
-
-	/* align the locals to a 16 byte boundary */
-	assert(NAlign == 3);
-	slot8 += slot8 & 3;
-	fn->slot += slot8;
-
-	if (debug['S']) {
-		fprintf(stderr, "\n> Block information:\n");
-		for (b=fn->start; b; b=b->link) {
-			printf("\t%-10s (% 5d) ", b->name, b->loop);
-			dumpts(b->out, fn->tmp, stdout);
-		}
-		fprintf(stderr, "\n> After spilling:\n");
-		printfn(fn, stderr);
-	}
-}
diff --git a/lisc/ssa.c b/lisc/ssa.c
@@ -1,516 +0,0 @@
-#include "lisc.h"
-#include <stdarg.h>
-
-static void
-adduse(Tmp *tmp, int ty, Blk *b, ...)
-{
-	Use *u;
-	int n;
-	va_list ap;
-
-	va_start(ap, b);
-	n = tmp->nuse;
-	vgrow(&tmp->use, ++tmp->nuse);
-	u = &tmp->use[n];
-	u->type = ty;
-	u->bid = b->id;
-	switch (ty) {
-	default:
-		diag("ssa: adduse defaulted");
-	case UPhi:
-		u->u.phi = va_arg(ap, Phi *);
-		break;
-	case UIns:
-		u->u.ins = va_arg(ap, Ins *);
-		break;
-	case UJmp:
-		break;
-	}
-	va_end(ap);
-}
-
-/* fill usage, phi, and class information
- */
-void
-filluse(Fn *fn)
-{
-	Blk *b;
-	Phi *p;
-	Ins *i;
-	int m, t;
-	uint a;
-	Tmp *tmp;
-
-	/* todo, is this the correct file? */
-	tmp = fn->tmp;
-	for (t=0; t<fn->ntmp; t++) {
-		tmp[t].ndef = 0;
-		tmp[t].nuse = 0;
-		tmp[t].phi = 0;
-		tmp[t].cls = 0;
-		if (tmp[t].use == 0)
-			tmp[t].use = vnew(0, sizeof(Use));
-	}
-	for (b=fn->start; b; b=b->link) {
-		for (p=b->phi; p; p=p->link) {
-			assert(rtype(p->to) == RTmp);
-			t = p->to.val;
-			tmp[t].ndef++;
-			tmp[t].cls = p->cls;
-			tmp[t].phi = p->to.val;
-			for (a=0; a<p->narg; a++)
-				if (rtype(p->arg[a]) == RTmp) {
-					t = p->arg[a].val;
-					adduse(&tmp[t], UPhi, b, p);
-					if (!tmp[t].phi)
-						tmp[t].phi = p->to.val;
-				}
-		}
-		for (i=b->ins; i-b->ins < b->nins; i++) {
-			if (!req(i->to, R)) {
-				assert(rtype(i->to) == RTmp);
-				t = i->to.val;
-				tmp[t].ndef++;
-				tmp[t].cls = i->cls;
-			}
-			for (m=0; m<2; m++)
-				if (rtype(i->arg[m]) == RTmp) {
-					t = i->arg[m].val;
-					adduse(&tmp[t], UIns, b, i);
-				}
-		}
-		if (rtype(b->jmp.arg) == RTmp)
-			adduse(&tmp[b->jmp.arg.val], UJmp, b);
-	}
-}
-
-static void
-addpred(Blk *bp, Blk *bc)
-{
-	uint i;
-
-	if (!bc->pred) {
-		bc->pred = alloc(bc->npred * sizeof bc->pred[0]);
-		for (i=0; i<bc->npred; i++)
-			bc->pred[i] = 0;
-	}
-	for (i=0; bc->pred[i]; i++)
-		;
-	bc->pred[i] = bp;
-}
-
-/* fill predecessors information in blocks
- */
-void
-fillpreds(Fn *f)
-{
-	Blk *b;
-
-	for (b=f->start; b; b=b->link) {
-		b->npred = 0;
-		b->pred = 0;
-	}
-	for (b=f->start; b; b=b->link) {
-		if (b->s1)
-			b->s1->npred++;
-		if (b->s2)
-			b->s2->npred++;
-	}
-	for (b=f->start; b; b=b->link) {
-		if (b->s1)
-			addpred(b, b->s1);
-		if (b->s2)
-			addpred(b, b->s2);
-	}
-}
-
-static int
-rporec(Blk *b, int x)
-{
-	Blk *s1, *s2;
-
-	if (!b || b->id >= 0)
-		return x;
-	b->id = 1;
-	s1 = b->s1;
-	s2 = b->s2;
-	if (s1 && s2 && s1->loop > s2->loop) {
-		s1 = b->s2;
-		s2 = b->s1;
-	}
-	x = rporec(s1, x);
-	x = rporec(s2, x);
-	b->id = x;
-	assert(x >= 0);
-	return x - 1;
-}
-
-/* fill the rpo information in blocks
- */
-void
-fillrpo(Fn *f)
-{
-	int n;
-	Blk *b, **p;
-
-	for (b=f->start; b; b=b->link)
-		b->id = -1;
-	n = 1 + rporec(f->start, f->nblk-1);
-	f->nblk -= n;
-	f->rpo = alloc(f->nblk * sizeof f->rpo[0]);
-	for (p=&f->start; *p;) {
-		b = *p;
-		if (b->id == -1) {
-			*p = b->link;
-			/* todo, free block */
-		} else {
-			b->id -= n;
-			f->rpo[b->id] = b;
-			p=&(*p)->link;
-		}
-	}
-}
-
-/* for dominators computation, read
- * "A Simple, Fast Dominance Algorithm"
- * by K. Cooper, T. Harvey, and K. Kennedy.
- */
-
-static Blk *
-inter(Blk *b1, Blk *b2)
-{
-	Blk *bt;
-
-	if (b1 == 0)
-		return b2;
-	while (b1 != b2) {
-		if (b1->id < b2->id) {
-			bt = b1;
-			b1 = b2;
-			b2 = bt;
-		}
-		while (b1->id > b2->id) {
-			b1 = b1->idom;
-			assert(b1);
-		}
-	}
-	return b1;
-}
-
-static void
-filldom(Fn *fn)
-{
-	Blk *b, *d;
-	int ch, n;
-	uint p;
-
-	for (b=fn->start; b; b=b->link) {
-		b->idom = 0;
-		b->dom = 0;
-		b->dlink = 0;
-	}
-	do {
-		ch = 0;
-		for (n=1; n<fn->nblk; n++) {
-			b = fn->rpo[n];
-			d = 0;
-			for (p=0; p<b->npred; p++)
-				if (b->pred[p]->idom
-				||  b->pred[p] == fn->start)
-					d = inter(d, b->pred[p]);
-			if (d != b->idom) {
-				ch++;
-				b->idom = d;
-			}
-		}
-	} while (ch);
-	for (b=fn->start; b; b=b->link)
-		if ((d=b->idom)) {
-			assert(d != b);
-			b->dlink = d->dom;
-			d->dom = b;
-		}
-}
-
-static int
-sdom(Blk *b1, Blk *b2)
-{
-	assert(b1 && b2);
-	if (b1 == b2)
-		return 0;
-	while (b2->id > b1->id)
-		b2 = b2->idom;
-	return b1 == b2;
-}
-
-static int
-dom(Blk *b1, Blk *b2)
-{
-	return b1 == b2 || sdom(b1, b2);
-}
-
-static void
-addfron(Blk *a, Blk *b)
-{
-	int n;
-
-	for (n=0; n<a->nfron; n++)
-		if (a->fron[n] == b)
-			return;
-	if (!a->nfron)
-		a->fron = vnew(++a->nfron, sizeof a->fron[0]);
-	else
-		vgrow(&a->fron, ++a->nfron);
-	a->fron[a->nfron-1] = b;
-}
-
-static void
-fillfron(Fn *fn)
-{
-	Blk *a, *b;
-
-	for (b=fn->start; b; b=b->link) {
-		if (b->s1)
-			for (a=b; !sdom(a, b->s1); a=a->idom)
-				addfron(a, b->s1);
-		if (b->s2)
-			for (a=b; !sdom(a, b->s2); a=a->idom)
-				addfron(a, b->s2);
-	}
-}
-
-static Ref
-refindex(int t, Fn *fn)
-{
-	return newtmp(fn->tmp[t].name, fn->tmp[t].cls, fn);
-}
-
-static void
-phiins(Fn *fn)
-{
-	BSet u[1], defs[1];
-	Blk *a, *b, **blist, **be, **bp;
-	Ins *i;
-	Phi *p;
-	Ref r;
-	int t, n, k, nt;
-
-	bsinit(u, fn->nblk);
-	bsinit(defs, fn->nblk);
-	blist = emalloc(fn->nblk * sizeof blist[0]);
-	be = &blist[fn->nblk];
-	nt = fn->ntmp;
-	for (t=Tmp0; t<nt; t++) {
-		fn->tmp[t].visit = 0;
-		if (fn->tmp[t].phi != 0)
-			continue;
-		bszero(u);
-		k = -1;
-		bp = be;
-		for (b=fn->start; b; b=b->link) {
-			b->visit = 0;
-			r = R;
-			for (i=b->ins; i-b->ins < b->nins; i++) {
-				if (!req(r, R)) {
-					if (req(i->arg[0], TMP(t)))
-						i->arg[0] = r;
-					if (req(i->arg[1], TMP(t)))
-						i->arg[1] = r;
-				}
-				if (req(i->to, TMP(t))) {
-					if (!bshas(b->out, t)) {
-						if (fn->tmp[t].ndef == 1)
-							r = TMP(t);
-						else
-							r = refindex(t, fn);
-						i->to = r;
-					} else {
-						if (!bshas(u, b->id)) {
-							bsset(u, b->id);
-							*--bp = b;
-						}
-						if (k == -1)
-							k = i->cls;
-						assert(k == i->cls);
-					}
-				}
-			}
-			if (!req(r, R) && req(b->jmp.arg, TMP(t)))
-				b->jmp.arg = r;
-		}
-		bscopy(defs, u);
-		while (bp != be) {
-			fn->tmp[t].visit = t;
-			b = *bp++;
-			bsclr(u, b->id);
-			for (n=0; n<b->nfron; n++) {
-				a = b->fron[n];
-				if (a->visit++ == 0)
-				if (bshas(a->in, t)) {
-					p = alloc(sizeof *p);
-					p->cls = k;
-					p->to = TMP(t);
-					p->link = a->phi;
-					a->phi = p;
-					if (!bshas(defs, a->id))
-					if (!bshas(u, a->id)) {
-						bsset(u, a->id);
-						*--bp = a;
-					}
-				}
-			}
-		}
-	}
-	free(blist);
-}
-
-typedef struct Name Name;
-struct Name {
-	Ref r;
-	Blk *b;
-	Name *up;
-};
-
-static Name *namel;
-
-static Name *
-nnew(Ref r, Blk *b, Name *up)
-{
-	Name *n;
-
-	if (namel) {
-		n = namel;
-		namel = n->up;
-	} else
-		/* could use alloc, here
-		 * but namel should be reset
-		 */
-		n = emalloc(sizeof *n);
-	n->r = r;
-	n->b = b;
-	n->up = up;
-	return n;
-}
-
-static void
-nfree(Name *n)
-{
-	n->up = namel;
-	namel = n;
-}
-
-static void
-rendef(Ref *r, Blk *b, Name **stk, Fn *fn)
-{
-	Ref r1;
-	int t;
-
-	t = r->val;
-	if (req(*r, R) || !fn->tmp[t].visit)
-		return;
-	r1 = refindex(t, fn);
-	fn->tmp[r1.val].visit = t;
-	stk[t] = nnew(r1, b, stk[t]);
-	*r = r1;
-}
-
-static Ref
-getstk(int t, Blk *b, Name **stk)
-{
-	Name *n, *n1;
-
-	n = stk[t];
-	while (n && !dom(n->b, b)) {
-		n1 = n;
-		n = n->up;
-		nfree(n1);
-	}
-	stk[t] = n;
-	if (!n) {
-		/* uh, oh, warn */
-		return CON_Z;
-	} else
-		return n->r;
-}
-
-static void
-renblk(Blk *b, Name **stk, Fn *fn)
-{
-	Phi *p;
-	Ins *i;
-	Blk *s, **ps, *succ[3];
-	int t, m;
-
-	for (p=b->phi; p; p=p->link)
-		rendef(&p->to, b, stk, fn);
-	for (i=b->ins; i-b->ins < b->nins; i++) {
-		for (m=0; m<2; m++) {
-			t = i->arg[m].val;
-			if (rtype(i->arg[m]) == RTmp)
-			if (fn->tmp[t].visit)
-				i->arg[m] = getstk(t, b, stk);
-		}
-		rendef(&i->to, b, stk, fn);
-	}
-	t = b->jmp.arg.val;
-	if (rtype(b->jmp.arg) == RTmp)
-	if (fn->tmp[t].visit)
-		b->jmp.arg = getstk(t, b, stk);
-	succ[0] = b->s1;
-	succ[1] = b->s2;
-	succ[2] = 0;
-	for (ps=succ; (s=*ps); ps++)
-		for (p=s->phi; p; p=p->link) {
-			t = p->to.val;
-			if ((t=fn->tmp[t].visit)) {
-				m = p->narg++;
-				if (m == NPred)
-					diag("ssa: too many phi arguments");
-				p->arg[m] = getstk(t, b, stk);
-				p->blk[m] = b;
-			}
-		}
-	for (s=b->dom; s; s=s->dlink)
-		renblk(s, stk, fn);
-}
-
-/* require ndef */
-void
-ssa(Fn *fn)
-{
-	Name **stk, *n;
-	int d, nt;
-	Blk *b, *b1;
-
-	nt = fn->ntmp;
-	stk = emalloc(nt * sizeof stk[0]);
-	d = debug['L'];
-	debug['L'] = 0;
-	filldom(fn);
-	if (debug['N']) {
-		fprintf(stderr, "\n> Dominators:\n");
-		for (b1=fn->start; b1; b1=b1->link) {
-			if (!b1->dom)
-				continue;
-			fprintf(stderr, "%10s:", b1->name);
-			for (b=b1->dom; b; b=b->dlink)
-				fprintf(stderr, " %s", b->name);
-			fprintf(stderr, "\n");
-		}
-	}
-	fillfron(fn);
-	filllive(fn);
-	phiins(fn);
-	renblk(fn->start, stk, fn);
-	while (nt--)
-		while ((n=stk[nt])) {
-			stk[nt] = n->up;
-			nfree(n);
-		}
-	debug['L'] = d;
-	free(stk);
-	if (debug['N']) {
-		fprintf(stderr, "\n> After SSA construction:\n");
-		printfn(fn, stderr);
-	}
-}
diff --git a/lisc/test/go.sh b/lisc/test/go.sh
@@ -1,116 +0,0 @@
-#!/bin/sh
-
-TMP=/tmp/qbe.zzzz
-
-DRV=$TMP.c
-ASM=$TMP.s
-BIN=$TMP.bin
-OUT=$TMP.out
-
-cleanup() {
-	rm -f $DRV $ASM $BIN $OUT
-}
-
-extract() {
-	WHAT="$1"
-	FILE="$2"
-
-	awk "
-		/^# >>> $WHAT/ {
-			p = 1
-			next
-		}
-		/^# <<</ {
-			if (p)
-				p = 0
-		}
-		p
-	" $FILE \
-	| sed -e 's/# //' \
-	| sed -e 's/#$//'
-}
-
-once() {
-	T="$1"
-
-	if ! test -f $T
-	then
-		echo "invalid test file $T" >&2
-		exit 1
-	fi
-
-	echo "$T... "
-
-	if ! ./lisc $T -o $ASM
-	then
-		echo "[qbe fail]"
-		return 1
-	fi
-
-	extract driver $T > $DRV
-	extract output $T > $OUT
-
-	if test -s $DRV
-	then
-		LNK="$DRV $ASM"
-	else
-		LNK="$ASM"
-	fi
-
-	if ! cc -g -o $BIN $LNK
-	then
-		echo "[cc fail]"
-		return 1
-	fi
-
-	if test -s $OUT
-	then
-		$BIN a b c | diff - $OUT
-		RET=$?
-		REASON="output"
-	else
-		$BIN a b c
-		RET=$?
-		REASON="returned $RET"
-	fi
-
-	if test $RET -ne 0
-	then
-		echo "[$REASON fail]"
-		return 1
-	fi
-
-	printf "\033[1A\033[45C[ok]\n"
-}
-
-
-#trap cleanup TERM QUIT
-
-if test -z "$1"
-then
-	echo "usage: test/go.sh {all, SSAFILE}" 2>&1
-	exit 1
-fi
-
-case $1 in
-	"all")
-		F=0
-		for T in test/[!_]*.ssa
-		do
-			once $T
-			F=`expr $F + $?`
-		done
-		if test $F -ge 1
-		then
-			echo
-			echo "$F test(s) failed!"
-		else
-			echo
-			echo "All is fine!"
-		fi
-		;;
-	*)
-		once $1
-		exit $?
-		;;
-esac
diff --git a/lisc/tools/abitest.sh b/lisc/tools/abitest.sh
@@ -1,104 +0,0 @@
-#!/bin/sh
-
-OCAMLC=/usr/bin/ocamlc
-QBE=`pwd`/lisc
-
-failure() {
-	echo "Failure at stage:" $1 >&2
-	exit 1
-}
-
-cleanup() {
-	rm -fr $TMP
-}
-
-init() {
-	cp tools/abi.ml $TMP
-	pushd $TMP > /dev/null
-
-	cat > Makefile << EOM
-
-.PHONY: test
-test: caller.o callee.o
-	c99 -o \$@ caller.o callee.o
-%.o: %.c
-	c99 -c -o \$@ \$<
-%.o: %.ssa
-	$QBE -o \$*.s \$<
-	c99 -c -o \$@ \$*.s
-
-EOM
-
-	if ! $OCAMLC abi.ml -o gentest
-	then
-		popd > /dev/null
-		cleanup
-		failure "abifuzz compilation"
-	fi
-	popd > /dev/null
-}
-
-once() {
-	if test -z "$3"
-	then
-		$TMP/gentest $TMP $1 $2
-	else
-		$TMP/gentest -s $3 $TMP $1 $2
-	fi
-	make -C $TMP test > /dev/null || failure "building"
-	$TMP/test || failure "runtime"
-}
-
-usage() {
-	echo "usage: abitest.sh [-callssa] [-callc] [-s SEED] [-n ITERATIONS]" >&2
-	exit 1
-}
-
-N=1
-CALLER=c
-CALLEE=ssa
-
-while test -n "$1"
-do
-	case "$1" in
-	"-callssa")
-		;;
-	"-callc")
-		CALLER=ssa
-		CALLEE=c
-		;;
-	"-s")
-		test -n "$2" || usage
-		shift
-		SEED="$1"
-		;;
-	"-n")
-		test -n "$2" || usage
-		shift
-		N="$1"
-		;;
-	*)
-		usage
-		;;
-	esac
-	shift
-done
-
-TMP=`mktemp -d abifuzz.XXXXXX`
-
-init
-
-if test -n "$S"
-then
-	once $CALLER $CALLEE $SEED
-else
-	for n in `seq $N`
-	do
-		once $CALLER $CALLEE
-		echo "$n" | grep "00$"
-	done
-fi
-
-echo "All done."
-
-cleanup
diff --git a/lisc/tools/regress.sh b/lisc/tools/regress.sh
@@ -1,17 +0,0 @@
-#!/bin/sh
-
-for t in test/*
-do
-	printf "Test $t ... "
-
-	./lisc   $t >/tmp/out.0 2>&1
-	./lisc.1 $t >/tmp/out.1 2>&1
-
-	if diff /tmp/out.0 /tmp/out.1 > /dev/null
-	then
-		echo "OK"
-	else
-		echo "KO"
-		break
-	fi
-done
diff --git a/lisc/util.c b/lisc/util.c
@@ -1,329 +0,0 @@
-#include "lisc.h"
-
-typedef struct Bitset Bitset;
-typedef struct Vec Vec;
-
-struct Vec {
-	ulong mag;
-	size_t esz;
-	ulong cap;
-	union {
-		long long ll;
-		long double ld;
-		void *ptr;
-	} align[];
-};
-
-enum {
-	VMin = 2,
-	VMag = 0xcabba9e,
-	NPtr = 256,
-};
-
-Typ typ[NTyp];
-Ins insb[NIns], *curi;
-
-static void *ptr[NPtr];
-static void **pool = ptr;
-static int nptr = 1;
-
-void
-diag(char *s)
-{
-	fputs(s, stderr);
-	fputc('\n', stderr);
-	abort();
-}
-
-void *
-emalloc(size_t n)
-{
-	void *p;
-
-	p = calloc(1, n);
-	if (!p)
-		diag("emalloc: out of memory");
-	return p;
-}
-
-void *
-alloc(size_t n)
-{
-	void **pp;
-
-	if (n == 0)
-		return 0;
-	if (nptr >= NPtr) {
-		pp = emalloc(NPtr * sizeof(void *));
-		pp[0] = pool;
-		pool = pp;
-		nptr = 1;
-	}
-	return pool[nptr++] = emalloc(n);
-}
-
-void
-freeall()
-{
-	void **pp;
-
-	for (;;) {
-		for (pp = &pool[1]; pp < &pool[nptr]; pp++)
-			free(*pp);
-		pp = pool[0];
-		if (!pp)
-			break;
-		free(pool);
-		pool = pp;
-		nptr = NPtr;
-	}
-	nptr = 1;
-}
-
-Blk *
-blknew()
-{
-	static Blk z;
-	Blk *b;
-
-	b = alloc(sizeof *b);
-	*b = z;
-	return b;
-}
-
-void
-emit(int op, int k, Ref to, Ref arg0, Ref arg1)
-{
-	if (curi == insb)
-		diag("emit: too many instructions");
-	*--curi = (Ins){
-		.op = op, .cls = k,
-		.to = to, .arg = {arg0, arg1}
-	};
-}
-
-void
-emiti(Ins i)
-{
-	emit(i.op, i.cls, i.to, i.arg[0], i.arg[1]);
-}
-
-void
-idup(Ins **pd, Ins *s, ulong n)
-{
-	*pd = alloc(n * sizeof(Ins));
-	memcpy(*pd, s, n * sizeof(Ins));
-}
-
-Ins *
-icpy(Ins *d, Ins *s, ulong n)
-{
-	memcpy(d, s, n * sizeof(Ins));
-	return d + n;
-}
-
-void *
-vnew(ulong len, size_t esz)
-{
-	ulong cap;
-	Vec *v;
-
-	for (cap=VMin; cap<len; cap*=2)
-		;
-	v = alloc(cap * esz + sizeof(Vec));
-	v->mag = VMag;
-	v->cap = cap;
-	v->esz = esz;
-	return v + 1;
-}
-
-void
-vgrow(void *vp, ulong len)
-{
-	Vec *v;
-	void *v1;
-
-	v = *(Vec **)vp - 1;
-	assert(v+1 && v->mag == VMag);
-	if (v->cap >= len)
-		return;
-	v1 = vnew(len, v->esz);
-	memcpy(v1, v+1, v->cap * v->esz);
-	*(Vec **)vp = v1;
-}
-
-int
-phicls(int t, Tmp *tmp /*, int c*/)
-{
-	if (tmp[t].phi)
-		return tmp[t].phi;
-	return t;
-#if 0
-	int t1;
-
-	t1 = tmp[t].phi;
-	if (!t1)
-		t1 = t;
-	if (t != t1) {
-		t1 = phitmp(t1, tmp, c);
-		if (c)
-			tmp[t].phi = t1;
-	}
-	return t1;
-#endif
-}
-
-Ref
-newtmp(char *prfx, int k,  Fn *fn)
-{
-	static int n;
-	int t;
-
-	t = fn->ntmp++;
-	vgrow(&fn->tmp, fn->ntmp);
-	sprintf(fn->tmp[t].name, "%s%d", prfx, ++n);
-	fn->tmp[t].cls = k;
-	fn->tmp[t].slot = -1;
-	fn->tmp[t].nuse = +1;
-	fn->tmp[t].ndef = +1;
-	return TMP(t);
-}
-
-Ref
-getcon(int64_t val, Fn *fn)
-{
-	int c;
-
-	for (c=0; c<fn->ncon; c++)
-		if (fn->con[c].type == CBits && fn->con[c].bits.i == val)
-			return CON(c);
-	fn->ncon++;
-	vgrow(&fn->con, fn->ncon);
-	fn->con[c] = (Con){.type = CBits, .bits.i = val};
-	return CON(c);
-}
-
-void
-addcon(Con *c0, Con *c1)
-{
-	if (c0->type == CUndef)
-		*c0 = *c1;
-	else {
-		if (c1->type == CAddr) {
-			if (c0->type == CAddr)
-				diag("addcon: adding two addresses");
-			c0->type = CAddr;
-			strcpy(c0->label, c1->label);
-		}
-		c0->bits.i += c1->bits.i;
-	}
-}
-
-void
-bsinit(BSet *bs, uint n)
-{
-	n = (n + NBit-1) / NBit;
-	bs->nt = n;
-	bs->t = alloc(n * sizeof bs->t[0]);
-}
-
-uint
-bscount(BSet *bs)
-{
-	uint i, j, n;
-
-	n = 0;
-	for (i=0; i<bs->nt; i++)
-		for (j=0; j<NBit; j++)
-			if (bs->t[i] & BIT(j))
-				n++;
-	return n;
-}
-
-static inline uint
-bsmax(BSet *bs)
-{
-	return bs->nt * NBit;
-}
-
-void
-bsset(BSet *bs, uint elt)
-{
-	assert(elt < bsmax(bs));
-	bs->t[elt/NBit] |= BIT(elt%NBit);
-}
-
-void
-bsclr(BSet *bs, uint elt)
-{
-	assert(elt < bsmax(bs));
-	bs->t[elt/NBit] &= ~BIT(elt%NBit);
-}
-
-#define BSOP(f, op)                           \
-	void                                  \
-	f(BSet *a, BSet *b)                   \
-	{                                     \
-		uint i;                       \
-		                              \
-		assert(a->nt == b->nt);       \
-		for (i=0; i<a->nt; i++)       \
-			a->t[i] op b->t[i];   \
-	}
-
-BSOP(bscopy, =)
-BSOP(bsunion, |=)
-BSOP(bsinter, &=)
-BSOP(bsdiff, &= ~)
-
-int
-bsequal(BSet *a, BSet *b)
-{
-	uint i;
-
-	assert(a->nt == b->nt);
-	for (i=0; i<a->nt; i++)
-		if (a->t[i] != b->t[i])
-			return 0;
-	return 1;
-}
-
-void
-bszero(BSet *bs)
-{
-	memset(bs->t, 0, bs->nt * sizeof bs->t[0]);
-}
-
-/* iterates on a bitset, use as follows
- *
- * 	for (i=0; bsiter(set, &i); i++)
- * 		use(i);
- *
- */
-int
-bsiter(BSet *bs, uint *elt)
-{
-	uint i;
-
-	for (i=*elt;; i++) {
-		while (i < bsmax(bs) && !bs->t[i/NBit])
-			i = (i + NBit) & -NBit;
-		if (i >= bsmax(bs))
-			return 0;
-		if (bshas(bs, i)) {
-			*elt = i;
-			return 1;
-		}
-	}
-}
-
-void
-dumpts(BSet *bs, Tmp *tmp, FILE *f)
-{
-	uint t;
-
-	fprintf(f, "[");
-	for (t=Tmp0; bsiter(bs, &t); t++)
-		fprintf(f, " %s", tmp[t].name);
-	fprintf(f, " ]\n");
-}
diff --git a/minic/mcc b/minic/mcc
@@ -29,7 +29,7 @@ fi
 
 
 ../minic/minic < $file        > /tmp/minic.ssa &&
-../lisc/lisc < /tmp/minic.ssa > /tmp/minic.s   &&
+../src/qbe < /tmp/minic.ssa   > /tmp/minic.s   &&
 cc $flags /tmp/minic.s
 
 if test $? -ne 0
diff --git a/src/.gitignore b/src/.gitignore
@@ -0,0 +1,5 @@
+qbe
+doc
+.comfile
+*.o
+*.out
diff --git a/lisc/.tag b/src/.tag
diff --git a/src/Makefile b/src/Makefile
@@ -0,0 +1,17 @@
+BIN = qbe
+OBJ = main.o util.o parse.o mem.o ssa.o copy.o live.o isel.o spill.o rega.o emit.o
+
+CFLAGS = -Wall -Wextra -std=c99 -g -pedantic
+
+$(BIN): $(OBJ)
+	$(CC) $(LDFLAGS) $(OBJ) -o $@
+
+$(OBJ): all.h
+
+.PHONY: clean check syndoc
+clean:
+	rm -f $(BIN) $(OBJ)
+check: $(BIN)
+	test/go.sh all
+syndoc:
+	unison -auto doc ssh://qcar@h/data/d/ssa-doc
diff --git a/lisc/lisc.h b/src/all.h
diff --git a/src/copy.c b/src/copy.c
@@ -0,0 +1,159 @@
+#include "all.h"
+
+typedef struct RList RList;
+struct RList {
+	int t;
+	RList *l;
+};
+
+static Ref
+copyof(Ref r, Ref *cp)
+{
+	if (rtype(r) == RTmp)
+		return cp[r.val];
+	else
+		return r;
+}
+
+static void
+update(Ref r, Ref rcp, Ref *cp, RList **w)
+{
+	RList *l;
+
+	if (!req(cp[r.val], rcp)) {
+		cp[r.val] = rcp;
+		l = emalloc(sizeof *l);
+		l->t = r.val;
+		l->l = *w;
+		*w = l;
+	}
+}
+
+static void
+visitphi(Phi *p, Ref *cp, RList **w)
+{
+	uint a;
+	Ref r, r1;
+
+	r = R;
+	for (a=0; a<p->narg; a++) {
+		r1 = copyof(p->arg[a], cp);
+		if (req(r1, R))
+			continue;
+		if (req(r, R) || req(r, r1))
+			r = r1;
+		else {
+			r = p->to;
+			break;
+		}
+	}
+	assert(!req(r, R));
+	update(p->to, r, cp, w);
+}
+
+static void
+visitins(Ins *i, Ref *cp, RList **w)
+{
+	Ref r;
+
+	if (i->op == OCopy) {
+		r = copyof(i->arg[0], cp);
+		update(i->to, r, cp, w);
+	} else if (!req(i->to, R)) {
+		assert(rtype(i->to) == RTmp);
+		update(i->to, i->to, cp, w);
+	}
+}
+
+void
+copy(Fn *fn)
+{
+	Blk *b;
+	Ref *cp, r;
+	RList *w, *w1;
+	Use *u, *u1;
+	Ins *i;
+	Phi *p, **pp;
+	uint a;
+	int t;
+
+	w = 0;
+	cp = emalloc(fn->ntmp * sizeof cp[0]);
+	for (b=fn->start; b; b=b->link) {
+		for (p=b->phi; p; p=p->link)
+			visitphi(p, cp, &w);
+		for (i=b->ins; i-b->ins < b->nins; i++)
+			visitins(i, cp, &w);
+	}
+	while ((w1=w)) {
+		t = w->t;
+		w = w->l;
+		free(w1);
+		u = fn->tmp[t].use;
+		u1 = u + fn->tmp[t].nuse;
+		for (; u<u1; u++)
+			switch (u->type) {
+			default:
+				diag("copy: invalid use");
+			case UPhi:
+				visitphi(u->u.phi, cp, &w);
+				break;
+			case UIns:
+				visitins(u->u.ins, cp, &w);
+				break;
+			case UJmp:
+				break;
+			}
+	}
+	for (b=fn->start; b; b=b->link) {
+		for (pp=&b->phi; (p=*pp);) {
+			r = cp[p->to.val];
+			if (!req(r, p->to)) {
+				*pp = p->link;
+				continue;
+			}
+			for (a=0; a<p->narg; a++)
+				if (rtype(p->arg[a]) == RTmp) {
+					r = cp[p->arg[a].val];
+					assert(!req(r, R));
+					p->arg[a] = r;
+				}
+			pp=&p->link;
+		}
+		for (i=b->ins; i-b->ins < b->nins; i++) {
+			r = cp[i->to.val];
+			if (!req(r, i->to)) {
+				*i = (Ins){.op = ONop};
+				continue;
+			}
+			for (a=0; a<2; a++)
+				if (rtype(i->arg[a]) == RTmp) {
+					r = cp[i->arg[a].val];
+					assert(!req(r, R));
+					i->arg[a] = r;
+				}
+		}
+		if (rtype(b->jmp.arg) == RTmp) {
+			r = cp[b->jmp.arg.val];
+			assert(!req(r, R));
+			b->jmp.arg = r;
+		}
+	}
+	if (debug['C']) {
+		fprintf(stderr, "\n> Copy information:");
+		for (t=Tmp0; t<fn->ntmp; t++) {
+			if (req(cp[t], R)) {
+				fprintf(stderr, "\n%10s not seen!",
+					fn->tmp[t].name);
+			}
+			else if (!req(cp[t], TMP(t))) {
+				fprintf(stderr, "\n%10s copy of ",
+					fn->tmp[t].name);
+				printref(cp[t], fn, stderr);
+			}
+		}
+		fprintf(stderr, "\n\n> After copy elimination:\n");
+		printfn(fn, stderr);
+	}
+	free(cp);
+}
diff --git a/src/emit.c b/src/emit.c
@@ -0,0 +1,666 @@
+#include "all.h"
+
+enum {
+	SLong = 0,
+	SWord = 1,
+	SShort = 2,
+	SByte = 3,
+
+	Ki = -1, /* matches Kw and Kl */
+	Ka = -2, /* matches all classes */
+};
+
+/* Instruction format strings:
+ *
+ * if the format string starts with -, the instruction
+ * is assumed to be 3-address and is put in 2-address
+ * mode using an extra mov if necessary
+ *
+ * if the format string starts with +, the same as the
+ * above applies, but commutativity is also assumed
+ *
+ * %k  is used to set the class of the instruction,
+ *     it'll expand to "l", "q", "ss", "sd", depending
+ *     on the instruction class
+ * %0  designates the first argument
+ * %1  designates the second argument
+ * %=  designates the result
+ *
+ * if %k is not used, a prefix to 0, 1, or = must be
+ * added, it can be:
+ *   M - memory reference
+ *   L - long  (64 bits)
+ *   W - word  (32 bits)
+ *   H - short (16 bits)
+ *   B - byte  (8 bits)
+ *   S - single precision float
+ *   D - double precision float
+ */
+static struct {
+	short op;
+	short cls;
+	char *asm;
+} omap[] = {
+	{ OAdd,    Ka, "+add%k %1, %=" },
+	{ OSub,    Ka, "-sub%k %1, %=" },
+	{ OAnd,    Ki, "+and%k %1, %=" },
+	{ OOr,     Ki, "+or%k %1, %=" },
+	{ OXor,    Ki, "+xor%k %1, %=" },
+	{ OSar,    Ki, "-sar%k %B1, %=" },
+	{ OShr,    Ki, "-shr%k %B1, %=" },
+	{ OShl,    Ki, "-shl%k %B1, %=" },
+	{ OMul,    Ki, "+imul%k %1, %=" },
+	{ OMul,    Ks, "+mulss %1, %=" }, /* fixme */
+	{ OMul,    Kd, "+mulsd %1, %=" },
+	{ ODiv,    Ka, "-div%k %1, %=" },
+	{ OStorel, Ka, "movq %L0, %M1" },
+	{ OStorew, Ka, "movl %W0, %M1" },
+	{ OStoreh, Ka, "movw %H0, %M1" },
+	{ OStoreb, Ka, "movb %B0, %M1" },
+	{ OStores, Ka, "movss %S0, %M1" },
+	{ OStored, Ka, "movsd %D0, %M1" },
+	{ OLoad,   Ka, "mov%k %M0, %=" },
+	{ OLoadsw, Kl, "movslq %M0, %L=" },
+	{ OLoadsw, Kw, "movl %M0, %W=" },
+	{ OLoaduw, Ki, "movl %M0, %W=" },
+	{ OLoadsh, Ki, "movsw%k %M0, %=" },
+	{ OLoaduh, Ki, "movzw%k %M0, %=" },
+	{ OLoadsb, Ki, "movsb%k %M0, %=" },
+	{ OLoadub, Ki, "movzb%k %M0, %=" },
+	{ OExtsw,  Kl, "movslq %W0, %L=" },
+	{ OExtuw,  Kl, "movl %W0, %W=" },
+	{ OExtsh,  Ki, "movsw%k %H0, %=" },
+	{ OExtuh,  Ki, "movzw%k %H0, %=" },
+	{ OExtsb,  Ki, "movsb%k %B0, %=" },
+	{ OExtub,  Ki, "movzb%k %B0, %=" },
+
+	{ OExts,   Kd, "cvtss2sd %0, %=" },  /* see if factorization is possible */
+	{ OTruncd, Ks, "cvttsd2ss %0, %=" },
+	{ OFtosi,  Kw, "cvttss2si %0, %=" },
+	{ OFtosi,  Kl, "cvttsd2si %0, %=" },
+	{ OSitof,  Ks, "cvtsi2ss %W0, %=" },
+	{ OSitof,  Kd, "cvtsi2sd %L0, %=" },
+	{ OCast,   Ki, "movq %D0, %L=" },
+	{ OCast,   Ka, "movq %L0, %D=" },
+
+	{ OAddr,   Ki, "lea%k %M0, %=" },
+	{ OSwap,   Ki, "xchg%k %0, %1" },
+	{ OSign,   Kl, "cqto" },
+	{ OSign,   Kw, "cltd" },
+	{ OXDiv,   Ki, "div%k %0" },
+	{ OXIDiv,  Ki, "idiv%k %0" },
+	{ OXCmp,   Ks, "comiss %S0, %S1" },  /* fixme, Kf */
+	{ OXCmp,   Kd, "comisd %D0, %D1" },
+	{ OXCmp,   Ki, "cmp%k %0, %1" },
+	{ OXTest,  Ki, "test%k %0, %1" },
+	{ OXSet+ICeq,  Ki, "setz %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICsle, Ki, "setle %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICslt, Ki, "setl %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICsgt, Ki, "setg %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICsge, Ki, "setge %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICne,  Ki, "setnz %B=\n\tmovzb%k %B=, %=" },
+	{ OXSet+ICXnp, Ki, "setnp %B=\n\tmovsb%k %B=, %=" },
+	{ OXSet+ICXp,  Ki, "setp %B=\n\tmovsb%k %B=, %=" },
+	{ NOp, 0, 0 }
+};
+
+static char *rname[][4] = {
+	[RAX] = {"rax", "eax", "ax", "al"},
+	[RBX] = {"rbx", "ebx", "bx", "bl"},
+	[RCX] = {"rcx", "ecx", "cx", "cl"},
+	[RDX] = {"rdx", "edx", "dx", "dl"},
+	[RSI] = {"rsi", "esi", "si", "sil"},
+	[RDI] = {"rdi", "edi", "di", "dil"},
+	[RBP] = {"rbp", "ebp", "bp", "bpl"},
+	[RSP] = {"rsp", "esp", "sp", "spl"},
+	[R8 ] = {"r8" , "r8d", "r8w", "r8b"},
+	[R9 ] = {"r9" , "r9d", "r9w", "r9b"},
+	[R10] = {"r10", "r10d", "r10w", "r10b"},
+	[R11] = {"r11", "r11d", "r11w", "r11b"},
+	[R12] = {"r12", "r12d", "r12w", "r12b"},
+	[R13] = {"r13", "r13d", "r13w", "r13b"},
+	[R14] = {"r14", "r14d", "r14w", "r14b"},
+	[R15] = {"r15", "r15d", "r15w", "r15b"},
+};
+
+
+static int
+slot(int s, Fn *fn)
+{
+	struct { int i:14; } x;
+
+	/* sign extend s using a bitfield */
+	x.i = s;
+	assert(NAlign == 3);
+	if (x.i < 0)
+		return -4 * x.i;
+	else {
+		assert(fn->slot >= x.i);
+		return -4 * (fn->slot - x.i);
+	}
+}
+
+static void
+emitcon(Con *con, FILE *f)
+{
+	switch (con->type) {
+	default:
+		diag("emit: invalid constant");
+	case CAddr:
+		fputs(con->label, f);
+		if (con->bits.i)
+			fprintf(f, "%+"PRId64, con->bits.i);
+		break;
+	case CBits:
+		fprintf(f, "%"PRId64, con->bits.i);
+		break;
+	}
+}
+
+static char *
+regtoa(int reg, int sz)
+{
+	static char buf[6];
+
+	if (reg >= XMM0) {
+		sprintf(buf, "xmm%d", reg-XMM0);
+		return buf;
+	} else
+		return rname[reg][sz];
+}
+
+static Ref
+getarg(char c, Ins *i)
+{
+	switch (c) {
+	default:
+		diag("emit: 0, 1, = expected in format");
+	case '0':
+		return i->arg[0];
+	case '1':
+		return i->arg[1];
+	case '=':
+		return i->to;
+	}
+}
+
+static void emitins(Ins, Fn *, FILE *);
+
+static void
+emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f)
+{
+	Ins icp;
+
+	icp.op = OCopy;
+	icp.arg[0] = r2;
+	icp.to = r1;
+	icp.cls = k;
+	emitins(icp, fn, f);
+}
+
+static void
+emitf(char *s, Ins *i, Fn *fn, FILE *f)
+{
+	static char clstoa[][3] = {"l", "q", "ss", "sd"};
+	char c;
+	int sz;
+	Ref ref;
+	Mem *m;
+	Con off;
+
+	switch (*s) {
+	case '+':
+		if (req(i->arg[1], i->to)) {
+			ref = i->arg[0];
+			i->arg[0] = i->arg[1];
+			i->arg[1] = ref;
+		}
+		/* fall through */
+	case '-':
+		if (req(i->arg[1], i->to) && !req(i->arg[0], i->to))
+			diag("emit: cannot convert to 2-address");
+		emitcopy(i->to, i->arg[0], i->cls, fn, f);
+		s++;
+		break;
+	}
+
+	fputc('\t', f);
+Next:
+	while ((c = *s++) != '%')
+		if (!c) {
+			fputc('\n', f);
+			return;
+		} else
+			fputc(c, f);
+	switch ((c = *s++)) {
+	default:
+		diag("emit: invalid escape");
+	case '%':
+		fputc('%', f);
+		break;
+	case 'k':
+		fputs(clstoa[i->cls], f);
+		break;
+	case '0':
+	case '1':
+	case '=':
+		sz = KWIDE(i->cls) ? SLong : SWord;
+		s--;
+		/* fall through */
+	case 'D':
+	case 'S':
+	Ref:
+		c = *s++;
+		ref = getarg(c, i);
+		switch (rtype(ref)) {
+		default:
+			diag("emit: invalid reference");
+		case RTmp:
+			assert(isreg(ref));
+			fprintf(f, "%%%s", regtoa(ref.val, sz));
+			break;
+		case RSlot:
+			fprintf(f, "%d(%%rbp)", slot(ref.val, fn));
+			break;
+		case RAMem:
+		Mem:
+			m = &fn->mem[ref.val & AMask];
+			if (rtype(m->base) == RSlot) {
+				off.type = CBits;
+				off.bits.i = slot(m->base.val, fn);
+				addcon(&m->offset, &off);
+				m->base = TMP(RBP);
+			}
+			if (m->offset.type != CUndef)
+				emitcon(&m->offset, f);
+			if (req(m->base, R) && req(m->index, R))
+				break;
+			fputc('(', f);
+			if (!req(m->base, R))
+				fprintf(f, "%%%s", regtoa(m->base.val, SLong));
+			if (!req(m->index, R))
+				fprintf(f, ", %%%s, %d",
+					regtoa(m->index.val, SLong),
+					m->scale
+				);
+			fputc(')', f);
+			break;
+		case RCon:
+			fputc('$', f);
+			emitcon(&fn->con[ref.val], f);
+			break;
+		}
+		break;
+	case 'L':
+		sz = SLong;
+		goto Ref;
+	case 'W':
+		sz = SWord;
+		goto Ref;
+	case 'H':
+		sz = SShort;
+		goto Ref;
+	case 'B':
+		sz = SByte;
+		goto Ref;
+	case 'M':
+		c = *s++;
+		ref = getarg(c, i);
+		switch (rtype(ref)) {
+		default:
+			diag("emit: invalid memory reference");
+		case RAMem:
+			goto Mem;
+		case RSlot:
+			fprintf(f, "%d(%%rbp)", slot(ref.val, fn));
+			break;
+		case RCon:
+			emitcon(&fn->con[ref.val], f);
+			fprintf(f, "(%%rip)");
+			break;
+		case RTmp:
+			assert(isreg(ref));
+			fprintf(f, "(%%%s)", regtoa(ref.val, SLong));
+			break;
+		}
+		break;
+	}
+	goto Next;
+}
+
+static void
+emitins(Ins i, Fn *fn, FILE *f)
+{
+	Ref r;
+	int64_t val;
+	int o;
+
+	switch (i.op) {
+	default:
+	Table:
+		/* most instructions are just pulled out of
+		 * the table omap[], some special cases are
+		 * detailed below */
+		for (o=0;; o++) {
+			/* this linear search should really be a binary
+			 * search */
+			if (omap[o].op == NOp)
+				diag("emit: no entry found for instruction");
+			if (omap[o].op == i.op)
+			if (omap[o].cls == i.cls
+			|| (omap[o].cls == Ki && KBASE(i.cls) == 0)
+			|| (omap[o].cls == Ka))
+				break;
+		}
+		emitf(omap[o].asm, &i, fn, f);
+		break;
+	case ONop:
+		/* just do nothing for nops, they are inserted
+		 * by some passes */
+		break;
+	case OMul:
+		/* here, we try to use the 3-addresss form
+		 * of multiplication when possible */
+		if (rtype(i.arg[1]) == RCon) {
+			r = i.arg[0];
+			i.arg[0] = i.arg[1];
+			i.arg[1] = r;
+		}
+		if (KBASE(i.cls) == 0 /* only available for ints */
+		&& rtype(i.arg[0]) == RCon
+		&& rtype(i.arg[1]) == RTmp) {
+			emitf("imul%k %0, %1, %=", &i, fn, f);
+			break;
+		}
+		goto Table;
+	case OSub:
+		/* we have to use the negation trick to handle
+		 * some 3-address substractions */
+		if (req(i.to, i.arg[1])) {
+			emitf("neg%k %=", &i, fn, f);
+			emitf("add%k %0, %=", &i, fn, f);
+			break;
+		}
+		goto Table;
+	case OCopy:
+		/* make sure we don't emit useless copies,
+		 * also, we can use a trick to load 64-bits
+		 * registers, it's detailed in my note below
+		 * http://c9x.me/art/notes.html?09/19/2015 */
+		if (req(i.to, R) || req(i.arg[0], R))
+			break;
+		if (isreg(i.to)
+		&& rtype(i.arg[0]) == RCon
+		&& i.cls == Kl
+		&& fn->con[i.arg[0].val].type == CBits
+		&& (val = fn->con[i.arg[0].val].bits.i) >= 0
+		&& val <= UINT32_MAX) {
+			emitf("movl %W0, %W=", &i, fn, f);
+		} else if (!req(i.arg[0], i.to))
+			emitf("mov%k %0, %=", &i, fn, f);
+		break;
+	case OCall:
+		/* calls simply have a weird syntax in AT&T
+		 * assembly... */
+		switch (rtype(i.arg[0])) {
+		default:
+			diag("emit: invalid call instruction");
+		case RCon:
+			fprintf(f, "\tcallq ");
+			emitcon(&fn->con[i.arg[0].val], f);
+			fprintf(f, "\n");
+			break;
+		case RTmp:
+			emitf("callq *%L0", &i, fn, f);
+			break;
+		}
+		break;
+	case OSAlloc:
+		/* there is no good reason why this is here
+		 * maybe we should split OSAlloc in 2 different
+		 * instructions depending on the result
+		 */
+		emitf("subq %L0, %%rsp", &i, fn, f);
+		if (!req(i.to, R))
+			emitcopy(i.to, TMP(RSP), Kl, fn, f);
+		break;
+	case OSwap:
+		if (KBASE(i.cls) == 0)
+			goto Table;
+		/* for floats, there is no swap instruction
+		 * so we use xmm15 as a temporary
+		 */
+		emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f);
+		emitcopy(i.arg[0], i.arg[1], i.cls, fn, f);
+		emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f);
+		break;
+	}
+}
+
+static int
+cneg(int cmp)
+{
+	switch (cmp) {
+	default:   diag("emit: cneg() unhandled comparison");
+	case ICule: return ICugt;
+	case ICult: return ICuge;
+	case ICsle: return ICsgt;
+	case ICslt: return ICsge;
+	case ICsgt: return ICsle;
+	case ICsge: return ICslt;
+	case ICugt: return ICule;
+	case ICuge: return ICult;
+	case ICeq:  return ICne;
+	case ICne:  return ICeq;
+	case ICXnp: return ICXp;
+	case ICXp:  return ICXnp;
+	}
+}
+
+static int
+framesz(Fn *fn)
+{
+	int i, o, f;
+
+	assert(NAlign == 3);
+	for (i=0, o=0; i<NRClob; i++)
+		o ^= 1 & (fn->reg >> rclob[i]);
+	f = fn->slot;
+	f = (f + 3) & -4;
+	return 4*f + 8*o;
+}
+
+void
+emitfn(Fn *fn, FILE *f)
+{
+	static char *ctoa[] = {
+		[ICeq]  = "z",
+		[ICule] = "be",
+		[ICult] = "b",
+		[ICsle] = "le",
+		[ICslt] = "l",
+		[ICsgt] = "g",
+		[ICsge] = "ge",
+		[ICugt] = "a",
+		[ICuge] = "ae",
+		[ICne]  = "nz",
+		[ICXnp] = "np",
+		[ICXp]  = "p"
+	};
+	Blk *b, *s;
+	Ins *i, itmp;
+	int *r, c, fs;
+
+	fprintf(f,
+		".text\n"
+		".globl %s\n"
+		".type %s, @function\n"
+		"%s:\n"
+		"\tpush %%rbp\n"
+		"\tmov %%rsp, %%rbp\n",
+		fn->name, fn->name, fn->name
+	);
+	fs = framesz(fn);
+	if (fs)
+		fprintf(f, "\tsub $%d, %%rsp\n", fs);
+	for (r=rclob; r-rclob < NRClob; r++)
+		if (fn->reg & BIT(*r)) {
+			itmp.arg[0] = TMP(*r);
+			emitf("pushq %L0", &itmp, fn, f);
+		}
+
+	for (b=fn->start; b; b=b->link) {
+		fprintf(f, ".L%s:\n", b->name);
+		for (i=b->ins; i!=&b->ins[b->nins]; i++)
+			emitins(*i, fn, f);
+		switch (b->jmp.type) {
+		case JRet0:
+			for (r=&rclob[NRClob]; r>rclob;)
+				if (fn->reg & BIT(*--r)) {
+					itmp.arg[0] = TMP(*r);
+					emitf("popq %L0", &itmp, fn, f);
+				}
+			fprintf(f,
+				"\tleave\n"
+				"\tret\n"
+			);
+			break;
+		case JJmp:
+			if (b->s1 != b->link)
+				fprintf(f, "\tjmp .L%s\n", b->s1->name);
+			break;
+		default:
+			c = b->jmp.type - JXJc;
+			if (0 <= c && c <= NXICmp) {
+				if (b->link == b->s2) {
+					s = b->s1;
+				} else if (b->link == b->s1) {
+					c = cneg(c);
+					s = b->s2;
+				} else
+					diag("emit: unhandled jump (1)");
+				fprintf(f, "\tj%s .L%s\n", ctoa[c], s->name);
+				break;
+			}
+			diag("emit: unhandled jump (2)");
+		}
+	}
+
+}
+
+void
+emitdat(Dat *d, FILE *f)
+{
+	static int align;
+	static char *dtoa[] = {
+		[DAlign] = ".align",
+		[DB] = "\t.byte",
+		[DH] = "\t.value",
+		[DW] = "\t.long",
+		[DL] = "\t.quad"
+	};
+
+	switch (d->type) {
+	case DStart:
+		align = 0;
+		fprintf(f, ".data\n");
+		break;
+	case DEnd:
+		break;
+	case DName:
+		if (!align)
+			fprintf(f, ".align 8\n");
+		fprintf(f,
+			".globl %s\n"
+			".type %s, @object\n"
+			"%s:\n",
+			d->u.str, d->u.str, d->u.str
+		);
+		break;
+	case DZ:
+		fprintf(f, "\t.fill %"PRId64",1,0\n", d->u.num);
+		break;
+	default:
+		if (d->type == DAlign)
+			align = 1;
+
+		if (d->isstr) {
+			if (d->type != DB)
+				err("strings only supported for 'b' currently");
+			fprintf(f, "\t.ascii \"%s\"\n", d->u.str);
+		}
+		else if (d->isref) {
+			fprintf(f, "%s %s%+"PRId64"\n",
+				dtoa[d->type], d->u.ref.nam,
+				d->u.ref.off);
+		}
+		else {
+			fprintf(f, "%s %"PRId64"\n",
+				dtoa[d->type], d->u.num);
+		}
+		break;
+	}
+}
+
+typedef struct FBits FBits;
+
+struct FBits {
+	int64_t bits;
+	int wide;
+	FBits *link;
+};
+
+static FBits *stash;
+
+int
+stashfp(int64_t n, int w)
+{
+	FBits **pb, *b;
+	int i;
+
+	/* does a dumb de-dup of fp constants
+	 * this should be the linker's job */
+	for (pb=&stash, i=0; (b=*pb); pb=&b->link, i++)
+		if (n == b->bits && w == b->wide)
+			return i;
+	b = emalloc(sizeof *b);
+	b->bits = n;
+	b->wide = w;
+	b->link = 0;
+	*pb = b;
+	return i;
+}
+
+void
+emitfin(FILE *f)
+{
+	FBits *b;
+	int i;
+
+	if (!stash)
+		return;
+	fprintf(f, "/* floating point constants */\n");
+	fprintf(f, ".data\n.align 8\n");
+	for (b=stash, i=0; b; b=b->link, i++)
+		if (b->wide)
+			fprintf(f,
+				".Lfp%d:\n"
+				"\t.quad %"PRId64
+				" /* %f */\n",
+				i, b->bits,
+				*(double *)&b->bits
+			);
+	for (b=stash, i=0; b; b=b->link, i++)
+		if (!b->wide)
+			fprintf(f,
+				".Lfp%d:\n"
+				"\t.long %"PRId64
+				" /* %lf */\n",
+				i, b->bits & 0xffffffff,
+				*(float *)&b->bits
+			);
+	while ((b=stash)) {
+		stash = b->link;
+		free(b);
+	}
+}
diff --git a/src/isel.c b/src/isel.c
@@ -0,0 +1,1135 @@
+#include "all.h"
+#include <limits.h>
+
+/* For x86_64, do the following:
+ *
+ * - lower calls
+ * - check that constants are used only in
+ *   places allowed
+ * - ensure immediates always fit in 32b
+ * - explicit machine register contraints
+ *   on instructions like division.
+ * - implement fast locals (the streak of
+ *   constant allocX in the first basic block)
+ * - recognize complex addressing modes
+ *
+ * Invariant: the use counts that are used
+ *            in sel() must be sound.  This
+ *            is not so trivial, maybe the
+ *            dce should be moved out...
+ */
+
+typedef struct ANum ANum;
+typedef struct AClass AClass;
+typedef struct RAlloc RAlloc;
+
+struct ANum {
+	char n, l, r;
+	Ins *i;
+	Ref mem;
+};
+
+static void amatch(Addr *, Ref, ANum *, Fn *, int);
+
+static int
+fcmptoi(int fc)
+{
+	switch (fc) {
+	default:   diag("isel: fcmptoi defaulted");
+	case FCle: return ICule;
+	case FClt: return ICult;
+	case FCgt: return ICugt;
+	case FCge: return ICuge;
+	case FCne: return ICne;
+	case FCeq: return ICeq;
+	case FCo:  return ICXnp;
+	case FCuo: return ICXp;
+	}
+}
+
+static int
+iscmp(int op, int *pk, int *pc)
+{
+	int k, c;
+
+	if (OCmpw <= op && op <= OCmpw1) {
+		c = op - OCmpw;
+		k = Kw;
+	}
+	else if (OCmpl <= op && op <= OCmpl1) {
+		c = op - OCmpl;
+		k = Kl;
+	}
+	else if (OCmps <= op && op <= OCmps1) {
+		c = fcmptoi(op - OCmps);
+		k = Ks;
+	}
+	else if (OCmpd <= op && op <= OCmpd1) {
+		c = fcmptoi(op - OCmpd);
+		k = Kd;
+	}
+	else
+		return 0;
+	if (pk)
+		*pk = k;
+	if (pc)
+		*pc = c;
+	return 1;
+}
+
+static int
+noimm(Ref r, Fn *fn)
+{
+	int64_t val;
+
+	if (rtype(r) != RCon)
+		return 0;
+	switch (fn->con[r.val].type) {
+	default:
+		diag("isel: invalid constant");
+	case CAddr:
+		/* we only support the 'small'
+		 * code model of the ABI, this
+		 * means that we can always
+		 * address data with 32bits
+		 */
+		return 0;
+	case CBits:
+		val = fn->con[r.val].bits.i;
+		return (val < INT32_MIN || val > INT32_MAX);
+	}
+}
+
+static int
+rslot(Ref r, Fn *fn)
+{
+	if (rtype(r) != RTmp)
+		return -1;
+	return fn->tmp[r.val].slot;
+}
+
+static int
+argcls(Ins *i, int n)
+{
+	return opdesc[i->op].argcls[n][i->cls];
+}
+
+static void
+fixarg(Ref *r, int k, int phi, Fn *fn)
+{
+	Addr a;
+	Ref r0, r1;
+	int s, n;
+
+	r1 = r0 = *r;
+	s = rslot(r0, fn);
+	if (KBASE(k) == 1 && rtype(r0) == RCon) {
+		/* load floating points from memory
+		 * slots, they can't be used as
+		 * immediates
+		 */
+		r1 = MEM(fn->nmem);
+		vgrow(&fn->mem, ++fn->nmem);
+		memset(&a, 0, sizeof a);
+		a.offset.type = CAddr;
+		n = stashfp(fn->con[r0.val].bits.i, KWIDE(k));
+		sprintf(a.offset.label, ".Lfp%d", n);
+		fn->mem[fn->nmem-1] = a;
+	}
+	else if (!phi && k == Kl && noimm(r0, fn)) {
+		/* load constants that do not fit in
+		 * a 32bit signed integer into a
+		 * long temporary
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(OCopy, Kl, r1, r0, R);
+	}
+	else if (s != -1) {
+		/* load fast locals' addresses into
+		 * temporaries right before the
+		 * instruction
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(OAddr, Kl, r1, SLOT(s), R);
+	}
+	*r = r1;
+}
+
+static void
+chuse(Ref r, int du, Fn *fn)
+{
+	if (rtype(r) == RTmp)
+		fn->tmp[r.val].nuse += du;
+}
+
+static void
+seladdr(Ref *r, ANum *an, Fn *fn)
+{
+	Addr a;
+	Ref r0, r1;
+
+	r0 = *r;
+	if (rtype(r0) == RTmp) {
+		chuse(r0, -1, fn);
+		r1 = an[r0.val].mem;
+		if (req(r1, R)) {
+			amatch(&a, r0, an, fn, 1);
+			vgrow(&fn->mem, ++fn->nmem);
+			fn->mem[fn->nmem-1] = a;
+			r1 = MEM(fn->nmem-1);
+			chuse(a.base, +1, fn);
+			chuse(a.index, +1, fn);
+			if (rtype(a.base) != RTmp)
+			if (rtype(a.index) != RTmp)
+				an[r0.val].mem = r1;
+		}
+		*r = r1;
+	}
+}
+
+static void
+selcmp(Ref arg[2], int k, Fn *fn)
+{
+	Ref r;
+
+	if (rtype(arg[0]) == RCon) {
+		r = arg[1];
+		arg[1] = arg[0];
+		arg[0] = r;
+	}
+	assert(rtype(arg[0]) != RCon);
+	emit(OXCmp, k, R, arg[1], arg[0]);
+	fixarg(&curi->arg[0], k, 0, fn);
+}
+
+static void
+sel(Ins i, ANum *an, Fn *fn)
+{
+	Ref r0, r1;
+	int x, k, kc;
+	int64_t val;
+	Ins *i0;
+
+	if (rtype(i.to) == RTmp)
+	if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
+	if (fn->tmp[i.to.val].nuse == 0) {
+		chuse(i.arg[0], -1, fn);
+		chuse(i.arg[1], -1, fn);
+		return;
+	}
+	i0 = curi;
+	k = i.cls;
+	switch (i.op) {
+	case ODiv:
+	case ORem:
+	case OUDiv:
+	case OURem:
+		if (i.op == ODiv || i.op == OUDiv)
+			r0 = TMP(RAX), r1 = TMP(RDX);
+		else
+			r0 = TMP(RDX), r1 = TMP(RAX);
+		emit(OCopy, k, i.to, r0, R);
+		emit(OCopy, k, R, r1, R);
+		if (rtype(i.arg[1]) == RCon) {
+			/* immediates not allowed for
+			 * divisions in x86
+			 */
+			r0 = newtmp("isel", k, fn);
+		} else
+			r0 = i.arg[1];
+		if (i.op == ODiv || i.op == ORem) {
+			emit(OXIDiv, k, R, r0, R);
+			emit(OSign, k, TMP(RDX), TMP(RAX), R);
+		} else {
+			emit(OXDiv, k, R, r0, R);
+			emit(OCopy, k, TMP(RDX), CON_Z, R);
+		}
+		emit(OCopy, k, TMP(RAX), i.arg[0], R);
+		if (rtype(i.arg[1]) == RCon)
+			emit(OCopy, k, r0, i.arg[1], R);
+		break;
+	case OSar:
+	case OShr:
+	case OShl:
+		if (rtype(i.arg[1]) == RCon)
+			goto Emit;
+		r0 = i.arg[1];
+		i.arg[1] = TMP(RCX);
+		emit(OCopy, Kw, R, TMP(RCX), R);
+		emiti(i);
+		emit(OCopy, Kw, TMP(RCX), r0, R);
+		break;
+	case ONop:
+		break;
+	case OStored:
+	case OStores:
+	case OStorel:
+	case OStorew:
+	case OStoreh:
+	case OStoreb:
+		if (rtype(i.arg[0]) == RCon) {
+			if (i.op == OStored)
+				i.op = OStorel;
+			if (i.op == OStores)
+				i.op = OStorew;
+		}
+		seladdr(&i.arg[1], an, fn);
+		goto Emit;
+	case_OLoad:
+		seladdr(&i.arg[0], an, fn);
+		goto Emit;
+	case OCall:
+	case OSAlloc:
+	case OCopy:
+	case OAdd:
+	case OSub:
+	case OMul:
+	case OAnd:
+	case OOr:
+	case OXor:
+	case OXTest:
+	case OFtosi:
+	case OSitof:
+	case OExts:
+	case OTruncd:
+	case OCast:
+	case_OExt:
+Emit:
+		emiti(i);
+		fixarg(&curi->arg[0], argcls(curi, 0), 0, fn);
+		fixarg(&curi->arg[1], argcls(curi, 1), 0, fn);
+		break;
+	case OAlloc:
+	case OAlloc+1:
+	case OAlloc+2: /* == OAlloc1 */
+		/* we need to make sure
+		 * the stack remains aligned
+		 * (rsp = 0) mod 16
+		 */
+		if (rtype(i.arg[0]) == RCon) {
+			assert(fn->con[i.arg[0].val].type == CBits);
+			val = fn->con[i.arg[0].val].bits.i;
+			val = (val + 15)  & ~INT64_C(15);
+			if (val < 0 || val > INT32_MAX)
+				diag("isel: alloc too large");
+			emit(OSAlloc, Kl, i.to, getcon(val, fn), R);
+		} else {
+			/* r0 = (i.arg[0] + 15) & -16 */
+			r0 = newtmp("isel", Kl, fn);
+			r1 = newtmp("isel", Kl, fn);
+			emit(OSAlloc, Kl, i.to, r0, R);
+			emit(OAnd, Kl, r0, r1, getcon(-16, fn));
+			emit(OAdd, Kl, r1, i.arg[0], getcon(15, fn));
+		}
+		break;
+	default:
+		if (isext(i.op))
+			goto case_OExt;
+		if (isload(i.op))
+			goto case_OLoad;
+		if (iscmp(i.op, &kc, &x)) {
+			if (rtype(i.arg[0]) == RCon)
+				x = icmpop(x);
+			emit(OXSet+x, k, i.to, R, R);
+			selcmp(i.arg, kc, fn);
+			break;
+		}
+		diag("isel: non-exhaustive implementation");
+	}
+
+	while (i0 > curi && --i0)
+		if (rslot(i0->arg[0], fn) != -1
+		||  rslot(i0->arg[1], fn) != -1)
+			diag("isel: usupported address argument");
+}
+
+static Ins *
+flagi(Ins *i0, Ins *i)
+{
+	while (i>i0) {
+		i--;
+		if (opdesc[i->op].sflag)
+			return i;
+		if (opdesc[i->op].lflag)
+			continue;
+		return 0;
+	}
+	return 0;
+}
+
+struct AClass {
+	int inmem;
+	int align;
+	uint size;
+	int cls[2];
+};
+
+static void
+aclass(AClass *a, Typ *t)
+{
+	int e, s, n, cls;
+	uint sz, al;
+
+	sz = t->size;
+	al = 1u << t->align;
+
+	/* the ABI requires sizes to be rounded
+	 * up to the nearest multiple of 8, moreover
+	 * it makes it easy load and store structures
+	 * in registers
+	 */
+	if (al < 8)
+		al = 8;
+	sz = (sz + al-1) & -al;
+
+	a->size = sz;
+	a->align = t->align;
+
+	if (t->dark || sz > 16) {
+		/* large or unaligned structures are
+		 * required to be passed in memory
+		 */
+		a->inmem = 1;
+		return;
+	}
+
+	a->inmem = 0;
+	for (e=0, s=0; e<2; e++) {
+		cls = -1;
+		for (n=0; n<8 && t->seg[s].len; s++) {
+			if (t->seg[s].ispad) {
+				/* don't change anything */
+			}
+			else if (t->seg[s].isflt) {
+				if (cls == -1)
+					cls = Kd;
+			}
+			else
+				cls = Kl;
+			n += t->seg[s].len;
+		}
+		assert(n <= 8);
+		a->cls[e] = cls;
+	}
+}
+
+static void
+blit(Ref rstk, uint soff, Ref rsrc, uint sz, Fn *fn)
+{
+	Ref r, r1;
+	uint boff;
+
+	/* it's an impolite blit, we might go across the end
+	 * of the source object a little bit... */
+	for (boff=0; sz>0; sz-=8, soff+=8, boff+=8) {
+		r = newtmp("abi", Kl, fn);
+		r1 = newtmp("abi", Kl, fn);
+		emit(OStorel, 0, R, r, r1);
+		emit(OAdd, Kl, r1, rstk, getcon(soff, fn));
+		r1 = newtmp("abi", Kl, fn);
+		emit(OLoad, Kl, r, r1, R);
+		emit(OAdd, Kl, r1, rsrc, getcon(boff, fn));
+		chuse(rsrc, +1, fn);
+		chuse(rstk, +1, fn);
+	}
+}
+
+static int
+retr(Ref reg[2], AClass *aret)
+{
+	static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
+	int n, k, ca, nr[2];
+
+	nr[0] = nr[1] = 0;
+	ca = 0;
+	for (n=0; aret->cls[n]>=0 && n<2; n++) {
+		k = KBASE(aret->cls[n]);
+		reg[n] = TMP(retreg[k][nr[k]++]);
+		ca += 1 << (2 * k);
+	}
+	return ca;
+}
+
+static void
+selret(Blk *b, Fn *fn)
+{
+	int j, k, ca;
+	Ref r, r0, reg[2];
+	AClass aret;
+
+	j = b->jmp.type;
+
+	if (!isret(j) || j == JRet0)
+		return;
+
+	r0 = b->jmp.arg;
+	b->jmp.type = JRet0;
+
+	if (j == JRetc) {
+		aclass(&aret, &typ[fn->retty]);
+		if (aret.inmem) {
+			assert(rtype(fn->retr) == RTmp);
+			emit(OCopy, Kl, TMP(RAX), fn->retr, R);
+			chuse(fn->retr, +1, fn);
+			blit(fn->retr, 0, r0, aret.size, fn);
+			ca = 1;
+		} else {
+			ca = retr(reg, &aret);
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				emit(OLoad, Kl, reg[1], r, R);
+				emit(OAdd, Kl, r, r0, getcon(8, fn));
+				chuse(r0, +1, fn);
+			}
+			emit(OLoad, Kl, reg[0], r0, R);
+		}
+	} else {
+		k = j - JRetw;
+		if (KBASE(k) == 0) {
+			emit(OCopy, k, TMP(RAX), r0, R);
+			ca = 1;
+		} else {
+			emit(OCopy, k, TMP(XMM0), r0, R);
+			ca = 1 << 2;
+		}
+	}
+
+	b->jmp.arg = CALL(ca);
+}
+
+static void
+seljmp(Blk *b, Fn *fn)
+{
+	Ref r;
+	int c, k;
+	Ins *fi;
+
+	if (b->jmp.type == JRet0 || b->jmp.type == JJmp)
+		return;
+	assert(b->jmp.type == JJnz);
+	r = b->jmp.arg;
+	b->jmp.arg = R;
+	assert(!req(r, R));
+	if (rtype(r) == RCon) {
+		b->jmp.type = JJmp;
+		if (req(r, CON_Z))
+			b->s1 = b->s2;
+		b->s2 = 0;
+		return;
+	}
+	fi = flagi(b->ins, &b->ins[b->nins]);
+	if (fi && req(fi->to, r)) {
+		if (iscmp(fi->op, &k, &c)) {
+			if (rtype(fi->arg[0]) == RCon)
+				c = icmpop(c);
+			b->jmp.type = JXJc + c;
+			if (fn->tmp[r.val].nuse == 1) {
+				assert(fn->tmp[r.val].ndef == 1);
+				selcmp(fi->arg, k, fn);
+				*fi = (Ins){.op = ONop};
+			}
+			return;
+		}
+		if (fi->op == OAnd && fn->tmp[r.val].nuse == 1
+		&& (rtype(fi->arg[0]) == RTmp ||
+		    rtype(fi->arg[1]) == RTmp)) {
+			fi->op = OXTest;
+			fi->to = R;
+			b->jmp.type = JXJc + ICne;
+			if (rtype(fi->arg[1]) == RCon) {
+				r = fi->arg[1];
+				fi->arg[1] = fi->arg[0];
+				fi->arg[0] = r;
+			}
+			return;
+		}
+		/* since flags are not tracked in liveness,
+		 * the result of the flag-setting instruction
+		 * has to be marked as live
+		 */
+		if (fn->tmp[r.val].nuse == 1)
+			emit(OCopy, Kw, R, r, R);
+		b->jmp.type = JXJc + ICne;
+		return;
+	}
+	selcmp((Ref[2]){r, CON_Z}, Kw, fn); /* todo, add long branch if non-zero */
+	b->jmp.type = JXJc + ICne;
+}
+
+static int
+classify(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret)
+{
+	int nint, ni, nsse, ns, n, *pn;
+	AClass *a;
+	Ins *i;
+
+	if (aret && aret->inmem)
+		nint = 5; /* hidden argument */
+	else
+		nint = 6;
+	nsse = 8;
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op == op) {
+			if (KBASE(i->cls) == 0)
+				pn = &nint;
+			else
+				pn = &nsse;
+			if (*pn > 0) {
+				--*pn;
+				a->inmem = 0;
+			} else
+				a->inmem = 2;
+			a->align = 3;
+			a->size = 8;
+			a->cls[0] = i->cls;
+		} else {
+			n = i->arg[0].val & AMask;
+			aclass(a, &typ[n]);
+			if (a->inmem)
+				continue;
+			ni = ns = 0;
+			for (n=0; n<2; n++)
+				if (KBASE(a->cls[n]) == 0)
+					ni++;
+				else
+					ns++;
+			if (nint >= ni && nsse >= ns) {
+				nint -= ni;
+				nsse -= ns;
+			} else
+				a->inmem = 1;
+		}
+	}
+
+	return ((6-nint) << 4) | ((8-nsse) << 8);
+}
+
+int rsave[] = {
+	RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
+	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14
+};
+int rclob[] = {RBX, R12, R13, R14, R15};
+
+MAKESURE(rsave_has_correct_size, sizeof rsave == NRSave * sizeof(int));
+MAKESURE(rclob_has_correct_size, sizeof rclob == NRClob * sizeof(int));
+
+bits
+retregs(Ref r, int p[2])
+{
+	bits b;
+	int ni, nf;
+
+	assert(rtype(r) == RACall);
+	b = 0;
+	ni = r.val & 3;
+	nf = (r.val >> 2) & 3;
+	if (ni >= 1)
+		b |= BIT(RAX);
+	if (ni >= 2)
+		b |= BIT(RDX);
+	if (nf >= 1)
+		b |= BIT(XMM0);
+	if (nf >= 2)
+		b |= BIT(XMM1);
+	if (p) {
+		p[0] = ni;
+		p[1] = nf;
+	}
+	return b;
+}
+
+bits
+argregs(Ref r, int p[2])
+{
+	bits b;
+	int j, ni, nf;
+
+	assert(rtype(r) == RACall);
+	b = 0;
+	ni = (r.val >> 4) & 15;
+	nf = (r.val >> 8) & 15;
+	for (j=0; j<ni; j++)
+		b |= BIT(rsave[j]);
+	for (j=0; j<nf; j++)
+		b |= BIT(XMM0+j);
+	if (p) {
+		p[0] = ni + 1;
+		p[1] = nf;
+	}
+	return b | BIT(RAX);
+}
+
+static Ref
+rarg(int ty, int *ni, int *ns)
+{
+	if (KBASE(ty) == 0)
+		return TMP(rsave[(*ni)++]);
+	else
+		return TMP(XMM0 + (*ns)++);
+}
+
+struct RAlloc {
+	Ins i;
+	RAlloc *link;
+};
+
+static void
+selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
+{
+	Ins *i;
+	AClass *ac, *a, aret;
+	int ca, ni, ns;
+	uint stk, off;
+	Ref r, r1, r2, reg[2], regcp[2];
+	RAlloc *ra;
+
+	ac = alloc((i1-i0) * sizeof ac[0]);
+	if (!req(i1->arg[1], R)) {
+		assert(rtype(i1->arg[1]) == RAType);
+		aclass(&aret, &typ[i1->arg[1].val & AMask]);
+		ca = classify(i0, i1, ac, OArg, &aret);
+	} else
+		ca = classify(i0, i1, ac, OArg, 0);
+
+	for (stk=0, a=&ac[i1-i0]; a>ac;)
+		if ((--a)->inmem) {
+			assert(a->align <= 4);
+			stk += a->size;
+			if (a->align == 4)
+				stk += stk & 15;
+		}
+	stk += stk & 15;
+	if (stk) {
+		r = getcon(-(int64_t)stk, fn);
+		emit(OSAlloc, Kl, R, r, R);
+	}
+
+	if (!req(i1->arg[1], R)) {
+		if (aret.inmem) {
+			/* get the return location from eax
+			 * it saves one callee-save reg */
+			r1 = newtmp("abi", Kl, fn);
+			emit(OCopy, Kl, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				regcp[1] = newtmp("abi", aret.cls[1], fn);
+				emit(OStorel, 0, R, regcp[1], r);
+				emit(OAdd, Kl, r, i1->to, getcon(8, fn));
+				chuse(i1->to, +1, fn);
+				ca += 1 << (2 * KBASE(aret.cls[1]));
+			}
+			regcp[0] = newtmp("abi", aret.cls[0], fn);
+			emit(OStorel, 0, R, regcp[0], i1->to);
+			ca += 1 << (2 * KBASE(aret.cls[0]));
+			retr(reg, &aret);
+			if (aret.size > 8)
+				emit(OCopy, aret.cls[1], regcp[1], reg[1], R);
+			emit(OCopy, aret.cls[0], regcp[0], reg[0], R);
+			r1 = i1->to;
+		}
+		/* allocate return pad */
+		ra = alloc(sizeof *ra);
+		assert(NAlign == 3);
+		aret.align -= 2;
+		if (aret.align < 0)
+			aret.align = 0;
+		ra->i.op = OAlloc + aret.align;
+		ra->i.cls = Kl;
+		ra->i.to = r1;
+		ra->i.arg[0] = getcon(aret.size, fn);
+		ra->link = (*rap);
+		*rap = ra;
+	} else {
+		ra = 0;
+		if (KBASE(i1->cls) == 0) {
+			emit(OCopy, i1->cls, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			emit(OCopy, i1->cls, i1->to, TMP(XMM0), R);
+			ca += 1 << 2;
+		}
+	}
+	emit(OCall, i1->cls, R, i1->arg[0], CALL(ca));
+	emit(OCopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
+
+	ni = ns = 0;
+	if (ra && aret.inmem)
+		emit(OCopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (a->inmem)
+			continue;
+		r1 = rarg(a->cls[0], &ni, &ns);
+		if (i->op == OArgc) {
+			if (a->size > 8) {
+				r2 = rarg(a->cls[1], &ni, &ns);
+				r = newtmp("abi", Kl, fn);
+				emit(OLoad, a->cls[1], r2, r, R);
+				emit(OAdd, Kl, r, i->arg[1], getcon(8, fn));
+				chuse(i->arg[1], +1, fn);
+			}
+			emit(OLoad, a->cls[0], r1, i->arg[1], R);
+		} else
+			emit(OCopy, i->cls, r1, i->arg[0], R);
+	}
+
+	if (!stk)
+		return;
+
+	r = newtmp("abi", Kl, fn);
+	chuse(r, -1, fn);
+	for (i=i0, a=ac, off=0; i<i1; i++, a++) {
+		if (!a->inmem)
+			continue;
+		if (i->op == OArgc) {
+			if (a->align == 4)
+				off += off & 15;
+			blit(r, off, i->arg[1], a->size, fn);
+		} else {
+			r1 = newtmp("abi", Kl, fn);
+			emit(OStorel, 0, R, i->arg[0], r1);
+			emit(OAdd, Kl, r1, r, getcon(off, fn));
+			chuse(r, +1, fn);
+		}
+		off += a->size;
+	}
+	emit(OSAlloc, Kl, r, getcon(stk, fn), R);
+}
+
+static void
+selpar(Fn *fn, Ins *i0, Ins *i1)
+{
+	AClass *ac, *a, aret;
+	Ins *i;
+	int ni, ns, s, al;
+	Ref r, r1;
+
+	ac = alloc((i1-i0) * sizeof ac[0]);
+	curi = insb;
+	ni = ns = 0;
+
+	if (fn->retty >= 0) {
+		aclass(&aret, &typ[fn->retty]);
+		if (aret.inmem) {
+			r = newtmp("abi", Kl, fn);
+			*curi++ = (Ins){OCopy, r, {rarg(Kl, &ni, &ns)}, Kl};
+			fn->retr = r;
+		}
+		classify(i0, i1, ac, OPar, &aret);
+	} else
+		classify(i0, i1, ac, OPar, 0);
+
+	assert(NAlign == 3);
+
+	s = 4;
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		switch (a->inmem) {
+		case 1:
+			assert(a->align <= 4);
+			if (a->align == 4)
+				s = (s+3) & -4;
+			fn->tmp[i->to.val].slot = -s; /* HACK! */
+			s += a->size / 4;
+			continue;
+		case 2:
+			*curi++ = (Ins){OLoad, i->to, {SLOT(-s)}, i->cls};
+			s += 2;
+			continue;
+		}
+		r1 = rarg(a->cls[0], &ni, &ns);
+		if (i->op == OParc) {
+			r = newtmp("abi", Kl, fn);
+			*curi++ = (Ins){OCopy, r, {r1}, Kl};
+			a->cls[0] = r.val;
+			if (a->size > 8) {
+				r1 = rarg(a->cls[1], &ni, &ns);
+				r = newtmp("abi", Kl, fn);
+				*curi++ = (Ins){OCopy, r, {r1}, Kl};
+				a->cls[1] = r.val;
+			}
+		} else
+			*curi++ = (Ins){OCopy, i->to, {r1}, i->cls};
+	}
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op != OParc || a->inmem)
+			continue;
+		assert(NAlign == 3);
+		for (al=0; a->align >> (al+2); al++)
+			;
+		r = TMP(a->cls[0]);
+		r1 = i->to;
+		*curi++ = (Ins){OAlloc+al, r1, {getcon(a->size, fn)}, Kl};
+		*curi++ = (Ins){OStorel, R, {r, r1}, 0};
+		if (a->size > 8) {
+			r = newtmp("abi", Kl, fn);
+			*curi++ = (Ins){OAdd, r, {r1, getcon(8, fn)}, Kl};
+			r1 = TMP(a->cls[1]);
+			*curi++ = (Ins){OStorel, R, {r1, r}, 0};
+		}
+	}
+}
+
+static int
+aref(Ref r, ANum *ai)
+{
+	switch (rtype(r)) {
+	default:
+		diag("isel: aref defaulted");
+	case RCon:
+		return 2;
+	case RTmp:
+		return ai[r.val].n;
+	}
+}
+
+static int
+ascale(Ref r, Con *con)
+{
+	int64_t n;
+
+	if (rtype(r) != RCon)
+		return 0;
+	if (con[r.val].type != CBits)
+		return 0;
+	n = con[r.val].bits.i;
+	return n == 1 || n == 2 || n == 4 || n == 8;
+}
+
+static void
+anumber(ANum *ai, Blk *b, Con *con)
+{
+	/* This should be made obsolete by a proper
+	 * reassoc pass.
+	 *
+	 * Rules:
+	 *
+	 *   RTmp(_) -> 0    tmp
+	 *   ( RTmp(_) -> 1    slot )
+	 *   RCon(_) -> 2    con
+	 *   0 * 2   -> 3    s * i (when constant is 1,2,4,8)
+	 */
+	static char add[10][10] = {
+		[2] [2] = 2,              /* folding */
+		[2] [5] = 5, [5] [2] = 5,
+		[2] [6] = 6, [6] [2] = 6,
+		[2] [7] = 7, [7] [2] = 7,
+		[0] [0] = 4,              /* 4: b + s * i */
+		[0] [3] = 4, [3] [0] = 4,
+		[2] [3] = 5, [3] [2] = 5, /* 5: o + s * i */
+		[0] [2] = 6, [2] [0] = 6, /* 6: o + b */
+		[2] [4] = 7, [4] [2] = 7, /* 7: o + b + s * i */
+		[0] [5] = 7, [5] [0] = 7,
+		[6] [3] = 7, [3] [6] = 7,
+
+	};
+	int a, a1, a2, n1, n2, t1, t2;
+	Ins *i;
+
+	for (i=b->ins; i-b->ins < b->nins; i++) {
+		if (rtype(i->to) == RTmp)
+			ai[i->to.val].i = i;
+		if (i->op != OAdd && i->op != OMul)
+			continue;
+		a1 = aref(i->arg[0], ai);
+		a2 = aref(i->arg[1], ai);
+		t1 = a1 != 1 && a1 != 2;
+		t2 = a2 != 1 && a2 != 2;
+		if (i->op == OAdd) {
+			a = add[n1 = a1][n2 = a2];
+			if (t1 && a < add[0][a2])
+				a = add[n1 = 0][n2 = a2];
+			if (t2 && a < add[a1][0])
+				a = add[n1 = a1][n2 = 0];
+			if (t1 && t2 && a < add[0][0])
+				a = add[n1 = 0][n2 = 0];
+		} else {
+			n1 = n2 = a = 0;
+			if (ascale(i->arg[0], con) && t2)
+				a = 3, n1 = 2, n2 = 0;
+			if (t1 && ascale(i->arg[1], con))
+				a = 3, n1 = 0, n2 = 2;
+		}
+		ai[i->to.val].n = a;
+		ai[i->to.val].l = n1;
+		ai[i->to.val].r = n2;
+	}
+}
+
+static void
+amatch(Addr *a, Ref r, ANum *ai, Fn *fn, int top)
+{
+	Ins *i;
+	int nl, nr, t, s;
+	Ref al, ar;
+
+	if (top)
+		memset(a, 0, sizeof *a);
+	if (rtype(r) == RCon) {
+		addcon(&a->offset, &fn->con[r.val]);
+		return;
+	}
+	assert(rtype(r) == RTmp);
+	i = ai[r.val].i;
+	nl = ai[r.val].l;
+	nr = ai[r.val].r;
+	if (i) {
+		if (nl > nr) {
+			al = i->arg[1];
+			ar = i->arg[0];
+			t = nl, nl = nr, nr = t;
+		} else {
+			al = i->arg[0];
+			ar = i->arg[1];
+		}
+	}
+	switch (ai[r.val].n) {
+	default:
+		diag("isel: amatch defaulted");
+	case 3: /* s * i */
+		if (!top) {
+			a->index = al;
+			a->scale = fn->con[ar.val].bits.i;
+		} else
+			a->base = r;
+		break;
+	case 4: /* b + s * i */
+		switch (nr) {
+		case 0:
+			if (fn->tmp[ar.val].slot != -1) {
+				al = i->arg[1];
+				ar = i->arg[0];
+			}
+			a->index = ar;
+			a->scale = 1;
+			break;
+		case 3:
+			amatch(a, ar, ai, fn, 0);
+			break;
+		}
+		r = al;
+	case 0:
+		s = fn->tmp[r.val].slot;
+		if (s != -1)
+			r = SLOT(s);
+		a->base = r;
+		break;
+	case 2: /* constants */
+	case 5: /* o + s * i */
+	case 6: /* o + b */
+	case 7: /* o + b + s * i */
+		amatch(a, ar, ai, fn, 0);
+		amatch(a, al, ai, fn, 0);
+		break;
+	}
+}
+
+/* instruction selection
+ * requires use counts (as given by parsing)
+ */
+void
+isel(Fn *fn)
+{
+	Blk *b, **sb;
+	Ins *i, *i0, *ip;
+	Phi *p;
+	uint a;
+	int n, al;
+	int64_t sz;
+	ANum *ainfo;
+	RAlloc *ral;
+
+	for (n=0; n<fn->ntmp; n++)
+		fn->tmp[n].slot = -1;
+	fn->slot = 0;
+
+	/* lower arguments */
+	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
+		if (i->op != OPar && i->op != OParc)
+			break;
+	selpar(fn, b->ins, i);
+	n = b->nins - (i - b->ins) + (curi - insb);
+	i0 = alloc(n * sizeof(Ins));
+	ip = icpy(ip = i0, insb, curi - insb);
+	ip = icpy(ip, i, &b->ins[b->nins] - i);
+	b->nins = n;
+	b->ins = i0;
+
+	/* lower function calls and returns */
+	ral = 0;
+	b = fn->start;
+	do {
+		if (!(b = b->link))
+			b = fn->start; /* do it last */
+		curi = &insb[NIns];
+		selret(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;) {
+			if ((--i)->op == OCall) {
+				for (i0=i; i0>b->ins; i0--)
+					if ((i0-1)->op != OArg)
+					if ((i0-1)->op != OArgc)
+						break;
+				selcall(fn, i0, i, &ral);
+				i = i0;
+				continue;
+			}
+			assert(i->op != OArg && i->op != OArgc);
+			emiti(*i);
+		}
+		if (b == fn->start)
+			for (; ral; ral=ral->link)
+				emiti(ral->i);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	} while (b != fn->start);
+
+	if (debug['A']) {
+		fprintf(stderr, "\n> After call lowering:\n");
+		printfn(fn, stderr);
+	}
+
+	/* assign slots to fast allocs */
+	b = fn->start;
+	assert(NAlign == 3 && "change n=4 and sz /= 4 below");
+	for (al=OAlloc, n=4; al<=OAlloc1; al++, n*=2)
+		for (i=b->ins; i-b->ins < b->nins; i++)
+			if (i->op == al) {
+				if (rtype(i->arg[0]) != RCon)
+					break;
+				sz = fn->con[i->arg[0].val].bits.i;
+				if (sz < 0 || sz >= INT_MAX-3)
+					diag("isel: invalid alloc size");
+				sz = (sz + n-1) & -n;
+				sz /= 4;
+				fn->tmp[i->to.val].slot = fn->slot;
+				fn->slot += sz;
+				*i = (Ins){.op = ONop};
+			}
+
+	/* process basic blocks */
+	n = fn->ntmp;
+	ainfo = emalloc(n * sizeof ainfo[0]);
+	for (b=fn->start; b; b=b->link) {
+		curi = &insb[NIns];
+		for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
+			for (p=(*sb)->phi; p; p=p->link) {
+				for (a=0; p->blk[a] != b; a++)
+					assert(a+1 < p->narg);
+				fixarg(&p->arg[a], p->cls, 1, fn);
+			}
+		memset(ainfo, 0, n * sizeof ainfo[0]);
+		anumber(ainfo, b, fn->con);
+		seljmp(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;)
+			sel(*--i, ainfo, fn);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	}
+	free(ainfo);
+
+	if (debug['I']) {
+		fprintf(stderr, "\n> After instruction selection:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/src/live.c b/src/live.c
@@ -0,0 +1,174 @@
+#include "all.h"
+
+void
+liveon(BSet *v, Blk *b, Blk *s)
+{
+	Phi *p;
+	uint a;
+
+	bscopy(v, s->in);
+	for (p=s->phi; p; p=p->link) {
+		bsclr(v, p->to.val);
+		for (a=0; a<p->narg; a++)
+			if (p->blk[a] == b)
+			if (rtype(p->arg[a]) == RTmp)
+				bsset(v, p->arg[a].val);
+	}
+}
+
+static int
+phitmp(int t, Tmp *tmp)
+{
+	int tp;
+
+	tp = tmp[t].phi;
+	return tp ? tp : t;
+}
+
+static void
+phifix(int t1, short *phi, Tmp *tmp)
+{
+	int t, t2;
+
+	/* detect temporaries arguments
+	 * of the same phi node that
+	 * interfere and separate them
+	 */
+	t = phitmp(t1, tmp);
+	t2 = phi[t];
+	if (t2 && t2 != t1) {
+		if (t != t1) {
+			tmp[t1].phi = t1;
+			t = t1;
+		} else {
+			tmp[t2].phi = t2;
+			phi[t2] = t2;
+		}
+	}
+	phi[t] = t1;
+}
+
+static void
+bset(Ref r, Blk *b, int *nlv, short *phi, Tmp *tmp)
+{
+
+	if (rtype(r) != RTmp)
+		return;
+	bsset(b->gen, r.val);
+	phifix(r.val, phi, tmp);
+	if (!bshas(b->in, r.val)) {
+		nlv[KBASE(tmp[r.val].cls)]++;
+		bsset(b->in, r.val);
+	}
+}
+
+/* liveness analysis
+ * requires rpo computation
+ */
+void
+filllive(Fn *f)
+{
+	Blk *b;
+	Ins *i;
+	int k, t, m[2], n, chg, nlv[2];
+	short *phi;
+	BSet u[1], v[1];
+	Mem *ma;
+
+	bsinit(u, f->ntmp);
+	bsinit(v, f->ntmp);
+	phi = emalloc(f->ntmp * sizeof phi[0]);
+	for (b=f->start; b; b=b->link) {
+		bsinit(b->in, f->ntmp);
+		bsinit(b->out, f->ntmp);
+		bsinit(b->gen, f->ntmp);
+	}
+	chg = 1;
+Again:
+	for (n=f->nblk-1; n>=0; n--) {
+		b = f->rpo[n];
+
+		bscopy(u, b->out);
+		if (b->s1) {
+			liveon(v, b, b->s1);
+			bsunion(b->out, v);
+		}
+		if (b->s2) {
+			liveon(v, b, b->s2);
+			bsunion(b->out, v);
+		}
+		chg |= !bsequal(b->out, u);
+
+		memset(phi, 0, f->ntmp * sizeof phi[0]);
+		memset(nlv, 0, sizeof nlv);
+		bscopy(b->in, b->out);
+		for (t=0; t<f->ntmp; t++)
+			if (bshas(b->in, t)) {
+				phifix(t, phi, f->tmp);
+				nlv[KBASE(f->tmp[t].cls)]++;
+			}
+		if (rtype(b->jmp.arg) == RACall) {
+			assert(bscount(b->in) == 0 && nlv[0] == 0 && nlv[1] == 0);
+			b->in->t[0] |= retregs(b->jmp.arg, nlv);
+		} else
+			bset(b->jmp.arg, b, nlv, phi, f->tmp);
+		for (k=0; k<2; k++)
+			b->nlive[k] = nlv[k];
+		for (i=&b->ins[b->nins]; i!=b->ins;) {
+			if ((--i)->op == OCall && rtype(i->arg[1]) == RACall) {
+				b->in->t[0] &= ~retregs(i->arg[1], m);
+				for (k=0; k<2; k++)
+					nlv[k] -= m[k];
+				if (nlv[0] + NISave > b->nlive[0])
+					b->nlive[0] = nlv[0] + NISave;
+				if (nlv[1] + NFSave > b->nlive[1])
+					b->nlive[1] = nlv[1] + NFSave;
+				b->in->t[0] |= argregs(i->arg[1], m);
+				for (k=0; k<2; k++)
+					nlv[k] += m[k];
+			}
+			if (!req(i->to, R)) {
+				assert(rtype(i->to) == RTmp);
+				t = i->to.val;
+				if (bshas(b->in, i->to.val))
+					nlv[KBASE(f->tmp[t].cls)]--;
+				bsset(b->gen, t);
+				bsclr(b->in, t);
+				phi[phitmp(t, f->tmp)] = 0;
+			}
+			for (k=0; k<2; k++)
+				switch (rtype(i->arg[k])) {
+				case RAMem:
+					ma = &f->mem[i->arg[k].val & AMask];
+					bset(ma->base, b, nlv, phi, f->tmp);
+					bset(ma->index, b, nlv, phi, f->tmp);
+					break;
+				default:
+					bset(i->arg[k], b, nlv, phi, f->tmp);
+					break;
+				}
+			for (k=0; k<2; k++)
+				if (nlv[k] > b->nlive[k])
+					b->nlive[k] = nlv[k];
+		}
+	}
+	if (chg) {
+		chg = 0;
+		goto Again;
+	}
+	free(phi);
+
+	if (debug['L']) {
+		fprintf(stderr, "\n> Liveness analysis:\n");
+		for (b=f->start; b; b=b->link) {
+			fprintf(stderr, "\t%-10sin:   ", b->name);
+			dumpts(b->in, f->tmp, stderr);
+			fprintf(stderr, "\t          out:  ");
+			dumpts(b->out, f->tmp, stderr);
+			fprintf(stderr, "\t          gen:  ");
+			dumpts(b->gen, f->tmp, stderr);
+			fprintf(stderr, "\t          live: ");
+			fprintf(stderr, "%d %d\n", b->nlive[0], b->nlive[1]);
+		}
+	}
+}
diff --git a/src/main.c b/src/main.c
@@ -0,0 +1,117 @@
+#include "all.h"
+#include <ctype.h>
+#include <getopt.h>
+
+char debug['Z'+1] = {
+	['P'] = 0, /* parsing */
+	['A'] = 0, /* abi lowering */
+	['I'] = 0, /* instruction selection */
+	['L'] = 0, /* liveness */
+	['M'] = 0, /* memory optimization */
+	['N'] = 0, /* ssa construction */
+	['C'] = 0, /* copy elimination */
+	['S'] = 0, /* spilling */
+	['R'] = 0, /* reg. allocation */
+};
+
+static FILE *outf;
+static int dbg;
+
+static void
+data(Dat *d)
+{
+	if (dbg)
+		return;
+	if (d->type == DEnd) {
+		fputs("/* end data */\n\n", outf);
+		freeall();
+	}
+	emitdat(d, outf);
+}
+
+static void
+func(Fn *fn)
+{
+	int n;
+
+	if (dbg)
+		fprintf(stderr, "**** Function %s ****", fn->name);
+	if (debug['P']) {
+		fprintf(stderr, "\n> After parsing:\n");
+		printfn(fn, stderr);
+	}
+	fillrpo(fn);
+	fillpreds(fn);
+	filluse(fn);
+	memopt(fn);
+	ssa(fn);
+	filluse(fn);
+	copy(fn);
+	filluse(fn);
+	isel(fn);
+	filllive(fn);
+	fillcost(fn);
+	spill(fn);
+	rega(fn);
+	fillrpo(fn);
+	assert(fn->rpo[0] == fn->start);
+	for (n=0;; n++)
+		if (n == fn->nblk-1) {
+			fn->rpo[n]->link = 0;
+			break;
+		} else
+			fn->rpo[n]->link = fn->rpo[n+1];
+	if (!dbg) {
+		emitfn(fn, outf);
+		fprintf(outf, "/* end function %s */\n\n", fn->name);
+	} else
+		fprintf(stderr, "\n");
+	freeall();
+}
+
+int
+main(int ac, char *av[])
+{
+	FILE *inf;
+	char *f;
+	int c;
+
+	outf = stdout;
+	while ((c = getopt(ac, av, "d:o:")) != -1)
+		switch (c) {
+		case 'd':
+			for (; *optarg; optarg++)
+				if (isalpha(*optarg)) {
+					debug[toupper(*optarg)] = 1;
+					dbg = 1;
+				}
+			break;
+		case 'o':
+			if (strcmp(optarg, "-") != 0)
+				outf = fopen(optarg, "w");
+			break;
+		default:
+			fprintf(stderr, "usage: %s [-d <flags>] [-o out] {file.ssa, -}\n", av[0]);
+			exit(1);
+		}
+
+	do {
+		f = av[optind];
+		if (!f || strcmp(f, "-") == 0) {
+			inf = stdin;
+			f = "-";
+		} else {
+			inf = fopen(f, "r");
+			if (!inf) {
+				fprintf(stderr, "cannot open '%s'\n", f);
+				exit(1);
+			}
+		}
+		parse(inf, f, data, func);
+	} while (++optind < ac);
+
+	if (!dbg)
+		emitfin(outf);
+
+	exit(0);
+}
diff --git a/src/mem.c b/src/mem.c
@@ -0,0 +1,81 @@
+#include "all.h"
+
+/* Memory optimization:
+ *
+ * - replace alloced slots used only in
+ *   load/store operations
+ *   Assumption: all the accesses have the
+ *   same size (this could be wrong...)
+ */
+
+/* require use, maintains use counts */
+void
+memopt(Fn *fn)
+{
+	Blk *b;
+	Ins *i, *l;
+	Tmp *t;
+	Use *u, *ue;
+	int a;
+
+	b = fn->start;
+	for (i=b->ins; i-b->ins < b->nins; i++) {
+		if (OAlloc > i->op || i->op > OAlloc1)
+			continue;
+		assert(NAlign == 3);
+		assert(rtype(i->to) == RTmp);
+		t = &fn->tmp[i->to.val];
+		for (u=t->use; u != &t->use[t->nuse]; u++) {
+			if (u->type != UIns)
+				goto NextIns;
+			l = u->u.ins;
+			if (!isload(l->op)
+			&& (!isstore(l->op) || req(i->to, l->arg[0])))
+				goto NextIns;
+		}
+		/* get rid of the alloc and replace uses */
+		*i = (Ins){.op = ONop};
+		t->ndef--;
+		ue = &t->use[t->nuse];
+		for (u=t->use; u!=ue; u++) {
+			l = u->u.ins;
+			if (isstore(l->op)) {
+				if (l->op == OStores)
+					l->cls = Kd;
+				else if (l->op == OStored)
+					l->cls = Kd;
+				else if (l->op == OStorel)
+					l->cls = Kl;
+				else
+					l->cls = Kw;
+				l->op = OCopy;
+				l->to = l->arg[1];
+				l->arg[1] = R;
+				t->nuse--;
+				t->ndef++;
+			} else
+				/* try to turn loads into copies so we
+				 * can eliminate them later */
+				switch(l->op) {
+				case OLoad:
+					l->op = OCopy;
+					break;
+				case OLoadsw:
+				case OLoaduw:
+					l->cls = Kw;
+					l->op = OCopy;
+					break;
+				default:
+					/* keep l->cls */
+					a = l->op - OLoadsw;
+					l->op = OExtsw + a;
+					break;
+				}
+		}
+	NextIns:;
+	}
+	if (debug['M']) {
+		fprintf(stderr, "\n> After memory optimization:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/src/parse.c b/src/parse.c
@@ -0,0 +1,1081 @@
+#include "all.h"
+#include <ctype.h>
+#include <stdarg.h>
+
+enum {
+	Kx = -1, /* Invalid operand */
+	Km = Kl, /* Memory pointer (for x64) */
+};
+
+OpDesc opdesc[NOp] = {
+#define A(a,b,c,d) {[Kw]=K##a, [Kl]=K##b, [Ks]=K##c, [Kd]=K##d}
+
+	/*            NAME       NM      ARGCLS0     ARGCLS1  SF LF */
+	[OAdd]    = { "add",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[OSub]    = { "sub",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[ODiv]    = { "div",      2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
+	[ORem]    = { "rem",      2, {A(w,l,x,x), A(w,l,x,x)}, 0, 0 },
+	[OUDiv]   = { "udiv",     2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
+	[OURem]   = { "urem",     2, {A(w,l,x,x), A(w,l,x,x)}, 0, 0 },
+	[OMul]    = { "mul",      2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
+	[OAnd]    = { "and",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[OOr]     = { "or",       2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[OXor]    = { "xor",      2, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[OSar]    = { "sar",      1, {A(w,l,x,x), A(w,w,x,x)}, 1, 0 },
+	[OShr]    = { "shr",      1, {A(w,l,x,x), A(w,w,x,x)}, 1, 0 },
+	[OShl]    = { "shl",      1, {A(w,l,x,x), A(w,w,x,x)}, 1, 0 },
+	[OStored] = { "stored",   0, {A(d,d,d,d), A(m,m,m,m)}, 0, 1 },
+	[OStores] = { "stores",   0, {A(s,s,s,s), A(m,m,m,m)}, 0, 1 },
+	[OStorel] = { "storel",   0, {A(l,l,l,l), A(m,m,m,m)}, 0, 1 },
+	[OStorew] = { "storew",   0, {A(w,w,w,w), A(m,m,m,m)}, 0, 1 },
+	[OStoreh] = { "storeh",   0, {A(w,w,w,w), A(m,m,m,m)}, 0, 1 },
+	[OStoreb] = { "storeb",   0, {A(w,w,w,w), A(m,m,m,m)}, 0, 1 },
+	[OLoad]   = { "load",     0, {A(m,m,m,m), A(x,x,x,x)}, 0, 1 },
+	[OLoadsw] = { "loadsw",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OLoaduw] = { "loaduw",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OLoadsh] = { "loadsh",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OLoaduh] = { "loaduh",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OLoadsb] = { "loadsb",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OLoadub] = { "loadub",   0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OExtsw]  = { "extsw",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExtuw]  = { "extuw",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExtsh]  = { "extsh",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExtuh]  = { "extuh",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExtsb]  = { "extsb",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExtub]  = { "extub",    0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OExts]   = { "exts",     0, {A(w,w,w,w), A(x,x,x,x)}, 0, 1 },
+	[OTruncd] = { "truncd",   0, {A(d,d,d,d), A(x,x,x,x)}, 0, 1 },
+	[OFtosi]  = { "ftosi",    0, {A(s,d,x,x), A(x,x,x,x)}, 0, 1 },
+	[OSitof]  = { "sitof",    0, {A(x,x,w,l), A(x,x,x,x)}, 0, 1 },
+	[OCast]   = { "cast",     0, {A(s,d,w,l), A(x,x,x,x)}, 0, 1 },
+	[OCopy]   = { "copy",     1, {A(w,l,s,d), A(x,x,x,x)}, 0, 1 },
+	[ONop]    = { "nop",      0, {A(x,x,x,x), A(x,x,x,x)}, 0, 1 },
+	[OSwap]   = { "swap",     2, {A(w,l,s,d), A(w,l,s,d)}, 0, 0 },
+	[OSign]   = { "sign",     0, {A(w,l,x,x), A(x,x,x,x)}, 0, 0 },
+	[OSAlloc] = { "salloc",   0, {A(x,l,x,x), A(x,x,x,x)}, 0, 0 },
+	[OXDiv]   = { "xdiv",     1, {A(w,l,x,x), A(x,x,x,x)}, 0, 0 },
+	[OXCmp]   = { "xcmp",     1, {A(w,l,s,d), A(w,l,s,d)}, 1, 0 },
+	[OXTest]  = { "xtest",    1, {A(w,l,x,x), A(w,l,x,x)}, 1, 0 },
+	[OAddr]   = { "addr",     0, {A(m,m,x,x), A(x,x,x,x)}, 0, 1 },
+	[OPar]    = { "parn",     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OParc]   = { "parc",     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OArg]    = { "arg",      0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OArgc]   = { "argc",     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OCall]   = { "call",     0, {A(m,m,m,m), A(x,x,x,x)}, 0, 0 },
+	[OXSetnp] = { "xsetnp",   0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OXSetp]  = { "xsetp",    0, {A(x,x,x,x), A(x,x,x,x)}, 0, 0 },
+	[OAlloc]   = { "alloc4",  1, {A(l,l,l,l), A(x,x,x,x)}, 0, 0 },
+	[OAlloc+1] = { "alloc8",  1, {A(l,l,l,l), A(x,x,x,x)}, 0, 0 },
+	[OAlloc+2] = { "alloc16", 1, {A(l,l,l,l), A(x,x,x,x)}, 0, 0 },
+#define X(c) \
+	[OCmpw+IC##c] = { "c"    #c "w", 0, {A(w,w,x,x), A(w,w,x,x)}, 1, 0 }, \
+	[OCmpl+IC##c] = { "c"    #c "l", 0, {A(l,l,x,x), A(l,l,x,x)}, 1, 0 }, \
+	[OXSet+IC##c] = { "xset" #c,     0, {A(x,x,x,x), A(x,x,x,x)}, 0, 1 },
+	ICMPS(X)
+#undef X
+#define X(c) \
+	[OCmps+FC##c] = { "c"    #c "s", 0, {A(s,s,x,x), A(s,s,x,x)}, 1, 0 }, \
+	[OCmpd+FC##c] = { "c"    #c "d", 0, {A(d,d,x,x), A(d,d,x,x)}, 1, 0 },
+	FCMPS(X)
+#undef X
+
+};
+#undef A
+
+typedef enum {
+	PXXX,
+	PLbl,
+	PPhi,
+	PIns,
+	PEnd,
+} PState;
+
+enum {
+	TXXX = NPubOp,
+	TCall,
+	TPhi,
+	TJmp,
+	TJnz,
+	TRet,
+	TFunc,
+	TType,
+	TData,
+	TAlign,
+	TL,
+	TW,
+	TH,
+	TB,
+	TD,
+	TS,
+	TZ,
+
+	TInt,
+	TFlts,
+	TFltd,
+	TTmp,
+	TLbl,
+	TGlo,
+	TTyp,
+	TStr,
+
+	TPlus,
+	TEq,
+	TComma,
+	TLParen,
+	TRParen,
+	TLBrace,
+	TRBrace,
+	TNL,
+	TEOF,
+};
+
+
+static FILE *inf;
+static char *inpath;
+static int thead;
+static struct {
+	char chr;
+	double fltd;
+	float flts;
+	int64_t num;
+	char *str;
+} tokval;
+static int lnum;
+
+static Tmp *tmp;
+static Con *con;
+static int ntmp;
+static int ncon;
+static Phi **plink;
+static Blk **bmap;
+static Blk *curb;
+static Blk **blink;
+static int nblk;
+static int rcls;
+static int ntyp;
+
+
+void
+err(char *s, ...)
+{
+	char buf[100], *p, *end;
+	va_list ap;
+
+	p = buf;
+	end = buf + sizeof(buf);
+
+	va_start(ap, s);
+	p += snprintf(p, end - p, "%s:%d: ", inpath, lnum);
+	p += vsnprintf(p, end - p, s, ap);
+	va_end(ap);
+
+	diag(buf);
+}
+
+static int
+lex()
+{
+	static struct {
+		char *str;
+		int tok;
+	} tmap[] = {
+		{ "call", TCall },
+		{ "phi", TPhi },
+		{ "jmp", TJmp },
+		{ "jnz", TJnz },
+		{ "ret", TRet },
+		{ "function", TFunc },
+		{ "type", TType },
+		{ "data", TData },
+		{ "align", TAlign },
+		{ "l", TL },
+		{ "w", TW },
+		{ "h", TH },
+		{ "b", TB },
+		{ "d", TD },
+		{ "s", TS },
+		{ "z", TZ },
+		{ "loadw", OLoad }, /* for convenience */
+		{ "loadl", OLoad },
+		{ "loads", OLoad },
+		{ "loadd", OLoad },
+		{ "alloc1", OAlloc },
+		{ "alloc2", OAlloc },
+		{ 0, TXXX }
+	};
+	static char tok[NString];
+	int c, i;
+	int t;
+
+	do
+		c = fgetc(inf);
+	while (isblank(c));
+	t = TXXX;
+	tokval.chr = c;
+	switch (c) {
+	case EOF:
+		return TEOF;
+	case ',':
+		return TComma;
+	case '(':
+		return TLParen;
+	case ')':
+		return TRParen;
+	case '{':
+		return TLBrace;
+	case '}':
+		return TRBrace;
+	case '=':
+		return TEq;
+	case '+':
+		return TPlus;
+	case 's':
+		if (fscanf(inf, "_%f", &tokval.flts) != 1)
+			break;
+		return TFlts;
+	case 'd':
+		if (fscanf(inf, "_%lf", &tokval.fltd) != 1)
+			break;
+		return TFltd;
+	case '%':
+		t = TTmp;
+		goto Alpha;
+	case '@':
+		t = TLbl;
+		goto Alpha;
+	case '$':
+		t = TGlo;
+		goto Alpha;
+	case ':':
+		t = TTyp;
+		goto Alpha;
+	case '#':
+		while (fgetc(inf) != '\n')
+			;
+	case '\n':
+		lnum++;
+		return TNL;
+	}
+	if (isdigit(c) || c == '-' || c == '+') {
+		ungetc(c, inf);
+		if (fscanf(inf, "%"SCNd64, &tokval.num) != 1)
+			err("invalid integer literal");
+		return TInt;
+	}
+	if (c == '"') {
+		tokval.str = vnew(0, 1);
+		for (i=0;; i++) {
+			c = fgetc(inf);
+			vgrow(&tokval.str, i+1);
+			if (c == '"')
+			if (!i || tokval.str[i-1] != '\\') {
+				tokval.str[i] = 0;
+				return TStr;
+			}
+			tokval.str[i] = c;
+		}
+	}
+	if (0)
+Alpha:		c = fgetc(inf);
+	if (!isalpha(c) && c != '.' && c != '_')
+		err("lexing failure: invalid character %c (%d)", c, c);
+	i = 0;
+	do {
+		if (i >= NString-1)
+			err("identifier too long");
+		tok[i++] = c;
+		c = fgetc(inf);
+	} while (isalpha(c) || c == '$' || c == '.' || c == '_' || isdigit(c));
+	tok[i] = 0;
+	ungetc(c, inf);
+	tokval.str = tok;
+	if (t != TXXX) {
+		return t;
+	}
+	for (i=0; i<NPubOp; i++)
+		if (opdesc[i].name)
+		if (strcmp(tok, opdesc[i].name) == 0)
+			return i;
+	for (i=0; tmap[i].str; i++)
+		if (strcmp(tok, tmap[i].str) == 0)
+			return tmap[i].tok;
+	err("unknown keyword %s", tokval.str);
+	return TXXX;
+}
+
+static int
+peek()
+{
+	if (thead == TXXX)
+		thead = lex();
+	return thead;
+}
+
+static int
+next()
+{
+	int t;
+
+	t = peek();
+	thead = TXXX;
+	return t;
+}
+
+static int
+nextnl()
+{
+	int t;
+
+	while ((t = next()) == TNL)
+		;
+	return t;
+}
+
+static void
+expect(int t)
+{
+	static char *ttoa[] = {
+		[TLbl] = "label",
+		[TComma] = ",",
+		[TEq] = "=",
+		[TNL] = "newline",
+		[TLParen] = "(",
+		[TRParen] = ")",
+		[TLBrace] = "{",
+		[TRBrace] = "}",
+		[TEOF] = 0,
+	};
+	char buf[128], *s1, *s2;
+	int t1;
+
+	t1 = next();
+	if (t == t1)
+		return;
+	s1 = ttoa[t] ? ttoa[t] : "??";
+	s2 = ttoa[t1] ? ttoa[t1] : "??";
+	sprintf(buf, "%s expected, got %s instead", s1, s2);
+	err(buf);
+}
+
+static Ref
+tmpref(char *v)
+{
+	int t;
+
+	for (t=Tmp0; t<ntmp; t++)
+		if (strcmp(v, tmp[t].name) == 0)
+			return TMP(t);
+	vgrow(&tmp, ++ntmp);
+	strcpy(tmp[t].name, v);
+	return TMP(t);
+}
+
+static Ref
+parseref()
+{
+	Con c;
+	int i;
+
+	memset(&c, 0, sizeof c);
+	switch (next()) {
+	case TTmp:
+		return tmpref(tokval.str);
+	case TInt:
+		c.type = CBits;
+		c.bits.i = tokval.num;
+		goto Look;
+	case TFlts:
+		c.type = CBits;
+		c.bits.s = tokval.flts;
+		c.flt = 1;
+		goto Look;
+	case TFltd:
+		c.type = CBits;
+		c.bits.d = tokval.fltd;
+		c.flt = 2;
+		goto Look;
+	case TGlo:
+		c.type = CAddr;
+		strcpy(c.label, tokval.str);
+	Look:
+		for (i=0; i<ncon; i++)
+			if (con[i].type == c.type
+			&& con[i].bits.i == c.bits.i
+			&& strcmp(con[i].label, c.label) == 0)
+				return CON(i);
+		vgrow(&con, ++ncon);
+		con[i] = c;
+		return CON(i);
+	default:
+		return R;
+	}
+}
+
+static int
+parsecls(int *tyn)
+{
+	int i;
+
+	switch (next()) {
+	default:
+		err("invalid class specifier");
+	case TTyp:
+		for (i=0; i<ntyp; i++)
+			if (strcmp(tokval.str, typ[i].name) == 0) {
+				*tyn = i;
+				return 4;
+			}
+		err("undefined type");
+	case TW:
+		return Kw;
+	case TL:
+		return Kl;
+	case TS:
+		return Ks;
+	case TD:
+		return Kd;
+	}
+}
+
+static void
+parserefl(int arg)
+{
+	int k, t, ty;
+	Ref r;
+
+	expect(TLParen);
+	if (peek() == TRParen) {
+		next();
+		return;
+	}
+	for (;;) {
+		if (curi - insb >= NIns)
+			err("too many instructions (1)");
+		k = parsecls(&ty);
+		r = parseref();
+		if (req(r, R))
+			err("invalid reference argument");
+		if (!arg && rtype(r) != RTmp)
+			err("invalid function parameter");
+		if (k == 4)
+			if (arg)
+				*curi = (Ins){OArgc, R, {TYPE(ty), r}, Kl};
+			else
+				*curi = (Ins){OParc, r, {TYPE(ty)}, Kl};
+		else
+			if (arg)
+				*curi = (Ins){OArg, R, {r}, k};
+			else
+				*curi = (Ins){OPar, r, {R}, k};
+		curi++;
+		t = next();
+		if (t == TRParen)
+			break;
+		if (t != TComma)
+			err(", or ) expected");
+	}
+}
+
+static Blk *
+findblk(char *name)
+{
+	int i;
+
+	for (i=0; i<nblk; i++)
+		if (strcmp(bmap[i]->name, name) == 0)
+			return bmap[i];
+	vgrow(&bmap, ++nblk);
+	bmap[i] = blknew();
+	strcpy(bmap[i]->name, name);
+	return bmap[i];
+}
+
+static void
+closeblk()
+{
+	curb->nins = curi - insb;
+	idup(&curb->ins, insb, curb->nins);
+	blink = &curb->link;
+	curi = insb;
+}
+
+static PState
+parseline(PState ps)
+{
+	Ref arg[NPred] = {R};
+	Blk *blk[NPred];
+	Phi *phi;
+	Ref r;
+	Blk *b;
+	int t, op, i, k, ty;
+
+	t = nextnl();
+	if (ps == PLbl && t != TLbl && t != TRBrace)
+		err("label or } expected");
+	switch (t) {
+	default:
+		if (isstore(t)) {
+			/* operations without result */
+			r = R;
+			k = 0;
+			op = t;
+			goto DoOp;
+		}
+		err("label, instruction or jump expected");
+	case TRBrace:
+		return PEnd;
+	case TTmp:
+		break;
+	case TLbl:
+		b = findblk(tokval.str);
+		if (b->jmp.type != JXXX)
+			err("multiple definitions of block");
+		if (curb && curb->jmp.type == JXXX) {
+			closeblk();
+			curb->jmp.type = JJmp;
+			curb->s1 = b;
+		}
+		*blink = b;
+		curb = b;
+		plink = &curb->phi;
+		expect(TNL);
+		return PPhi;
+	case TRet:
+		curb->jmp.type = (int[]){
+			JRetw, JRetl,
+			JRets, JRetd,
+			JRetc, JRet0
+		}[rcls];
+		if (rcls < 5) {
+			r = parseref();
+			if (req(r, R))
+				err("return value expected");
+			curb->jmp.arg = r;
+		}
+		goto Close;
+	case TJmp:
+		curb->jmp.type = JJmp;
+		goto Jump;
+	case TJnz:
+		curb->jmp.type = JJnz;
+		r = parseref();
+		if (req(r, R))
+			err("invalid argument for jnz jump");
+		curb->jmp.arg = r;
+		expect(TComma);
+	Jump:
+		expect(TLbl);
+		curb->s1 = findblk(tokval.str);
+		if (curb->jmp.type != JJmp) {
+			expect(TComma);
+			expect(TLbl);
+			curb->s2 = findblk(tokval.str);
+		}
+	Close:
+		expect(TNL);
+		closeblk();
+		return PLbl;
+	}
+	r = tmpref(tokval.str);
+	expect(TEq);
+	k = parsecls(&ty);
+	op = next();
+DoOp:
+	if (op == TPhi) {
+		if (ps != PPhi)
+			err("unexpected phi instruction");
+		op = -1;
+	}
+	if (op == TCall) {
+		arg[0] = parseref();
+		parserefl(1);
+		expect(TNL);
+		op = OCall;
+		if (k == 4) {
+			k = Kl;
+			arg[1] = TYPE(ty);
+		} else
+			arg[1] = R;
+		goto Ins;
+	}
+	if (k == 4)
+		err("size class must be w, l, s, or d");
+	if (op >= NPubOp)
+		err("invalid instruction");
+	i = 0;
+	if (peek() != TNL)
+		for (;;) {
+			if (i == NPred)
+				err("too many arguments");
+			if (op == -1) {
+				expect(TLbl);
+				blk[i] = findblk(tokval.str);
+			}
+			arg[i] = parseref();
+			if (req(arg[i], R))
+				err("invalid instruction argument");
+			i++;
+			t = peek();
+			if (t == TNL)
+				break;
+			if (t != TComma)
+				err(", or end of line expected");
+			next();
+		}
+	next();
+	if (op != -1) {
+	Ins:
+		if (curi - insb >= NIns)
+			err("too many instructions (2)");
+		curi->op = op;
+		curi->cls = k;
+		curi->to = r;
+		curi->arg[0] = arg[0];
+		curi->arg[1] = arg[1];
+		curi++;
+		return PIns;
+	} else {
+		phi = alloc(sizeof *phi);
+		phi->to = r;
+		phi->cls = k;
+		memcpy(phi->arg, arg, i * sizeof arg[0]);
+		memcpy(phi->blk, blk, i * sizeof blk[0]);
+		phi->narg = i;
+		*plink = phi;
+		plink = &phi->link;
+		return PPhi;
+	}
+}
+
+static Fn *
+parsefn()
+{
+	PState ps;
+	Fn *fn;
+
+	ntmp = Tmp0;
+	ncon = 1; /* first constant must be 0 */
+	curb = 0;
+	nblk = 0;
+	curi = insb;
+	tmp = vnew(ntmp, sizeof tmp[0]);
+	con = vnew(ncon, sizeof con[0]);
+	bmap = vnew(nblk, sizeof bmap[0]);
+	con[0].type = CBits;
+	fn = alloc(sizeof *fn);
+	blink = &fn->start;
+	fn->retty = -1;
+	if (peek() != TGlo)
+		rcls = parsecls(&fn->retty);
+	else
+		rcls = 5;
+	if (next() != TGlo)
+		err("function name expected");
+	strcpy(fn->name, tokval.str);
+	parserefl(0);
+	if (nextnl() != TLBrace)
+		err("function body must start with {");
+	ps = PLbl;
+	do
+		ps = parseline(ps);
+	while (ps != PEnd);
+	if (!curb)
+		err("empty file");
+	if (curb->jmp.type == JXXX)
+		err("last block misses jump");
+	fn->tmp = tmp;
+	fn->con = con;
+	fn->mem = vnew(0, sizeof fn->mem[0]);
+	fn->ntmp = ntmp;
+	fn->ncon = ncon;
+	fn->nmem = 0;
+	fn->nblk = nblk;
+	fn->rpo = 0;
+	return fn;
+}
+
+static void
+parsetyp()
+{
+	Typ *ty;
+	int t, n, sz, al, s, a, c, flt;
+
+	if (ntyp >= NTyp)
+		err("too many type definitions");
+	ty = &typ[ntyp++];
+	ty->align = -1;
+	if (nextnl() != TTyp ||  nextnl() != TEq)
+		err("type name, then = expected");
+	strcpy(ty->name, tokval.str);
+	t = nextnl();
+	if (t == TAlign) {
+		if (nextnl() != TInt)
+			err("alignment expected");
+		for (al=0; tokval.num /= 2; al++)
+			;
+		ty->align = al;
+		t = nextnl();
+	}
+	if (t != TLBrace)
+		err("type body must start with {");
+	t = nextnl();
+	if (t == TInt) {
+		ty->dark = 1;
+		ty->size = tokval.num;
+		if (ty->align == -1)
+			err("dark types need alignment");
+		t = nextnl();
+	} else {
+		ty->dark = 0;
+		n = -1;
+		sz = 0;
+		al = 0;
+		for (;;) {
+			flt = 0;
+			switch (t) {
+			default: err("invalid size specifier %c", tokval.chr);
+			case TD: flt = 1;
+			case TL: s = 8; a = 3; break;
+			case TS: flt = 1;
+			case TW: s = 4; a = 2; break;
+			case TH: s = 2; a = 1; break;
+			case TB: s = 1; a = 0; break;
+			}
+			if (a > al)
+				al = a;
+			if ((a = sz & (s-1))) {
+				a = s - a;
+				if (++n < NSeg) {
+					/* padding segment */
+					ty->seg[n].ispad = 1;
+					ty->seg[n].len = a;
+				}
+			}
+			t = nextnl();
+			if (t == TInt) {
+				c = tokval.num;
+				t = nextnl();
+			} else
+				c = 1;
+			while (c-- > 0) {
+				if (++n < NSeg) {
+					ty->seg[n].isflt = flt;
+					ty->seg[n].ispad = 0;
+					ty->seg[n].len = s;
+				}
+				sz += a + s;
+			}
+			if (t != TComma)
+				break;
+			t = nextnl();
+		}
+		if (++n >= NSeg)
+			ty->dark = 1;
+		else
+			ty->seg[n].len = 0;
+		if (ty->align == -1)
+			ty->align = al;
+		else
+			al = ty->align;
+		a = (1 << al) - 1;
+		ty->size = (sz + a) & ~a;
+	}
+	if (t != TRBrace)
+		err("expected closing }");
+}
+
+static void
+parsedatref(Dat *d)
+{
+	int t;
+
+	d->isref = 1;
+	d->u.ref.nam = tokval.str;
+	d->u.ref.off = 0;
+	t = peek();
+	if (t == TPlus) {
+		next();
+		if (next() != TInt)
+			err("invalid token after offset in ref");
+		d->u.ref.off = tokval.num;
+	}
+}
+
+static void
+parsedatstr(Dat *d)
+{
+	d->isstr = 1;
+	d->u.str = tokval.str;
+}
+
+static void
+parsedat(void cb(Dat *))
+{
+	char s[NString];
+	int t;
+	Dat d;
+
+	d.type = DStart;
+	d.isstr = 0;
+	d.isref = 0;
+	cb(&d);
+	if (nextnl() != TGlo || nextnl() != TEq)
+		err("data name, then = expected");
+	strcpy(s, tokval.str);
+	t = nextnl();
+	if (t == TAlign) {
+		if (nextnl() != TInt)
+			err("alignment expected");
+		d.type = DAlign;
+		d.u.num = tokval.num;
+		cb(&d);
+		t = nextnl();
+	}
+	d.type = DName;
+	d.u.str = s;
+	cb(&d);
+
+	if (t != TLBrace)
+		err("expected data contents in { .. }");
+	for (;;) {
+		switch (nextnl()) {
+		default: err("invalid size specifier %c in data", tokval.chr);
+		case TRBrace: goto Done;
+		case TL: d.type = DL; break;
+		case TW: d.type = DW; break;
+		case TH: d.type = DH; break;
+		case TB: d.type = DB; break;
+		case TS: d.type = DW; break;
+		case TD: d.type = DL; break;
+		case TZ: d.type = DZ; break;
+		}
+		t = nextnl();
+		do {
+			d.isref = 0;
+			d.isstr = 0;
+			memset(&d.u, 0, sizeof d.u);
+			if (t == TFlts)
+				d.u.flts = tokval.flts;
+			else if (t == TFltd)
+				d.u.fltd = tokval.fltd;
+			else if (t == TInt)
+				d.u.num = tokval.num;
+			else if (t == TGlo)
+				parsedatref(&d);
+			else if (t == TStr)
+				parsedatstr(&d);
+			else
+				err("constant literal expected");
+			cb(&d);
+			t = nextnl();
+		} while (t == TInt || t == TFlts || t == TFltd);
+		if (t == TRBrace)
+			break;
+		if (t != TComma)
+			err(", or } expected");
+	}
+Done:
+	d.type = DEnd;
+	cb(&d);
+}
+
+void
+parse(FILE *f, char *path, void data(Dat *), void func(Fn *))
+{
+	inf = f;
+	inpath = path;
+	lnum = 1;
+	thead = TXXX;
+	ntyp = 0;
+	for (;;)
+		switch (nextnl()) {
+		case TFunc:
+			func(parsefn());
+			break;
+		case TType:
+			parsetyp();
+			break;
+		case TData:
+			parsedat(data);
+			break;
+		case TEOF:
+			return;
+		default:
+			err("top-level definition expected");
+			break;
+		}
+}
+
+static void
+printcon(Con *c, FILE *f)
+{
+	switch (c->type) {
+	case CUndef:
+		break;
+	case CAddr:
+		fprintf(f, "$%s", c->label);
+		if (c->bits.i)
+			fprintf(f, "%+"PRIi64, c->bits.i);
+		break;
+	case CBits:
+		if (c->flt == 1)
+			fprintf(f, "s_%f", c->bits.s);
+		else if (c->flt == 2)
+			fprintf(f, "d_%lf", c->bits.d);
+		else
+			fprintf(f, "%"PRIi64, c->bits.i);
+		break;
+	}
+}
+
+void
+printref(Ref r, Fn *fn, FILE *f)
+{
+	int i;
+	Mem *m;
+
+	switch (rtype(r)) {
+	case RTmp:
+		if (r.val < Tmp0)
+			fprintf(f, "R%d", r.val);
+		else
+			fprintf(f, "%%%s", fn->tmp[r.val].name);
+		break;
+	case RCon:
+		printcon(&fn->con[r.val], f);
+		break;
+	case RSlot:
+		fprintf(f, "S%d", r.val);
+		break;
+	case RACall:
+		fprintf(f, "%03x", r.val & AMask);
+		break;
+	case RAType:
+		fprintf(f, ":%s", typ[r.val & AMask].name);
+		break;
+	case RAMem:
+		i = 0;
+		m = &fn->mem[r.val & AMask];
+		fputc('[', f);
+		if (m->offset.type != CUndef) {
+			printcon(&m->offset, f);
+			i = 1;
+		}
+		if (!req(m->base, R)) {
+			if (i)
+				fprintf(f, " + ");
+			printref(m->base, fn, f);
+			i = 1;
+		}
+		if (!req(m->index, R)) {
+			if (i)
+				fprintf(f, " + ");
+			fprintf(f, "%d * ", m->scale);
+			printref(m->index, fn, f);
+		}
+		fputc(']', f);
+		break;
+	}
+}
+
+void
+printfn(Fn *fn, FILE *f)
+{
+	static char *jtoa[NJmp] = {
+		[JRet0]     = "ret",
+		[JRetw]     = "retw",
+		[JRetl]     = "retl",
+		[JRetc]     = "retc",
+		[JRets]     = "rets",
+		[JRetd]     = "retd",
+		[JJnz]      = "jnz",
+		[JXJnp]     = "xjnp",
+		[JXJp]      = "xjp",
+	#define X(c) [JXJc+IC##c] = "xj" #c,
+		ICMPS(X)
+	#undef X
+	};
+	static char prcls[NOp] = {
+		[OArg] = 1,
+		[OSwap] = 1,
+		[OXCmp] = 1,
+		[OXTest] = 1,
+		[OXDiv] = 1,
+		[OXIDiv] = 1,
+	};
+	static char ktoc[] = "wlsd";
+	Blk *b;
+	Phi *p;
+	Ins *i;
+	uint n;
+
+	fprintf(f, "function $%s() {\n", fn->name);
+	for (b=fn->start; b; b=b->link) {
+		fprintf(f, "@%s\n", b->name);
+		for (p=b->phi; p; p=p->link) {
+			fprintf(f, "\t");
+			printref(p->to, fn, f);
+			fprintf(f, " =%c phi ", ktoc[p->cls]);
+			assert(p->narg);
+			for (n=0;; n++) {
+				fprintf(f, "@%s ", p->blk[n]->name);
+				printref(p->arg[n], fn, f);
+				if (n == p->narg-1) {
+					fprintf(f, "\n");
+					break;
+				} else
+					fprintf(f, ", ");
+			}
+		}
+		for (i=b->ins; i-b->ins < b->nins; i++) {
+			fprintf(f, "\t");
+			if (!req(i->to, R)) {
+				printref(i->to, fn, f);
+				fprintf(f, " =%c ", ktoc[i->cls]);
+			}
+			assert(opdesc[i->op].name);
+			fprintf(f, "%s", opdesc[i->op].name);
+			if (req(i->to, R) && prcls[i->op])
+				fputc(ktoc[i->cls], f);
+			if (!req(i->arg[0], R)) {
+				fprintf(f, " ");
+				printref(i->arg[0], fn, f);
+			}
+			if (!req(i->arg[1], R)) {
+				fprintf(f, ", ");
+				printref(i->arg[1], fn, f);
+			}
+			fprintf(f, "\n");
+		}
+		switch (b->jmp.type) {
+		case JRet0:
+		case JRetw:
+		case JRetl:
+		case JRets:
+		case JRetd:
+		case JRetc:
+			fprintf(f, "\t%s", jtoa[b->jmp.type]);
+			if (b->jmp.type != JRet0 || !req(b->jmp.arg, R)) {
+				fprintf(f, " ");
+				printref(b->jmp.arg, fn, f);
+			}
+			if (b->jmp.type == JRetc)
+				fprintf(f, ", :%s", typ[fn->retty].name);
+			fprintf(f, "\n");
+			break;
+		case JJmp:
+			if (b->s1 != b->link)
+				fprintf(f, "\tjmp @%s\n", b->s1->name);
+			break;
+		default:
+			fprintf(f, "\t%s ", jtoa[b->jmp.type]);
+			if (b->jmp.type == JJnz) {
+				printref(b->jmp.arg, fn, f);
+				fprintf(f, ", ");
+			}
+			fprintf(f, "@%s, @%s\n", b->s1->name, b->s2->name);
+			break;
+		}
+	}
+	fprintf(f, "}\n");
+}
diff --git a/src/rega.c b/src/rega.c
@@ -0,0 +1,598 @@
+#include "all.h"
+
+#ifdef TEST_PMOV
+	#undef assert
+	#define assert(x) assert_test(#x, x)
+#endif
+
+typedef struct RMap RMap;
+
+struct RMap {
+	int t[NIReg+NFReg];
+	int r[NIReg+NFReg];
+	BSet b[1];
+	int n;
+};
+
+static bits regu;      /* registers used */
+static Tmp *tmp;       /* function temporaries */
+static Mem *mem;       /* function mem references */
+static struct {
+	Ref src, dst;
+	int cls;
+} *pm;                 /* parallel move constructed */
+static int cpm, npm;   /* capacity and size of pm */
+
+static int *
+hint(int t)
+{
+	return &tmp[phicls(t, tmp)].hint.r;
+}
+
+static void
+sethint(int t, int r)
+{
+	bits m;
+
+	m = tmp[phicls(t, tmp)].hint.m;
+	if (*hint(t) == -1)
+	if (!(BIT(r) & m))
+		*hint(t) = r;
+}
+
+static void
+rcopy(RMap *ma, RMap *mb)
+{
+	memcpy(ma->t, mb->t, sizeof ma->t);
+	memcpy(ma->r, mb->r, sizeof ma->r);
+	bscopy(ma->b, mb->b);
+	ma->n = mb->n;
+}
+
+static int
+rfind(RMap *m, int t)
+{
+	int i;
+
+	for (i=0; i<m->n; i++)
+		if (m->t[i] == t)
+			return m->r[i];
+	return -1;
+}
+
+static Ref
+rref(RMap *m, int t)
+{
+	int r, s;
+
+	r = rfind(m, t);
+	if (r == -1) {
+		s = tmp[t].slot;
+		assert(s != -1 && "should have spilled");
+		return SLOT(s);
+	} else
+		return TMP(r);
+}
+
+static void
+radd(RMap *m, int t, int r)
+{
+	assert((t >= Tmp0 || t == r) && "invalid temporary");
+	assert(((RAX <= r && r < RAX + NIReg) || (XMM0 <= r && r < XMM0 + NFReg)) && "invalid register");
+	assert(!bshas(m->b, t) && "temporary has mapping");
+	assert(!bshas(m->b, r) && "register already allocated");
+	assert(m->n <= NIReg+NFReg && "too many mappings");
+	bsset(m->b, t);
+	bsset(m->b, r);
+	m->t[m->n] = t;
+	m->r[m->n] = r;
+	m->n++;
+	regu |= BIT(r);
+}
+
+static Ref
+ralloc(RMap *m, int t)
+{
+	bits regs;
+	int r, r0, r1;
+
+	if (t < Tmp0) {
+		assert(bshas(m->b, t));
+		return TMP(t);
+	}
+	if (bshas(m->b, t)) {
+		r = rfind(m, t);
+		assert(r != -1);
+		return TMP(r);
+	}
+	r = *hint(t);
+	if (r == -1 || bshas(m->b, r)) {
+		regs = tmp[phicls(t, tmp)].hint.m;
+		regs |= m->b->t[0];
+		switch (KBASE(tmp[t].cls)) {
+		case 0:
+			r0 = RAX;
+			r1 = RAX + NIReg;
+			break;
+		case 1:
+			r0 = XMM0;
+			r1 = XMM0 + NFReg;
+			break;
+		}
+		for (r=r0; r<r1; r++)
+			if (!(regs & BIT(r)))
+				goto Found;
+		for (r=r0; r<r1; r++)
+			if (!bshas(m->b, r))
+				goto Found;
+		diag("rega: no more regs");
+	}
+Found:
+	radd(m, t, r);
+	sethint(t, r);
+	return TMP(r);
+}
+
+static int
+rfree(RMap *m, int t)
+{
+	int i, r;
+
+	if (!bshas(m->b, t))
+		return -1;
+	for (i=0; m->t[i] != t; i++)
+		assert(i+1 < m->n);
+	r = m->r[i];
+	bsclr(m->b, t);
+	bsclr(m->b, r);
+	m->n--;
+	memmove(&m->t[i], &m->t[i+1], (m->n-i) * sizeof m->t[0]);
+	memmove(&m->r[i], &m->r[i+1], (m->n-i) * sizeof m->r[0]);
+	return r;
+}
+
+static void
+mdump(RMap *m)
+{
+	int i;
+
+	for (i=0; i<m->n; i++)
+		fprintf(stderr, " (%s, R%d)",
+			tmp[m->t[i]].name,
+			m->r[i]);
+	fprintf(stderr, "\n");
+}
+
+static void
+pmadd(Ref src, Ref dst, int k)
+{
+	if (npm == cpm) {
+		cpm = cpm * 2 + 16;
+		pm = realloc(pm, cpm * sizeof pm[0]);
+		if (!pm)
+			diag("pmadd: out of memory");
+	}
+	pm[npm].src = src;
+	pm[npm].dst = dst;
+	pm[npm].cls = k;
+	npm++;
+}
+
+enum PMStat { ToMove, Moving, Moved };
+
+static Ref
+pmrec(enum PMStat *status, int i, int *k)
+{
+	Ref swp, swp1;
+	int j, k1;
+
+	/* note, this routine might emit
+	 * too many large instructions:
+	 *
+	 *                  , x -- x
+	 *      x -- x -- x        |
+	 *                  ` x -- x
+	 *
+	 * if only the first move is wide
+	 * the whole cycle will be wide,
+	 * this is safe but not necessary
+	 */
+
+	if (req(pm[i].src, pm[i].dst))
+		return R;
+	status[i] = Moving;
+	assert(KBASE(*k) == KBASE(pm[i].cls));
+	assert((Kw|1) == Kl && (Ks|1) == Kd);
+	*k |= KWIDE(pm[i].cls); /* see above */
+	swp = R;
+	for (j=0; j<npm; j++) {
+		if (req(pm[j].src, pm[i].dst))
+			switch (status[j]) {
+			case ToMove:
+				k1 = *k;
+				swp1 = pmrec(status, j, &k1);
+				if (!req(swp1, R)) {
+					assert(req(swp, R));
+					swp = swp1;
+					*k = k1;
+				}
+				break;
+			case Moving:
+				assert(req(swp, R));
+				swp = pm[i].dst;
+				break;
+			case Moved:
+				break;
+			}
+	}
+	status[i] = Moved;
+	if (req(swp, R)) {
+		*curi++ = (Ins){OCopy, pm[i].dst, {pm[i].src}, pm[i].cls};
+		return R;
+	} else if (!req(swp, pm[i].src)) {
+		*curi++ = (Ins){OSwap, R, {pm[i].src, pm[i].dst}, *k};
+		return swp;
+	} else
+		return R;
+
+}
+
+static void
+pmgen()
+{
+	int i, k;
+	enum PMStat *status;
+
+	status = alloc(npm * sizeof status[0]);
+	assert(!npm || status[npm-1] == ToMove);
+	curi = insb;
+	for (i=0; i<npm; i++)
+		if (status[i] == ToMove) {
+			k = pm[i].cls;
+			pmrec(status, i, &k);
+		}
+}
+
+static void
+move(int r, Ref to, RMap *m)
+{
+	int n, t, r1;
+
+	r1 = req(to, R) ? -1 : rfree(m, to.val);
+	if (bshas(m->b, r) && r1 != r) {
+		/* r is used and not by to */
+		for (n=0; m->r[n] != r; n++)
+			assert(n+1 < m->n);
+		t = m->t[n];
+		rfree(m, t);
+		bsset(m->b, r);
+		ralloc(m, t);
+		bsclr(m->b, r);
+	}
+	t = req(to, R) ? r : to.val;
+	radd(m, t, r);
+}
+
+static int
+regcpy(Ins *i)
+{
+	return i->op == OCopy && isreg(i->arg[0]);
+}
+
+static Ins *
+dopm(Blk *b, Ins *i, RMap *m)
+{
+	RMap m0;
+	int n, r, r1, t, s;
+	Ins *i0, *i1, *ip, *ir;
+	bits def;
+
+	m0 = *m;
+	i1 = ++i;
+	do {
+		i--;
+		move(i->arg[0].val, i->to, m);
+	} while (i != b->ins && regcpy(i-1));
+	assert(m0.n <= m->n);
+	if (i != b->ins && (i-1)->op == OCall) {
+		def = retregs((i-1)->arg[1], 0);
+		for (r=0; r<NRSave; r++)
+			if (!(BIT(rsave[r]) & def))
+				move(rsave[r], R, m);
+	}
+	for (npm=0, n=0; n<m->n; n++) {
+		t = m->t[n];
+		s = tmp[t].slot;
+		r1 = m->r[n];
+		r = rfind(&m0, t);
+		if (r != -1)
+			pmadd(TMP(r1), TMP(r), tmp[t].cls);
+		else if (s != -1)
+			pmadd(TMP(r1), SLOT(s), tmp[t].cls);
+	}
+	for (ip=i; ip<i1; ip++) {
+		if (!req(ip->to, R))
+			rfree(m, ip->to.val);
+		r = ip->arg[0].val;
+		if (rfind(m, r) == -1)
+			radd(m, r, r);
+	}
+	pmgen();
+#ifdef TEST_PMOV
+	return 0;
+#endif
+	n = b->nins - (i1 - i) + (curi - insb);
+	i0 = alloc(n * sizeof(Ins));
+	ip = icpy(ip = i0, b->ins, i - b->ins);
+	ip = icpy(ir = ip, insb, curi - insb);
+	ip = icpy(ip, i1, &b->ins[b->nins] - i1);
+	b->nins = n;
+	b->ins = i0;
+	return ir;
+}
+
+static int
+prio(Ref r1, Ref r2)
+{
+	/* trivial heuristic to begin with,
+	 * later we can use the distance to
+	 * the definition instruction
+	 */
+	(void) r2;
+	return *hint(r1.val) != -1;
+}
+
+static void
+insert(Ref *r, Ref **rs, int p)
+{
+	int i;
+
+	rs[i = p] = r;
+	while (i-- > 0 && prio(*r, *rs[i])) {
+		rs[i+1] = rs[i];
+		rs[i] = r;
+	}
+}
+
+static void
+doblk(Blk *b, RMap *cur)
+{
+	int x, r, nr;
+	bits rs;
+	Ins *i;
+	Mem *m;
+	Ref *ra[4];
+
+	if (rtype(b->jmp.arg) == RTmp)
+		b->jmp.arg = ralloc(cur, b->jmp.arg.val);
+	else if (rtype(b->jmp.arg) == RACall) {
+		/* add return registers */
+		rs = retregs(b->jmp.arg, 0);
+		for (r=0; rs; rs/=2, r++)
+			if (rs & 1)
+				radd(cur, r, r);
+	}
+	for (i=&b->ins[b->nins]; i!=b->ins;) {
+		switch ((--i)->op) {
+		case OCall:
+			rs = argregs(i->arg[1], 0);
+			for (r=0; r<NRSave; r++)
+				if (!(BIT(rsave[r]) & rs))
+					rfree(cur, rsave[r]);
+			break;
+		case OCopy:
+			if (isreg(i->arg[0])) {
+				i = dopm(b, i, cur);
+				continue;
+			}
+			if (isreg(i->to))
+			if (rtype(i->arg[0]) == RTmp)
+				sethint(i->arg[0].val, i->to.val);
+			/* fall through */
+		default:
+			if (!req(i->to, R)) {
+				assert(rtype(i->to) == RTmp);
+				r = rfree(cur, i->to.val);
+				if (r == -1 && !isreg(i->to)) {
+					*i = (Ins){.op = ONop};
+					continue;
+				}
+				if (i->to.val >= Tmp0)
+					i->to = TMP(r);
+			}
+			break;
+		}
+		for (x=0, nr=0; x<2; x++)
+			switch (rtype(i->arg[x])) {
+			case RAMem:
+				m = &mem[i->arg[x].val & AMask];
+				if (rtype(m->base) == RTmp)
+					insert(&m->base, ra, nr++);
+				if (rtype(m->index) == RTmp)
+					insert(&m->index, ra, nr++);
+				break;
+			case RTmp:
+				insert(&i->arg[x], ra, nr++);
+				break;
+			}
+		for (r=0; r<nr; r++)
+			*ra[r] = ralloc(cur, ra[r]->val);
+	}
+}
+
+/* register allocation
+ * depends on rpo, phi, cost, (and obviously spill)
+ */
+void
+rega(Fn *fn)
+{
+	int j, n, t, r, r1, x, rl[Tmp0];
+	Blk *b, *b1, *s, ***ps, *blist;
+	RMap *end, *beg, cur, old;
+	Ins *i;
+	Phi *p;
+	uint u;
+	Ref src, dst;
+
+	/* 1. setup */
+	regu = 0;
+	tmp = fn->tmp;
+	mem = fn->mem;
+	end = alloc(fn->nblk * sizeof end[0]);
+	beg = alloc(fn->nblk * sizeof beg[0]);
+	for (n=0; n<fn->nblk; n++) {
+		bsinit(end[n].b, fn->ntmp);
+		bsinit(beg[n].b, fn->ntmp);
+	}
+	bsinit(cur.b, fn->ntmp);
+	bsinit(old.b, fn->ntmp);
+
+	for (t=Tmp0; t<fn->ntmp; t++)
+		*hint(t) = -1;
+	for (b=fn->start, i=b->ins; i-b->ins < b->nins; i++)
+		if (i->op != OCopy || !isreg(i->arg[0]))
+			break;
+		else {
+			assert(rtype(i->to) == RTmp);
+			sethint(i->to.val, i->arg[0].val);
+		}
+
+	/* 2. assign registers following post-order */
+	for (n=fn->nblk-1; n>=0; n--) {
+		b = fn->rpo[n];
+		cur.n = 0;
+		bszero(cur.b);
+		for (x=0; x<2; x++)
+			for (t=Tmp0; t<fn->ntmp; t++) {
+				assert(bshas(b->out, t) ||
+					!bshas(cur.b, t));
+				if (bshas(b->out, t))
+				if (!bshas(cur.b, t))
+				if (x || (r=*hint(t)) != -1)
+				if (x || !bshas(cur.b, r))
+					ralloc(&cur, t);
+			}
+		rcopy(&end[n], &cur);
+		doblk(b, &cur);
+		bscopy(b->in, cur.b);
+		for (p=b->phi; p; p=p->link)
+			if (rtype(p->to) == RTmp) {
+				bsclr(b->in, p->to.val);
+				/* heuristic 0:
+				 * if the phi destination has an
+				 * argument from a frequent block
+				 * that was already allocated to
+				 * 'r', use 'r' as the new hint
+				 */
+				memset(rl, 0, sizeof rl);
+				for (u=0; u<p->narg; u++) {
+					t = p->arg[u].val;
+					b1 = p->blk[u];
+					if (rtype(p->arg[u]) == RTmp)
+					if ((r=rfind(&end[b1->id], t)) != -1)
+						rl[r] += b1->loop;
+				}
+				for (x=0, j=0; j<Tmp0; j++)
+					if (rl[j] > rl[x])
+						x = j;
+				if (rl[x] >= b->loop)
+					*hint(p->to.val) = x;
+			}
+		if (b->npred > 1) {
+			/* heuristic 1:
+			 * attempt to satisfy hints
+			 * when it's simple and we have
+			 * multiple predecessors
+			 */
+			rcopy(&old, &cur);
+			curi = &insb[NIns];
+			for (j=0; j<old.n; j++) {
+				t = old.t[j];
+				r = *hint(t);
+				r1 = rfind(&cur, t);
+				if (r != -1 && r != r1)
+				if (!bshas(cur.b, r)) {
+					rfree(&cur, t);
+					radd(&cur, t, r);
+					x = tmp[t].cls;
+					emit(OCopy, x, TMP(r1), TMP(r), R);
+				}
+			}
+			if ((j = &insb[NIns] - curi)) {
+				b->nins += j;
+				i = alloc(b->nins * sizeof(Ins));
+				icpy(icpy(i, curi, j), b->ins, b->nins-j);
+				b->ins = i;
+			}
+		}
+		rcopy(&beg[n], &cur);
+	}
+	if (debug['R'])  {
+		fprintf(stderr, "\n> Register mappings:\n");
+		for (n=0; n<fn->nblk; n++) {
+			b = fn->rpo[n];
+			fprintf(stderr, "\t%-10s beg", b->name);
+			mdump(&beg[n]);
+			fprintf(stderr, "\t           end");
+			mdump(&end[n]);
+		}
+		fprintf(stderr, "\n");
+	}
+
+	/* 3. compose glue code */
+	blist = 0;
+	for (b=fn->start;; b=b->link) {
+		ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
+		for (; (s=**ps); ps++) {
+			npm = 0;
+			for (p=s->phi; p; p=p->link) {
+				dst = p->to;
+				assert(rtype(dst)==RSlot || rtype(dst)==RTmp);
+				if (rtype(dst) == RTmp) {
+					r = rfind(&beg[s->id], dst.val);
+					if (r == -1)
+						continue;
+					dst = TMP(r);
+				}
+				for (u=0; p->blk[u]!=b; u++)
+					assert(u+1 < p->narg);
+				src = p->arg[u];
+				if (rtype(src) == RTmp)
+					src = rref(&end[b->id], src.val);
+				pmadd(src, dst, p->cls);
+			}
+			for (t=Tmp0; t<fn->ntmp; t++)
+				if (bshas(s->in, t)) {
+					src = rref(&end[b->id], t);
+					dst = rref(&beg[s->id], t);
+					pmadd(src, dst, tmp[t].cls);
+				}
+			pmgen();
+			if (curi == insb)
+				continue;
+			b1 = blknew();
+			b1->loop = (b->loop+s->loop) / 2;
+			b1->link = blist;
+			blist = b1;
+			fn->nblk++;
+			sprintf(b1->name, "%s_%s", b->name, s->name);
+			b1->nins = curi - insb;
+			idup(&b1->ins, insb, b1->nins);
+			b1->jmp.type = JJmp;
+			b1->s1 = s;
+			**ps = b1;
+		}
+		if (!b->link) {
+			b->link = blist;
+			break;
+		}
+	}
+	for (b=fn->start; b; b=b->link)
+		b->phi = 0;
+	fn->reg = regu;
+
+	if (debug['R']) {
+		fprintf(stderr, "\n> After register allocation:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/src/spill.c b/src/spill.c
@@ -0,0 +1,507 @@
+#include "all.h"
+
+static void
+loopmark(Blk *hd, Blk *b, Phi *p)
+{
+	int k, head;
+	uint n, a;
+
+	head = hd->id;
+	if (b->id < head)
+		return;
+	for (; p; p=p->link)
+		for (a=0; a<p->narg; a++)
+			if (p->blk[a] == b)
+			if (rtype(p->arg[a]) == RTmp)
+				bsset(hd->gen, p->arg[a].val);
+	if (b->visit == head)
+		return;
+	b->visit = head;
+	b->loop *= 10;
+	/* aggregate looping information at
+	 * loop headers */
+	bsunion(hd->gen, b->gen);
+	for (k=0; k<2; k++)
+		if (b->nlive[k] > hd->nlive[k])
+			hd->nlive[k] = b->nlive[k];
+	for (n=0; n<b->npred; n++)
+		loopmark(hd, b->pred[n], b->phi);
+}
+
+static void
+tmpuse(Ref r, int use, int loop, Fn *fn)
+{
+	Mem *m;
+	Tmp *t;
+
+	if (rtype(r) == RAMem) {
+		m = &fn->mem[r.val & AMask];
+		tmpuse(m->base, 1, loop, fn);
+		tmpuse(m->index, 1, loop, fn);
+	}
+	else if (rtype(r) == RTmp && r.val >= Tmp0) {
+		t = &fn->tmp[r.val];
+		t->nuse += use;
+		t->ndef += !use;
+		t->cost += loop;
+	}
+}
+
+/* evaluate spill costs of temporaries,
+ * this also fills usage information
+ * requires rpo, preds
+ */
+void
+fillcost(Fn *fn)
+{
+	int n, hd;
+	uint a;
+	Blk *b;
+	Ins *i;
+	Tmp *t;
+	Phi *p;
+
+	for (b=fn->start; b; b=b->link) {
+		b->loop = 1;
+		b->visit = -1;
+	}
+	if (debug['S'])
+		fprintf(stderr, "\n> Loop information:\n");
+	for (n=0; n<fn->nblk; n++) {
+		b = fn->rpo[n];
+		hd = 0;
+		for (a=0; a<b->npred; a++)
+			if (b->pred[a]->id >= n) {
+				loopmark(b, b->pred[a], b->phi);
+				hd = 1;
+			}
+		if (hd && debug['S']) {
+			fprintf(stderr, "\t%-10s", b->name);
+			fprintf(stderr, " (% 3d ", b->nlive[0]);
+			fprintf(stderr, "% 3d) ", b->nlive[1]);
+			dumpts(b->gen, fn->tmp, stderr);
+		}
+	}
+	for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) {
+		t->cost = t-fn->tmp < Tmp0 ? 1e6 : 0;
+		t->nuse = 0;
+		t->ndef = 0;
+	}
+	for (b=fn->start; b; b=b->link) {
+		for (p=b->phi; p; p=p->link) {
+			/* todo, the cost computation
+			 * for p->to is not great... */
+			tmpuse(p->to, 0, 0, fn);
+			for (a=0; a<p->narg; a++) {
+				n = p->blk[a]->loop;
+				assert(b->npred==p->narg &&
+					"wrong cfg");
+				n /= b->npred;
+				tmpuse(p->arg[a], 1, n, fn);
+			}
+		}
+		n = b->loop;
+		for (i=b->ins; i-b->ins < b->nins; i++) {
+			tmpuse(i->to, 0, n, fn);
+			tmpuse(i->arg[0], 1, n, fn);
+			tmpuse(i->arg[1], 1, n, fn);
+		}
+		tmpuse(b->jmp.arg, 1, n, fn);
+	}
+	if (debug['S']) {
+		fprintf(stderr, "\n> Spill costs:\n");
+		for (n=Tmp0; n<fn->ntmp; n++)
+			fprintf(stderr, "\t%-10s %d\n",
+				fn->tmp[n].name,
+				fn->tmp[n].cost);
+		fprintf(stderr, "\n");
+	}
+}
+
+static BSet *fst; /* temps to prioritize in registers (for tcmp1) */
+static Tmp *tmp;  /* current temporaries (for tcmpX) */
+static int ntmp;  /* current # of temps (for limit) */
+static int locs;  /* stack size used by locals */
+static int slot4; /* next slot of 4 bytes */
+static int slot8; /* ditto, 8 bytes */
+static BSet mask[2][1]; /* class masks */
+
+static int
+tcmp0(const void *pa, const void *pb)
+{
+	return tmp[*(int *)pb].cost - tmp[*(int *)pa].cost;
+}
+
+static int
+tcmp1(const void *pa, const void *pb)
+{
+	int c;
+
+	c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa);
+	return c ? c : tcmp0(pa, pb);
+}
+
+static Ref
+slot(int t)
+{
+	int s;
+
+	if (t < Tmp0)
+		diag("spill: cannot spill register");
+	s = tmp[t].slot;
+	if (s == -1) {
+		assert(NAlign == 3);
+		/* nice logic to pack stack slots
+		 * on demand, there can be only
+		 * one hole and slot4 points to it
+		 *
+		 * invariant: slot4 <= slot8
+		 */
+		if (KWIDE(tmp[t].cls)) {
+			s = slot8;
+			if (slot4 == slot8)
+				slot4 += 2;
+			slot8 += 2;
+		} else {
+			s = slot4;
+			if (slot4 == slot8) {
+				slot8 += 2;
+				slot4 += 1;
+			} else
+				slot4 = slot8;
+		}
+		s += locs;
+		tmp[t].slot = s;
+	}
+	return SLOT(s);
+}
+
+static void
+limit(BSet *b, int k, BSet *f)
+{
+	static int *tarr, maxt;
+	int i, nt;
+	uint t;
+
+	nt = bscount(b);
+	if (nt <= k)
+		return;
+	if (nt > maxt) {
+		free(tarr);
+		tarr = emalloc(nt * sizeof tarr[0]);
+		maxt = nt;
+	}
+	for (i=0, t=0; bsiter(b, &t); t++) {
+		bsclr(b, t);
+		tarr[i++] = t;
+	}
+	if (!f)
+		qsort(tarr, nt, sizeof tarr[0], tcmp0);
+	else {
+		fst = f;
+		qsort(tarr, nt, sizeof tarr[0], tcmp1);
+	}
+	for (i=0; i<k && i<nt; i++)
+		bsset(b, tarr[i]);
+	for (; i<nt; i++)
+		slot(tarr[i]);
+}
+
+static void
+limit2(BSet *b1, int k1, int k2, BSet *fst)
+{
+	BSet b2[1];
+
+	bsinit(b2, ntmp); /* todo, free those */
+	bscopy(b2, b1);
+	bsinter(b1, mask[0]);
+	bsinter(b2, mask[1]);
+	limit(b1, NIReg - k1, fst);
+	limit(b2, NFReg - k2, fst);
+	bsunion(b1, b2);
+}
+
+static void
+sethint(BSet *u, bits r)
+{
+	uint t;
+
+	for (t=Tmp0; bsiter(u, &t); t++)
+		tmp[phicls(t, tmp)].hint.m |= r;
+}
+
+static void
+reloads(BSet *u, BSet *v)
+{
+	uint t;
+
+	for (t=Tmp0; bsiter(u, &t); t++)
+		if (!bshas(v, t))
+			emit(OLoad, tmp[t].cls, TMP(t), slot(t), R);
+}
+
+static void
+store(Ref r, int s)
+{
+	static int kstore[] = {
+		[Kw] = OStorew, [Kl] = OStorel,
+		[Ks] = OStores, [Kd] = OStored,
+	};
+
+	if (s != -1)
+		emit(kstore[tmp[r.val].cls], 0, R, r, SLOT(s));
+}
+
+static int
+regcpy(Ins *i)
+{
+	return i->op == OCopy && isreg(i->arg[0]);
+}
+
+static Ins *
+dopm(Blk *b, Ins *i, BSet *v)
+{
+	int n, t;
+	BSet u[1];
+	Ins *i1;
+	bits r;
+
+	bsinit(u, ntmp); /* todo, free those */
+	/* consecutive copies from
+	 * registers need to be handled
+	 * as one large instruction
+	 *
+	 * fixme: there is an assumption
+	 * that calls are always followed
+	 * by copy instructions here, this
+	 * might not be true if previous
+	 * passes change
+	 */
+	i1 = ++i;
+	do {
+		i--;
+		t = i->to.val;
+		if (!req(i->to, R))
+		if (bshas(v, t)) {
+			bsclr(v, t);
+			store(i->to, tmp[t].slot);
+		}
+		bsset(v, i->arg[0].val);
+	} while (i != b->ins && regcpy(i-1));
+	bscopy(u, v);
+	if (i != b->ins && (i-1)->op == OCall) {
+		v->t[0] &= ~retregs((i-1)->arg[1], 0);
+		limit2(v, NISave, NFSave, 0);
+		for (r=0, n=0; n<NRSave; n++)
+			r |= BIT(rsave[n]);
+		v->t[0] |= argregs((i-1)->arg[1], 0);
+	} else {
+		limit2(v, 0, 0, 0);
+		r = v->t[0];
+	}
+	sethint(v, r);
+	reloads(u, v);
+	do
+		emiti(*--i1);
+	while (i1 != i);
+	return i;
+}
+
+/* spill code insertion
+ * requires spill costs, rpo, liveness
+ *
+ * Note: this will replace liveness
+ * information (in, out) with temporaries
+ * that must be in registers at block
+ * borders
+ *
+ * Be careful with:
+ * - OCopy instructions to ensure register
+ *   constraints
+ */
+void
+spill(Fn *fn)
+{
+	Blk *b, *s1, *s2, *hd, **bp;
+	int j, n, l, t, k, lvarg[2];
+	BSet u[1], v[1], w[1];
+	Ins *i;
+	Phi *p;
+	Mem *m;
+	bits r;
+
+	tmp = fn->tmp;
+	ntmp = fn->ntmp;
+	bsinit(u, ntmp);
+	bsinit(v, ntmp);
+	bsinit(w, ntmp);
+	bsinit(mask[0], ntmp);
+	bsinit(mask[1], ntmp);
+	locs = fn->slot;
+	slot4 = 0;
+	slot8 = 0;
+	for (t=0; t<ntmp; t++) {
+		k = 0;
+		if (t >= XMM0 && t < XMM0 + NFReg)
+			k = 1;
+		else if (t >= Tmp0)
+			k = KBASE(tmp[t].cls);
+		bsset(mask[k], t);
+	}
+
+	for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) {
+		b = *--bp;
+		/* invariant: all bocks with bigger rpo got
+		 * their in,out updated. */
+
+		/* 1. find temporaries in registers at
+		 * the end of the block (put them in v) */
+		curi = 0;
+		s1 = b->s1;
+		s2 = b->s2;
+		hd = 0;
+		if (s1 && s1->id <= n)
+			hd = s1;
+		if (s2 && s2->id <= n)
+		if (!hd || s2->id >= hd->id)
+			hd = s2;
+		r = 0;
+		bszero(v);
+		if (hd) {
+			/* back-edge */
+			for (k=0; k<2; k++) {
+				n = k == 0 ? NIReg : NFReg;
+				bscopy(u, b->out);
+				bsinter(u, mask[k]);
+				bscopy(w, u);
+				bsinter(u, hd->gen);
+				bsdiff(w, hd->gen);
+				if ((int)bscount(u) < n) { /* fixme */
+					j = bscount(w);   /* live through */
+					l = hd->nlive[k];
+					limit(w, n - (l - j), 0);
+					bsunion(u, w);
+				} else
+					limit(u, n, 0);
+				bsunion(v, u);
+			}
+		} else if (s1) {
+			liveon(v, b, s1);
+			if (s2) {
+				liveon(u, b, s2);
+				bscopy(w, u);
+				bsinter(w, v);
+				bsunion(v, u);
+			}
+			limit2(v, 0, 0, w);
+		} else if (rtype(b->jmp.arg) == RACall) {
+			/* return */
+			r = retregs(b->jmp.arg, 0);
+			v->t[0] |= r;
+		}
+		bscopy(b->out, v);
+
+		/* 2. process the block instructions */
+		curi = &insb[NIns];
+		for (i=&b->ins[b->nins]; i!=b->ins;) {
+			i--;
+			if (regcpy(i)) {
+				i = dopm(b, i, v);
+				continue;
+			}
+			bszero(w);
+			if (!req(i->to, R)) {
+				assert(rtype(i->to) == RTmp);
+				t = i->to.val;
+				if (bshas(v, t))
+					bsclr(v, t);
+				else {
+					/* make sure we have a reg
+					 * for the result */
+					bsset(v, t);
+					bsset(w, t);
+				}
+			}
+			j = opdesc[i->op].nmem;
+			for (n=0; n<2; n++)
+				if (rtype(i->arg[n]) == RAMem)
+					j--;
+			for (n=0; n<2; n++)
+				switch (rtype(i->arg[n])) {
+				case RAMem:
+					t = i->arg[n].val;
+					m = &fn->mem[t & AMask];
+					if (rtype(m->base) == RTmp) {
+						bsset(v, m->base.val);
+						bsset(w, m->base.val);
+					}
+					if (rtype(m->index) == RTmp) {
+						bsset(v, m->index.val);
+						bsset(w, m->index.val);
+					}
+					break;
+				case RTmp:
+					t = i->arg[n].val;
+					lvarg[n] = bshas(v, t);
+					bsset(v, t);
+					if (j-- <= 0)
+						bsset(w, t);
+					break;
+				}
+			bscopy(u, v);
+			limit2(v, 0, 0, w);
+			for (n=0; n<2; n++)
+				if (rtype(i->arg[n]) == RTmp) {
+					t = i->arg[n].val;
+					if (!bshas(v, t)) {
+						/* do not reload if the
+						 * the temporary was dead
+						 */
+						if (!lvarg[n])
+							bsclr(u, t);
+						i->arg[n] = slot(t);
+					}
+				}
+			reloads(u, v);
+			if (!req(i->to, R)) {
+				t = i->to.val;
+				store(i->to, tmp[t].slot);
+				bsclr(v, t);
+			}
+			emiti(*i);
+			r = v->t[0] & (BIT(Tmp0)-1);
+			if (r)
+				sethint(v, r);
+		}
+		assert(!r || b==fn->start);
+
+		for (p=b->phi; p; p=p->link) {
+			assert(rtype(p->to) == RTmp);
+			t = p->to.val;
+			if (bshas(v, t)) {
+				bsclr(v, t);
+				store(p->to, tmp[t].slot);
+			} else if (bshas(b->in, t))
+				/* only if the phi is live */
+				p->to = slot(p->to.val);
+		}
+		bscopy(b->in, v);
+		b->nins = &insb[NIns] - curi;
+		idup(&b->ins, curi, b->nins);
+	}
+
+	/* align the locals to a 16 byte boundary */
+	assert(NAlign == 3);
+	slot8 += slot8 & 3;
+	fn->slot += slot8;
+
+	if (debug['S']) {
+		fprintf(stderr, "\n> Block information:\n");
+		for (b=fn->start; b; b=b->link) {
+			printf("\t%-10s (% 5d) ", b->name, b->loop);
+			dumpts(b->out, fn->tmp, stdout);
+		}
+		fprintf(stderr, "\n> After spilling:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/src/ssa.c b/src/ssa.c
@@ -0,0 +1,516 @@
+#include "all.h"
+#include <stdarg.h>
+
+static void
+adduse(Tmp *tmp, int ty, Blk *b, ...)
+{
+	Use *u;
+	int n;
+	va_list ap;
+
+	va_start(ap, b);
+	n = tmp->nuse;
+	vgrow(&tmp->use, ++tmp->nuse);
+	u = &tmp->use[n];
+	u->type = ty;
+	u->bid = b->id;
+	switch (ty) {
+	default:
+		diag("ssa: adduse defaulted");
+	case UPhi:
+		u->u.phi = va_arg(ap, Phi *);
+		break;
+	case UIns:
+		u->u.ins = va_arg(ap, Ins *);
+		break;
+	case UJmp:
+		break;
+	}
+	va_end(ap);
+}
+
+/* fill usage, phi, and class information
+ */
+void
+filluse(Fn *fn)
+{
+	Blk *b;
+	Phi *p;
+	Ins *i;
+	int m, t;
+	uint a;
+	Tmp *tmp;
+
+	/* todo, is this the correct file? */
+	tmp = fn->tmp;
+	for (t=0; t<fn->ntmp; t++) {
+		tmp[t].ndef = 0;
+		tmp[t].nuse = 0;
+		tmp[t].phi = 0;
+		tmp[t].cls = 0;
+		if (tmp[t].use == 0)
+			tmp[t].use = vnew(0, sizeof(Use));
+	}
+	for (b=fn->start; b; b=b->link) {
+		for (p=b->phi; p; p=p->link) {
+			assert(rtype(p->to) == RTmp);
+			t = p->to.val;
+			tmp[t].ndef++;
+			tmp[t].cls = p->cls;
+			tmp[t].phi = p->to.val;
+			for (a=0; a<p->narg; a++)
+				if (rtype(p->arg[a]) == RTmp) {
+					t = p->arg[a].val;
+					adduse(&tmp[t], UPhi, b, p);
+					if (!tmp[t].phi)
+						tmp[t].phi = p->to.val;
+				}
+		}
+		for (i=b->ins; i-b->ins < b->nins; i++) {
+			if (!req(i->to, R)) {
+				assert(rtype(i->to) == RTmp);
+				t = i->to.val;
+				tmp[t].ndef++;
+				tmp[t].cls = i->cls;
+			}
+			for (m=0; m<2; m++)
+				if (rtype(i->arg[m]) == RTmp) {
+					t = i->arg[m].val;
+					adduse(&tmp[t], UIns, b, i);
+				}
+		}
+		if (rtype(b->jmp.arg) == RTmp)
+			adduse(&tmp[b->jmp.arg.val], UJmp, b);
+	}
+}
+
+static void
+addpred(Blk *bp, Blk *bc)
+{
+	uint i;
+
+	if (!bc->pred) {
+		bc->pred = alloc(bc->npred * sizeof bc->pred[0]);
+		for (i=0; i<bc->npred; i++)
+			bc->pred[i] = 0;
+	}
+	for (i=0; bc->pred[i]; i++)
+		;
+	bc->pred[i] = bp;
+}
+
+/* fill predecessors information in blocks
+ */
+void
+fillpreds(Fn *f)
+{
+	Blk *b;
+
+	for (b=f->start; b; b=b->link) {
+		b->npred = 0;
+		b->pred = 0;
+	}
+	for (b=f->start; b; b=b->link) {
+		if (b->s1)
+			b->s1->npred++;
+		if (b->s2)
+			b->s2->npred++;
+	}
+	for (b=f->start; b; b=b->link) {
+		if (b->s1)
+			addpred(b, b->s1);
+		if (b->s2)
+			addpred(b, b->s2);
+	}
+}
+
+static int
+rporec(Blk *b, int x)
+{
+	Blk *s1, *s2;
+
+	if (!b || b->id >= 0)
+		return x;
+	b->id = 1;
+	s1 = b->s1;
+	s2 = b->s2;
+	if (s1 && s2 && s1->loop > s2->loop) {
+		s1 = b->s2;
+		s2 = b->s1;
+	}
+	x = rporec(s1, x);
+	x = rporec(s2, x);
+	b->id = x;
+	assert(x >= 0);
+	return x - 1;
+}
+
+/* fill the rpo information in blocks
+ */
+void
+fillrpo(Fn *f)
+{
+	int n;
+	Blk *b, **p;
+
+	for (b=f->start; b; b=b->link)
+		b->id = -1;
+	n = 1 + rporec(f->start, f->nblk-1);
+	f->nblk -= n;
+	f->rpo = alloc(f->nblk * sizeof f->rpo[0]);
+	for (p=&f->start; *p;) {
+		b = *p;
+		if (b->id == -1) {
+			*p = b->link;
+			/* todo, free block */
+		} else {
+			b->id -= n;
+			f->rpo[b->id] = b;
+			p=&(*p)->link;
+		}
+	}
+}
+
+/* for dominators computation, read
+ * "A Simple, Fast Dominance Algorithm"
+ * by K. Cooper, T. Harvey, and K. Kennedy.
+ */
+
+static Blk *
+inter(Blk *b1, Blk *b2)
+{
+	Blk *bt;
+
+	if (b1 == 0)
+		return b2;
+	while (b1 != b2) {
+		if (b1->id < b2->id) {
+			bt = b1;
+			b1 = b2;
+			b2 = bt;
+		}
+		while (b1->id > b2->id) {
+			b1 = b1->idom;
+			assert(b1);
+		}
+	}
+	return b1;
+}
+
+static void
+filldom(Fn *fn)
+{
+	Blk *b, *d;
+	int ch, n;
+	uint p;
+
+	for (b=fn->start; b; b=b->link) {
+		b->idom = 0;
+		b->dom = 0;
+		b->dlink = 0;
+	}
+	do {
+		ch = 0;
+		for (n=1; n<fn->nblk; n++) {
+			b = fn->rpo[n];
+			d = 0;
+			for (p=0; p<b->npred; p++)
+				if (b->pred[p]->idom
+				||  b->pred[p] == fn->start)
+					d = inter(d, b->pred[p]);
+			if (d != b->idom) {
+				ch++;
+				b->idom = d;
+			}
+		}
+	} while (ch);
+	for (b=fn->start; b; b=b->link)
+		if ((d=b->idom)) {
+			assert(d != b);
+			b->dlink = d->dom;
+			d->dom = b;
+		}
+}
+
+static int
+sdom(Blk *b1, Blk *b2)
+{
+	assert(b1 && b2);
+	if (b1 == b2)
+		return 0;
+	while (b2->id > b1->id)
+		b2 = b2->idom;
+	return b1 == b2;
+}
+
+static int
+dom(Blk *b1, Blk *b2)
+{
+	return b1 == b2 || sdom(b1, b2);
+}
+
+static void
+addfron(Blk *a, Blk *b)
+{
+	int n;
+
+	for (n=0; n<a->nfron; n++)
+		if (a->fron[n] == b)
+			return;
+	if (!a->nfron)
+		a->fron = vnew(++a->nfron, sizeof a->fron[0]);
+	else
+		vgrow(&a->fron, ++a->nfron);
+	a->fron[a->nfron-1] = b;
+}
+
+static void
+fillfron(Fn *fn)
+{
+	Blk *a, *b;
+
+	for (b=fn->start; b; b=b->link) {
+		if (b->s1)
+			for (a=b; !sdom(a, b->s1); a=a->idom)
+				addfron(a, b->s1);
+		if (b->s2)
+			for (a=b; !sdom(a, b->s2); a=a->idom)
+				addfron(a, b->s2);
+	}
+}
+
+static Ref
+refindex(int t, Fn *fn)
+{
+	return newtmp(fn->tmp[t].name, fn->tmp[t].cls, fn);
+}
+
+static void
+phiins(Fn *fn)
+{
+	BSet u[1], defs[1];
+	Blk *a, *b, **blist, **be, **bp;
+	Ins *i;
+	Phi *p;
+	Ref r;
+	int t, n, k, nt;
+
+	bsinit(u, fn->nblk);
+	bsinit(defs, fn->nblk);
+	blist = emalloc(fn->nblk * sizeof blist[0]);
+	be = &blist[fn->nblk];
+	nt = fn->ntmp;
+	for (t=Tmp0; t<nt; t++) {
+		fn->tmp[t].visit = 0;
+		if (fn->tmp[t].phi != 0)
+			continue;
+		bszero(u);
+		k = -1;
+		bp = be;
+		for (b=fn->start; b; b=b->link) {
+			b->visit = 0;
+			r = R;
+			for (i=b->ins; i-b->ins < b->nins; i++) {
+				if (!req(r, R)) {
+					if (req(i->arg[0], TMP(t)))
+						i->arg[0] = r;
+					if (req(i->arg[1], TMP(t)))
+						i->arg[1] = r;
+				}
+				if (req(i->to, TMP(t))) {
+					if (!bshas(b->out, t)) {
+						if (fn->tmp[t].ndef == 1)
+							r = TMP(t);
+						else
+							r = refindex(t, fn);
+						i->to = r;
+					} else {
+						if (!bshas(u, b->id)) {
+							bsset(u, b->id);
+							*--bp = b;
+						}
+						if (k == -1)
+							k = i->cls;
+						assert(k == i->cls);
+					}
+				}
+			}
+			if (!req(r, R) && req(b->jmp.arg, TMP(t)))
+				b->jmp.arg = r;
+		}
+		bscopy(defs, u);
+		while (bp != be) {
+			fn->tmp[t].visit = t;
+			b = *bp++;
+			bsclr(u, b->id);
+			for (n=0; n<b->nfron; n++) {
+				a = b->fron[n];
+				if (a->visit++ == 0)
+				if (bshas(a->in, t)) {
+					p = alloc(sizeof *p);
+					p->cls = k;
+					p->to = TMP(t);
+					p->link = a->phi;
+					a->phi = p;
+					if (!bshas(defs, a->id))
+					if (!bshas(u, a->id)) {
+						bsset(u, a->id);
+						*--bp = a;
+					}
+				}
+			}
+		}
+	}
+	free(blist);
+}
+
+typedef struct Name Name;
+struct Name {
+	Ref r;
+	Blk *b;
+	Name *up;
+};
+
+static Name *namel;
+
+static Name *
+nnew(Ref r, Blk *b, Name *up)
+{
+	Name *n;
+
+	if (namel) {
+		n = namel;
+		namel = n->up;
+	} else
+		/* could use alloc, here
+		 * but namel should be reset
+		 */
+		n = emalloc(sizeof *n);
+	n->r = r;
+	n->b = b;
+	n->up = up;
+	return n;
+}
+
+static void
+nfree(Name *n)
+{
+	n->up = namel;
+	namel = n;
+}
+
+static void
+rendef(Ref *r, Blk *b, Name **stk, Fn *fn)
+{
+	Ref r1;
+	int t;
+
+	t = r->val;
+	if (req(*r, R) || !fn->tmp[t].visit)
+		return;
+	r1 = refindex(t, fn);
+	fn->tmp[r1.val].visit = t;
+	stk[t] = nnew(r1, b, stk[t]);
+	*r = r1;
+}
+
+static Ref
+getstk(int t, Blk *b, Name **stk)
+{
+	Name *n, *n1;
+
+	n = stk[t];
+	while (n && !dom(n->b, b)) {
+		n1 = n;
+		n = n->up;
+		nfree(n1);
+	}
+	stk[t] = n;
+	if (!n) {
+		/* uh, oh, warn */
+		return CON_Z;
+	} else
+		return n->r;
+}
+
+static void
+renblk(Blk *b, Name **stk, Fn *fn)
+{
+	Phi *p;
+	Ins *i;
+	Blk *s, **ps, *succ[3];
+	int t, m;
+
+	for (p=b->phi; p; p=p->link)
+		rendef(&p->to, b, stk, fn);
+	for (i=b->ins; i-b->ins < b->nins; i++) {
+		for (m=0; m<2; m++) {
+			t = i->arg[m].val;
+			if (rtype(i->arg[m]) == RTmp)
+			if (fn->tmp[t].visit)
+				i->arg[m] = getstk(t, b, stk);
+		}
+		rendef(&i->to, b, stk, fn);
+	}
+	t = b->jmp.arg.val;
+	if (rtype(b->jmp.arg) == RTmp)
+	if (fn->tmp[t].visit)
+		b->jmp.arg = getstk(t, b, stk);
+	succ[0] = b->s1;
+	succ[1] = b->s2;
+	succ[2] = 0;
+	for (ps=succ; (s=*ps); ps++)
+		for (p=s->phi; p; p=p->link) {
+			t = p->to.val;
+			if ((t=fn->tmp[t].visit)) {
+				m = p->narg++;
+				if (m == NPred)
+					diag("ssa: too many phi arguments");
+				p->arg[m] = getstk(t, b, stk);
+				p->blk[m] = b;
+			}
+		}
+	for (s=b->dom; s; s=s->dlink)
+		renblk(s, stk, fn);
+}
+
+/* require ndef */
+void
+ssa(Fn *fn)
+{
+	Name **stk, *n;
+	int d, nt;
+	Blk *b, *b1;
+
+	nt = fn->ntmp;
+	stk = emalloc(nt * sizeof stk[0]);
+	d = debug['L'];
+	debug['L'] = 0;
+	filldom(fn);
+	if (debug['N']) {
+		fprintf(stderr, "\n> Dominators:\n");
+		for (b1=fn->start; b1; b1=b1->link) {
+			if (!b1->dom)
+				continue;
+			fprintf(stderr, "%10s:", b1->name);
+			for (b=b1->dom; b; b=b->dlink)
+				fprintf(stderr, " %s", b->name);
+			fprintf(stderr, "\n");
+		}
+	}
+	fillfron(fn);
+	filllive(fn);
+	phiins(fn);
+	renblk(fn->start, stk, fn);
+	while (nt--)
+		while ((n=stk[nt])) {
+			stk[nt] = n->up;
+			nfree(n);
+		}
+	debug['L'] = d;
+	free(stk);
+	if (debug['N']) {
+		fprintf(stderr, "\n> After SSA construction:\n");
+		printfn(fn, stderr);
+	}
+}
diff --git a/lisc/test/_alt.ssa b/src/test/_alt.ssa
diff --git a/lisc/test/_dragon.ssa b/src/test/_dragon.ssa
diff --git a/lisc/test/_fix1.ssa b/src/test/_fix1.ssa
diff --git a/lisc/test/_fix2.ssa b/src/test/_fix2.ssa
diff --git a/lisc/test/_fix3.ssa b/src/test/_fix3.ssa
diff --git a/lisc/test/_fix4.ssa b/src/test/_fix4.ssa
diff --git a/lisc/test/_live.ssa b/src/test/_live.ssa
diff --git a/lisc/test/_rpo.ssa b/src/test/_rpo.ssa
diff --git a/lisc/test/_spill1.ssa b/src/test/_spill1.ssa
diff --git a/lisc/test/_spill2.ssa b/src/test/_spill2.ssa
diff --git a/lisc/test/_spill3.ssa b/src/test/_spill3.ssa
diff --git a/lisc/test/abi1.ssa b/src/test/abi1.ssa
diff --git a/lisc/test/abi2.ssa b/src/test/abi2.ssa
diff --git a/lisc/test/abi3.ssa b/src/test/abi3.ssa
diff --git a/lisc/test/abi4.ssa b/src/test/abi4.ssa
diff --git a/lisc/test/abi5.ssa b/src/test/abi5.ssa
diff --git a/lisc/test/align.ssa b/src/test/align.ssa
diff --git a/lisc/test/collatz.ssa b/src/test/collatz.ssa
diff --git a/lisc/test/cprime.ssa b/src/test/cprime.ssa
diff --git a/lisc/test/cup.ssa b/src/test/cup.ssa
diff --git a/lisc/test/dark.ssa b/src/test/dark.ssa
diff --git a/lisc/test/double.ssa b/src/test/double.ssa
diff --git a/lisc/test/echo.ssa b/src/test/echo.ssa
diff --git a/lisc/test/eucl.ssa b/src/test/eucl.ssa
diff --git a/lisc/test/euclc.ssa b/src/test/euclc.ssa
diff --git a/lisc/test/fpcnv.ssa b/src/test/fpcnv.ssa
diff --git a/src/test/go.sh b/src/test/go.sh
@@ -0,0 +1,116 @@
+#!/bin/sh
+
+TMP=/tmp/qbe.zzzz
+
+DRV=$TMP.c
+ASM=$TMP.s
+BIN=$TMP.bin
+OUT=$TMP.out
+
+cleanup() {
+	rm -f $DRV $ASM $BIN $OUT
+}
+
+extract() {
+	WHAT="$1"
+	FILE="$2"
+
+	awk "
+		/^# >>> $WHAT/ {
+			p = 1
+			next
+		}
+		/^# <<</ {
+			if (p)
+				p = 0
+		}
+		p
+	" $FILE \
+	| sed -e 's/# //' \
+	| sed -e 's/#$//'
+}
+
+once() {
+	T="$1"
+
+	if ! test -f $T
+	then
+		echo "invalid test file $T" >&2
+		exit 1
+	fi
+
+	echo "$T... "
+
+	if ! ./qbe $T -o $ASM
+	then
+		echo "[qbe fail]"
+		return 1
+	fi
+
+	extract driver $T > $DRV
+	extract output $T > $OUT
+
+	if test -s $DRV
+	then
+		LNK="$DRV $ASM"
+	else
+		LNK="$ASM"
+	fi
+
+	if ! cc -g -o $BIN $LNK
+	then
+		echo "[cc fail]"
+		return 1
+	fi
+
+	if test -s $OUT
+	then
+		$BIN a b c | diff - $OUT
+		RET=$?
+		REASON="output"
+	else
+		$BIN a b c
+		RET=$?
+		REASON="returned $RET"
+	fi
+
+	if test $RET -ne 0
+	then
+		echo "[$REASON fail]"
+		return 1
+	fi
+
+	printf "\033[1A\033[45C[ok]\n"
+}
+
+
+#trap cleanup TERM QUIT
+
+if test -z "$1"
+then
+	echo "usage: test/go.sh {all, SSAFILE}" 2>&1
+	exit 1
+fi
+
+case $1 in
+	"all")
+		F=0
+		for T in test/[!_]*.ssa
+		do
+			once $T
+			F=`expr $F + $?`
+		done
+		if test $F -ge 1
+		then
+			echo
+			echo "$F test(s) failed!"
+		else
+			echo
+			echo "All is fine!"
+		fi
+		;;
+	*)
+		once $1
+		exit $?
+		;;
+esac
diff --git a/lisc/test/loop.ssa b/src/test/loop.ssa
diff --git a/lisc/test/mandel.ssa b/src/test/mandel.ssa
diff --git a/lisc/test/max.ssa b/src/test/max.ssa
diff --git a/lisc/test/prime.ssa b/src/test/prime.ssa
diff --git a/lisc/test/puts10.ssa b/src/test/puts10.ssa
diff --git a/lisc/test/sum.ssa b/src/test/sum.ssa
diff --git a/lisc/tools/abi.ml b/src/tools/abi.ml
diff --git a/src/tools/abitest.sh b/src/tools/abitest.sh
@@ -0,0 +1,104 @@
+#!/bin/sh
+
+OCAMLC=/usr/bin/ocamlc
+QBE=`pwd`/qbe
+
+failure() {
+	echo "Failure at stage:" $1 >&2
+	exit 1
+}
+
+cleanup() {
+	rm -fr $TMP
+}
+
+init() {
+	cp tools/abi.ml $TMP
+	pushd $TMP > /dev/null
+
+	cat > Makefile << EOM
+
+.PHONY: test
+test: caller.o callee.o
+	c99 -o \$@ caller.o callee.o
+%.o: %.c
+	c99 -c -o \$@ \$<
+%.o: %.ssa
+	$QBE -o \$*.s \$<
+	c99 -c -o \$@ \$*.s
+
+EOM
+
+	if ! $OCAMLC abi.ml -o gentest
+	then
+		popd > /dev/null
+		cleanup
+		failure "abifuzz compilation"
+	fi
+	popd > /dev/null
+}
+
+once() {
+	if test -z "$3"
+	then
+		$TMP/gentest $TMP $1 $2
+	else
+		$TMP/gentest -s $3 $TMP $1 $2
+	fi
+	make -C $TMP test > /dev/null || failure "building"
+	$TMP/test || failure "runtime"
+}
+
+usage() {
+	echo "usage: abitest.sh [-callssa] [-callc] [-s SEED] [-n ITERATIONS]" >&2
+	exit 1
+}
+
+N=1
+CALLER=c
+CALLEE=ssa
+
+while test -n "$1"
+do
+	case "$1" in
+	"-callssa")
+		;;
+	"-callc")
+		CALLER=ssa
+		CALLEE=c
+		;;
+	"-s")
+		test -n "$2" || usage
+		shift
+		SEED="$1"
+		;;
+	"-n")
+		test -n "$2" || usage
+		shift
+		N="$1"
+		;;
+	*)
+		usage
+		;;
+	esac
+	shift
+done
+
+TMP=`mktemp -d abifuzz.XXXXXX`
+
+init
+
+if test -n "$S"
+then
+	once $CALLER $CALLEE $SEED
+else
+	for n in `seq $N`
+	do
+		once $CALLER $CALLEE
+		echo "$n" | grep "00$"
+	done
+fi
+
+echo "All done."
+
+cleanup
diff --git a/lisc/tools/fptox.c b/src/tools/fptox.c
diff --git a/lisc/tools/pmov.c b/src/tools/pmov.c
diff --git a/src/tools/regress.sh b/src/tools/regress.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+for t in test/*
+do
+	printf "Test $t ... "
+
+	./qbe   $t >/tmp/out.0 2>&1
+	./qbe.1 $t >/tmp/out.1 2>&1
+
+	if diff /tmp/out.0 /tmp/out.1 > /dev/null
+	then
+		echo "OK"
+	else
+		echo "KO"
+		break
+	fi
+done
diff --git a/src/util.c b/src/util.c
@@ -0,0 +1,329 @@
+#include "all.h"
+
+typedef struct Bitset Bitset;
+typedef struct Vec Vec;
+
+struct Vec {
+	ulong mag;
+	size_t esz;
+	ulong cap;
+	union {
+		long long ll;
+		long double ld;
+		void *ptr;
+	} align[];
+};
+
+enum {
+	VMin = 2,
+	VMag = 0xcabba9e,
+	NPtr = 256,
+};
+
+Typ typ[NTyp];
+Ins insb[NIns], *curi;
+
+static void *ptr[NPtr];
+static void **pool = ptr;
+static int nptr = 1;
+
+void
+diag(char *s)
+{
+	fputs(s, stderr);
+	fputc('\n', stderr);
+	abort();
+}
+
+void *
+emalloc(size_t n)
+{
+	void *p;
+
+	p = calloc(1, n);
+	if (!p)
+		diag("emalloc: out of memory");
+	return p;
+}
+
+void *
+alloc(size_t n)
+{
+	void **pp;
+
+	if (n == 0)
+		return 0;
+	if (nptr >= NPtr) {
+		pp = emalloc(NPtr * sizeof(void *));
+		pp[0] = pool;
+		pool = pp;
+		nptr = 1;
+	}
+	return pool[nptr++] = emalloc(n);
+}
+
+void
+freeall()
+{
+	void **pp;
+
+	for (;;) {
+		for (pp = &pool[1]; pp < &pool[nptr]; pp++)
+			free(*pp);
+		pp = pool[0];
+		if (!pp)
+			break;
+		free(pool);
+		pool = pp;
+		nptr = NPtr;
+	}
+	nptr = 1;
+}
+
+Blk *
+blknew()
+{
+	static Blk z;
+	Blk *b;
+
+	b = alloc(sizeof *b);
+	*b = z;
+	return b;
+}
+
+void
+emit(int op, int k, Ref to, Ref arg0, Ref arg1)
+{
+	if (curi == insb)
+		diag("emit: too many instructions");
+	*--curi = (Ins){
+		.op = op, .cls = k,
+		.to = to, .arg = {arg0, arg1}
+	};
+}
+
+void
+emiti(Ins i)
+{
+	emit(i.op, i.cls, i.to, i.arg[0], i.arg[1]);
+}
+
+void
+idup(Ins **pd, Ins *s, ulong n)
+{
+	*pd = alloc(n * sizeof(Ins));
+	memcpy(*pd, s, n * sizeof(Ins));
+}
+
+Ins *
+icpy(Ins *d, Ins *s, ulong n)
+{
+	memcpy(d, s, n * sizeof(Ins));
+	return d + n;
+}
+
+void *
+vnew(ulong len, size_t esz)
+{
+	ulong cap;
+	Vec *v;
+
+	for (cap=VMin; cap<len; cap*=2)
+		;
+	v = alloc(cap * esz + sizeof(Vec));
+	v->mag = VMag;
+	v->cap = cap;
+	v->esz = esz;
+	return v + 1;
+}
+
+void
+vgrow(void *vp, ulong len)
+{
+	Vec *v;
+	void *v1;
+
+	v = *(Vec **)vp - 1;
+	assert(v+1 && v->mag == VMag);
+	if (v->cap >= len)
+		return;
+	v1 = vnew(len, v->esz);
+	memcpy(v1, v+1, v->cap * v->esz);
+	*(Vec **)vp = v1;
+}
+
+int
+phicls(int t, Tmp *tmp /*, int c*/)
+{
+	if (tmp[t].phi)
+		return tmp[t].phi;
+	return t;
+#if 0
+	int t1;
+
+	t1 = tmp[t].phi;
+	if (!t1)
+		t1 = t;
+	if (t != t1) {
+		t1 = phitmp(t1, tmp, c);
+		if (c)
+			tmp[t].phi = t1;
+	}
+	return t1;
+#endif
+}
+
+Ref
+newtmp(char *prfx, int k,  Fn *fn)
+{
+	static int n;
+	int t;
+
+	t = fn->ntmp++;
+	vgrow(&fn->tmp, fn->ntmp);
+	sprintf(fn->tmp[t].name, "%s%d", prfx, ++n);
+	fn->tmp[t].cls = k;
+	fn->tmp[t].slot = -1;
+	fn->tmp[t].nuse = +1;
+	fn->tmp[t].ndef = +1;
+	return TMP(t);
+}
+
+Ref
+getcon(int64_t val, Fn *fn)
+{
+	int c;
+
+	for (c=0; c<fn->ncon; c++)
+		if (fn->con[c].type == CBits && fn->con[c].bits.i == val)
+			return CON(c);
+	fn->ncon++;
+	vgrow(&fn->con, fn->ncon);
+	fn->con[c] = (Con){.type = CBits, .bits.i = val};
+	return CON(c);
+}
+
+void
+addcon(Con *c0, Con *c1)
+{
+	if (c0->type == CUndef)
+		*c0 = *c1;
+	else {
+		if (c1->type == CAddr) {
+			if (c0->type == CAddr)
+				diag("addcon: adding two addresses");
+			c0->type = CAddr;
+			strcpy(c0->label, c1->label);
+		}
+		c0->bits.i += c1->bits.i;
+	}
+}
+
+void
+bsinit(BSet *bs, uint n)
+{
+	n = (n + NBit-1) / NBit;
+	bs->nt = n;
+	bs->t = alloc(n * sizeof bs->t[0]);
+}
+
+uint
+bscount(BSet *bs)
+{
+	uint i, j, n;
+
+	n = 0;
+	for (i=0; i<bs->nt; i++)
+		for (j=0; j<NBit; j++)
+			if (bs->t[i] & BIT(j))
+				n++;
+	return n;
+}
+
+static inline uint
+bsmax(BSet *bs)
+{
+	return bs->nt * NBit;
+}
+
+void
+bsset(BSet *bs, uint elt)
+{
+	assert(elt < bsmax(bs));
+	bs->t[elt/NBit] |= BIT(elt%NBit);
+}
+
+void
+bsclr(BSet *bs, uint elt)
+{
+	assert(elt < bsmax(bs));
+	bs->t[elt/NBit] &= ~BIT(elt%NBit);
+}
+
+#define BSOP(f, op)                           \
+	void                                  \
+	f(BSet *a, BSet *b)                   \
+	{                                     \
+		uint i;                       \
+		                              \
+		assert(a->nt == b->nt);       \
+		for (i=0; i<a->nt; i++)       \
+			a->t[i] op b->t[i];   \
+	}
+
+BSOP(bscopy, =)
+BSOP(bsunion, |=)
+BSOP(bsinter, &=)
+BSOP(bsdiff, &= ~)
+
+int
+bsequal(BSet *a, BSet *b)
+{
+	uint i;
+
+	assert(a->nt == b->nt);
+	for (i=0; i<a->nt; i++)
+		if (a->t[i] != b->t[i])
+			return 0;
+	return 1;
+}
+
+void
+bszero(BSet *bs)
+{
+	memset(bs->t, 0, bs->nt * sizeof bs->t[0]);
+}
+
+/* iterates on a bitset, use as follows
+ *
+ * 	for (i=0; bsiter(set, &i); i++)
+ * 		use(i);
+ *
+ */
+int
+bsiter(BSet *bs, uint *elt)
+{
+	uint i;
+
+	for (i=*elt;; i++) {
+		while (i < bsmax(bs) && !bs->t[i/NBit])
+			i = (i + NBit) & -NBit;
+		if (i >= bsmax(bs))
+			return 0;
+		if (bshas(bs, i)) {
+			*elt = i;
+			return 1;
+		}
+	}
+}
+
+void
+dumpts(BSet *bs, Tmp *tmp, FILE *f)
+{
+	uint t;
+
+	fprintf(f, "[");
+	for (t=Tmp0; bsiter(bs, &t); t++)
+		fprintf(f, " %s", tmp[t].name);
+	fprintf(f, " ]\n");
+}

	qbe Internal scc patchset buffer for QBE
	Log \| Files \| Refs \| README \| LICENSE

D	lisc/.gitignore	\|	5	-----
D	lisc/Makefile	\|	17	-----------------
D	lisc/copy.c	\|	159	-------------------------------------------------------------------------------
D	lisc/emit.c	\|	666	-------------------------------------------------------------------------------
D	lisc/isel.c	\|	1135	-------------------------------------------------------------------------------
D	lisc/live.c	\|	174	-------------------------------------------------------------------------------
D	lisc/main.c	\|	117	-------------------------------------------------------------------------------
D	lisc/mem.c	\|	81	-------------------------------------------------------------------------------
D	lisc/parse.c	\|	1081	-------------------------------------------------------------------------------
D	lisc/rega.c	\|	597	-------------------------------------------------------------------------------
D	lisc/spill.c	\|	507	-------------------------------------------------------------------------------
D	lisc/ssa.c	\|	516	-------------------------------------------------------------------------------
D	lisc/test/go.sh	\|	116	-------------------------------------------------------------------------------
D	lisc/tools/abitest.sh	\|	104	-------------------------------------------------------------------------------
D	lisc/tools/regress.sh	\|	17	-----------------
D	lisc/util.c	\|	329	-------------------------------------------------------------------------------
M	minic/mcc	\|	2	+-
A	src/.gitignore	\|	5	+++++
R	lisc/.tag -> src/.tag	\|	0
A	src/Makefile	\|	17	+++++++++++++++++
R	lisc/lisc.h -> src/all.h	\|	0
A	src/copy.c	\|	159	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/emit.c	\|	666	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/isel.c	\|	1135	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/live.c	\|	174	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/main.c	\|	117	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/mem.c	\|	81	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/parse.c	\|	1081	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/rega.c	\|	598	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/spill.c	\|	507	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/ssa.c	\|	516	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R	lisc/test/_alt.ssa -> src/test/_alt.ssa	\|	0
R	lisc/test/_dragon.ssa -> src/test/_dragon.ssa	\|	0
R	lisc/test/_fix1.ssa -> src/test/_fix1.ssa	\|	0
R	lisc/test/_fix2.ssa -> src/test/_fix2.ssa	\|	0
R	lisc/test/_fix3.ssa -> src/test/_fix3.ssa	\|	0
R	lisc/test/_fix4.ssa -> src/test/_fix4.ssa	\|	0
R	lisc/test/_live.ssa -> src/test/_live.ssa	\|	0
R	lisc/test/_rpo.ssa -> src/test/_rpo.ssa	\|	0
R	lisc/test/_spill1.ssa -> src/test/_spill1.ssa	\|	0
R	lisc/test/_spill2.ssa -> src/test/_spill2.ssa	\|	0
R	lisc/test/_spill3.ssa -> src/test/_spill3.ssa	\|	0
R	lisc/test/abi1.ssa -> src/test/abi1.ssa	\|	0
R	lisc/test/abi2.ssa -> src/test/abi2.ssa	\|	0
R	lisc/test/abi3.ssa -> src/test/abi3.ssa	\|	0
R	lisc/test/abi4.ssa -> src/test/abi4.ssa	\|	0
R	lisc/test/abi5.ssa -> src/test/abi5.ssa	\|	0
R	lisc/test/align.ssa -> src/test/align.ssa	\|	0
R	lisc/test/collatz.ssa -> src/test/collatz.ssa	\|	0
R	lisc/test/cprime.ssa -> src/test/cprime.ssa	\|	0
R	lisc/test/cup.ssa -> src/test/cup.ssa	\|	0
R	lisc/test/dark.ssa -> src/test/dark.ssa	\|	0
R	lisc/test/double.ssa -> src/test/double.ssa	\|	0
R	lisc/test/echo.ssa -> src/test/echo.ssa	\|	0
R	lisc/test/eucl.ssa -> src/test/eucl.ssa	\|	0
R	lisc/test/euclc.ssa -> src/test/euclc.ssa	\|	0
R	lisc/test/fpcnv.ssa -> src/test/fpcnv.ssa	\|	0
A	src/test/go.sh	\|	116	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R	lisc/test/loop.ssa -> src/test/loop.ssa	\|	0
R	lisc/test/mandel.ssa -> src/test/mandel.ssa	\|	0
R	lisc/test/max.ssa -> src/test/max.ssa	\|	0
R	lisc/test/prime.ssa -> src/test/prime.ssa	\|	0
R	lisc/test/puts10.ssa -> src/test/puts10.ssa	\|	0
R	lisc/test/sum.ssa -> src/test/sum.ssa	\|	0
R	lisc/tools/abi.ml -> src/tools/abi.ml	\|	0
A	src/tools/abitest.sh	\|	104	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R	lisc/tools/fptox.c -> src/tools/fptox.c	\|	0
R	lisc/tools/pmov.c -> src/tools/pmov.c	\|	0
A	src/tools/regress.sh	\|	17	+++++++++++++++++
A	src/util.c	\|	329	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++