scc

simple c99 compiler
git clone git://git.simple-cc.org/scc
Log | Files | Refs | Submodules | README | LICENSE

commit abd03eecf666bf864a81f920e0c1f625dbf69f37
parent fa17e40f3662b219eed94f29eb4b1c36e31bf493
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date:   Thu, 21 Sep 2023 09:41:27 +0200

as: Use a perfect hash for instructions

The parser of as was using a dicotomic search to locate the
Ins structure with all the information about the instruction
and possible operands. As the instructions are known at
compile time we can train a perfect hash to have a O(1)
time.

Diffstat:
Asrc/cmd/as/.gitignore | 1+
Msrc/cmd/as/Makefile | 11++++++++++-
Msrc/cmd/as/as.h | 3++-
Asrc/cmd/as/lexh.c | 153+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/cmd/as/main.c | 31+++++++++++++++++++++++++------
Msrc/cmd/as/mktbl.awk | 8+++++---
Msrc/cmd/as/target/powerpc/powerpc.mk | 2+-
Msrc/cmd/as/target/powerpc/powerpc64.mk | 2+-
Msrc/cmd/as/target/x80/z80.mk | 2+-
Msrc/cmd/as/target/x86/amd64.mk | 2+-
Msrc/cmd/as/target/x86/i286.mk | 2+-
Msrc/cmd/as/target/x86/i386.mk | 2+-
12 files changed, 202 insertions(+), 17 deletions(-)

diff --git a/src/cmd/as/.gitignore b/src/cmd/as/.gitignore @@ -0,0 +1 @@ +lexh diff --git a/src/cmd/as/Makefile b/src/cmd/as/Makefile @@ -25,7 +25,16 @@ all: $(TARGET) $(TARGET): $(LIBSCC) clean: - rm -f target/*/*.o target/*/*tbl.c + rm -f target/*/*.o target/*/*tbl.c lexh + +genhash.o: ../../libscc/genhash.c + $(HOSTCC) -c ../../libscc/genhash.c + +lexh.o: lexh.c + $(HOSTCC) -c lexh.c + +lexh: lexh.o genhash.o + $(HOSTCC) -o $@ lexh.o genhash.o include target/powerpc/powerpc64.mk include target/powerpc/powerpc.mk diff --git a/src/cmd/as/as.h b/src/cmd/as/as.h @@ -201,8 +201,9 @@ extern char *tobytes(TUINT v, int n, int inc); /* * Definition of global variables */ +extern unsigned long M, S, K; +extern short hashmap[]; extern Section *cursec, *seclist; -extern int nr_ins; extern Ins instab[]; extern Op optab[]; extern int pass; diff --git a/src/cmd/as/lexh.c b/src/cmd/as/lexh.c @@ -0,0 +1,153 @@ +#include <signal.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <limits.h> + +#ifndef SIGHUP +#define SIGHUP 0 +#endif + +extern unsigned genhash(char *); + +static sig_atomic_t signalled; +static unsigned long M, K, S; + +static unsigned +hash(char *s) +{ + unsigned h; + + h = genhash(s); + + return (unsigned short) h; +} + +static int +foundmap(unsigned *th, int ntok) +{ + int i; + unsigned h; + char bmap[USHRT_MAX]; + + for (K = 1; (unsigned short) (K+1) != 0; K += 2) { + if (signalled) { + signalled = 0; + fprintf(stderr, "M=%lu,K=%lu,S=%lu\n", M, K, S); + } + memset(bmap, 0, sizeof(bmap)); + + for (i = 0; i < ntok; i++) { + h = (th[i]*K >> S) & M; + if (bmap[h]) + break; + bmap[h] = 1; + } + + if (i == ntok) + return 1; + } + + return 0; +} + +static void +phash(char *toks[], int ntok) +{ + int i, j, nbits; + unsigned *th, h; + + if ((th = calloc(ntok, sizeof(*th))) == NULL) { + perror("hash"); + exit(EXIT_FAILURE); + } + + for (i = 0; i < ntok; i++) { + h = hash(toks[i]); + for (j = 0; j < i && th[j] != h; j++) + ; + + if (i != j) { + fprintf(stderr, + "hash: collision %s-%s\n", + toks[i], + toks[j]); + exit(EXIT_FAILURE); + } + th[i] = h; + } + + for (nbits = 1; 1<<nbits < ntok; ++nbits) + ; + + for (i = nbits+1; i < 16; i++) { + M = ~((unsigned) -1 << i); + + for (j = nbits; j < 32; j++) { + S = 32 - j; + + if (foundmap(th, ntok)) + goto found; + } + } + + fputs("hash: not possible perfect hash\n", stderr); + exit(EXIT_FAILURE); + +found: + printf("unsigned long K=%lu, M=%lu, S=%lu;\n", K, M, S); + + printf("short hashmap[%ld] = {\n", 1<<i); + for (i = 0; i < ntok; i++) + printf("\t[%d] = %d,\n", (th[i]*K >> S) & M, i+1); + puts("};"); +} + +static void +sighandler(int num) +{ + signalled = 1; + signal(SIGHUP, sighandler); +} + +int +main() +{ + int nr, n; + char line[BUFSIZ], **tokens, *tok; + + nr = 0; + tokens = NULL; + for (;;) { + if (!fgets(line, sizeof(line), stdin)) + break; + n = strlen(line); + if (n == 0) + continue; + if (line[n-1] != '\n') { + fputs("error: line truncated\n", stderr); + exit(EXIT_FAILURE); + } + line[n-1] = '\0'; + + tok = malloc(n); + tokens = realloc(tokens, (nr+1) * sizeof(*tokens)); + if (!tokens || !tok) { + perror("hash"); + exit(EXIT_FAILURE); + } + tokens[nr++] = memcpy(tok, line, n); + } + + if (ferror(stdin)) { + perror("hash"); + exit(EXIT_FAILURE); + } + + signal(SIGHUP, sighandler); + + if (nr != 0) + phash(tokens, nr); + + return 0; +} diff --git a/src/cmd/as/main.c b/src/cmd/as/main.c @@ -59,19 +59,38 @@ cmp(const void *f1, const void *f2) return strcmp(s, ins->str); } -static void -translate(char *text, char *xargs) +static Ins * +decode(char *s) { int c; char *p; + unsigned h, id; Ins *ins; - Op *op, *lim; - Node **args; - for (p = text; c = *p; ++p) + for (p = s; c = *p; ++p) *p = toupper(c); - ins = bsearch(text, instab, nr_ins, sizeof(Ins), cmp); + h = (unsigned short) genhash(s); + h = (h*K >> S) & M; + id = hashmap[h]; + if (id == 0) + return NULL; + + ins = &instab[id-1]; + if (strcmp(ins->str, s) != 0) + return NULL; + + return ins; +} + +static void +translate(char *text, char *xargs) +{ + Ins *ins; + Op *op, *lim; + Node **args; + + ins = decode(text); if (!ins) { error("invalid instruction '%s'", text); return; diff --git a/src/cmd/as/mktbl.awk b/src/cmd/as/mktbl.awk @@ -36,18 +36,20 @@ END { for (i in formats) printf "Format %s;\n", i - printf "int nr_ins = %d;\n\n", nop print "struct ins instab[] = {" for (i = 0; i < nop; i++) { n = opnames[i] start = opstart[n] end = start + opcount[n] printf "\t{.str = \"%s\", .begin = %d, .end = %d},\n", - n, start, end | "sort" + n, start, end } - close("sort") printf "};\n\n" + for (i = 0; i < nop; i++) + print opnames[i] | "./lexh" + close("./lexh") + print "struct op optab[] = {" for (i = 0; i < nvar; i++) { printf "\t/* %d */\n", i diff --git a/src/cmd/as/target/powerpc/powerpc.mk b/src/cmd/as/target/powerpc/powerpc.mk @@ -5,7 +5,7 @@ POWERPC_OBJ =\ $(POWERPC)/powerpc.o\ $(POWERPC)/ins.o\ -$(POWERPC)/powerpctbl.c: $(POWERPC)/ops.dat $(POWERPC)/opers.dat +$(POWERPC)/powerpctbl.c: $(POWERPC)/ops.dat $(POWERPC)/opers.dat lexh ./mktbl -f powerpc -c powerpc $(LIBEXEC)/scc/as-powerpc: $(POWERPC_OBJ) diff --git a/src/cmd/as/target/powerpc/powerpc64.mk b/src/cmd/as/target/powerpc/powerpc64.mk @@ -5,7 +5,7 @@ POWERPC64_OBJ =\ $(POWERPC)/powerpc64.o\ $(POWERPC)/ins.o\ -$(POWERPC)/powerpc64tbl.c: $(POWERPC)/ops.dat $(POWERPC)/opers.dat +$(POWERPC)/powerpc64tbl.c: $(POWERPC)/ops.dat $(POWERPC)/opers.dat lexh ./mktbl -f powerpc -c powerpc64 $(LIBEXEC)/scc/as-powerpc64: $(POWERPC64_OBJ) diff --git a/src/cmd/as/target/x80/z80.mk b/src/cmd/as/target/x80/z80.mk @@ -4,7 +4,7 @@ Z80_OBJ =\ target/x80/z80.o\ target/x80/ins.o\ -target/x80/z80tbl.c: target/x80/ops.dat target/x80/opers.dat +target/x80/z80tbl.c: target/x80/ops.dat target/x80/opers.dat lexh ./mktbl -f x80 -c z80 $(LIBEXEC)/scc/as-z80: $(OBJ) $(Z80_OBJ) diff --git a/src/cmd/as/target/x86/amd64.mk b/src/cmd/as/target/x86/amd64.mk @@ -4,7 +4,7 @@ AMD64_OBJ =\ target/x86/amd64.o\ target/x86/ins.o\ -target/x86/amd64tbl.c: target/x86/ops.dat target/x86/opers.dat +target/x86/amd64tbl.c: target/x86/ops.dat target/x86/opers.dat lexh ./mktbl -f x86 -c amd64 $(LIBEXEC)/scc/as-amd64: $(AMD64_OBJ) diff --git a/src/cmd/as/target/x86/i286.mk b/src/cmd/as/target/x86/i286.mk @@ -4,7 +4,7 @@ I286_OBJ =\ target/x86/i286.o\ target/x86/ins.o\ -target/x86/i286tbl.c: target/x86/ops.dat target/x86/opers.dat +target/x86/i286tbl.c: target/x86/ops.dat target/x86/opers.dat lexh ./mktbl -f x86 -c i286 $(LIBEXEC)/scc/as-i286: $(I286_OBJ) diff --git a/src/cmd/as/target/x86/i386.mk b/src/cmd/as/target/x86/i386.mk @@ -4,7 +4,7 @@ I386_OBJ =\ target/x86/i386.o\ target/x86/ins.o\ -target/x86/i386tbl.c: target/x86/ops.dat target/x86/opers.dat +target/x86/i386tbl.c: target/x86/ops.dat target/x86/opers.dat lexh ./mktbl -f x86 -c i386 $(LIBEXEC)/scc/as-i386: $(I386_OBJ)