commit abd03eecf666bf864a81f920e0c1f625dbf69f37
parent fa17e40f3662b219eed94f29eb4b1c36e31bf493
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date: Thu, 21 Sep 2023 09:41:27 +0200
as: Use a perfect hash for instructions
The parser of as was using a dicotomic search to locate the
Ins structure with all the information about the instruction
and possible operands. As the instructions are known at
compile time we can train a perfect hash to have a O(1)
time.
Diffstat:
12 files changed, 202 insertions(+), 17 deletions(-)
diff --git a/src/cmd/as/.gitignore b/src/cmd/as/.gitignore
@@ -0,0 +1 @@
+lexh
diff --git a/src/cmd/as/Makefile b/src/cmd/as/Makefile
@@ -25,7 +25,16 @@ all: $(TARGET)
$(TARGET): $(LIBSCC)
clean:
- rm -f target/*/*.o target/*/*tbl.c
+ rm -f target/*/*.o target/*/*tbl.c lexh
+
+genhash.o: ../../libscc/genhash.c
+ $(HOSTCC) -c ../../libscc/genhash.c
+
+lexh.o: lexh.c
+ $(HOSTCC) -c lexh.c
+
+lexh: lexh.o genhash.o
+ $(HOSTCC) -o $@ lexh.o genhash.o
include target/powerpc/powerpc64.mk
include target/powerpc/powerpc.mk
diff --git a/src/cmd/as/as.h b/src/cmd/as/as.h
@@ -201,8 +201,9 @@ extern char *tobytes(TUINT v, int n, int inc);
/*
* Definition of global variables
*/
+extern unsigned long M, S, K;
+extern short hashmap[];
extern Section *cursec, *seclist;
-extern int nr_ins;
extern Ins instab[];
extern Op optab[];
extern int pass;
diff --git a/src/cmd/as/lexh.c b/src/cmd/as/lexh.c
@@ -0,0 +1,153 @@
+#include <signal.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+
+#ifndef SIGHUP
+#define SIGHUP 0
+#endif
+
+extern unsigned genhash(char *);
+
+static sig_atomic_t signalled;
+static unsigned long M, K, S;
+
+static unsigned
+hash(char *s)
+{
+ unsigned h;
+
+ h = genhash(s);
+
+ return (unsigned short) h;
+}
+
+static int
+foundmap(unsigned *th, int ntok)
+{
+ int i;
+ unsigned h;
+ char bmap[USHRT_MAX];
+
+ for (K = 1; (unsigned short) (K+1) != 0; K += 2) {
+ if (signalled) {
+ signalled = 0;
+ fprintf(stderr, "M=%lu,K=%lu,S=%lu\n", M, K, S);
+ }
+ memset(bmap, 0, sizeof(bmap));
+
+ for (i = 0; i < ntok; i++) {
+ h = (th[i]*K >> S) & M;
+ if (bmap[h])
+ break;
+ bmap[h] = 1;
+ }
+
+ if (i == ntok)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void
+phash(char *toks[], int ntok)
+{
+ int i, j, nbits;
+ unsigned *th, h;
+
+ if ((th = calloc(ntok, sizeof(*th))) == NULL) {
+ perror("hash");
+ exit(EXIT_FAILURE);
+ }
+
+ for (i = 0; i < ntok; i++) {
+ h = hash(toks[i]);
+ for (j = 0; j < i && th[j] != h; j++)
+ ;
+
+ if (i != j) {
+ fprintf(stderr,
+ "hash: collision %s-%s\n",
+ toks[i],
+ toks[j]);
+ exit(EXIT_FAILURE);
+ }
+ th[i] = h;
+ }
+
+ for (nbits = 1; 1<<nbits < ntok; ++nbits)
+ ;
+
+ for (i = nbits+1; i < 16; i++) {
+ M = ~((unsigned) -1 << i);
+
+ for (j = nbits; j < 32; j++) {
+ S = 32 - j;
+
+ if (foundmap(th, ntok))
+ goto found;
+ }
+ }
+
+ fputs("hash: not possible perfect hash\n", stderr);
+ exit(EXIT_FAILURE);
+
+found:
+ printf("unsigned long K=%lu, M=%lu, S=%lu;\n", K, M, S);
+
+ printf("short hashmap[%ld] = {\n", 1<<i);
+ for (i = 0; i < ntok; i++)
+ printf("\t[%d] = %d,\n", (th[i]*K >> S) & M, i+1);
+ puts("};");
+}
+
+static void
+sighandler(int num)
+{
+ signalled = 1;
+ signal(SIGHUP, sighandler);
+}
+
+int
+main()
+{
+ int nr, n;
+ char line[BUFSIZ], **tokens, *tok;
+
+ nr = 0;
+ tokens = NULL;
+ for (;;) {
+ if (!fgets(line, sizeof(line), stdin))
+ break;
+ n = strlen(line);
+ if (n == 0)
+ continue;
+ if (line[n-1] != '\n') {
+ fputs("error: line truncated\n", stderr);
+ exit(EXIT_FAILURE);
+ }
+ line[n-1] = '\0';
+
+ tok = malloc(n);
+ tokens = realloc(tokens, (nr+1) * sizeof(*tokens));
+ if (!tokens || !tok) {
+ perror("hash");
+ exit(EXIT_FAILURE);
+ }
+ tokens[nr++] = memcpy(tok, line, n);
+ }
+
+ if (ferror(stdin)) {
+ perror("hash");
+ exit(EXIT_FAILURE);
+ }
+
+ signal(SIGHUP, sighandler);
+
+ if (nr != 0)
+ phash(tokens, nr);
+
+ return 0;
+}
diff --git a/src/cmd/as/main.c b/src/cmd/as/main.c
@@ -59,19 +59,38 @@ cmp(const void *f1, const void *f2)
return strcmp(s, ins->str);
}
-static void
-translate(char *text, char *xargs)
+static Ins *
+decode(char *s)
{
int c;
char *p;
+ unsigned h, id;
Ins *ins;
- Op *op, *lim;
- Node **args;
- for (p = text; c = *p; ++p)
+ for (p = s; c = *p; ++p)
*p = toupper(c);
- ins = bsearch(text, instab, nr_ins, sizeof(Ins), cmp);
+ h = (unsigned short) genhash(s);
+ h = (h*K >> S) & M;
+ id = hashmap[h];
+ if (id == 0)
+ return NULL;
+
+ ins = &instab[id-1];
+ if (strcmp(ins->str, s) != 0)
+ return NULL;
+
+ return ins;
+}
+
+static void
+translate(char *text, char *xargs)
+{
+ Ins *ins;
+ Op *op, *lim;
+ Node **args;
+
+ ins = decode(text);
if (!ins) {
error("invalid instruction '%s'", text);
return;
diff --git a/src/cmd/as/mktbl.awk b/src/cmd/as/mktbl.awk
@@ -36,18 +36,20 @@ END {
for (i in formats)
printf "Format %s;\n", i
- printf "int nr_ins = %d;\n\n", nop
print "struct ins instab[] = {"
for (i = 0; i < nop; i++) {
n = opnames[i]
start = opstart[n]
end = start + opcount[n]
printf "\t{.str = \"%s\", .begin = %d, .end = %d},\n",
- n, start, end | "sort"
+ n, start, end
}
- close("sort")
printf "};\n\n"
+ for (i = 0; i < nop; i++)
+ print opnames[i] | "./lexh"
+ close("./lexh")
+
print "struct op optab[] = {"
for (i = 0; i < nvar; i++) {
printf "\t/* %d */\n", i
diff --git a/src/cmd/as/target/powerpc/powerpc.mk b/src/cmd/as/target/powerpc/powerpc.mk
@@ -5,7 +5,7 @@ POWERPC_OBJ =\
$(POWERPC)/powerpc.o\
$(POWERPC)/ins.o\
-$(POWERPC)/powerpctbl.c: $(POWERPC)/ops.dat $(POWERPC)/opers.dat
+$(POWERPC)/powerpctbl.c: $(POWERPC)/ops.dat $(POWERPC)/opers.dat lexh
./mktbl -f powerpc -c powerpc
$(LIBEXEC)/scc/as-powerpc: $(POWERPC_OBJ)
diff --git a/src/cmd/as/target/powerpc/powerpc64.mk b/src/cmd/as/target/powerpc/powerpc64.mk
@@ -5,7 +5,7 @@ POWERPC64_OBJ =\
$(POWERPC)/powerpc64.o\
$(POWERPC)/ins.o\
-$(POWERPC)/powerpc64tbl.c: $(POWERPC)/ops.dat $(POWERPC)/opers.dat
+$(POWERPC)/powerpc64tbl.c: $(POWERPC)/ops.dat $(POWERPC)/opers.dat lexh
./mktbl -f powerpc -c powerpc64
$(LIBEXEC)/scc/as-powerpc64: $(POWERPC64_OBJ)
diff --git a/src/cmd/as/target/x80/z80.mk b/src/cmd/as/target/x80/z80.mk
@@ -4,7 +4,7 @@ Z80_OBJ =\
target/x80/z80.o\
target/x80/ins.o\
-target/x80/z80tbl.c: target/x80/ops.dat target/x80/opers.dat
+target/x80/z80tbl.c: target/x80/ops.dat target/x80/opers.dat lexh
./mktbl -f x80 -c z80
$(LIBEXEC)/scc/as-z80: $(OBJ) $(Z80_OBJ)
diff --git a/src/cmd/as/target/x86/amd64.mk b/src/cmd/as/target/x86/amd64.mk
@@ -4,7 +4,7 @@ AMD64_OBJ =\
target/x86/amd64.o\
target/x86/ins.o\
-target/x86/amd64tbl.c: target/x86/ops.dat target/x86/opers.dat
+target/x86/amd64tbl.c: target/x86/ops.dat target/x86/opers.dat lexh
./mktbl -f x86 -c amd64
$(LIBEXEC)/scc/as-amd64: $(AMD64_OBJ)
diff --git a/src/cmd/as/target/x86/i286.mk b/src/cmd/as/target/x86/i286.mk
@@ -4,7 +4,7 @@ I286_OBJ =\
target/x86/i286.o\
target/x86/ins.o\
-target/x86/i286tbl.c: target/x86/ops.dat target/x86/opers.dat
+target/x86/i286tbl.c: target/x86/ops.dat target/x86/opers.dat lexh
./mktbl -f x86 -c i286
$(LIBEXEC)/scc/as-i286: $(I286_OBJ)
diff --git a/src/cmd/as/target/x86/i386.mk b/src/cmd/as/target/x86/i386.mk
@@ -4,7 +4,7 @@ I386_OBJ =\
target/x86/i386.o\
target/x86/ins.o\
-target/x86/i386tbl.c: target/x86/ops.dat target/x86/opers.dat
+target/x86/i386tbl.c: target/x86/ops.dat target/x86/opers.dat lexh
./mktbl -f x86 -c i386
$(LIBEXEC)/scc/as-i386: $(I386_OBJ)