qbe

Internal scc patchset buffer for QBE
Log | Files | Refs | README | LICENSE

commit 577e93fe6d729b63447faad471fd0f5f2296f667
parent b03a8970d7b73959397f0ca5c8f2a532c1905e5d
Author: Quentin Carbonneaux <quentin@c9x.me>
Date:   Mon,  3 Oct 2022 10:40:39 +0200

thread-local storage for arm64_apple

It is documented nowhere how this is
supposed to work. It is also quite easy
to have assertion failures pop in the
linker when generating asm slightly
different from clang's!

The best source of information is found
in LLVM's source code (AArch64ISelLowering.cpp).
I paste it here for future reference:

/// Darwin only has one TLS scheme which must be capable of dealing with the
/// fully general situation, in the worst case. This means:
///     + "extern __thread" declaration.
///     + Defined in a possibly unknown dynamic library.
///
/// The general system is that each __thread variable has a [3 x i64] descriptor
/// which contains information used by the runtime to calculate the address. The
/// only part of this the compiler needs to know about is the first xword, which
/// contains a function pointer that must be called with the address of the
/// entire descriptor in "x0".
///
/// Since this descriptor may be in a different unit, in general even the
/// descriptor must be accessed via an indirect load. The "ideal" code sequence
/// is:
///     adrp x0, _var@TLVPPAGE
///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
///                                      ; the function pointer
///     blr x1                           ; Uses descriptor address in x0
///     ; Address of _var is now in x0.
///
/// If the address of _var's descriptor *is* known to the linker, then it can
/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
/// a slight efficiency gain.

The call 'blr x1' above is actually
special in that it trashes less registers
than what the abi would normally permit.
In qbe, I don't take advantage of this
and lower the call like a regular call.
We can revise this later on. Again, the
source for this information is LLVM's
source code:

// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).

Diffstat:
Marm64/emit.c | 65+++++++++++++++++++++++++++++++++++++++++------------------------
Marm64/isel.c | 31++++++++++++++++++++++++++++---
Memit.c | 28+++++++++++++++++++++++-----
3 files changed, 92 insertions(+), 32 deletions(-)

diff --git a/arm64/emit.c b/arm64/emit.c @@ -245,33 +245,50 @@ emitf(char *s, Ins *i, E *e) static void loadaddr(Con *c, char *rn, E *e) { - static char *ldsym[][2] = { - /* arm64 */ - [0][0] = "\tadrp\t%s, %s%s%s\n", - [0][1] = "\tadd\t%s, %s, #:lo12:%s%s%s\n", - /* apple */ - [1][0] = "\tadrp\t%s, %s%s@page%s\n", - [1][1] = "\tadd\t%s, %s, %s%s@pageoff%s\n", - }; - char *p, *l, off[32]; + char *p, *l, *s; + + switch (c->reloc) { + default: + die("unreachable"); + case RelDef: + if (T.apple) + s = "\tadrp\tR, S@pageO\n" + "\tadd\tR, R, S@pageoffO\n"; + else + s = "\tadrp\tR, SO\n" + "\tadd\tR, R, #:lo12:SO\n"; + break; + case RelThr: + if (T.apple) + s = "\tadrp\tR, S@tlvppage\n" + "\tldr\tR, [R, S@tlvppageoff]\n"; + else + s = "\tmrs\tR, tpidr_el0\n" + "\tadd\tR, R, #:tprel_hi12:SO, lsl #12\n" + "\tadd\tR, R, #:tprel_lo12_nc:SO\n"; + break; + } - if (c->bits.i) - /* todo, handle large offsets */ - sprintf(off, "+%"PRIi64, c->bits.i); - else - off[0] = 0; l = str(c->label); p = l[0] == '"' ? "" : T.assym; - if (c->reloc == RelThr) { - fprintf(e->f, "\tmrs\t%s, tpidr_el0\n", rn); - fprintf(e->f, "\tadd\t%s, %s, #:tprel_hi12:%s%s%s, lsl #12\n", - rn, rn, p, l, off); - fprintf(e->f, "\tadd\t%s, %s, #:tprel_lo12_nc:%s%s%s\n", - rn, rn, p, l, off); - } else { - fprintf(e->f, ldsym[T.apple != 0][0], rn, p, l, off); - fprintf(e->f, ldsym[T.apple != 0][1], rn, rn, p, l, off); - } + for (; *s; s++) + switch (*s) { + default: + fputc(*s, e->f); + break; + case 'R': + fputs(rn, e->f); + break; + case 'S': + fputs(p, e->f); + fputs(l, e->f); + break; + case 'O': + if (c->bits.i) + /* todo, handle large offsets */ + fprintf(e->f, "+%"PRIi64, c->bits.i); + break; + } } static void diff --git a/arm64/isel.c b/arm64/isel.c @@ -70,20 +70,45 @@ static void fixarg(Ref *pr, int k, int phi, Fn *fn) { char buf[32]; - Ref r0, r1, r2; + Ref r0, r1, r2, r3; int s, n; - Con *c; + Con *c, cc; r0 = *pr; switch (rtype(r0)) { case RCon: + c = &fn->con[r0.val]; + if (T.apple + && c->type == CAddr + && c->reloc == RelThr) { + r1 = newtmp("isel", Kl, fn); + *pr = r1; + if (c->bits.i) { + r2 = newtmp("isel", Kl, fn); + cc = (Con){.type = CBits}; + cc.bits.i = c->bits.i; + r3 = newcon(&cc, fn); + emit(Oadd, Kl, r1, r2, r3); + r1 = r2; + } + emit(Ocopy, Kl, r1, TMP(R0), R); + r1 = newtmp("isel", Kl, fn); + r2 = newtmp("isel", Kl, fn); + emit(Ocall, 0, R, r1, CALL(33)); + emit(Ocopy, Kl, TMP(R0), r2, R); + emit(Oload, Kl, r1, r2, R); + cc = *c; + cc.bits.i = 0; + r3 = newcon(&cc, fn); + emit(Ocopy, Kl, r2, r3, R); + break; + } if (KBASE(k) == 0 && phi) return; r1 = newtmp("isel", k, fn); if (KBASE(k) == 0) { emit(Ocopy, k, r1, r0, R); } else { - c = &fn->con[r0.val]; n = stashbits(&c->bits, KWIDE(k) ? 8 : 4); vgrow(&fn->con, ++fn->ncon); c = &fn->con[fn->ncon-1]; diff --git a/emit.c b/emit.c @@ -17,8 +17,27 @@ emitlnk(char *n, Lnk *l, int s, FILE *f) [1][SecData] = ".section .tdata,\"awT\"", [1][SecBss] = ".section .tbss,\"awT\"", }; - char *p; - + char *pfx, *sfx; + + pfx = n[0] == '"' ? "" : T.assym; + sfx = ""; + if (T.apple && l->thread) { + l->sec = "__DATA"; + l->secf = "__thread_data,thread_local_regular"; + sfx = "$tlv$init"; + fputs( + ".section __DATA,__thread_vars," + "thread_local_variables\n", + f + ); + fprintf(f, "%s%s:\n", pfx, n); + fprintf(f, + "\t.quad __tlv_bootstrap\n" + "\t.quad 0\n" + "\t.quad %s%s%s\n\n", + pfx, n, sfx + ); + } if (l->sec) { fprintf(f, ".section %s", l->sec); if (l->secf) @@ -28,10 +47,9 @@ emitlnk(char *n, Lnk *l, int s, FILE *f) fputc('\n', f); if (l->align) fprintf(f, ".balign %d\n", l->align); - p = n[0] == '"' ? "" : T.assym; if (l->export) - fprintf(f, ".globl %s%s\n", p, n); - fprintf(f, "%s%s:\n", p, n); + fprintf(f, ".globl %s%s\n", pfx, n); + fprintf(f, "%s%s%s:\n", pfx, n, sfx); } void