scc

simple c99 compiler
git clone git://git.simple-cc.org/scc
Log | Files | Refs | Submodules | README | LICENSE

commit dee0c6f0c90f7d64fd45cb6e3c48321f4beaf81d
parent 2eaef0900f5ebd9f00bebd8ce899423a0b37b4bb
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date:   Wed, 26 Feb 2025 10:31:51 +0100

libc/wchar: Fix unicode handling

* mbrtowc: validate input, handle 4-byte UTF-8 code points, set errno
* wcrtomb: if a UTF-8 sequence has N bytes, the leading byte has the first
  N bits set (with ASCII characters a special case), not the first N-1 bits
* _validutf8: negate condition

Diffstat:
Msrc/libc/wchar/_validutf8.c | 2+-
Msrc/libc/wchar/mbrtowc.c | 26+++++++++++++++++---------
Msrc/libc/wchar/wcrtomb.c | 9+++++++--
3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/src/libc/wchar/_validutf8.c b/src/libc/wchar/_validutf8.c @@ -23,7 +23,7 @@ _validutf8(wchar_t wc, int *nbytes) }; struct range *bp; - for (bp = ranges; bp->begin <= wc && bp->end > wc; ++bp) + for (bp = ranges; bp->begin > wc || bp->end <= wc; ++bp) ; *nbytes = bp->nbytes; diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c @@ -1,3 +1,5 @@ +#include <errno.h> +#include <stdlib.h> #include <wchar.h> #include "../libc.h" @@ -8,37 +10,43 @@ size_t mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n, mbstate_t *restrict ps) { - unsigned char *t = (unsigned char *) s; + const unsigned char *t = (const unsigned char *) s; unsigned long wc; unsigned c; int i, len, maxlen; - if (s == NULL) + if (t == NULL) return 0; + if ((wc = *t) == 0) + goto return_code; - wc = c = *t++; + c = *t++; for (len = 0; n > 0 && c & 0x80; --n, ++len) c <<= 1; - if (n == 0 || len == 1 || len == 8) - return -1; + if (n == 0 && c & 0x80) + return -2; + if (len == 1 || len > MB_CUR_MAX) + goto return_error; if (len == 0) goto return_code; wc = (c & 0xFF) >> len; for (i = 0; i < len-1; i++) { if (((c = *t++) & 0xC0) != 0x80) - return -1; + goto return_error; wc <<= 6; wc |= c & 0x3F; } if (!_validutf8(wc, &maxlen) || len != maxlen) - return -1; + goto return_error; return_code: if (pwc) *pwc = wc; - if (*s == '\0') - return 0; return t - (unsigned char *) s; + +return_error: + errno = EILSEQ; + return -1; } diff --git a/src/libc/wchar/wcrtomb.c b/src/libc/wchar/wcrtomb.c @@ -14,13 +14,18 @@ wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict ps) if (!s) return 1; + if (c < 0x80) { + *s = wc; + return 1; + } + if (!_validutf8(wc, &n)) { errno = EILSEQ; return -1; } - n--; - *s = 0; + + *s = 0x80; for (i = 0; i < n; i++) { *s >>= 1; *s |= 0x80;