commit dee0c6f0c90f7d64fd45cb6e3c48321f4beaf81d
parent 2eaef0900f5ebd9f00bebd8ce899423a0b37b4bb
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date: Wed, 26 Feb 2025 10:31:51 +0100
libc/wchar: Fix unicode handling
* mbrtowc: validate input, handle 4-byte UTF-8 code points, set errno
* wcrtomb: if a UTF-8 sequence has N bytes, the leading byte has the first
N bits set (with ASCII characters a special case), not the first N-1 bits
* _validutf8: negate condition
Diffstat:
3 files changed, 25 insertions(+), 12 deletions(-)
diff --git a/src/libc/wchar/_validutf8.c b/src/libc/wchar/_validutf8.c
@@ -23,7 +23,7 @@ _validutf8(wchar_t wc, int *nbytes)
};
struct range *bp;
- for (bp = ranges; bp->begin <= wc && bp->end > wc; ++bp)
+ for (bp = ranges; bp->begin > wc || bp->end <= wc; ++bp)
;
*nbytes = bp->nbytes;
diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c
@@ -1,3 +1,5 @@
+#include <errno.h>
+#include <stdlib.h>
#include <wchar.h>
#include "../libc.h"
@@ -8,37 +10,43 @@ size_t
mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n,
mbstate_t *restrict ps)
{
- unsigned char *t = (unsigned char *) s;
+ const unsigned char *t = (const unsigned char *) s;
unsigned long wc;
unsigned c;
int i, len, maxlen;
- if (s == NULL)
+ if (t == NULL)
return 0;
+ if ((wc = *t) == 0)
+ goto return_code;
- wc = c = *t++;
+ c = *t++;
for (len = 0; n > 0 && c & 0x80; --n, ++len)
c <<= 1;
- if (n == 0 || len == 1 || len == 8)
- return -1;
+ if (n == 0 && c & 0x80)
+ return -2;
+ if (len == 1 || len > MB_CUR_MAX)
+ goto return_error;
if (len == 0)
goto return_code;
wc = (c & 0xFF) >> len;
for (i = 0; i < len-1; i++) {
if (((c = *t++) & 0xC0) != 0x80)
- return -1;
+ goto return_error;
wc <<= 6;
wc |= c & 0x3F;
}
if (!_validutf8(wc, &maxlen) || len != maxlen)
- return -1;
+ goto return_error;
return_code:
if (pwc)
*pwc = wc;
- if (*s == '\0')
- return 0;
return t - (unsigned char *) s;
+
+return_error:
+ errno = EILSEQ;
+ return -1;
}
diff --git a/src/libc/wchar/wcrtomb.c b/src/libc/wchar/wcrtomb.c
@@ -14,13 +14,18 @@ wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict ps)
if (!s)
return 1;
+ if (c < 0x80) {
+ *s = wc;
+ return 1;
+ }
+
if (!_validutf8(wc, &n)) {
errno = EILSEQ;
return -1;
}
-
n--;
- *s = 0;
+
+ *s = 0x80;
for (i = 0; i < n; i++) {
*s >>= 1;
*s |= 0x80;