commit b4b507633186e7f223231a60447d82a6e3ab92af parent f507bae3a0e45b6f9eeadecb3065afaea1a6d6bc Author: Roberto E. Vargas Caballero <k0ga@shike2.com> Date: Mon, 10 Mar 2025 21:22:52 +0100 libc/wchar: Rewrite mbtowc() There are many different corner cases in the implementation of the underlaying call to mbrtowc(). When mbrtowc() returns -2, it should be ready to receive new bytes with new calls, and it requires to update the conversion state. Also, the standard specifies: The implementation shall behave as if no library function calls the mbtowc function. and for that reason, we cannot pass a NULL pointer as state to mbrtowc() because it would imply to use the hidden state of mbrtowc(). After considering to keep ABI compatibility with the definiton of mbstate_t in the different systems, it created to many problems. There is 0 guarantees that code compiled with different libc implementations would work, and for that reason we dropped the ABI compatibility. Diffstat:
23 files changed, 92 insertions(+), 147 deletions(-)
diff --git a/include/bits/darwin/sys/cdefs.h b/include/bits/darwin/sys/cdefs.h @@ -1,9 +1 @@ -#ifdef _NEED_MBSTATE_T -#ifndef _MBSTATE_T -typedef struct { - unsigned char state[4]; - size_t count; -} mbstate_t; -#define _MBSTATE_T -#endif -#endif +/* nothing fpr darwin */ diff --git a/include/bits/dragonfly/sys/cdefs.h b/include/bits/dragonfly/sys/cdefs.h @@ -1,8 +1 @@ -#ifdef _NEED_MBSTATE_T -#ifndef _MBSTATE_T -typedef union { - char __mbstate8[128]; -} mbstate_t; -#define _MBSTATE_T -#endif -#endif +/* nothing for dragonfly */ diff --git a/include/bits/freebsd/sys/cdefs.h b/include/bits/freebsd/sys/cdefs.h @@ -1,8 +1 @@ -#ifdef _NEED_MBSTATE_T -#ifndef _MBSTATE_T -typedef union { - char __mbstate8[128]; -} mbstate_t; -#define _MBSTATE_T -#endif -#endif +/* nothing for Openbsd */ diff --git a/include/bits/linux/sys/cdefs.h b/include/bits/linux/sys/cdefs.h @@ -1,8 +1 @@ -#ifdef _NEED_MBSTATE_T -#ifndef _MBSTATE_T -typedef struct __mbstate_t { - unsigned __opaque1, __opaque2; -} mbstate_t; -#define _MBSTATE_T -#endif -#endif +/* nothing for Linux */ diff --git a/include/bits/netbsd/sys/cdefs.h b/include/bits/netbsd/sys/cdefs.h @@ -1,8 +1 @@ -#ifdef _NEED_MBSTATE_T -#ifndef _MBSTATE_T -typedef union { - char __mbstate8[128]; -} mbstate_t; -#define _MBSTATE_T -#endif -#endif +/* nothing for netbsd */ diff --git a/include/bits/openbsd/sys/cdefs.h b/include/bits/openbsd/sys/cdefs.h @@ -1,8 +1 @@ -#ifdef _NEED_MBSTATE_T -#ifndef _MBSTATE_T -typedef union { - char __mbstate8[128]; -} mbstate_t; -#define _MBSTATE_T -#endif -#endif +/* nothing for Openbsd */ diff --git a/include/wchar.h b/include/wchar.h @@ -8,10 +8,15 @@ #define _NEED_WCHARLIM #define _NEED_WINT #define _NEED_VA_LIST -#define _NEED_MBSTATE_T #include <arch/cdefs.h> #include <sys/cdefs.h> +typedef struct { + unsigned char oc; + unsigned char sh; + wchar_t wc; +} mbstate_t; + struct tm; struct _FILE; diff --git a/src/libc/arch/bsd/Makefile b/src/libc/arch/bsd/Makefile @@ -4,8 +4,6 @@ include $(PROJECTDIR)/scripts/rules.mk include ../../rules.mk OBJS=\ - _mbsget.$O\ - _mbsset.$O\ - _waitpid.$O\ + _waitpid.$O\ all: $(OBJS) diff --git a/src/libc/arch/bsd/_mbsget.c b/src/libc/arch/bsd/_mbsget.c @@ -1,9 +0,0 @@ -#include <wchar.h> - -#include "../../libc.h" - -int -_mbsget(mbstate_t *ps) -{ - return ps->__mbstate8[0]; -} diff --git a/src/libc/arch/bsd/_mbsset.c b/src/libc/arch/bsd/_mbsset.c @@ -1,9 +0,0 @@ -#include <wchar.h> - -#include "../../libc.h" - -int -_mbsset(mbstate_t *ps, int ch) -{ - return ps->__mbstate8[0] = ch; -} diff --git a/src/libc/arch/darwin/Makefile b/src/libc/arch/darwin/Makefile @@ -5,7 +5,5 @@ include ../../rules.mk OBJS=\ _getheap.$O\ - _mbsget.$O\ - _mbsset.$O\ all: $(OBJS) diff --git a/src/libc/arch/darwin/_mbsget.c b/src/libc/arch/darwin/_mbsget.c @@ -1,9 +0,0 @@ -#include <wchar.h> - -#include "../../libc.h" - -int -_mbsget(mbstate_t *ps) -{ - return ps->state[0]; -} diff --git a/src/libc/arch/darwin/_mbsset.c b/src/libc/arch/darwin/_mbsset.c @@ -1,10 +0,0 @@ -#include <wchar.h> - -#include "../../libc.h" - -int -_mbsset(mbstate_t *ps, int ch) -{ - ps-count = 1; - return ps->state[0] = ch; -} diff --git a/src/libc/arch/linux/Makefile b/src/libc/arch/linux/Makefile @@ -6,8 +6,6 @@ include ../../rules.mk OBJS=\ _brk.$O\ _getheap.$O\ - _mbsget.$O\ - _mbsset.$O\ _sigaction.$O\ _waitpid.$O\ diff --git a/src/libc/arch/linux/_mbsget.c b/src/libc/arch/linux/_mbsget.c @@ -1,9 +0,0 @@ -#include <wchar.h> - -#include "../../libc.h" - -int -_mbsget(mbstate_t *ps) -{ - return ps->__opaque1; -} diff --git a/src/libc/arch/linux/_mbsset.c b/src/libc/arch/linux/_mbsset.c @@ -1,9 +0,0 @@ -#include <wchar.h> - -#include "../../libc.h" - -int -_mbsset(mbstate_t *ps, int ch) -{ - return ps->__opaque1 = ch; -} diff --git a/src/libc/libc.h b/src/libc/libc.h @@ -61,8 +61,6 @@ extern void (*_atexithdl)(void); #ifdef _WCHAR_H extern int _validutf8(wchar_t, int *); -extern int _mbsset(mbstate_t *, int); -extern int _mbsget(mbstate_t *); #ifdef _STDIO_H extern wint_t _fputwc(wchar_t, FILE *, int *); #endif diff --git a/src/libc/objs/amd64-linux.mk b/src/libc/objs/amd64-linux.mk @@ -36,8 +36,6 @@ OBJS =\ arch/amd64/strcpy.$O\ arch/linux/_brk.$O\ arch/linux/_getheap.$O\ - arch/linux/_mbsget.$O\ - arch/linux/_mbsset.$O\ arch/linux/_sigaction.$O\ arch/linux/_waitpid.$O\ arch/posix/_open.$O\ diff --git a/src/libc/objs/amd64-netbsd.mk b/src/libc/objs/amd64-netbsd.mk @@ -28,8 +28,6 @@ OBJS =\ arch/amd64/strcmp.$O\ arch/amd64/strcpy.$O\ arch/bsd/_waitpid.$O\ - arch/bsd/_mbsget.$O\ - arch/bsd/_mbsset.$O\ arch/netbsd/_sigaction.$O\ arch/posix/_getheap.$O\ arch/posix/_open.$O\ diff --git a/src/libc/objs/amd64-openbsd.mk b/src/libc/objs/amd64-openbsd.mk @@ -33,8 +33,6 @@ OBJS =\ arch/amd64/strcmp.$O\ arch/amd64/strcpy.$O\ arch/bsd/_waitpid.$O\ - arch/bsd/_mbsget.$O\ - arch/bsd/_mbsset.$O\ arch/posix/_getheap.$O\ arch/posix/_open.$O\ arch/posix/_systime.$O\ diff --git a/src/libc/stdlib/mbtowc.c b/src/libc/stdlib/mbtowc.c @@ -1,4 +1,5 @@ #include <stdlib.h> +#include <string.h> #include <wchar.h> #undef mbtowc @@ -6,5 +7,12 @@ int mbtowc(wchar_t *restrict pwc, const char *restrict s, size_t n) { - return mbrtowc(pwc, s, n, NULL); + static mbstate_t st; + int ret; + + ret = mbrtowc(pwc, s, n, &st); + if (ret < 0) + ret = -1; + + return ret; } diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c @@ -1,5 +1,6 @@ #include <errno.h> #include <stdlib.h> +#include <string.h> #include <wchar.h> #include "../libc.h" @@ -10,43 +11,88 @@ size_t mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n, mbstate_t *restrict ps) { + static mbstate_t state; const unsigned char *t = (const unsigned char *) s; + wchar_t dummy; unsigned long wc; - unsigned c; - int i, len, maxlen; - - if (t == NULL) - return 0; - if ((wc = *t) == 0) - goto return_code; - - c = *t++; - for (len = 0; n > 0 && c & 0x80; --n, ++len) - c <<= 1; - if (n == 0 && c & 0x80) + unsigned c, oc; + int sh, max; + + if (!ps) + ps = &state; + + if (t == NULL) { + if (ps->sh != 0) + goto return_error; + pwc = &dummy; + goto return_code_set; + } + if (n == 0) return -2; - if (len == 1 || len > MB_CUR_MAX) - goto return_error; - if (len == 0) - goto return_code; - - wc = (c & 0xFF) >> len; - for (i = 0; i < len-1; i++) { - if (((c = *t++) & 0xC0) != 0x80) + + oc = ps->oc; + wc = ps->wc; + sh = ps->sh; + + /* initial state? */ + if (sh == 0) { + /* NUL character? */ + if ((c = wc = *t) == 0) + goto return_code; + t++; + n--; + + /* fast track for ascii? */ + if (c < 0x80) + goto return_code; + + /* out of sequence multibyte? */ + if ((c & 0xc0) != 0xc0) goto return_error; + + /* in sequence multibyte! */ + oc = c << 1; + wc = 0; + sh = 1; + } + + for ( ; n > 0; --n) { + if (sh > MB_CUR_MAX) + goto return_error; + + c = *t++; + if ((c & 0xc0) != 0x80) + goto return_error; + wc <<= 6; - wc |= c & 0x3F; + wc |= c & 0x3f; + oc <<= 1; + sh++; + + if ((oc & 0x80) == 0) { + oc = (oc & 0xff) >> sh; + wc |= oc << (sh-1) * 6; + + if (!_validutf8(wc, &max) || sh != max) + goto return_error; + goto return_code_set; + } } - if (!_validutf8(wc, &maxlen) || len != maxlen) - goto return_error; + ps->sh = sh; + ps->oc = oc; + ps->wc = wc; + return -2; +return_code_set: + memset(ps, 0, sizeof(*ps)); return_code: if (pwc) *pwc = wc; return t - (unsigned char *) s; return_error: + memset(ps, 0, sizeof(*ps)); errno = EILSEQ; return -1; } diff --git a/src/libc/wchar/mbsinit.c b/src/libc/wchar/mbsinit.c @@ -1,7 +1,5 @@ #include <wchar.h> -#include "../libc.h" - #undef mbsinit int @@ -9,5 +7,5 @@ mbsinit(const mbstate_t *ps) { if (!ps) return 1; - return _mbsget(ps) == 0; + return ps->oc == 0; }