scc

simple c99 compiler
git clone git://git.simple-cc.org/scc
Log | Files | Refs | Submodules | README | LICENSE

commit 2ec31560621723e31191aa91d93f9ffa9e199d28
parent 4e9c575865375b57c0764a75775a84fb6c075550
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date:   Mon, 14 Nov 2022 18:48:09 +0100

libc/wchar: Check length of utf8 characters

UTF-8 characters must be always represented with the shorter
possible representation, otherwise they are consiered invalid.

Diffstat:
Msrc/libc/libc.h | 2+-
Msrc/libc/wchar/_validutf8.c | 32++++++++++++++++++++++++--------
Msrc/libc/wchar/mbrtowc.c | 4++--
Msrc/libc/wchar/wcrtomb.c | 9++-------
4 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/src/libc/libc.h b/src/libc/libc.h @@ -60,7 +60,7 @@ extern void (*_flushall)(void); extern void (*_atexithdl)(void); #ifdef _WCHAR_H -extern int _validutf8(wchar_t); +extern int _validutf8(wchar_t, int *); #ifdef _STDIO_H extern wint_t _fputwc(wchar_t, FILE *, int *); #endif diff --git a/src/libc/wchar/_validutf8.c b/src/libc/wchar/_validutf8.c @@ -2,14 +2,30 @@ #include "../libc.h" +struct range { + unsigned long begin, end; + int valid; + int nbytes; +}; + int -_validutf8(wchar_t wc) +_validutf8(wchar_t wc, int *nbytes) { - if (wc >= 0xD800 && wc < 0xDBFF) - return 0; - if (wc >= 0xDC00 && wc < 0xDCFF) - return 0; - if (wc > 0x10FFFF) - return 0; - return 1; + + static struct range ranges[] = { + {0, 0x80, 1, 1}, + {0x80, 0x800, 1, 2}, + {0x800, 0xD800, 1, 3}, + {0xD800, 0xDD00, 0, 3}, + {0xDD00, 0x10000, 1, 3}, + {0x10000, 0x110000, 1, 4}, + {0x11000, -1ul, 0, 0}, + }; + struct range *bp; + + for (bp = ranges; bp->begin <= wc && bp->end > wc; ++bp) + ; + *nbytes = bp->nbytes; + + return bp->valid; } diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c @@ -11,7 +11,7 @@ mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n, unsigned char *t = (unsigned char *) s; unsigned long wc; unsigned c; - size_t i, len; + int i, len, maxlen; if (s == NULL) return 0; @@ -32,7 +32,7 @@ mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n, wc |= c & 0x3F; } - if (!_validutf8(wc)) + if (!_validutf8(wc, &maxlen) || len != maxlen) return -1; return_code: diff --git a/src/libc/wchar/wcrtomb.c b/src/libc/wchar/wcrtomb.c @@ -10,21 +10,16 @@ wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict ps) { int i, n; unsigned long c = wc; - static unsigned long limits[] = { - 0x80, 0x800, 0x10000, 0x200000, 0x4000000 - }; if (!s) return 1; - if (_validutf8(wc)) { + if (!_validutf8(wc, &n)) { errno = EILSEQ; return -1; } - for (n = 0; n < 5 && c >= limits[n]; ++n) - ; - + n--; *s = 0; for (i = 0; i < n; i++) { *s >>= 1;