commit 2ec31560621723e31191aa91d93f9ffa9e199d28
parent 4e9c575865375b57c0764a75775a84fb6c075550
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date: Mon, 14 Nov 2022 18:48:09 +0100
libc/wchar: Check length of utf8 characters
UTF-8 characters must be always represented with the shorter
possible representation, otherwise they are consiered invalid.
Diffstat:
4 files changed, 29 insertions(+), 18 deletions(-)
diff --git a/src/libc/libc.h b/src/libc/libc.h
@@ -60,7 +60,7 @@ extern void (*_flushall)(void);
extern void (*_atexithdl)(void);
#ifdef _WCHAR_H
-extern int _validutf8(wchar_t);
+extern int _validutf8(wchar_t, int *);
#ifdef _STDIO_H
extern wint_t _fputwc(wchar_t, FILE *, int *);
#endif
diff --git a/src/libc/wchar/_validutf8.c b/src/libc/wchar/_validutf8.c
@@ -2,14 +2,30 @@
#include "../libc.h"
+struct range {
+ unsigned long begin, end;
+ int valid;
+ int nbytes;
+};
+
int
-_validutf8(wchar_t wc)
+_validutf8(wchar_t wc, int *nbytes)
{
- if (wc >= 0xD800 && wc < 0xDBFF)
- return 0;
- if (wc >= 0xDC00 && wc < 0xDCFF)
- return 0;
- if (wc > 0x10FFFF)
- return 0;
- return 1;
+
+ static struct range ranges[] = {
+ {0, 0x80, 1, 1},
+ {0x80, 0x800, 1, 2},
+ {0x800, 0xD800, 1, 3},
+ {0xD800, 0xDD00, 0, 3},
+ {0xDD00, 0x10000, 1, 3},
+ {0x10000, 0x110000, 1, 4},
+ {0x11000, -1ul, 0, 0},
+ };
+ struct range *bp;
+
+ for (bp = ranges; bp->begin <= wc && bp->end > wc; ++bp)
+ ;
+ *nbytes = bp->nbytes;
+
+ return bp->valid;
}
diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c
@@ -11,7 +11,7 @@ mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n,
unsigned char *t = (unsigned char *) s;
unsigned long wc;
unsigned c;
- size_t i, len;
+ int i, len, maxlen;
if (s == NULL)
return 0;
@@ -32,7 +32,7 @@ mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n,
wc |= c & 0x3F;
}
- if (!_validutf8(wc))
+ if (!_validutf8(wc, &maxlen) || len != maxlen)
return -1;
return_code:
diff --git a/src/libc/wchar/wcrtomb.c b/src/libc/wchar/wcrtomb.c
@@ -10,21 +10,16 @@ wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict ps)
{
int i, n;
unsigned long c = wc;
- static unsigned long limits[] = {
- 0x80, 0x800, 0x10000, 0x200000, 0x4000000
- };
if (!s)
return 1;
- if (_validutf8(wc)) {
+ if (!_validutf8(wc, &n)) {
errno = EILSEQ;
return -1;
}
- for (n = 0; n < 5 && c >= limits[n]; ++n)
- ;
-
+ n--;
*s = 0;
for (i = 0; i < n; i++) {
*s >>= 1;