commit 3cb21279e81fde0e4dc9b10061b9aa2ecd7225b5
parent 2fa9d9d8e2580b0d8081184db9da3339a291b912
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date: Sun, 6 Nov 2022 20:07:46 +0100
libc: Discard invalid utf8 ranges
RFC 2277 defines UTF-8 as having a maximum of 6
bytes but RFC 3629 marks as invalid several ranges
to match UTF-16 rstrictions and it limits the encoded
endpints to U+10FFFF.
Diffstat:
7 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/include/limits.h b/include/limits.h
@@ -3,6 +3,6 @@
#include <arch/limits.h>
-#define MB_LEN_MAX 1
+#define MB_LEN_MAX 4
#endif
diff --git a/include/stdlib.h b/include/stdlib.h
@@ -11,7 +11,7 @@
#define _ATEXIT_MAX 32
-#define MB_CUR_MAX 6
+#define MB_CUR_MAX 4
#define RAND_MAX 32767
typedef struct {
diff --git a/src/libc/libc.h b/src/libc/libc.h
@@ -58,3 +58,7 @@ extern void (*_exitf[])(void);
extern unsigned _exitn;
extern void (*_flushall)(void);
extern void (*_atexithdl)(void);
+
+#ifdef _WCHAR_H
+extern int _validutf8(wchar_t);
+#endif
diff --git a/src/libc/objs/common-objs.mk b/src/libc/objs/common-objs.mk
@@ -121,3 +121,4 @@ COMMON_OBJS =\
wchar/mbrtowc.$O\
wchar/wcrtomb.$O\
wchar/wcwidth.$O\
+ wchar/_validutf8.$O\
diff --git a/src/libc/wchar/Makefile b/src/libc/wchar/Makefile
@@ -8,6 +8,7 @@ OBJS =\
mbrtowc.$O\
wcrtomb.$O\
wcwidth.$O\
+ _validutf8.$O\
all: $(OBJS)
diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c
@@ -1,5 +1,7 @@
#include <wchar.h>
+#include "../libc.h"
+
#undef mbrtowc
size_t
@@ -30,6 +32,9 @@ mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n,
wc |= c & 0x3F;
}
+ if (!_validutf8(wc))
+ return -1;
+
return_code:
if (pwc)
*pwc = wc;
diff --git a/src/libc/wchar/wcrtomb.c b/src/libc/wchar/wcrtomb.c
@@ -1,5 +1,8 @@
+#include <errno.h>
#include <wchar.h>
+#include "../libc.h"
+
#undef wcrtomb
size_t
@@ -14,6 +17,11 @@ wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict ps)
if (!s)
return 1;
+ if (_validutf8(wc)) {
+ errno = EILSEQ;
+ return -1;
+ }
+
for (n = 0; n < 5 && c >= limits[n]; ++n)
;