commit 67eb502477c70b9d84d0615c1a4ea169fb1c6b84
parent 43e7e49b2686b39cac4695343ae4ee8452645f93
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date:   Sat,  5 Nov 2022 10:47:23 +0100
cc1: Add support for long character constants
Diffstat:
6 files changed, 123 insertions(+), 16 deletions(-)
diff --git a/src/cmd/cc/cc1/cc1.h b/src/cmd/cc/cc1/cc1.h
@@ -1,3 +1,5 @@
+#include <stdint.h>
+
 #define INPUTSIZ LINESIZ
 
 #define GLOBALCTX 0
@@ -290,6 +292,7 @@ typedef struct node Node;
 typedef struct macro Macro;
 typedef struct input Input;
 typedef struct arch Arch;
+typedef uint32_t Rune;
 
 struct limits {
 	union {
diff --git a/src/cmd/cc/cc1/lex.c b/src/cmd/cc/cc1/lex.c
@@ -196,10 +196,6 @@ repeat:
 		break;
 	case EOF:
 		break;
-	default:
-		if (!isprint(c) && !ispunct(c) && !isspace(c))
-			warn("invalid input character. The shame of UB is yours");
-		break;
 	}
 
 	return c;
@@ -545,25 +541,83 @@ escape(void)
 	return c;
 }
 
+static Rune
+utf8rune(void)
+{
+	Rune wc;
+	unsigned c;
+	size_t i, len;
+
+	c = *input->p;
+	for (len = 0; c & 0x80; len++)
+		c <<= 1;
+	if (len == 0)
+		return c;
+	if (len == 1 || len == 8)
+		goto invalid;
+
+	wc = (c & 0xFF) >> len;
+	for (i = 0; i < len-1; i++) {
+		c = input->p[1];
+		if ((c & 0xC0) != 0x80)
+			goto invalid;
+		input->p++;
+		wc <<= 6;
+		wc |= c & 0x3F;
+	}
+	return wc;
+
+invalid:
+	errorp("invalid multibyte sequence");
+	return 0xFFFD;
+}
+
+static Rune
+decode(int multi)
+{
+	Rune r;
+
+	if (*input->p == '\\') {
+		r = escape();
+		return r;
+	}
+
+	return multi ? utf8rune() : *input->p;
+}
+
 static int
 character(void)
 {
-	int c;
+	int i, multi = 0;
+	Rune r, d;
+	Type *tp = inttype;
 	Symbol *sym;
 
-	if ((c = *++input->p) == '\\')
-		c = escape();
-	else
-		c = *input->p;
-	++input->p;
-	if (*input->p != '\'')
-		errorp("invalid character constant");
-	else
-		++input->p;
+	if (*input->p == 'L') {
+		multi = 1;
+		tp = wchartype;
+		input->p++;
+	}
+
+	d = 0;
+	input->p++;
+	for (i = 0; *input->p != '\''; i++) {
+		r = decode(multi);
+		if (r > getlimits(tp)->max.i)
+			warn("character too large for enclosing character literal type");
+		d |= r;
+		input->p++;
+	}
+	input->p++;
+
+	if (i == 0)
+		errorp("empty character constant");
+	if (i > 1)
+		warn("multi-character character constant");
 
 	sym = newsym(NS_IDEN, NULL);
-	sym->u.i = c;
-	sym->type = inttype;
+	sym->u.i = d;
+	sym->type = tp;
 	yylval.sym = sym;
 	tok2str();
 	return CONSTANT;
@@ -635,6 +689,9 @@ iden(void)
 	Symbol *sym;
 	char *p, *begin;
 
+	if (input->p[0] == 'L' && input->p[1] == '\'')
+		return character();
+
 	begin = input->p;
 	for (p = begin; isalnum(*p) || *p == '_'; ++p)
 		;
diff --git a/tests/cc/error/0033-character.c b/tests/cc/error/0033-character.c
@@ -0,0 +1,32 @@
+/*
+PATTERN:
+0033-character.c:23: error: empty character constant
+0033-character.c:24: warning: multi-character character constant
+0033-character.c:25: error: empty character constant
+0033-character.c:26: warning: multi-character character constant
+0033-character.c:27: warning: multi-character character constant
+0033-character.c:28: error: invalid multibyte sequence
+0033-character.c:28: warning: multi-character character constant
+0033-character.c:29: error: invalid multibyte sequence
+0033-character.c:29: error: invalid multibyte sequence
+0033-character.c:29: warning: multi-character character constant
+.
+*/
+#include <wchar.h>
+
+int
+main()
+{
+	int i;
+	wchar_t w;
+
+	i = '';
+	i = 'ab';
+	w = L'';
+	w = L'ab';
+	w = L'áá';
+	w = L'€ ';
+	w = L'€À';
+
+	return 0;
+}
diff --git a/tests/cc/error/scc-tests.lst b/tests/cc/error/scc-tests.lst
@@ -30,3 +30,4 @@
 0030-krtypes.c
 0031-krtypes.c
 0032-krtypes.c
+0033-character.c
diff --git a/tests/cc/execute/0217-lchar.c b/tests/cc/execute/0217-lchar.c
@@ -0,0 +1,13 @@
+#include <wchar.h>
+
+int
+main()
+{
+	wchar_t c;
+
+	c = L'á';
+
+	if (c != 225)
+		return 1;
+	return 0;
+}
diff --git a/tests/cc/execute/scc-tests.lst b/tests/cc/execute/scc-tests.lst
@@ -207,3 +207,4 @@
 0214-va_copy.c
 0215-ret_struct.c
 0216-initialize.c [TODO]
+0217-lchar.c