commit 67eb502477c70b9d84d0615c1a4ea169fb1c6b84
parent 43e7e49b2686b39cac4695343ae4ee8452645f93
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date: Sat, 5 Nov 2022 10:47:23 +0100
cc1: Add support for long character constants
Diffstat:
6 files changed, 123 insertions(+), 16 deletions(-)
diff --git a/src/cmd/cc/cc1/cc1.h b/src/cmd/cc/cc1/cc1.h
@@ -1,3 +1,5 @@
+#include <stdint.h>
+
#define INPUTSIZ LINESIZ
#define GLOBALCTX 0
@@ -290,6 +292,7 @@ typedef struct node Node;
typedef struct macro Macro;
typedef struct input Input;
typedef struct arch Arch;
+typedef uint32_t Rune;
struct limits {
union {
diff --git a/src/cmd/cc/cc1/lex.c b/src/cmd/cc/cc1/lex.c
@@ -196,10 +196,6 @@ repeat:
break;
case EOF:
break;
- default:
- if (!isprint(c) && !ispunct(c) && !isspace(c))
- warn("invalid input character. The shame of UB is yours");
- break;
}
return c;
@@ -545,25 +541,83 @@ escape(void)
return c;
}
+static Rune
+utf8rune(void)
+{
+ Rune wc;
+ unsigned c;
+ size_t i, len;
+
+ c = *input->p;
+ for (len = 0; c & 0x80; len++)
+ c <<= 1;
+ if (len == 0)
+ return c;
+ if (len == 1 || len == 8)
+ goto invalid;
+
+ wc = (c & 0xFF) >> len;
+ for (i = 0; i < len-1; i++) {
+ c = input->p[1];
+ if ((c & 0xC0) != 0x80)
+ goto invalid;
+ input->p++;
+ wc <<= 6;
+ wc |= c & 0x3F;
+ }
+ return wc;
+
+invalid:
+ errorp("invalid multibyte sequence");
+ return 0xFFFD;
+}
+
+static Rune
+decode(int multi)
+{
+ Rune r;
+
+ if (*input->p == '\\') {
+ r = escape();
+ return r;
+ }
+
+ return multi ? utf8rune() : *input->p;
+}
+
static int
character(void)
{
- int c;
+ int i, multi = 0;
+ Rune r, d;
+ Type *tp = inttype;
Symbol *sym;
- if ((c = *++input->p) == '\\')
- c = escape();
- else
- c = *input->p;
- ++input->p;
- if (*input->p != '\'')
- errorp("invalid character constant");
- else
- ++input->p;
+ if (*input->p == 'L') {
+ multi = 1;
+ tp = wchartype;
+ input->p++;
+ }
+
+ d = 0;
+ input->p++;
+ for (i = 0; *input->p != '\''; i++) {
+ r = decode(multi);
+ if (r > getlimits(tp)->max.i)
+ warn("character too large for enclosing character literal type");
+ d |= r;
+ input->p++;
+ }
+ input->p++;
+
+ if (i == 0)
+ errorp("empty character constant");
+ if (i > 1)
+ warn("multi-character character constant");
sym = newsym(NS_IDEN, NULL);
- sym->u.i = c;
- sym->type = inttype;
+ sym->u.i = d;
+ sym->type = tp;
yylval.sym = sym;
tok2str();
return CONSTANT;
@@ -635,6 +689,9 @@ iden(void)
Symbol *sym;
char *p, *begin;
+ if (input->p[0] == 'L' && input->p[1] == '\'')
+ return character();
+
begin = input->p;
for (p = begin; isalnum(*p) || *p == '_'; ++p)
;
diff --git a/tests/cc/error/0033-character.c b/tests/cc/error/0033-character.c
@@ -0,0 +1,32 @@
+/*
+PATTERN:
+0033-character.c:23: error: empty character constant
+0033-character.c:24: warning: multi-character character constant
+0033-character.c:25: error: empty character constant
+0033-character.c:26: warning: multi-character character constant
+0033-character.c:27: warning: multi-character character constant
+0033-character.c:28: error: invalid multibyte sequence
+0033-character.c:28: warning: multi-character character constant
+0033-character.c:29: error: invalid multibyte sequence
+0033-character.c:29: error: invalid multibyte sequence
+0033-character.c:29: warning: multi-character character constant
+.
+*/
+#include <wchar.h>
+
+int
+main()
+{
+ int i;
+ wchar_t w;
+
+ i = '';
+ i = 'ab';
+ w = L'';
+ w = L'ab';
+ w = L'áá';
+ w = L'€ ';
+ w = L'€À';
+
+ return 0;
+}
diff --git a/tests/cc/error/scc-tests.lst b/tests/cc/error/scc-tests.lst
@@ -30,3 +30,4 @@
0030-krtypes.c
0031-krtypes.c
0032-krtypes.c
+0033-character.c
diff --git a/tests/cc/execute/0217-lchar.c b/tests/cc/execute/0217-lchar.c
@@ -0,0 +1,13 @@
+#include <wchar.h>
+
+int
+main()
+{
+ wchar_t c;
+
+ c = L'á';
+
+ if (c != 225)
+ return 1;
+ return 0;
+}
diff --git a/tests/cc/execute/scc-tests.lst b/tests/cc/execute/scc-tests.lst
@@ -207,3 +207,4 @@
0214-va_copy.c
0215-ret_struct.c
0216-initialize.c [TODO]
+0217-lchar.c