commit 328388439b2db162f7dedb75db497509131cc745
parent 693af8c7f3cef75f855092950a8601e25714a23d
Author: Roberto E. Vargas Caballero <k0ga@shike2.net>
Date: Fri, 22 May 2026 11:26:14 +0200
cc1: Add cppspaces()
The rules about how the preprocssor must handle spaces are a bit
unspecified in the standard. The paragraph 6.4.3 says:
Preprocessing tokens can be separated by white space; this
consists of comments (described later), or white-space characters
(space, horizontal tab, new-line, vertical tab, and form-feed),
or both
but 6.10.5 says:
The only white-space characters that shall appear between
preprocessing tokens within a preprocessing directive (from just
after the introducing # preprocessing token through just before
the terminating new-line character) are space and horizontal-tab
(including spaces that have replaced comments or possibly other
white-space characters in translation phase 3).
so, it seems that this paragraph seems to limit the spaces in
preprocessing directives to only ' ' and '\t', but 5.1.1.2.1 says about
translation phase 3:
Each comment is replaced by one space character. New-line
characters are retained. Whether each nonempty sequence of
white-space characters other than new-line is retained or replaced
by one space character is implementation-defined.
so, as 6.10.5 accepts the changes possibily done in translation phase 3,
then we can decide to accept all the white spaces in preprocessor directives.
To make all this stuff simpler we just deal all the white spaces in the same
way using the freedom provided by 5.1.1.2.1, and we add a new cppspace()
function to avoid problems with ctype isspace() because there are cases where
valid programs can pass characters over 127 and being processed by these
calls, for example:
#define a(x) #x
a(word)
`word` can contain any character, including utf8 characters over 127 and still
being a valid C program, so using isspaces there can be problematic. While we
don't care so much about having problems with invalid inputs (as in these cases
the result is undefined) it is different in the case of vaild inputs.
Diffstat:
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/src/cmd/scc-cc/cc1/cpp.c b/src/cmd/scc-cc/cc1/cpp.c
@@ -142,6 +142,12 @@ unterminated:
mp->sym->name);
}
+static int
+cppspace(int c)
+{
+ return c == ' ' || c == '\t' || c == '\v' || c == '\n' || c == '\f';
+}
+
static char *
parameter(Macro *mp, int n)
{
@@ -170,9 +176,9 @@ parameter(Macro *mp, int n)
begin = mp->arg;
end = begin + mp->argsiz;
- while (begin < end && isspace(*begin))
+ while (begin < end && cppspace(*begin))
begin++;
- while (end > begin && isspace(end[-1]))
+ while (end > begin && cppspace(end[-1]))
end--;
siz = end - begin;
@@ -249,12 +255,12 @@ concatoper(char *def, char *cur)
{
char *s;
- for (s = cur + 4; isspace(*s); ++s)
+ for (s = cur + 4; cppspace(*s); ++s)
;
if (*s == CONCAT)
return 1;
- for (s = cur; s > def && isspace(s[-1]); --s)
+ for (s = cur; s > def && cppspace(s[-1]); --s)
;
if (s > def && s[-1] == CONCAT)
return 1;
@@ -314,8 +320,8 @@ stringoper(char **bpp, int *sizep, char *arg)
}
break;
default:
- if (!delim && isspace(c)) {
- while (isspace(*arg))
+ if (!delim && cppspace(c)) {
+ while (cppspace(*arg))
++arg;
c = ' ';
}
@@ -414,9 +420,9 @@ copymacro(Macro *mp)
case CONCAT:
/* token concatenation operator */
DBG("MACRO concat");
- while (isspace(bp[-1]))
+ while (cppspace(bp[-1]))
--bp, ++bufsiz;
- while (isspace(s[1]))
+ while (cppspace(s[1]))
++s;
break;
case STRINGIZE:
@@ -624,7 +630,7 @@ getdefs(Symbol *args[NR_MACROARG], int nargs, char *buffer, size_t bufsiz)
char c, *bp, *s, *p;
int len, id, token, prevc, ispar;
- while (isspace(*input->p))
+ while (cppspace(*input->p))
++input->p;
bp = buffer;
@@ -640,7 +646,7 @@ getdefs(Symbol *args[NR_MACROARG], int nargs, char *buffer, size_t bufsiz)
++input->p;
} else {
c = token = STRINGIZE;
- while (isspace(input->p[1]))
+ while (cppspace(input->p[1]))
++input->p;
}
} else if (c == '"' || c == '\'' || c == '_' || isalpha(c)) {
@@ -691,7 +697,7 @@ end_loop:
yytoken = EOFTOK;
if (prevc == CONCAT)
goto wrong_concat;
- for ( ; bp > buffer && isspace(bp[-1]); --bp);
+ for ( ; bp > buffer && cppspace(bp[-1]); --bp);
;
*bp = '\0';
return 1;
@@ -1134,7 +1140,7 @@ cpp(void)
int ns;
char *p;
- for (p = input->p; isspace(*p); ++p)
+ for (p = input->p; cppspace(*p); ++p)
;
if (*p != '#') {