scc

simple c99 compiler
git clone git://git.simple-cc.org/scc
Log | Files | Refs | Submodules | README | LICENSE

commit 21863f4266b66c813746b9380b19d27c7c5d9c68
parent f29cc7e4e4ff9756684411fadfca18f3ff6556d2
Author: Roberto E. Vargas Caballero <k0ga@shike2.net>
Date:   Wed,  8 Apr 2026 14:50:25 +0200

cc1: Preserve string token input

The parser was using yylex to store the internal representation of the
token, creating many problems, mainly with the preprocessor. The correct
solution is to use different buffers for the text of the token and for
the internal repsentation of the token.

Diffstat:
Msrc/cmd/scc-cc/cc1/cc1.h | 2+-
Msrc/cmd/scc-cc/cc1/cpp.c | 57+--------------------------------------------------------
Msrc/cmd/scc-cc/cc1/lex.c | 36+++++++++++-------------------------
3 files changed, 13 insertions(+), 82 deletions(-)

diff --git a/src/cmd/scc-cc/cc1/cc1.h b/src/cmd/scc-cc/cc1/cc1.h @@ -546,7 +546,7 @@ extern struct yystype yylval; extern char yytext[]; extern int yytoken; extern unsigned short yylen; -extern int disexpand, disescape, disstring; +extern int disexpand, disstring; extern unsigned cppctx; extern Input *input; extern int lexmode, namespace; diff --git a/src/cmd/scc-cc/cc1/cpp.c b/src/cmd/scc-cc/cc1/cpp.c @@ -24,7 +24,6 @@ static struct items dirinclude; unsigned cppctx; int disexpand; -int disescape; void defdefine(char *name, char *val, char *source) @@ -672,7 +671,6 @@ define(void) if (cppoff) return; - disescape = 1; namespace = NS_CPP; next(); @@ -1145,7 +1143,6 @@ cpp(void) cpperror("trailing characters after preprocessor directive"); ret: - disescape = 0; disexpand = 0; lexmode = CCMODE; namespace = ns; @@ -1191,59 +1188,7 @@ outcpp(void) for (next(); yytoken != EOFTOK; next()) { if (onlyheader) continue; - if (yytoken != STRING) { - printf("%s ", yytext); - continue; - } - for (s = yytext; (c = *s) != '\0'; ++s) { - switch (c) { - case '\n': - t = "\\n"; - goto print_str; - case '\v': - t = "\\v"; - goto print_str; - case '\b': - t = "\\b"; - goto print_str; - case '\t': - t = "\\t"; - goto print_str; - case '\a': - t = "\\a"; - goto print_str; - case '\f': - t = "\\f"; - goto print_str; - case '\r': - t = "\\r"; - goto print_str; - case '"': - if (s == yytext || s[1] == '\0') - goto print_chr; - t = "\\\""; - goto print_str; - case '\'': - t = "\\'"; - goto print_str; - case '\?': - t = "\\\?"; - goto print_str; - case '\\': - putchar('\\'); - default: - print_chr: - if (!isprint(c)) - printf("\\x%x", c); - else - putchar(c); - break; - print_str: - fputs(t, stdout); - break; - } - } - putchar(' '); + printf("%s ", yytext); } putchar('\n'); } diff --git a/src/cmd/scc-cc/cc1/lex.c b/src/cmd/scc-cc/cc1/lex.c @@ -342,7 +342,7 @@ repeat: static void tok2str(void) { - if ((yylen = input->p - input->begin) > INTIDENTSIZ) + if ((yylen = input->p - input->begin) >= sizeof(yytext)) error("token too big"); memcpy(yytext, input->begin, yylen); yytext[yylen] = '\0'; @@ -711,26 +711,16 @@ character(void) return CONSTANT; } -/* - * string() parses a constant string, and convert all the - * escape sequences into single characters. This behaviour - * is correct except when we parse a #define, where we want - * to preserve the literal content of the string. In that - * case cpp.c:/^define( sets the variable disescape to - * disable converting escape sequences into characters. - */ static int string(void) { - char *bp = yytext; - int c, esc; + int c, multi = 0; + char buff[STRINGSIZ+1], *bp = buff, *beg = input->p, *end; - *bp++ = '"'; - esc = 0; for (++input->p; ; ++input->p) { c = *input->p; - if (c == '"' && !esc) + if (c == '"') break; if (c == '\0') { @@ -738,12 +728,10 @@ string(void) break; } - esc = (c == '\\' && !esc && disescape); + if (c == '\\') + c = escape(multi); - if (c == '\\' && !esc) - c = escape(0); - - if (bp == &yytext[STRINGSIZ+1]) { + if (input->p - beg == STRINGSIZ + 1) { /* too long, ignore everything until next quote */ for (++input->p; *input->p != '"'; ++input->p) { if (*input->p == '\\') @@ -757,14 +745,12 @@ string(void) } *bp++ = c; } + *bp++ = '\0'; + input->p++; - input->begin = ++input->p; - *bp = '\0'; + yylval.sym = newstring(buff, bp - buff); + tok2str(); - yylen = bp - yytext + 1; - yylval.sym = newstring(yytext+1, yylen-1); - *bp++ = '"'; - *bp = '\0'; return STRING; }