scc

simple c99 compiler
git clone git://git.simple-cc.org/scc
Log | Files | Refs | README | LICENSE

lex.c (14172B)


      1 #include <assert.h>
      2 #include <ctype.h>
      3 #include <errno.h>
      4 #include <limits.h>
      5 #include <setjmp.h>
      6 #include <stdio.h>
      7 #include <stdlib.h>
      8 #include <string.h>
      9 
     10 #include <scc/cstd.h>
     11 #include <scc/scc.h>
     12 #include "cc1.h"
     13 
     14 int yytoken;
     15 struct yystype yylval;
     16 char yytext[STRINGSIZ+3];
     17 unsigned short yylen;
     18 int lexmode = CCMODE;
     19 unsigned lineno;
     20 char filenam[FILENAME_MAX];
     21 
     22 int namespace = NS_IDEN;
     23 static int safe;
     24 Input *input;
     25 
     26 void
     27 ilex(void)
     28 {
     29 	static struct keyword keys[] = {
     30 		{"auto", SCLASS, AUTO},
     31 		{"break", BREAK, BREAK},
     32 		{"_Bool", TYPE, BOOL},
     33 		{"__builtin_va_list", TYPE, VA_LIST},
     34 		{"case", CASE, CASE},
     35 		{"char", TYPE, CHAR},
     36 		{"const", TQUALIFIER, CONST},
     37 		{"continue", CONTINUE, CONTINUE},
     38 		{"default", DEFAULT, DEFAULT},
     39 		{"do", DO, DO},
     40 		{"double", TYPE, DOUBLE},
     41 		{"else", ELSE, ELSE},
     42 		{"enum", TYPE, ENUM},
     43 		{"extern", SCLASS, EXTERN},
     44 		{"float", TYPE, FLOAT},
     45 		{"for", FOR, FOR},
     46 		{"goto", GOTO, GOTO},
     47 		{"if", IF, IF},
     48 		{"inline", TQUALIFIER, INLINE},
     49 		{"int", TYPE, INT},
     50 		{"long", TYPE, LONG},
     51 		{"register", SCLASS, REGISTER},
     52 		{"restrict", TQUALIFIER, RESTRICT},
     53 		{"return", RETURN, RETURN},
     54 		{"short", TYPE, SHORT},
     55 		{"signed", TYPE, SIGNED},
     56 		{"sizeof", SIZEOF, SIZEOF},
     57 		{"static", SCLASS, STATIC},
     58 		{"struct", TYPE, STRUCT},
     59 		{"switch", SWITCH, SWITCH},
     60 		{"typedef", SCLASS, TYPEDEF},
     61 		{"union", TYPE, UNION},
     62 		{"unsigned", TYPE, UNSIGNED},
     63 		{"void", TYPE, VOID},
     64 		{"volatile", TQUALIFIER, VOLATILE},
     65 		{"while", WHILE, WHILE},
     66 		{NULL, 0, 0},
     67 	};
     68 	keywords(keys, NS_KEYWORD);
     69 }
     70 
     71 void
     72 setloc(char *fname, unsigned line)
     73 {
     74 	size_t len;
     75 
     76 	if ((len = strlen(fname)) >= FILENAME_MAX)
     77 		die("cc1: %s: file name too long", fname);
     78 	memmove(filenam, fname, len);
     79 	filenam[len] = '\0';
     80 
     81 	free(input->filenam);
     82 	input->filenam = xstrdup(fname);
     83 	lineno = input->lineno = line;
     84 }
     85 
     86 int
     87 addinput(char *fname, Symbol *hide, char *buffer, int fail)
     88 {
     89 	FILE *fp;
     90 	char *extp;
     91 	unsigned flags;
     92 	int infileln;
     93 	Input *newip, *curip = input;
     94 
     95 	if (hide) {
     96 		/* this is a macro expansion */
     97 		fp = NULL;
     98 		if (hide->hide == UCHAR_MAX)
     99 			die("cc1: too many macro expansions");
    100 		++hide->hide;
    101 		flags = IMACRO;
    102 	} else  if (fname) {
    103 		/* a new file */
    104 		if ((fp = fopen(fname, "r")) == NULL) {
    105 			if (!fail)
    106 				return 0;
    107 			die("cc1: %s: %s", fname, strerror(errno));
    108 		}
    109 		flags = IFILE;
    110 		if (curip && onlyheader) {
    111 			infileln = strlen(infile);
    112 			if (extp = strrchr(infile, '.'))
    113 				infileln -= strlen(extp);
    114 			printf("%.*s.o: %s %s\n",
    115 			       infileln, infile, infile, fname);
    116 		}
    117 	} else {
    118 		/* reading from stdin */
    119 		fp = stdin;
    120 		fname = "<stdin>";
    121 		flags = ISTDIN;
    122 	}
    123 
    124 	newip = xmalloc(sizeof(*newip));
    125 
    126 	if (!buffer) {
    127 		buffer = xmalloc(INPUTSIZ);
    128 		buffer[0] = '\0';
    129 	}
    130 
    131 	if (curip)
    132 		curip->lineno = lineno;
    133 
    134 	newip->p = newip->begin = newip->line = buffer;
    135 	newip->filenam = NULL;
    136 	newip->lineno = 0;
    137 	newip->next = curip;
    138 	newip->fp = fp;
    139 	newip->hide = hide;
    140 	newip->flags = flags;
    141 	input = newip;
    142 
    143 	setloc(fname, (curip) ? curip->lineno : newip->lineno);
    144 	return 1;
    145 }
    146 
    147 void
    148 delinput(void)
    149 {
    150 	Input *ip = input;
    151 	Symbol *hide = ip->hide;
    152 
    153 	switch (ip->flags & ITYPE) {
    154 	case IFILE:
    155 		if (fclose(ip->fp))
    156 			die("cc1: %s: %s", ip->filenam, strerror(errno));
    157 		break;
    158 	case IMACRO:
    159 		assert(hide->hide == 1);
    160 		--hide->hide;
    161 		break;
    162 	}
    163 	input = ip->next;
    164 	free(ip->filenam);
    165 	free(ip->line);
    166 	if (input) {
    167 		lineno = input->lineno;
    168 		strcpy(filenam, input->filenam);
    169 	}
    170 }
    171 
    172 static void
    173 newline(void)
    174 {
    175 	if (++lineno == 0)
    176 		die("cc1: %s: file too long", filenam);
    177 }
    178 
    179 /*
    180  * Read the next character from the input file, counting number of lines
    181  * and joining lines escaped with \
    182  */
    183 static int
    184 readchar(void)
    185 {
    186 	FILE *fp = input->fp;
    187 	int c;
    188 
    189 repeat:
    190 	switch (c = getc(fp)) {
    191 	case '\\':
    192 		if ((c = getc(fp)) == '\n') {
    193 			newline();
    194 			goto repeat;
    195 		}
    196 		ungetc(c, fp);
    197 		c = '\\';
    198 		break;
    199 	case '\n':
    200 		newline();
    201 		break;
    202 	default:
    203 		if (!isprint(c) && !ispunct(c) && !isspace(c))
    204 			warn("invalid input character. The shame of UB is yours");
    205 		break;
    206 	}
    207 
    208 	return c;
    209 }
    210 
    211 /*
    212  * discard a C comment. This function is only called from readline
    213  * because it is impossible to have a comment in a macro, because
    214  * comments are always discarded before processing any cpp directive
    215  */
    216 static void
    217 comment(int type)
    218 {
    219 	int c;
    220 
    221 repeat:
    222 	while ((c = readchar()) != EOF && c != type)
    223 		;
    224 
    225 	if (c == EOF) {
    226 		errorp("unterminated comment");
    227 		return;
    228 	}
    229 
    230 	if (type == '*' && (c = readchar()) != '/')
    231 		goto repeat;
    232 }
    233 
    234 /*
    235  * readline is used to read a full logic line from a file.
    236  * It discards comments and check that the line fits in
    237  * the input buffer
    238  */
    239 static int
    240 readline(void)
    241 {
    242 	char *bp, *lim;
    243 	int c, peekc = 0;
    244 
    245 	if (feof(input->fp)) {
    246 		input->flags |= IEOF;
    247 		return 0;
    248 	}
    249 
    250 	*input->line = '\0';
    251 	lim = &input->line[INPUTSIZ-1];
    252 	for (bp = input->line; bp < lim-1; *bp++ = c) {
    253 		c = (peekc) ? peekc : readchar();
    254 		peekc = 0;
    255 		if (c == '\n' || c == EOF)
    256 			break;
    257 		if (c != '/')
    258 			continue;
    259 
    260 		/* check for /* or // */
    261 		peekc = readchar();
    262 		if (peekc != '*' && peekc != '/')
    263 			continue;
    264 		comment((peekc == '/') ? '\n' : '*');
    265 		peekc = 0;
    266 		c = ' ';
    267 	}
    268 
    269 	input->begin = input->p = input->line;
    270 	if (bp == lim-1) {
    271 		errorp("line too long");
    272 		--bp;
    273 	}
    274 	*bp++ = '\n';
    275 	*bp = '\0';
    276 
    277 	return 1;
    278 }
    279 
    280 /*
    281  * moreinput gets more bytes to be passed to the lexer.
    282  * It can take more bytes from macro expansions or
    283  * directly reading from files. When a cpp directive
    284  * is processed the line is discarded because it must not
    285  * be passed to the lexer
    286  */
    287 static int
    288 moreinput(void)
    289 {
    290 	int wasexpand = 0;
    291 
    292 repeat:
    293 	if (!input)
    294 		return 0;
    295 
    296 	if (*input->p == '\0') {
    297 		if ((input->flags&ITYPE) == IMACRO) {
    298 			wasexpand = 1;
    299 			input->flags |= IEOF;
    300 		}
    301 		if (input->flags & IEOF) {
    302 			delinput();
    303 			goto repeat;
    304 		}
    305 		if (!readline() || cpp()) {
    306 			*input->p = '\0';
    307 			goto repeat;
    308 		}
    309 	}
    310 
    311 	if (onlycpp && !wasexpand)
    312 		ppragmaln();
    313 	return 1;
    314 }
    315 
    316 static void
    317 tok2str(void)
    318 {
    319 	if ((yylen = input->p - input->begin) > INTIDENTSIZ)
    320 		error("token too big");
    321 	memcpy(yytext, input->begin, yylen);
    322 	yytext[yylen] = '\0';
    323 	input->begin = input->p;
    324 }
    325 
    326 static Symbol *
    327 readint(char *s, int base, int sign, Symbol *sym)
    328 {
    329 	Type *tp = sym->type;
    330 	struct limits *lim;
    331 	TUINT u, val, max;
    332 	int c;
    333 
    334 	lim = getlimits(tp);
    335 	max = lim->max.i;
    336 	if (*s == '0')
    337 		++s;
    338 	if (toupper(*s) == 'X')
    339 		++s;
    340 
    341 	for (u = 0; isxdigit(c = *s++); u = u*base + val) {
    342 		static char letters[] = "0123456789ABCDEF";
    343 		val = strchr(letters, toupper(c)) - letters;
    344 	repeat:
    345 		if (u <= max/base && u*base <= max - val)
    346 			continue;
    347 		if (tp->prop & TSIGNED) {
    348 			if (tp == inttype)
    349 				tp = (base==10) ? longtype : uinttype;
    350 			else if (tp == longtype)
    351 				tp = (base==10) ? llongtype : ulongtype;
    352 			else
    353 				goto overflow;
    354 		} else {
    355 			if (tp == uinttype)
    356 				tp = (sign==UNSIGNED) ? ulongtype : longtype;
    357 			else if (tp == ulongtype)
    358 				tp = (sign==UNSIGNED) ? ullongtype : llongtype;
    359 			else
    360 				goto overflow;
    361 		}
    362 		sym->type = tp;
    363 		lim = getlimits(tp);
    364 		max = lim->max.i;
    365 		goto repeat;
    366 	}
    367 
    368 	if (tp->prop & TSIGNED)
    369 		sym->u.i = u;
    370 	else
    371 		sym->u.u = u;
    372 
    373 	return sym;
    374 
    375 overflow:
    376 	errorp("overflow in integer constant");
    377 	return sym;
    378 }
    379 
    380 static int
    381 integer(char *s, int base)
    382 {
    383 	Type *tp;
    384 	Symbol *sym;
    385 	unsigned size, sign;
    386 
    387 	for (size = sign = 0; ; ++input->p) {
    388 		switch (toupper(*input->p)) {
    389 		case 'L':
    390 			if (size == LLONG)
    391 				goto wrong_type;
    392 			size = (size == LONG) ? LLONG : LONG;
    393 			continue;
    394 		case 'U':
    395 			if (sign == UNSIGNED)
    396 				goto wrong_type;
    397 			sign = UNSIGNED;
    398 			continue;
    399 		default:
    400 			goto convert;
    401 		wrong_type:
    402 			error("invalid suffix in integer constant");
    403 		}
    404 	}
    405 
    406 convert:
    407 	tp = ctype(INT, sign, size);
    408 	sym = newsym(NS_IDEN, NULL);
    409 	sym->type = tp;
    410 	sym->flags |= SCONSTANT;
    411 	yylval.sym = readint(s, base, sign, sym);
    412 	return CONSTANT;
    413 }
    414 
    415 static char *
    416 digits(int base)
    417 {
    418 	char *p;
    419 	int c;
    420 
    421 	for (p = input->p; c = *p; ++p) {
    422 		switch (base) {
    423 		case 8:
    424 			if (!strchr("01234567", c))
    425 				goto end;
    426 			break;
    427 		case 10:
    428 			if (!isdigit(c))
    429 				goto end;
    430 			break;
    431 		case 16:
    432 			if (!isxdigit(c))
    433 				goto end;
    434 			break;
    435 		}
    436 	}
    437 end:
    438 	input->p = p;
    439 	tok2str();
    440 	return yytext;
    441 }
    442 
    443 static int
    444 number(void)
    445 {
    446 	int base;
    447 
    448 	if (*input->p != '0') {
    449 		base = 10;
    450 	} else {
    451 		if (toupper(*++input->p) == 'X') {
    452 			++input->p;
    453 			base = 16;
    454 		} else {
    455 			base = 8;
    456 		}
    457 	}
    458 
    459 	return integer(digits(base), base);
    460 }
    461 
    462 static int
    463 escape(void)
    464 {
    465 	int c, base;
    466 
    467 	switch (*++input->p) {
    468 	case 'a':
    469 		return '\a';
    470 	case 'f':
    471 		return '\f';
    472 	case 'n':
    473 		return '\n';
    474 	case 'r':
    475 		return '\r';
    476 	case 't':
    477 		return '\t';
    478 	case 'v':
    479 		return '\v';
    480 	case '"':
    481 		return '"';
    482 	case '\'':
    483 		return '\'';
    484 	case '\\':
    485 		return '\\';
    486 	case '\?':
    487 		return '\?';
    488 	case 'u':
    489 		/*
    490 		 * FIXME: universal constants are not correctly handled
    491 		 */
    492 		if (!isdigit(*++input->p))
    493 			warn("incorrect digit for numerical character constant");
    494 		base = 10;
    495 		break;
    496 	case 'x':
    497 		if (!isxdigit(*++input->p))
    498 			warn("\\x used with no following hex digits");
    499 		base = 16;
    500 		break;
    501 	case '0':
    502 		if (!strchr("01234567", *++input->p))
    503 			warn("\\0 used with no following octal digits");
    504 		base = 8;
    505 		break;
    506 	default:
    507 		warn("unknown escape sequence");
    508 		return ' ';
    509 	}
    510 	errno = 0;
    511 	c = strtoul(input->p, &input->p, base);
    512 	if (errno || c > 255)
    513 		warn("character constant out of range");
    514 	--input->p;
    515 	return c;
    516 }
    517 
    518 static int
    519 character(void)
    520 {
    521 	int c;
    522 	Symbol *sym;
    523 
    524 	if ((c = *++input->p) == '\\')
    525 		c = escape();
    526 	else
    527 		c = *input->p;
    528 	++input->p;
    529 	if (*input->p != '\'')
    530 		errorp("invalid character constant");
    531 	else
    532 		++input->p;
    533 
    534 	sym = newsym(NS_IDEN, NULL);
    535 	sym->u.i = c;
    536 	sym->type = inttype;
    537 	yylval.sym = sym;
    538 	tok2str();
    539 	return CONSTANT;
    540 }
    541 
    542 static int
    543 string(void)
    544 {
    545 	char *bp = yytext;
    546 	int c;
    547 
    548 	*bp++ = '"';
    549 	for (++input->p; (c = *input->p) != '"'; ++input->p) {
    550 		if (c == '\0') {
    551 			errorp("missing terminating '\"' character");
    552 			break;
    553 		}
    554 		if (c == '\\')
    555 			c = escape();
    556 		if (bp == &yytext[STRINGSIZ+1]) {
    557 			/* TODO: proper error handling here */
    558 			error("string too long");
    559 		}
    560 		*bp++ = c;
    561 	}
    562 
    563 	input->begin = ++input->p;
    564 	*bp = '\0';
    565 
    566 	yylen = bp - yytext + 1;
    567 	yylval.sym = newstring(yytext+1, yylen-1);
    568 	*bp++ = '"';
    569 	*bp = '\0';
    570 	return STRING;
    571 }
    572 
    573 static int
    574 iden(void)
    575 {
    576 	Symbol *sym;
    577 	char *p, *begin;
    578 
    579 	begin = input->p;
    580 	for (p = begin; isalnum(*p) || *p == '_'; ++p)
    581 		;
    582 	input->p = p;
    583 	tok2str();
    584 	if ((sym = lookup(NS_CPP, yytext, NOALLOC)) != NULL) {
    585 		if (!disexpand && !sym->hide && expand(begin, sym))
    586 			return next();
    587 	}
    588 	sym = lookup(namespace, yytext, ALLOC);
    589 	yylval.sym = sym;
    590 	if (sym->flags & SCONSTANT)
    591 		return CONSTANT;
    592 	if (sym->token != IDEN)
    593 		yylval.token = sym->u.token;
    594 	return sym->token;
    595 }
    596 
    597 static int
    598 follow(int expect, int ifyes, int ifno)
    599 {
    600 	if (*input->p++ == expect)
    601 		return ifyes;
    602 	--input->p;
    603 	return ifno;
    604 }
    605 
    606 static int
    607 minus(void)
    608 {
    609 	switch (*input->p++) {
    610 	case '-':
    611 		return DEC;
    612 	case '>':
    613 		return INDIR;
    614 	case '=':
    615 		return SUB_EQ;
    616 	default:
    617 		--input->p;
    618 		return '-';
    619 	}
    620 }
    621 
    622 static int
    623 plus(void)
    624 {
    625 	switch (*input->p++) {
    626 	case '+':
    627 		return INC;
    628 	case '=':
    629 		return ADD_EQ;
    630 	default:
    631 		--input->p;
    632 		return '+';
    633 	}
    634 }
    635 
    636 static int
    637 relational(int op, int equal, int shift, int assig)
    638 {
    639 	int c;
    640 
    641 	if ((c = *input->p++) == '=')
    642 		return equal;
    643 	if (c == op)
    644 		return follow('=', assig, shift);
    645 	--input->p;
    646 	return op;
    647 }
    648 
    649 static int
    650 logic(int op, int equal, int logic)
    651 {
    652 	int c;
    653 
    654 	if ((c = *input->p++) == '=')
    655 		return equal;
    656 	if (c == op)
    657 		return logic;
    658 	--input->p;
    659 	return op;
    660 }
    661 
    662 static int
    663 dot(void)
    664 {
    665 	int c;
    666 
    667 	if ((c = *input->p) != '.')
    668 		return '.';
    669 	if ((c = *++input->p) != '.')
    670 		error("incorrect token '..'");
    671 	++input->p;
    672 	return ELLIPSIS;
    673 }
    674 
    675 static int
    676 operator(void)
    677 {
    678 	int t;
    679 
    680 	switch (t = *input->p++) {
    681 	case '<':
    682 		t = relational('<', LE, SHL, SHL_EQ);
    683 		break;
    684 	case '>':
    685 		t = relational('>', GE, SHR, SHR_EQ);
    686 		break;
    687 	case '&':
    688 		t = logic('&', AND_EQ, AND);
    689 		break;
    690 	case '|':
    691 		t = logic('|', OR_EQ, OR);
    692 		break;
    693 	case '=':
    694 		t = follow('=', EQ, '=');
    695 		break;
    696 	case '^':
    697 		t = follow('=', XOR_EQ, '^');
    698 		break;
    699 	case '*':
    700 		t = follow('=', MUL_EQ, '*');
    701 		break;
    702 	case '/':
    703 		t = follow('=', DIV_EQ, '/');
    704 		break;
    705 	case '!':
    706 		t = follow('=', NE, '!');
    707 		break;
    708 	case '#':
    709 		t = follow('#', '$', '#');
    710 		break;
    711 	case '-':
    712 		t = minus();
    713 		break;
    714 	case '+':
    715 		t = plus();
    716 		break;
    717 	case '.':
    718 		t = dot();
    719 		break;
    720 	}
    721 	tok2str();
    722 	return t;
    723 }
    724 
    725 /* TODO: Ensure that namespace is NS_IDEN after a recovery */
    726 
    727 /*
    728  * skip all the spaces until the next token. When we are in
    729  * CPPMODE \n is not considered a whitespace
    730  */
    731 static int
    732 skipspaces(void)
    733 {
    734 	int c;
    735 
    736 	for (;;) {
    737 		switch (c = *input->p) {
    738 		case '\n':
    739 			if (lexmode == CPPMODE)
    740 				goto return_byte;
    741 			++input->p;
    742 		case '\0':
    743 			if (!moreinput())
    744 				return EOF;
    745 			break;
    746 		case ' ':
    747 		case '\t':
    748 		case '\v':
    749 		case '\r':
    750 		case '\f':
    751 			++input->p;
    752 			break;
    753 		default:
    754 			goto return_byte;
    755 		}
    756 	}
    757 
    758 return_byte:
    759 	input->begin = input->p;
    760 	return c;
    761 }
    762 
    763 int
    764 next(void)
    765 {
    766 	int c;
    767 
    768 	if ((c = skipspaces()) == EOF)
    769 		yytoken = EOFTOK;
    770 	else if (isalpha(c) || c == '_')
    771 		yytoken = iden();
    772 	else if (isdigit(c))
    773 		yytoken = number();
    774 	else if (c == '"')
    775 		yytoken = string();
    776 	else if (c == '\'')
    777 		yytoken = character();
    778 	else
    779 		yytoken = operator();
    780 
    781 	if (yytoken == EOF) {
    782 		strcpy(yytext, "<EOF>");
    783 		if (cppctx)
    784 			errorp("#endif expected");
    785 	}
    786 
    787 	DBG("TOKEN %s", yytext);
    788 	return yytoken;
    789 }
    790 
    791 void
    792 expect(int tok)
    793 {
    794 	if (yytoken != tok) {
    795 		if (isgraph(tok))
    796 			errorp("expected '%c' before '%s'", tok, yytext);
    797 		else
    798 			errorp("unexpected '%s'", yytext);
    799 	} else {
    800 		next();
    801 	}
    802 }
    803 
    804 int
    805 ahead(void)
    806 {
    807 	skipspaces();
    808 	return *input->begin;
    809 }
    810 
    811 void
    812 setsafe(int type)
    813 {
    814 	safe = type;
    815 }
    816 
    817 void
    818 discard(void)
    819 {
    820 	extern jmp_buf recover;
    821 	int c;
    822 
    823 	input->begin = input->p;
    824 	for (c = yytoken; ; c = *input->begin++) {
    825 		switch (safe) {
    826 		case END_COMP:
    827 			if (c == '}')
    828 				goto jump;
    829 			goto semicolon;
    830 		case END_COND:
    831 			if (c == ')')
    832 				goto jump;
    833 			break;
    834 		case END_LDECL:
    835 			if (c == ',')
    836 				goto jump;
    837 		case END_DECL:
    838 		semicolon:
    839 			if (c == ';')
    840 				goto jump;
    841 			break;
    842 		}
    843 		if (c == '\0' && !moreinput())
    844 			exit(1);
    845 	}
    846 jump:
    847 	yytoken = c;
    848 	longjmp(recover, 1);
    849 }