scc

simple c99 compiler
git clone git://git.simple-cc.org/scc
Log | Files | Refs | README | LICENSE

lex.c (14035B)


      1 #include <assert.h>
      2 #include <ctype.h>
      3 #include <errno.h>
      4 #include <limits.h>
      5 #include <setjmp.h>
      6 #include <stdio.h>
      7 #include <stdlib.h>
      8 #include <string.h>
      9 
     10 #include <scc/cstd.h>
     11 #include <scc/scc.h>
     12 #include "cc1.h"
     13 
     14 int yytoken;
     15 struct yystype yylval;
     16 char yytext[STRINGSIZ+3];
     17 unsigned short yylen;
     18 int lexmode = CCMODE;
     19 unsigned lineno;
     20 char filenam[FILENAME_MAX];
     21 
     22 int namespace = NS_IDEN;
     23 static int safe;
     24 Input *input;
     25 
     26 void
     27 ilex(void)
     28 {
     29 	static struct keyword keys[] = {
     30 		{"auto", SCLASS, AUTO},
     31 		{"break", BREAK, BREAK},
     32 		{"_Bool", TYPE, BOOL},
     33 		{"__builtin_va_list", TYPE, VA_LIST},
     34 		{"case", CASE, CASE},
     35 		{"char", TYPE, CHAR},
     36 		{"const", TQUALIFIER, CONST},
     37 		{"continue", CONTINUE, CONTINUE},
     38 		{"default", DEFAULT, DEFAULT},
     39 		{"do", DO, DO},
     40 		{"double", TYPE, DOUBLE},
     41 		{"else", ELSE, ELSE},
     42 		{"enum", TYPE, ENUM},
     43 		{"extern", SCLASS, EXTERN},
     44 		{"float", TYPE, FLOAT},
     45 		{"for", FOR, FOR},
     46 		{"goto", GOTO, GOTO},
     47 		{"if", IF, IF},
     48 		{"inline", TQUALIFIER, INLINE},
     49 		{"int", TYPE, INT},
     50 		{"long", TYPE, LONG},
     51 		{"register", SCLASS, REGISTER},
     52 		{"restrict", TQUALIFIER, RESTRICT},
     53 		{"return", RETURN, RETURN},
     54 		{"short", TYPE, SHORT},
     55 		{"signed", TYPE, SIGNED},
     56 		{"sizeof", SIZEOF, SIZEOF},
     57 		{"static", SCLASS, STATIC},
     58 		{"struct", TYPE, STRUCT},
     59 		{"switch", SWITCH, SWITCH},
     60 		{"typedef", SCLASS, TYPEDEF},
     61 		{"union", TYPE, UNION},
     62 		{"unsigned", TYPE, UNSIGNED},
     63 		{"void", TYPE, VOID},
     64 		{"volatile", TQUALIFIER, VOLATILE},
     65 		{"while", WHILE, WHILE},
     66 		{NULL, 0, 0},
     67 	};
     68 	keywords(keys, NS_KEYWORD);
     69 }
     70 
     71 void
     72 setloc(char *fname, unsigned line)
     73 {
     74 	size_t len;
     75 
     76 	if ((len = strlen(fname)) >= FILENAME_MAX)
     77 		die("cc1: %s: file name too long", fname);
     78 	memmove(filenam, fname, len);
     79 	filenam[len] = '\0';
     80 
     81 	free(input->filenam);
     82 	input->filenam = xstrdup(fname);
     83 	lineno = input->lineno = line;
     84 }
     85 
     86 void
     87 addinput(char *fname, Symbol *hide, char *buffer)
     88 {
     89 	FILE *fp;
     90 	char *extp;
     91 	unsigned flags;
     92 	int infileln;
     93 	Input *newip, *curip = input;
     94 
     95 	if (hide) {
     96 		/* this is a macro expansion */
     97 		fp = NULL;
     98 		if (hide->hide == UCHAR_MAX)
     99 			die("cc1: too many macro expansions");
    100 		++hide->hide;
    101 		flags = IMACRO;
    102 	} else  if (fname) {
    103 		/* a new file */
    104 		if ((fp = fopen(fname, "r")) == NULL)
    105 			die("cc1: %s: %s", fname, strerror(errno));
    106 		flags = IFILE;
    107 		if (curip && onlyheader) {
    108 			infileln = strlen(infile);
    109 			if (extp = strrchr(infile, '.'))
    110 				infileln -= strlen(extp);
    111 			printf("%.*s.o: %s %s\n",
    112 			       infileln, infile, infile, fname);
    113 		}
    114 	} else {
    115 		/* reading from stdin */
    116 		fp = stdin;
    117 		fname = "<stdin>";
    118 		flags = ISTDIN;
    119 	}
    120 
    121 	newip = xmalloc(sizeof(*newip));
    122 
    123 	if (!buffer) {
    124 		buffer = xmalloc(INPUTSIZ);
    125 		buffer[0] = '\0';
    126 	}
    127 
    128 	if (curip)
    129 		curip->lineno = lineno;
    130 
    131 	newip->p = newip->begin = newip->line = buffer;
    132 	newip->filenam = NULL;
    133 	newip->lineno = 0;
    134 	newip->next = curip;
    135 	newip->fp = fp;
    136 	newip->hide = hide;
    137 	newip->flags = flags;
    138 	input = newip;
    139 
    140 	setloc(fname, (curip) ? curip->lineno : newip->lineno);
    141 }
    142 
    143 void
    144 delinput(void)
    145 {
    146 	Input *ip = input;
    147 	Symbol *hide = ip->hide;
    148 
    149 	switch (ip->flags & ITYPE) {
    150 	case IFILE:
    151 		if (fclose(ip->fp))
    152 			die("cc1: %s: %s", ip->filenam, strerror(errno));
    153 		break;
    154 	case IMACRO:
    155 		assert(hide->hide == 1);
    156 		--hide->hide;
    157 		break;
    158 	}
    159 	input = ip->next;
    160 	free(ip->filenam);
    161 	free(ip->line);
    162 	if (input) {
    163 		lineno = input->lineno;
    164 		strcpy(filenam, input->filenam);
    165 	}
    166 }
    167 
    168 static void
    169 newline(void)
    170 {
    171 	if (++lineno == 0)
    172 		die("cc1: %s: file too long", filenam);
    173 }
    174 
    175 /*
    176  * Read the next character from the input file, counting number of lines
    177  * and joining lines escaped with \
    178  */
    179 static int
    180 readchar(void)
    181 {
    182 	FILE *fp = input->fp;
    183 	int c;
    184 
    185 repeat:
    186 	switch (c = getc(fp)) {
    187 	case '\\':
    188 		if ((c = getc(fp)) == '\n') {
    189 			newline();
    190 			goto repeat;
    191 		}
    192 		ungetc(c, fp);
    193 		c = '\\';
    194 		break;
    195 	case '\n':
    196 		newline();
    197 		break;
    198 	default:
    199 		if (!isprint(c) && !ispunct(c) && !isspace(c))
    200 			warn("invalid input character. The shame of UB is yours");
    201 		break;
    202 	}
    203 
    204 	return c;
    205 }
    206 
    207 /*
    208  * discard a C comment. This function is only called from readline
    209  * because it is impossible to have a comment in a macro, because
    210  * comments are always discarded before processing any cpp directive
    211  */
    212 static void
    213 comment(int type)
    214 {
    215 	int c;
    216 
    217 repeat:
    218 	while ((c = readchar()) != EOF && c != type)
    219 		;
    220 
    221 	if (c == EOF) {
    222 		errorp("unterminated comment");
    223 		return;
    224 	}
    225 
    226 	if (type == '*' && (c = readchar()) != '/')
    227 		goto repeat;
    228 }
    229 
    230 /*
    231  * readline is used to read a full logic line from a file.
    232  * It discards comments and check that the line fits in
    233  * the input buffer
    234  */
    235 static int
    236 readline(void)
    237 {
    238 	char *bp, *lim;
    239 	int c, peekc = 0;
    240 
    241 	if (feof(input->fp)) {
    242 		input->flags |= IEOF;
    243 		return 0;
    244 	}
    245 
    246 	*input->line = '\0';
    247 	lim = &input->line[INPUTSIZ-1];
    248 	for (bp = input->line; bp < lim-1; *bp++ = c) {
    249 		c = (peekc) ? peekc : readchar();
    250 		peekc = 0;
    251 		if (c == '\n' || c == EOF)
    252 			break;
    253 		if (c != '/')
    254 			continue;
    255 
    256 		/* check for /* or // */
    257 		peekc = readchar();
    258 		if (peekc != '*' && peekc != '/')
    259 			continue;
    260 		comment((peekc == '/') ? '\n' : '*');
    261 		peekc = 0;
    262 		c = ' ';
    263 	}
    264 
    265 	input->begin = input->p = input->line;
    266 	if (bp == lim-1) {
    267 		errorp("line too long");
    268 		--bp;
    269 	}
    270 	*bp++ = '\n';
    271 	*bp = '\0';
    272 
    273 	return 1;
    274 }
    275 
    276 /*
    277  * moreinput gets more bytes to be passed to the lexer.
    278  * It can take more bytes from macro expansions or
    279  * directly reading from files. When a cpp directive
    280  * is processed the line is discarded because it must not
    281  * be passed to the lexer
    282  */
    283 static int
    284 moreinput(void)
    285 {
    286 	int wasexpand = 0;
    287 
    288 repeat:
    289 	if (!input)
    290 		return 0;
    291 
    292 	if (*input->p == '\0') {
    293 		if ((input->flags&ITYPE) == IMACRO) {
    294 			wasexpand = 1;
    295 			input->flags |= IEOF;
    296 		}
    297 		if (input->flags & IEOF) {
    298 			delinput();
    299 			goto repeat;
    300 		}
    301 		if (!readline() || cpp()) {
    302 			*input->p = '\0';
    303 			goto repeat;
    304 		}
    305 	}
    306 
    307 	if (onlycpp && !wasexpand)
    308 		ppragmaln();
    309 	return 1;
    310 }
    311 
    312 static void
    313 tok2str(void)
    314 {
    315 	if ((yylen = input->p - input->begin) > INTIDENTSIZ)
    316 		error("token too big");
    317 	memcpy(yytext, input->begin, yylen);
    318 	yytext[yylen] = '\0';
    319 	input->begin = input->p;
    320 }
    321 
    322 static Symbol *
    323 readint(char *s, int base, int sign, Symbol *sym)
    324 {
    325 	Type *tp = sym->type;
    326 	struct limits *lim;
    327 	TUINT u, val, max;
    328 	int c;
    329 
    330 	lim = getlimits(tp);
    331 	max = lim->max.i;
    332 	if (*s == '0')
    333 		++s;
    334 	if (toupper(*s) == 'X')
    335 		++s;
    336 
    337 	for (u = 0; isxdigit(c = *s++); u = u*base + val) {
    338 		static char letters[] = "0123456789ABCDEF";
    339 		val = strchr(letters, toupper(c)) - letters;
    340 	repeat:
    341 		if (u <= max/base && u*base <= max - val)
    342 			continue;
    343 		if (tp->prop & TSIGNED) {
    344 			if (tp == inttype)
    345 				tp = (base==10) ? longtype : uinttype;
    346 			else if (tp == longtype)
    347 				tp = (base==10) ? llongtype : ulongtype;
    348 			else
    349 				goto overflow;
    350 		} else {
    351 			if (tp == uinttype)
    352 				tp = (sign==UNSIGNED) ? ulongtype : longtype;
    353 			else if (tp == ulongtype)
    354 				tp = (sign==UNSIGNED) ? ullongtype : llongtype;
    355 			else
    356 				goto overflow;
    357 		}
    358 		sym->type = tp;
    359 		lim = getlimits(tp);
    360 		max = lim->max.i;
    361 		goto repeat;
    362 	}
    363 
    364 	if (tp->prop & TSIGNED)
    365 		sym->u.i = u;
    366 	else
    367 		sym->u.u = u;
    368 
    369 	return sym;
    370 
    371 overflow:
    372 	errorp("overflow in integer constant");
    373 	return sym;
    374 }
    375 
    376 static int
    377 integer(char *s, int base)
    378 {
    379 	Type *tp;
    380 	Symbol *sym;
    381 	unsigned size, sign;
    382 
    383 	for (size = sign = 0; ; ++input->p) {
    384 		switch (toupper(*input->p)) {
    385 		case 'L':
    386 			if (size == LLONG)
    387 				goto wrong_type;
    388 			size = (size == LONG) ? LLONG : LONG;
    389 			continue;
    390 		case 'U':
    391 			if (sign == UNSIGNED)
    392 				goto wrong_type;
    393 			sign = UNSIGNED;
    394 			continue;
    395 		default:
    396 			goto convert;
    397 		wrong_type:
    398 			error("invalid suffix in integer constant");
    399 		}
    400 	}
    401 
    402 convert:
    403 	tp = ctype(INT, sign, size);
    404 	sym = newsym(NS_IDEN, NULL);
    405 	sym->type = tp;
    406 	sym->flags |= SCONSTANT;
    407 	yylval.sym = readint(s, base, sign, sym);
    408 	return CONSTANT;
    409 }
    410 
    411 static char *
    412 digits(int base)
    413 {
    414 	char *p;
    415 	int c;
    416 
    417 	for (p = input->p; c = *p; ++p) {
    418 		switch (base) {
    419 		case 8:
    420 			if (!strchr("01234567", c))
    421 				goto end;
    422 			break;
    423 		case 10:
    424 			if (!isdigit(c))
    425 				goto end;
    426 			break;
    427 		case 16:
    428 			if (!isxdigit(c))
    429 				goto end;
    430 			break;
    431 		}
    432 	}
    433 end:
    434 	input->p = p;
    435 	tok2str();
    436 	return yytext;
    437 }
    438 
    439 static int
    440 number(void)
    441 {
    442 	int base;
    443 
    444 	if (*input->p != '0') {
    445 		base = 10;
    446 	} else {
    447 		if (toupper(*++input->p) == 'X') {
    448 			++input->p;
    449 			base = 16;
    450 		} else {
    451 			base = 8;
    452 		}
    453 	}
    454 
    455 	return integer(digits(base), base);
    456 }
    457 
    458 static int
    459 escape(void)
    460 {
    461 	int c, base;
    462 
    463 	switch (*++input->p) {
    464 	case 'a':  return '\a';
    465 	case 'f':  return '\f';
    466 	case 'n':  return '\n';
    467 	case 'r':  return '\r';
    468 	case 't':  return '\t';
    469 	case 'v':  return '\v';
    470 	case '"':  return '"';
    471 	case '\'': return '\'';
    472 	case '\\': return '\\';
    473 	case '\?': return '\?';
    474 	case 'u':
    475 		/*
    476 		 * FIXME: universal constants are not correctly handled
    477 		 */
    478 		if (!isdigit(*++input->p))
    479 			warn("incorrect digit for numerical character constant");
    480 		base = 10;
    481 		break;
    482 	case 'x':
    483 		if (!isxdigit(*++input->p))
    484 			warn("\\x used with no following hex digits");
    485 		base = 16;
    486 		break;
    487 	case '0':
    488 		if (!strchr("01234567", *++input->p))
    489 			warn("\\0 used with no following octal digits");
    490 		base = 8;
    491 		break;
    492 	default:
    493 		warn("unknown escape sequence");
    494 		return ' ';
    495 	}
    496 	errno = 0;
    497 	c = strtoul(input->p, &input->p, base);
    498 	if (errno || c > 255)
    499 		warn("character constant out of range");
    500 	--input->p;
    501 	return c;
    502 }
    503 
    504 static int
    505 character(void)
    506 {
    507 	int c;
    508 	Symbol *sym;
    509 
    510 	if ((c = *++input->p) == '\\')
    511 		c = escape();
    512 	else
    513 		c = *input->p;
    514 	++input->p;
    515 	if (*input->p != '\'')
    516 		errorp("invalid character constant");
    517 	else
    518 		++input->p;
    519 
    520 	sym = newsym(NS_IDEN, NULL);
    521 	sym->u.i = c;
    522 	sym->type = inttype;
    523 	yylval.sym = sym;
    524 	tok2str();
    525 	return CONSTANT;
    526 }
    527 
    528 static int
    529 string(void)
    530 {
    531 	char *bp = yytext;
    532 	int c;
    533 
    534 	*bp++ = '"';
    535 	for (++input->p; (c = *input->p) != '"'; ++input->p) {
    536 		if (c == '\0') {
    537 			errorp("missing terminating '\"' character");
    538 			break;
    539 		}
    540 		if (c == '\\')
    541 			c = escape();
    542 		if (bp == &yytext[STRINGSIZ+1]) {
    543 			/* TODO: proper error handling here */
    544 			error("string too long");
    545 		}
    546 		*bp++ = c;
    547 	}
    548 
    549 	input->begin = ++input->p;
    550 	*bp = '\0';
    551 
    552 	yylen = bp - yytext + 1;
    553 	yylval.sym = newstring(yytext+1, yylen-1);
    554 	*bp++ = '"';
    555 	*bp = '\0';
    556 	return STRING;
    557 }
    558 
    559 static int
    560 iden(void)
    561 {
    562 	Symbol *sym;
    563 	char *p, *begin;
    564 
    565 	begin = input->p;
    566 	for (p = begin; isalnum(*p) || *p == '_'; ++p)
    567 		;
    568 	input->p = p;
    569 	tok2str();
    570 	if ((sym = lookup(NS_CPP, yytext, NOALLOC)) != NULL) {
    571 		if (!disexpand && !sym->hide && expand(begin, sym))
    572 			return next();
    573 	}
    574 	sym = lookup(namespace, yytext, ALLOC);
    575 	yylval.sym = sym;
    576 	if (sym->flags & SCONSTANT)
    577 		return CONSTANT;
    578 	if (sym->token != IDEN)
    579 		yylval.token = sym->u.token;
    580 	return sym->token;
    581 }
    582 
    583 static int
    584 follow(int expect, int ifyes, int ifno)
    585 {
    586 	if (*input->p++ == expect)
    587 		return ifyes;
    588 	--input->p;
    589 	return ifno;
    590 }
    591 
    592 static int
    593 minus(void)
    594 {
    595 	switch (*input->p++) {
    596 	case '-': return DEC;
    597 	case '>': return INDIR;
    598 	case '=': return SUB_EQ;
    599 	default: --input->p; return '-';
    600 	}
    601 }
    602 
    603 static int
    604 plus(void)
    605 {
    606 	switch (*input->p++) {
    607 	case '+': return INC;
    608 	case '=': return ADD_EQ;
    609 	default: --input->p; return '+';
    610 	}
    611 }
    612 
    613 static int
    614 relational(int op, int equal, int shift, int assig)
    615 {
    616 	int c;
    617 
    618 	if ((c = *input->p++) == '=')
    619 		return equal;
    620 	if (c == op)
    621 		return follow('=', assig, shift);
    622 	--input->p;
    623 	return op;
    624 }
    625 
    626 static int
    627 logic(int op, int equal, int logic)
    628 {
    629 	int c;
    630 
    631 	if ((c = *input->p++) == '=')
    632 		return equal;
    633 	if (c == op)
    634 		return logic;
    635 	--input->p;
    636 	return op;
    637 }
    638 
    639 static int
    640 dot(void)
    641 {
    642 	int c;
    643 
    644 	if ((c = *input->p) != '.')
    645 		return '.';
    646 	if ((c = *++input->p) != '.')
    647 		error("incorrect token '..'");
    648 	++input->p;
    649 	return ELLIPSIS;
    650 }
    651 
    652 static int
    653 operator(void)
    654 {
    655 	int t;
    656 
    657 	switch (t = *input->p++) {
    658 	case '<': t = relational('<', LE, SHL, SHL_EQ); break;
    659 	case '>': t = relational('>', GE, SHR, SHR_EQ); break;
    660 	case '&': t = logic('&', AND_EQ, AND); break;
    661 	case '|': t = logic('|', OR_EQ, OR); break;
    662 	case '=': t = follow('=', EQ, '='); break;
    663 	case '^': t = follow('=', XOR_EQ, '^'); break;
    664 	case '*': t = follow('=', MUL_EQ, '*'); break;
    665 	case '/': t = follow('=', DIV_EQ, '/'); break;
    666 	case '!': t = follow('=', NE, '!'); break;
    667 	case '#': t = follow('#', '$', '#'); break;
    668 	case '-': t = minus(); break;
    669 	case '+': t = plus(); break;
    670 	case '.': t = dot(); break;
    671 	}
    672 	tok2str();
    673 	return t;
    674 }
    675 
    676 /* TODO: Ensure that namespace is NS_IDEN after a recovery */
    677 
    678 /*
    679  * skip all the spaces until the next token. When we are in
    680  * CPPMODE \n is not considered a whitespace
    681  */
    682 static int
    683 skipspaces(void)
    684 {
    685 	int c;
    686 
    687 	for (;;) {
    688 		switch (c = *input->p) {
    689 		case '\n':
    690 			if (lexmode == CPPMODE)
    691 				goto return_byte;
    692 			++input->p;
    693 		case '\0':
    694 			if (!moreinput())
    695 				return EOF;
    696 			break;
    697 		case ' ':
    698 		case '\t':
    699 		case '\v':
    700 		case '\r':
    701 		case '\f':
    702 			++input->p;
    703 			break;
    704 		default:
    705 			goto return_byte;
    706 		}
    707 	}
    708 
    709 return_byte:
    710 	input->begin = input->p;
    711 	return c;
    712 }
    713 
    714 int
    715 next(void)
    716 {
    717 	int c;
    718 
    719 	if ((c = skipspaces()) == EOF)
    720 		yytoken = EOFTOK;
    721 	else if (isalpha(c) || c == '_')
    722 		yytoken = iden();
    723 	else if (isdigit(c))
    724 		yytoken = number();
    725 	else if (c == '"')
    726 		yytoken = string();
    727 	else if (c == '\'')
    728 		yytoken = character();
    729 	else
    730 		yytoken = operator();
    731 
    732 	if (yytoken == EOF) {
    733 		strcpy(yytext, "<EOF>");
    734 		if (cppctx)
    735 			errorp("#endif expected");
    736 	}
    737 
    738 	DBG("TOKEN %s", yytext);
    739 	return yytoken;
    740 }
    741 
    742 void
    743 expect(int tok)
    744 {
    745 	if (yytoken != tok) {
    746 		if (isgraph(tok))
    747 			errorp("expected '%c' before '%s'", tok, yytext);
    748 		else
    749 			errorp("unexpected '%s'", yytext);
    750 	} else {
    751 		next();
    752 	}
    753 }
    754 
    755 int
    756 ahead(void)
    757 {
    758 	skipspaces();
    759 	return *input->begin;
    760 }
    761 
    762 void
    763 setsafe(int type)
    764 {
    765 	safe = type;
    766 }
    767 
    768 void
    769 discard(void)
    770 {
    771 	extern jmp_buf recover;
    772 	int c;
    773 
    774 	input->begin = input->p;
    775 	for (c = yytoken; ; c = *input->begin++) {
    776 		switch (safe) {
    777 		case END_COMP:
    778 			if (c == '}')
    779 				goto jump;
    780 			goto semicolon;
    781 		case END_COND:
    782 			if (c == ')')
    783 				goto jump;
    784 			break;
    785 		case END_LDECL:
    786 			if (c == ',')
    787 				goto jump;
    788 		case END_DECL:
    789 		semicolon:
    790 			if (c == ';')
    791 				goto jump;
    792 			break;
    793 		}
    794 		if (c == '\0' && !moreinput())
    795 			exit(1);
    796 	}
    797 jump:
    798 	yytoken = c;
    799 	longjmp(recover, 1);
    800 }