lex.c (16273B)
1 #include <assert.h> 2 #include <ctype.h> 3 #include <errno.h> 4 #include <limits.h> 5 #include <setjmp.h> 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <string.h> 9 10 #include <scc/cstd.h> 11 #include <scc/scc.h> 12 #include "cc1.h" 13 14 int yytoken; 15 struct yystype yylval; 16 char yytext[STRINGSIZ+3]; 17 unsigned short yylen; 18 int lexmode = CCMODE; 19 unsigned lineno; 20 char filenam[FILENAME_MAX]; 21 22 int namespace = NS_IDEN; 23 static int safe; 24 Input *input; 25 26 void 27 setloc(char *fname, unsigned line) 28 { 29 size_t len; 30 31 if (fname) { 32 if ((len = strlen(fname)) >= FILENAME_MAX) 33 die("cc1: %s: file name too long", fname); 34 memmove(filenam, fname, len); 35 filenam[len] = '\0'; 36 37 /* 38 * There are cases where we want to call setloc() 39 * with the data in input, and then we have t be 40 * careful about freeing input->filenam 41 */ 42 if (fname != input->filenam) { 43 free(input->filenam); 44 input->filenam = xstrdup(fname); 45 } 46 } 47 48 lineno = input->lineno = line; 49 } 50 51 int 52 addinput(int type, void *arg, int fail) 53 { 54 FILE *fp; 55 char *extp, *fname, *buffer, *infile; 56 int infileln; 57 Macro *mp; 58 Symbol *sym; 59 Input *newip, *curip = input; 60 61 if (curip) 62 curip->lineno = lineno; 63 64 switch (type) { 65 case IMACRO: 66 fp = NULL; 67 mp = arg; 68 sym = mp->sym; 69 fname = mp->fname; 70 buffer = mp->buffer; 71 DBG("INPUT: expanding macro %s", sym->name); 72 break; 73 case IPARAM: 74 fp = NULL; 75 mp = NULL; 76 buffer = arg; 77 fname = filenam; 78 DBG("INPUT: macro parameter '%s'", buffer); 79 break; 80 case IFILE: 81 fname = arg; 82 mp = NULL; 83 buffer = NULL; 84 85 if ((fp = fopen(fname, "r")) == NULL) { 86 if (!fail) 87 return 0; 88 die("cc1: %s: %s", fname, strerror(errno)); 89 } 90 if (curip && onlyheader) { 91 infile = curip->filenam; 92 infileln = strlen(infile); 93 if (extp = strrchr(infile, '.')) 94 infileln -= strlen(extp); 95 printf("%.*s.o: %s %s\n", 96 infileln, infile, infile, fname); 97 } 98 lineno = 0; 99 DBG("INPUT: file input '%s'", fname); 100 break; 101 case ISTDIN: 102 fp = stdin; 103 mp = NULL; 104 fname = "<stdin>"; 105 buffer = NULL; 106 lineno = 0; 107 DBG("INPUT: file input 'stdin'"); 108 break; 109 default: 110 abort(); 111 } 112 113 if (!buffer) { 114 buffer = xmalloc(INPUTSIZ); 115 buffer[0] = '\0'; 116 } else { 117 buffer = xstrdup(buffer); 118 } 119 120 newip = xmalloc(sizeof(*newip)); 121 newip->next = curip; 122 newip->macro = mp; 123 newip->p = newip->begin = newip->line = buffer; 124 newip->filenam = NULL; 125 newip->lineno = 0; 126 newip->fp = fp; 127 newip->flags = type; 128 input = newip; 129 130 setloc(fname, lineno); 131 return 1; 132 } 133 134 void 135 delinput(void) 136 { 137 Input *ip = input; 138 139 switch (ip->flags & ITYPE) { 140 case IFILE: 141 DBG("INPUT: file finished '%s'", ip->filenam); 142 if (fclose(ip->fp)) 143 die("cc1: %s: %s", ip->filenam, strerror(errno)); 144 break; 145 case IMACRO: 146 DBG("INPUT: macro %s finished", ip->macro->sym->name); 147 delmacro(ip->macro); 148 break; 149 case IPARAM: 150 DBG("INPUT: macro param finished"); 151 break; 152 case ISTDIN: 153 DBG("INPUT: stdin finished"); 154 break; 155 default: 156 abort(); 157 } 158 159 input = ip->next; 160 free(ip->filenam); 161 free(ip->line); 162 free(ip); 163 if (input) 164 setloc(input->filenam, input->lineno); 165 } 166 167 static void 168 newline(void) 169 { 170 if (++lineno == 0) 171 die("cc1: %s: file too long", filenam); 172 } 173 174 /* 175 * Read the next character from the input file, counting number of lines 176 * and joining lines escaped with \ 177 */ 178 static int 179 readchar(void) 180 { 181 FILE *fp = input->fp; 182 int c; 183 184 repeat: 185 switch (c = getc(fp)) { 186 case '\\': 187 if ((c = getc(fp)) == '\n') { 188 newline(); 189 goto repeat; 190 } 191 ungetc(c, fp); 192 c = '\\'; 193 break; 194 case '\n': 195 newline(); 196 break; 197 case EOF: 198 break; 199 } 200 201 return c; 202 } 203 204 /* 205 * discard a C comment. This function is only called from readline 206 * because it is impossible to have a comment in a macro, because 207 * comments are always discarded before processing any cpp directive 208 */ 209 static void 210 comment(int type) 211 { 212 int c; 213 214 repeat: 215 while ((c = readchar()) != EOF && c != type) 216 ; 217 218 if (c == EOF) { 219 errorp("unterminated comment"); 220 return; 221 } 222 223 if (type == '*' && (c = readchar()) != '/') 224 goto repeat; 225 } 226 227 /* 228 * readline is used to read a full logic line from a file. 229 * It discards comments and check that the line fits in 230 * the input buffer 231 */ 232 static int 233 readline(void) 234 { 235 char *bp, *lim; 236 int c, peekc = 0, delim = 0; 237 238 if (feof(input->fp)) { 239 input->flags |= IEOF; 240 *input->p = '\0'; 241 return 0; 242 } 243 244 *input->line = '\0'; 245 lim = &input->line[INPUTSIZ-1]; 246 for (bp = input->line; bp < lim-1; *bp++ = c) { 247 c = (peekc) ? peekc : readchar(); 248 peekc = 0; 249 if (c == '\n' || c == EOF) 250 break; 251 if (c == '\\') { 252 peekc = readchar(); 253 if (peekc == '\n' || peekc == EOF) 254 continue; 255 if (bp == lim-2) 256 break; 257 *bp++ = c; 258 c = peekc; 259 peekc = 0; 260 continue; 261 } 262 263 if (delim && c == delim) 264 delim = 0; 265 else if (!delim && (c == '"' || c == '\'')) 266 delim = c; 267 if (c != '/' || delim) 268 continue; 269 270 /* check for /* or // */ 271 peekc = readchar(); 272 if (peekc != '*' && peekc != '/') 273 continue; 274 275 if (peekc == '/') { 276 comment('\n'); 277 break; 278 } else { 279 comment('*'); 280 c = ' '; 281 } 282 peekc = 0; 283 } 284 285 input->begin = input->p = input->line; 286 if (bp == lim-1) { 287 errorp("line too long"); 288 --bp; 289 } 290 *bp++ = '\n'; 291 *bp = '\0'; 292 293 return 1; 294 } 295 296 /* 297 * moreinput gets more bytes to be passed to the lexer. 298 * It can take more bytes from macro expansions or 299 * directly reading from files. When a cpp directive 300 * is processed the line is discarded because it must not 301 * be passed to the lexer 302 */ 303 static int 304 moreinput(void) 305 { 306 int wasexpand = 0; 307 308 repeat: 309 if (!input) 310 return 0; 311 312 if (*input->p == '\0') { 313 int t = input->flags & ITYPE; 314 if (t == IPARAM) { 315 input->flags |= IEOF; 316 return 0; 317 } 318 if (t == IMACRO) { 319 wasexpand = 1; 320 input->flags |= IEOF; 321 } 322 if (input->flags & IEOF) { 323 delinput(); 324 goto repeat; 325 } 326 if (!readline()) { 327 *input->p = '\0'; 328 goto repeat; 329 } 330 if (cpp()) 331 goto repeat; 332 } 333 334 if (onlycpp && !wasexpand) 335 ppragmaln(); 336 return 1; 337 } 338 339 static void 340 tok2str(void) 341 { 342 if ((yylen = input->p - input->begin) > INTIDENTSIZ) 343 error("token too big"); 344 memcpy(yytext, input->begin, yylen); 345 yytext[yylen] = '\0'; 346 input->begin = input->p; 347 } 348 349 static Symbol * 350 readint(char *s, int base, int sign, Symbol *sym) 351 { 352 Type *tp = sym->type; 353 struct limits *lim; 354 TUINT u, val, max; 355 int c; 356 357 lim = getlimits(tp); 358 max = lim->max.i; 359 if (*s == '0') 360 ++s; 361 if (toupper(*s) == 'X') 362 ++s; 363 364 for (u = 0; isxdigit(c = *s++); u = u*base + val) { 365 static char letters[] = "0123456789ABCDEF"; 366 val = strchr(letters, toupper(c)) - letters; 367 repeat: 368 if (u <= max/base && u*base <= max - val) 369 continue; 370 if (tp->prop & TSIGNED) { 371 if (tp == inttype) 372 tp = (base==10) ? longtype : uinttype; 373 else if (tp == longtype) 374 tp = (base==10) ? llongtype : ulongtype; 375 else 376 goto overflow; 377 } else { 378 if (tp == uinttype) 379 tp = (sign==UNSIGNED) ? ulongtype : longtype; 380 else if (tp == ulongtype) 381 tp = (sign==UNSIGNED) ? ullongtype : llongtype; 382 else 383 goto overflow; 384 } 385 sym->type = tp; 386 lim = getlimits(tp); 387 max = lim->max.i; 388 goto repeat; 389 } 390 391 if (tp->prop & TSIGNED) 392 sym->u.i = u; 393 else 394 sym->u.u = u; 395 396 return sym; 397 398 overflow: 399 errorp("overflow in integer constant"); 400 return sym; 401 } 402 403 static int 404 integer(char *s, int base) 405 { 406 Type *tp; 407 Symbol *sym; 408 unsigned size, sign; 409 410 for (size = sign = 0; ; ++input->p) { 411 switch (toupper(*input->p)) { 412 case 'L': 413 if (size == LLONG) 414 goto wrong_type; 415 size = (size == LONG) ? LLONG : LONG; 416 continue; 417 case 'U': 418 if (sign == UNSIGNED) 419 goto wrong_type; 420 sign = UNSIGNED; 421 continue; 422 default: 423 goto convert; 424 wrong_type: 425 error("invalid suffix in integer constant"); 426 } 427 } 428 429 convert: 430 tok2str(); 431 tp = ctype(INT, sign, size); 432 sym = newsym(NS_IDEN, NULL); 433 sym->type = tp; 434 sym->flags |= SCONSTANT; 435 yylval.sym = readint(s, base, sign, sym); 436 return CONSTANT; 437 } 438 439 static char * 440 digits(int base) 441 { 442 char *p; 443 int c; 444 445 for (p = input->p; c = *p; ++p) { 446 switch (base) { 447 case 8: 448 if (!strchr("01234567", c)) 449 goto end; 450 break; 451 case 10: 452 if (!isdigit(c)) 453 goto end; 454 break; 455 case 16: 456 if (!isxdigit(c)) 457 goto end; 458 break; 459 } 460 } 461 end: 462 input->p = p; 463 return yytext; 464 } 465 466 static int 467 number(void) 468 { 469 int base; 470 471 if (*input->p != '0') { 472 base = 10; 473 } else { 474 if (toupper(*++input->p) == 'X') { 475 ++input->p; 476 base = 16; 477 } else { 478 base = 8; 479 } 480 } 481 482 return integer(digits(base), base); 483 } 484 485 static int 486 escape(void) 487 { 488 int c, d, i, cnt, base; 489 490 switch (*++input->p) { 491 case 'a': 492 return '\a'; 493 case 'b': 494 return '\b'; 495 case 'f': 496 return '\f'; 497 case 'n': 498 return '\n'; 499 case 'r': 500 return '\r'; 501 case 't': 502 return '\t'; 503 case 'v': 504 return '\v'; 505 case '"': 506 return '"'; 507 case '\'': 508 return '\''; 509 case '\\': 510 return '\\'; 511 case '\?': 512 return '\?'; 513 case 'u': 514 /* 515 * FIXME: universal constants are not correctly handled 516 */ 517 if (!isdigit(*++input->p)) 518 warn("incorrect digit for numerical character constant"); 519 base = 10; 520 break; 521 case 'x': 522 if (!isxdigit(*++input->p)) 523 warn("\\x used with no following hex digits"); 524 cnt = 2; 525 base = 16; 526 break; 527 case '0': 528 case '1': 529 case '2': 530 case '3': 531 case '4': 532 case '5': 533 case '6': 534 case '7': 535 cnt = 3; 536 base = 8; 537 break; 538 default: 539 warn("unknown escape sequence"); 540 return ' '; 541 } 542 543 for (c = i = 0; i < cnt; ++i) { 544 static char digits[] = "0123456789ABCDEF"; 545 char *p = strchr(digits, toupper(*input->p)); 546 547 if (!p || (d = p - digits) > base) 548 break; 549 c *= base; 550 c += d; 551 ++input->p; 552 } 553 --input->p; 554 555 return c; 556 } 557 558 static Rune 559 utf8rune(void) 560 { 561 Rune wc; 562 unsigned c; 563 size_t i, len; 564 565 c = *input->p; 566 for (len = 0; c & 0x80; len++) 567 c <<= 1; 568 if (len == 0) 569 return c; 570 if (len == 1 || len == 8) 571 goto invalid; 572 573 wc = (c & 0xFF) >> len; 574 for (i = 0; i < len-1; i++) { 575 c = input->p[1]; 576 if ((c & 0xC0) != 0x80) 577 goto invalid; 578 input->p++; 579 wc <<= 6; 580 wc |= c & 0x3F; 581 } 582 return wc; 583 584 invalid: 585 errorp("invalid multibyte sequence"); 586 return 0xFFFD; 587 } 588 589 static Rune 590 decode(int multi) 591 { 592 Rune r; 593 594 if (*input->p == '\\') { 595 r = escape(); 596 return r; 597 } 598 599 return multi ? utf8rune() : *input->p; 600 } 601 602 static int 603 character(void) 604 { 605 int i, multi = 0; 606 Rune r, d; 607 Type *tp = inttype; 608 Symbol *sym; 609 610 if (*input->p == 'L') { 611 multi = 1; 612 tp = wchartype; 613 input->p++; 614 } 615 616 d = 0; 617 input->p++; 618 for (i = 0; *input->p != '\''; i++) { 619 r = decode(multi); 620 if (r > getlimits(tp)->max.i) 621 warn("character too large for enclosing character literal type"); 622 d |= r; 623 input->p++; 624 } 625 input->p++; 626 627 if (i == 0) 628 errorp("empty character constant"); 629 if (i > 1) 630 warn("multi-character character constant"); 631 632 sym = newsym(NS_IDEN, NULL); 633 sym->u.i = d; 634 sym->type = tp; 635 yylval.sym = sym; 636 tok2str(); 637 return CONSTANT; 638 } 639 640 /* 641 * string() parses a constant string, and convert all the 642 * escape sequences into single characters. This behaviour 643 * is correct except when we parse a #define, where we want 644 * to preserve the literal content of the string. In that 645 * case cpp.c:/^define( sets the variable disescape to 646 * disable converting escape sequences into characters. 647 */ 648 static int 649 string(void) 650 { 651 char *bp = yytext; 652 int c, esc; 653 654 *bp++ = '"'; 655 esc = 0; 656 for (++input->p; ; ++input->p) { 657 c = *input->p; 658 659 if (c == '"' && !esc) 660 break; 661 662 if (c == '\0') { 663 errorp("missing terminating '\"' character"); 664 break; 665 } 666 667 esc = (c == '\\' && !esc && disescape); 668 669 if (c == '\\' && !esc) 670 c = escape(); 671 672 if (bp == &yytext[STRINGSIZ+1]) { 673 /* too long, ignore everything until next quote */ 674 for (++input->p; *input->p != '"'; ++input->p) { 675 if (*input->p == '\\') 676 ++input->p; 677 if (*input->p == '\0') 678 break; 679 } 680 --bp; 681 errorp("string too long"); 682 break; 683 } 684 *bp++ = c; 685 } 686 687 input->begin = ++input->p; 688 *bp = '\0'; 689 690 yylen = bp - yytext + 1; 691 yylval.sym = newstring(yytext+1, yylen-1); 692 *bp++ = '"'; 693 *bp = '\0'; 694 return STRING; 695 } 696 697 static int 698 iden(void) 699 { 700 Symbol *sym; 701 char *p, *begin; 702 703 if (input->p[0] == 'L' && input->p[1] == '\'') 704 return character(); 705 706 begin = input->p; 707 for (p = begin; isalnum(*p) || *p == '_'; ++p) 708 ; 709 input->p = p; 710 tok2str(); 711 if ((sym = lookup(NS_CPP, yytext, NOALLOC)) != NULL) { 712 if (expand(sym)) 713 return next(); 714 } 715 sym = lookup(namespace, yytext, ALLOC); 716 yylval.sym = sym; 717 if (sym->flags & SCONSTANT) 718 return CONSTANT; 719 if (sym->token != IDEN) 720 yylval.token = sym->u.token; 721 return sym->token; 722 } 723 724 static int 725 follow(int expect, int ifyes, int ifno) 726 { 727 if (*input->p++ == expect) 728 return ifyes; 729 --input->p; 730 return ifno; 731 } 732 733 static int 734 minus(void) 735 { 736 switch (*input->p++) { 737 case '-': 738 return DEC; 739 case '>': 740 return INDIR; 741 case '=': 742 return SUB_EQ; 743 default: 744 --input->p; 745 return '-'; 746 } 747 } 748 749 static int 750 plus(void) 751 { 752 switch (*input->p++) { 753 case '+': 754 return INC; 755 case '=': 756 return ADD_EQ; 757 default: 758 --input->p; 759 return '+'; 760 } 761 } 762 763 static int 764 relational(int op, int equal, int shift, int assig) 765 { 766 int c; 767 768 if ((c = *input->p++) == '=') 769 return equal; 770 if (c == op) 771 return follow('=', assig, shift); 772 --input->p; 773 return op; 774 } 775 776 static int 777 logic(int op, int equal, int logic) 778 { 779 int c; 780 781 if ((c = *input->p++) == '=') 782 return equal; 783 if (c == op) 784 return logic; 785 --input->p; 786 return op; 787 } 788 789 static int 790 dot(void) 791 { 792 int c; 793 794 if ((c = *input->p) != '.') 795 return '.'; 796 if ((c = *++input->p) != '.') 797 error("incorrect token '..'"); 798 ++input->p; 799 return ELLIPSIS; 800 } 801 802 static int 803 operator(void) 804 { 805 int t; 806 807 switch (t = *input->p++) { 808 case '<': 809 t = relational('<', LE, SHL, SHL_EQ); 810 break; 811 case '>': 812 t = relational('>', GE, SHR, SHR_EQ); 813 break; 814 case '&': 815 t = logic('&', AND_EQ, AND); 816 break; 817 case '|': 818 t = logic('|', OR_EQ, OR); 819 break; 820 case '=': 821 t = follow('=', EQ, '='); 822 break; 823 case '^': 824 t = follow('=', XOR_EQ, '^'); 825 break; 826 case '*': 827 t = follow('=', MUL_EQ, '*'); 828 break; 829 case '/': 830 t = follow('=', DIV_EQ, '/'); 831 break; 832 case '%': 833 t = follow('=', MOD_EQ, '%'); 834 break; 835 case '!': 836 t = follow('=', NE, '!'); 837 break; 838 case '#': 839 t = follow('#', CONCAT, STRINGIZE); 840 break; 841 case '-': 842 t = minus(); 843 break; 844 case '+': 845 t = plus(); 846 break; 847 case '.': 848 t = dot(); 849 break; 850 } 851 tok2str(); 852 return t; 853 } 854 855 /* TODO: Ensure that namespace is NS_IDEN after a recovery */ 856 857 /* 858 * skip all the spaces until the next token. When we are in 859 * CPPMODE \n is not considered a whitespace 860 */ 861 static int 862 skipspaces(void) 863 { 864 int c; 865 866 if (!input) 867 return EOF; 868 869 for (;;) { 870 switch (c = *input->p) { 871 case '\n': 872 if (lexmode == CPPMODE) 873 goto return_byte; 874 ++input->p; 875 case '\0': 876 if (!moreinput()) 877 return EOF; 878 break; 879 case ' ': 880 case '\t': 881 case '\v': 882 case '\r': 883 case '\f': 884 ++input->p; 885 break; 886 default: 887 goto return_byte; 888 } 889 } 890 891 return_byte: 892 input->begin = input->p; 893 return c; 894 } 895 896 int 897 next(void) 898 { 899 int c; 900 901 if ((c = skipspaces()) == EOF) 902 yytoken = EOFTOK; 903 else if (isalpha(c) || c == '_') 904 yytoken = iden(); 905 else if (isdigit(c)) 906 yytoken = number(); 907 else if (c == '"') 908 yytoken = string(); 909 else if (c == '\'') 910 yytoken = character(); 911 else 912 yytoken = operator(); 913 914 if (yytoken == EOFTOK) { 915 strcpy(yytext, "<EOF>"); 916 if (cppctx && !input) 917 errorp("#endif expected"); 918 } 919 920 DBG("TOKEN %s", yytext); 921 return yytoken; 922 } 923 924 void 925 expect(int tok) 926 { 927 if (yytoken != tok) { 928 if (isgraph(tok)) 929 errorp("expected '%c' before '%s'", tok, yytext); 930 else 931 errorp("unexpected '%s'", yytext); 932 } else { 933 next(); 934 } 935 } 936 937 int 938 ahead(void) 939 { 940 skipspaces(); 941 return *input->begin; 942 } 943 944 void 945 setsafe(int type) 946 { 947 safe = type; 948 } 949 950 void 951 discard(void) 952 { 953 extern jmp_buf recover; 954 int c; 955 956 for (c = yytoken; ; c = *input->p++) { 957 switch (safe) { 958 case END_COMP: 959 if (c == '}') 960 goto jump; 961 goto semicolon; 962 case END_COND: 963 if (c == ')') 964 goto jump; 965 break; 966 case END_LDECL: 967 if (c == ',') 968 goto jump; 969 case END_DECL: 970 semicolon: 971 if (c == ';') 972 goto jump; 973 break; 974 } 975 if ((c == '\0' || c == EOFTOK) && !moreinput()) 976 exit(EXIT_FAILURE); 977 } 978 jump: 979 input->begin = input->p; 980 yytoken = c; 981 yytext[0] = c; 982 yytext[1] = '\0'; 983 exit(EXIT_FAILURE); 984 985 /* 986 * FIXME: We don't have a proper recover mechanism at this moment 987 * and we don't set the recover point ever, so executing this 988 * longjmp will generate surely a segmentation fault, so it does 989 * not make sense to do it. We just exit until we can find time 990 * to solve this problem. 991 */ 992 longjmp(recover, 1); 993 }