lex.c (16737B)
1 #include <assert.h> 2 #include <ctype.h> 3 #include <errno.h> 4 #include <limits.h> 5 #include <setjmp.h> 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <string.h> 9 10 #include <scc/cstd.h> 11 #include <scc/scc.h> 12 #include "cc1.h" 13 14 int yytoken; 15 struct yystype yylval; 16 char yytext[STRINGSIZ+3]; 17 unsigned short yylen; 18 int lexmode = CCMODE; 19 unsigned lineno; 20 char filenam[FILENAME_MAX]; 21 22 int namespace = NS_IDEN; 23 static int safe; 24 Input *input; 25 26 void 27 setloc(char *fname, unsigned line) 28 { 29 size_t len; 30 31 if (fname) { 32 if ((len = strlen(fname)) >= FILENAME_MAX) 33 die("cc1: %s: file name too long", fname); 34 memmove(filenam, fname, len); 35 filenam[len] = '\0'; 36 37 /* 38 * There are cases where we want to call setloc() 39 * with the data in input, and then we have t be 40 * careful about freeing input->filenam 41 */ 42 if (fname != input->filenam) { 43 free(input->filenam); 44 input->filenam = xstrdup(fname); 45 } 46 } 47 48 lineno = input->lineno = line; 49 } 50 51 int 52 addinput(int type, void *arg, int fail) 53 { 54 FILE *fp; 55 char *extp, *fname, *buffer, *infile; 56 int infileln; 57 Macro *mp; 58 Symbol *sym; 59 Input *newip, *curip = input; 60 61 if (curip) 62 curip->lineno = lineno; 63 64 switch (type) { 65 case IMACRO: 66 fp = NULL; 67 mp = arg; 68 sym = mp->sym; 69 fname = mp->fname; 70 buffer = mp->buffer; 71 DBG("INPUT: expanding macro %s", sym->name); 72 break; 73 case IPARAM: 74 fp = NULL; 75 mp = NULL; 76 buffer = arg; 77 fname = filenam; 78 DBG("INPUT: macro parameter '%s'", buffer); 79 break; 80 case IFILE: 81 fname = arg; 82 mp = NULL; 83 buffer = NULL; 84 85 if ((fp = fopen(fname, "r")) == NULL) { 86 if (!fail) 87 return 0; 88 die("cc1: %s: %s", fname, strerror(errno)); 89 } 90 if (curip && onlyheader) { 91 infile = curip->filenam; 92 infileln = strlen(infile); 93 if (extp = strrchr(infile, '.')) 94 infileln -= strlen(extp); 95 printf("%.*s.o: %s %s\n", 96 infileln, infile, infile, fname); 97 } 98 lineno = 0; 99 DBG("INPUT: file input '%s'", fname); 100 break; 101 case ISTDIN: 102 fp = stdin; 103 mp = NULL; 104 fname = "<stdin>"; 105 buffer = NULL; 106 lineno = 0; 107 DBG("INPUT: file input 'stdin'"); 108 break; 109 default: 110 abort(); 111 } 112 113 if (!buffer) { 114 buffer = xmalloc(INPUTSIZ); 115 buffer[0] = '\0'; 116 } else { 117 buffer = xstrdup(buffer); 118 } 119 120 newip = xmalloc(sizeof(*newip)); 121 newip->next = curip; 122 newip->macro = mp; 123 newip->p = newip->begin = newip->line = buffer; 124 newip->filenam = NULL; 125 newip->lineno = 0; 126 newip->fp = fp; 127 newip->flags = type; 128 input = newip; 129 130 setloc(fname, lineno); 131 return 1; 132 } 133 134 void 135 delinput(void) 136 { 137 Input *ip = input; 138 139 switch (ip->flags & ITYPE) { 140 case IFILE: 141 DBG("INPUT: file finished '%s'", ip->filenam); 142 if (fclose(ip->fp)) 143 die("cc1: %s: %s", ip->filenam, strerror(errno)); 144 break; 145 case IMACRO: 146 DBG("INPUT: macro %s finished", ip->macro->sym->name); 147 delmacro(ip->macro); 148 break; 149 case IPARAM: 150 DBG("INPUT: macro param finished"); 151 break; 152 case ISTDIN: 153 DBG("INPUT: stdin finished"); 154 break; 155 default: 156 abort(); 157 } 158 159 input = ip->next; 160 free(ip->filenam); 161 free(ip->line); 162 free(ip); 163 if (input) 164 setloc(input->filenam, input->lineno); 165 } 166 167 static void 168 newline(void) 169 { 170 if (++lineno == 0) 171 die("cc1: %s: file too long", filenam); 172 } 173 174 /* 175 * Read the next character from the input file, counting number of lines 176 * and joining lines escaped with \ 177 */ 178 static int 179 readchar(void) 180 { 181 FILE *fp = input->fp; 182 int c; 183 184 repeat: 185 switch (c = getc(fp)) { 186 case '\\': 187 if ((c = getc(fp)) == '\n') { 188 newline(); 189 goto repeat; 190 } 191 ungetc(c, fp); 192 c = '\\'; 193 break; 194 case '\n': 195 newline(); 196 break; 197 case EOF: 198 break; 199 } 200 201 return c; 202 } 203 204 /* 205 * discard a C comment. This function is only called from readline 206 * because it is impossible to have a comment in a macro, because 207 * comments are always discarded before processing any cpp directive 208 */ 209 static void 210 comment(int type) 211 { 212 int c; 213 214 c = readchar(); 215 repeat: 216 for ( ; c != EOF && c != type; c = readchar()) 217 ; 218 219 if (c == EOF) { 220 errorp("unterminated comment"); 221 return; 222 } 223 224 if (type == '*' && (c = readchar()) != '/') 225 goto repeat; 226 } 227 228 /* 229 * readline is used to read a full logic line from a file. 230 * It discards comments and check that the line fits in 231 * the input buffer 232 */ 233 static int 234 readline(void) 235 { 236 char *bp, *lim; 237 int c, peekc = 0, delim = 0; 238 239 if (feof(input->fp)) { 240 input->flags |= IEOF; 241 *input->p = '\0'; 242 return 0; 243 } 244 245 *input->line = '\0'; 246 lim = &input->line[INPUTSIZ-1]; 247 for (bp = input->line; bp < lim-1; *bp++ = c) { 248 c = (peekc) ? peekc : readchar(); 249 peekc = 0; 250 if (c == '\n' || c == EOF) 251 break; 252 if (c == '\\') { 253 peekc = readchar(); 254 if (peekc == '\n' || peekc == EOF) 255 continue; 256 if (bp == lim-2) 257 break; 258 *bp++ = c; 259 c = peekc; 260 peekc = 0; 261 continue; 262 } 263 264 if (delim && c == delim) 265 delim = 0; 266 else if (!delim && (c == '"' || c == '\'')) 267 delim = c; 268 if (c != '/' || delim) 269 continue; 270 271 /* check for /* or // */ 272 peekc = readchar(); 273 if (peekc != '*' && peekc != '/') 274 continue; 275 276 if (peekc == '/') { 277 comment('\n'); 278 break; 279 } else { 280 comment('*'); 281 c = ' '; 282 } 283 peekc = 0; 284 } 285 286 input->begin = input->p = input->line; 287 if (bp == lim-1) { 288 errorp("line too long"); 289 --bp; 290 } 291 *bp++ = '\n'; 292 *bp = '\0'; 293 294 return 1; 295 } 296 297 /* 298 * moreinput gets more bytes to be passed to the lexer. 299 * It can take more bytes from macro expansions or 300 * directly reading from files. When a cpp directive 301 * is processed the line is discarded because it must not 302 * be passed to the lexer 303 */ 304 static int 305 moreinput(void) 306 { 307 int wasexpand = 0; 308 309 repeat: 310 if (!input) 311 return 0; 312 313 if (*input->p == '\0') { 314 int t = input->flags & ITYPE; 315 if (t == IPARAM) { 316 input->flags |= IEOF; 317 return 0; 318 } 319 if (t == IMACRO) { 320 wasexpand = 1; 321 input->flags |= IEOF; 322 } 323 if (input->flags & IEOF) { 324 delinput(); 325 goto repeat; 326 } 327 if (!readline()) { 328 *input->p = '\0'; 329 goto repeat; 330 } 331 if (cpp()) 332 goto repeat; 333 } 334 335 if (onlycpp && !wasexpand) 336 ppragmaln(); 337 return 1; 338 } 339 340 static void 341 tok2str(void) 342 { 343 if ((yylen = input->p - input->begin) > INTIDENTSIZ) 344 error("token too big"); 345 memcpy(yytext, input->begin, yylen); 346 yytext[yylen] = '\0'; 347 input->begin = input->p; 348 } 349 350 static Symbol * 351 readint(int base, int sign, Symbol *sym) 352 { 353 char *s = yytext; 354 Type *tp = sym->type; 355 struct limits *lim; 356 unsigned long long u, val, max; 357 int c; 358 359 lim = getlimits(tp); 360 max = lim->max.i; 361 if (*s == '0') 362 ++s; 363 if (toupper(*s) == 'X') 364 ++s; 365 366 for (u = 0; isxdigit(c = *s++); u = u*base + val) { 367 static char letters[] = "0123456789ABCDEF"; 368 val = strchr(letters, toupper(c)) - letters; 369 repeat: 370 if (u <= max/base && u*base <= max - val) 371 continue; 372 if (tp->prop & TSIGNED) { 373 if (tp == inttype) 374 tp = (base==10) ? longtype : uinttype; 375 else if (tp == longtype) 376 tp = (base==10) ? llongtype : ulongtype; 377 else 378 goto overflow; 379 } else { 380 if (tp == uinttype) 381 tp = (sign==UNSIGNED) ? ulongtype : longtype; 382 else if (tp == ulongtype) 383 tp = (sign==UNSIGNED) ? ullongtype : llongtype; 384 else 385 goto overflow; 386 } 387 sym->type = tp; 388 lim = getlimits(tp); 389 max = lim->max.i; 390 goto repeat; 391 } 392 393 if (tp->prop & TSIGNED) 394 sym->u.i = u; 395 else 396 sym->u.u = u; 397 398 return sym; 399 400 overflow: 401 errorp("overflow in integer constant"); 402 return sym; 403 } 404 405 static int 406 integer(int base) 407 { 408 Type *tp; 409 Symbol *sym; 410 unsigned size, sign; 411 412 for (size = sign = 0; ; ++input->p) { 413 switch (toupper(*input->p)) { 414 case 'L': 415 if (size == LLONG) 416 goto wrong_type; 417 size = (size == LONG) ? LLONG : LONG; 418 continue; 419 case 'U': 420 if (sign == UNSIGNED) 421 goto wrong_type; 422 sign = UNSIGNED; 423 continue; 424 default: 425 goto convert; 426 wrong_type: 427 error("invalid suffix in integer constant"); 428 } 429 } 430 431 convert: 432 tok2str(); 433 tp = ctype(INT, sign, size); 434 sym = newsym(NS_IDEN, NULL); 435 sym->type = tp; 436 sym->flags |= SCONSTANT; 437 yylval.sym = readint(base, sign, sym); 438 return CONSTANT; 439 } 440 441 static void 442 digits(int base) 443 { 444 char *p; 445 int c; 446 447 for (p = input->p; c = *p; ++p) { 448 switch (base) { 449 case 8: 450 if (!strchr("01234567", c)) 451 goto end; 452 break; 453 case 10: 454 if (!isdigit(c)) 455 goto end; 456 break; 457 case 16: 458 if (!isxdigit(c)) 459 goto end; 460 break; 461 } 462 } 463 end: 464 input->p = p; 465 } 466 467 static int 468 number(void) 469 { 470 Type *tp; 471 Symbol *sym; 472 int ch, base; 473 long double ld; 474 475 if (*input->p != '0') { 476 base = 10; 477 } else { 478 if (toupper(*++input->p) == 'X') { 479 ++input->p; 480 base = 16; 481 } else { 482 base = 8; 483 } 484 } 485 digits(base); 486 487 if (*input->p != '.') 488 return integer(base); 489 490 sym = newsym(NS_IDEN, NULL); 491 492 ld = strtold(input->begin, &input->p); 493 switch (toupper(*input->p)) { 494 case 'F': 495 ++input->p; 496 tp = floattype; 497 sym->u.f = ld; 498 break; 499 case 'L': 500 ++input->p; 501 tp = ldoubletype; 502 sym->u.ld = ld; 503 break; 504 default: 505 tp = doubletype; 506 sym->u.d = ld; 507 break; 508 } 509 510 tok2str(); 511 sym->type = tp; 512 sym->flags |= SCONSTANT; 513 yylval.sym = sym; 514 return CONSTANT; 515 } 516 517 static int 518 escape(void) 519 { 520 int c, d, i, cnt, base; 521 522 switch (*++input->p) { 523 case 'a': 524 return '\a'; 525 case 'b': 526 return '\b'; 527 case 'f': 528 return '\f'; 529 case 'n': 530 return '\n'; 531 case 'r': 532 return '\r'; 533 case 't': 534 return '\t'; 535 case 'v': 536 return '\v'; 537 case '"': 538 return '"'; 539 case '\'': 540 return '\''; 541 case '\\': 542 return '\\'; 543 case '\?': 544 return '\?'; 545 case 'u': 546 /* 547 * FIXME: universal constants are not correctly handled 548 */ 549 if (!isdigit(*++input->p)) 550 warn("incorrect digit for numerical character constant"); 551 base = 10; 552 break; 553 case 'x': 554 if (!isxdigit(*++input->p)) 555 warn("\\x used with no following hex digits"); 556 cnt = 2; 557 base = 16; 558 break; 559 case '0': 560 case '1': 561 case '2': 562 case '3': 563 case '4': 564 case '5': 565 case '6': 566 case '7': 567 cnt = 3; 568 base = 8; 569 break; 570 default: 571 warn("unknown escape sequence"); 572 return ' '; 573 } 574 575 for (c = i = 0; i < cnt; ++i) { 576 static char digits[] = "0123456789ABCDEF"; 577 char *p = strchr(digits, toupper(*input->p)); 578 579 if (!p || (d = p - digits) > base) 580 break; 581 c *= base; 582 c += d; 583 ++input->p; 584 } 585 --input->p; 586 587 return c; 588 } 589 590 static Rune 591 utf8rune(void) 592 { 593 Rune wc; 594 unsigned c; 595 size_t i, len; 596 597 c = *input->p; 598 for (len = 0; c & 0x80; len++) 599 c <<= 1; 600 if (len == 0) 601 return c; 602 if (len == 1 || len == 8) 603 goto invalid; 604 605 wc = (c & 0xFF) >> len; 606 for (i = 0; i < len-1; i++) { 607 c = input->p[1]; 608 if ((c & 0xC0) != 0x80) 609 goto invalid; 610 input->p++; 611 wc <<= 6; 612 wc |= c & 0x3F; 613 } 614 return wc; 615 616 invalid: 617 errorp("invalid multibyte sequence"); 618 return 0xFFFD; 619 } 620 621 static Rune 622 decode(int multi) 623 { 624 Rune r; 625 626 if (*input->p == '\\') { 627 r = escape(); 628 return r; 629 } 630 631 return multi ? utf8rune() : *input->p; 632 } 633 634 static int 635 character(void) 636 { 637 int i, multi = 0; 638 Rune r, d; 639 Type *tp = inttype; 640 Symbol *sym; 641 642 if (*input->p == 'L') { 643 multi = 1; 644 tp = wchartype; 645 input->p++; 646 } 647 648 d = 0; 649 input->p++; 650 for (i = 0; *input->p != '\''; i++) { 651 r = decode(multi); 652 if (r > getlimits(tp)->max.i) 653 warn("character too large for enclosing character literal type"); 654 d |= r; 655 input->p++; 656 } 657 input->p++; 658 659 if (i == 0) 660 errorp("empty character constant"); 661 if (i > 1) 662 warn("multi-character character constant"); 663 664 sym = newsym(NS_IDEN, NULL); 665 sym->u.i = d; 666 sym->type = tp; 667 yylval.sym = sym; 668 tok2str(); 669 return CONSTANT; 670 } 671 672 /* 673 * string() parses a constant string, and convert all the 674 * escape sequences into single characters. This behaviour 675 * is correct except when we parse a #define, where we want 676 * to preserve the literal content of the string. In that 677 * case cpp.c:/^define( sets the variable disescape to 678 * disable converting escape sequences into characters. 679 */ 680 static int 681 string(void) 682 { 683 char *bp = yytext; 684 int c, esc; 685 686 *bp++ = '"'; 687 esc = 0; 688 for (++input->p; ; ++input->p) { 689 c = *input->p; 690 691 if (c == '"' && !esc) 692 break; 693 694 if (c == '\0') { 695 errorp("missing terminating '\"' character"); 696 break; 697 } 698 699 esc = (c == '\\' && !esc && disescape); 700 701 if (c == '\\' && !esc) 702 c = escape(); 703 704 if (bp == &yytext[STRINGSIZ+1]) { 705 /* too long, ignore everything until next quote */ 706 for (++input->p; *input->p != '"'; ++input->p) { 707 if (*input->p == '\\') 708 ++input->p; 709 if (*input->p == '\0') 710 break; 711 } 712 --bp; 713 errorp("string too long"); 714 break; 715 } 716 *bp++ = c; 717 } 718 719 input->begin = ++input->p; 720 *bp = '\0'; 721 722 yylen = bp - yytext + 1; 723 yylval.sym = newstring(yytext+1, yylen-1); 724 *bp++ = '"'; 725 *bp = '\0'; 726 return STRING; 727 } 728 729 static int 730 iden(void) 731 { 732 Symbol *sym; 733 char *p, *begin; 734 735 if (input->p[0] == 'L' && input->p[1] == '\'') 736 return character(); 737 738 begin = input->p; 739 for (p = begin; isalnum(*p) || *p == '_'; ++p) 740 ; 741 input->p = p; 742 tok2str(); 743 if ((sym = lookup(NS_CPP, yytext, NOALLOC)) != NULL) { 744 if (expand(sym)) 745 return next(); 746 } 747 sym = lookup(namespace, yytext, ALLOC); 748 yylval.sym = sym; 749 if (sym->flags & SCONSTANT) 750 return CONSTANT; 751 if (sym->token != IDEN) 752 yylval.token = sym->u.token; 753 return sym->token; 754 } 755 756 static int 757 follow(int expect, int ifyes, int ifno) 758 { 759 if (*input->p++ == expect) 760 return ifyes; 761 --input->p; 762 return ifno; 763 } 764 765 static int 766 minus(void) 767 { 768 switch (*input->p++) { 769 case '-': 770 return DEC; 771 case '>': 772 return INDIR; 773 case '=': 774 return SUB_EQ; 775 default: 776 --input->p; 777 return '-'; 778 } 779 } 780 781 static int 782 plus(void) 783 { 784 switch (*input->p++) { 785 case '+': 786 return INC; 787 case '=': 788 return ADD_EQ; 789 default: 790 --input->p; 791 return '+'; 792 } 793 } 794 795 static int 796 relational(int op, int equal, int shift, int assig) 797 { 798 int c; 799 800 if ((c = *input->p++) == '=') 801 return equal; 802 if (c == op) 803 return follow('=', assig, shift); 804 --input->p; 805 return op; 806 } 807 808 static int 809 logic(int op, int equal, int logic) 810 { 811 int c; 812 813 if ((c = *input->p++) == '=') 814 return equal; 815 if (c == op) 816 return logic; 817 --input->p; 818 return op; 819 } 820 821 static int 822 dot(void) 823 { 824 int c; 825 826 if (isdigit(*input->p)) 827 return number(); 828 if ((c = *input->p) != '.') 829 return '.'; 830 if ((c = *++input->p) != '.') 831 error("incorrect token '..'"); 832 ++input->p; 833 return ELLIPSIS; 834 } 835 836 static int 837 operator(void) 838 { 839 int t; 840 841 switch (t = *input->p++) { 842 case '<': 843 t = relational('<', LE, SHL, SHL_EQ); 844 break; 845 case '>': 846 t = relational('>', GE, SHR, SHR_EQ); 847 break; 848 case '&': 849 t = logic('&', AND_EQ, AND); 850 break; 851 case '|': 852 t = logic('|', OR_EQ, OR); 853 break; 854 case '=': 855 t = follow('=', EQ, '='); 856 break; 857 case '^': 858 t = follow('=', XOR_EQ, '^'); 859 break; 860 case '*': 861 t = follow('=', MUL_EQ, '*'); 862 break; 863 case '/': 864 t = follow('=', DIV_EQ, '/'); 865 break; 866 case '%': 867 t = follow('=', MOD_EQ, '%'); 868 break; 869 case '!': 870 t = follow('=', NE, '!'); 871 break; 872 case '-': 873 t = minus(); 874 break; 875 case '+': 876 t = plus(); 877 break; 878 case '.': 879 t = dot(); 880 break; 881 } 882 tok2str(); 883 return t; 884 } 885 886 /* TODO: Ensure that namespace is NS_IDEN after a recovery */ 887 888 /* 889 * skip all the spaces until the next token. When we are in 890 * CPPMODE \n is not considered a whitespace 891 */ 892 static int 893 skipspaces(void) 894 { 895 int c; 896 897 if (!input) 898 return EOF; 899 900 for (;;) { 901 switch (c = *input->p) { 902 case '\n': 903 if (lexmode == CPPMODE) 904 goto return_byte; 905 ++input->p; 906 case '\0': 907 if (!moreinput()) 908 return EOF; 909 break; 910 case ' ': 911 case '\t': 912 case '\v': 913 case '\r': 914 case '\f': 915 ++input->p; 916 break; 917 default: 918 goto return_byte; 919 } 920 } 921 922 return_byte: 923 input->begin = input->p; 924 return c; 925 } 926 927 int 928 next(void) 929 { 930 int c; 931 932 if ((c = skipspaces()) == EOF) 933 yytoken = EOFTOK; 934 else if (isalpha(c) || c == '_') 935 yytoken = iden(); 936 else if (isdigit(c)) 937 yytoken = number(); 938 else if (c == '"') 939 yytoken = string(); 940 else if (c == '\'') 941 yytoken = character(); 942 else 943 yytoken = operator(); 944 945 if (yytoken == EOFTOK) { 946 strcpy(yytext, "<EOF>"); 947 if (cppctx && !input) 948 errorp("#endif expected"); 949 } 950 951 DBG("TOKEN %s", yytext); 952 return yytoken; 953 } 954 955 void 956 expect(int tok) 957 { 958 if (yytoken != tok) { 959 if (isgraph(tok)) 960 errorp("expected '%c' before '%s'", tok, yytext); 961 else 962 errorp("unexpected '%s'", yytext); 963 } else { 964 next(); 965 } 966 } 967 968 int 969 ahead(void) 970 { 971 skipspaces(); 972 return *input->begin; 973 } 974 975 void 976 setsafe(int type) 977 { 978 safe = type; 979 } 980 981 void 982 discard(void) 983 { 984 extern jmp_buf recover; 985 int c; 986 987 for (c = yytoken; ; c = *input->p++) { 988 switch (safe) { 989 case END_COMP: 990 if (c == '}') 991 goto jump; 992 goto semicolon; 993 case END_COND: 994 if (c == ')') 995 goto jump; 996 break; 997 case END_LDECL: 998 if (c == ',') 999 goto jump; 1000 case END_DECL: 1001 semicolon: 1002 if (c == ';') 1003 goto jump; 1004 break; 1005 } 1006 if ((c == '\0' || c == EOFTOK) && !moreinput()) 1007 exit(EXIT_FAILURE); 1008 } 1009 jump: 1010 input->begin = input->p; 1011 yytoken = c; 1012 yytext[0] = c; 1013 yytext[1] = '\0'; 1014 exit(EXIT_FAILURE); 1015 1016 /* 1017 * FIXME: We don't have a proper recover mechanism at this moment 1018 * and we don't set the recover point ever, so executing this 1019 * longjmp will generate surely a segmentation fault, so it does 1020 * not make sense to do it. We just exit until we can find time 1021 * to solve this problem. 1022 */ 1023 longjmp(recover, 1); 1024 }