lex.c (16245B)
1 #include <assert.h> 2 #include <ctype.h> 3 #include <errno.h> 4 #include <limits.h> 5 #include <setjmp.h> 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <string.h> 9 10 #include <scc/cstd.h> 11 #include <scc/scc.h> 12 #include "cc1.h" 13 14 int yytoken; 15 struct yystype yylval; 16 char yytext[STRINGSIZ+3]; 17 unsigned short yylen; 18 int lexmode = CCMODE; 19 unsigned lineno; 20 char filenam[FILENAME_MAX]; 21 22 int namespace = NS_IDEN; 23 static int safe; 24 Input *input; 25 26 void 27 setloc(char *fname, unsigned line) 28 { 29 size_t len; 30 31 if (fname) { 32 if ((len = strlen(fname)) >= FILENAME_MAX) 33 die("cc1: %s: file name too long", fname); 34 memmove(filenam, fname, len); 35 filenam[len] = '\0'; 36 37 /* 38 * There are cases where we want to call setloc() 39 * with the data in input, and then we have t be 40 * careful about freeing input->filenam 41 */ 42 if (fname != input->filenam) { 43 free(input->filenam); 44 input->filenam = xstrdup(fname); 45 } 46 } 47 48 lineno = input->lineno = line; 49 } 50 51 int 52 addinput(int type, void *arg, int fail) 53 { 54 FILE *fp; 55 char *extp, *fname, *buffer, *infile; 56 int infileln; 57 Macro *mp; 58 Symbol *sym; 59 Input *newip, *curip = input; 60 61 if (curip) 62 curip->lineno = lineno; 63 64 switch (type) { 65 case IMACRO: 66 fp = NULL; 67 mp = arg; 68 sym = mp->sym; 69 fname = mp->fname; 70 buffer = mp->buffer; 71 DBG("INPUT: expanding macro %s", sym->name); 72 break; 73 case IPARAM: 74 fp = NULL; 75 mp = NULL; 76 buffer = arg; 77 fname = filenam; 78 DBG("INPUT: macro parameter '%s'", buffer); 79 break; 80 case IFILE: 81 fname = arg; 82 mp = NULL; 83 buffer = NULL; 84 85 if ((fp = fopen(fname, "r")) == NULL) { 86 if (!fail) 87 return 0; 88 die("cc1: %s: %s", fname, strerror(errno)); 89 } 90 if (curip && onlyheader) { 91 infile = curip->filenam; 92 infileln = strlen(infile); 93 if (extp = strrchr(infile, '.')) 94 infileln -= strlen(extp); 95 printf("%.*s.o: %s %s\n", 96 infileln, infile, infile, fname); 97 } 98 lineno = 0; 99 DBG("INPUT: file input '%s'", fname); 100 break; 101 case ISTDIN: 102 fp = stdin; 103 mp = NULL; 104 fname = "<stdin>"; 105 buffer = NULL; 106 lineno = 0; 107 DBG("INPUT: file input 'stdin'"); 108 break; 109 default: 110 abort(); 111 } 112 113 if (!buffer) { 114 buffer = xmalloc(INPUTSIZ); 115 buffer[0] = '\0'; 116 } else { 117 buffer = xstrdup(buffer); 118 } 119 120 newip = xmalloc(sizeof(*newip)); 121 newip->next = curip; 122 newip->macro = mp; 123 newip->p = newip->begin = newip->line = buffer; 124 newip->filenam = NULL; 125 newip->lineno = 0; 126 newip->fp = fp; 127 newip->flags = type; 128 input = newip; 129 130 setloc(fname, lineno); 131 return 1; 132 } 133 134 void 135 delinput(void) 136 { 137 Input *ip = input; 138 139 switch (ip->flags & ITYPE) { 140 case IFILE: 141 DBG("INPUT: file finished '%s'", ip->filenam); 142 if (fclose(ip->fp)) 143 die("cc1: %s: %s", ip->filenam, strerror(errno)); 144 break; 145 case IMACRO: 146 DBG("INPUT: macro %s finished", ip->macro->sym->name); 147 delmacro(ip->macro); 148 break; 149 case IPARAM: 150 DBG("INPUT: macro param finished"); 151 break; 152 case ISTDIN: 153 DBG("INPUT: stdin finished"); 154 break; 155 default: 156 abort(); 157 } 158 159 input = ip->next; 160 free(ip->filenam); 161 free(ip->line); 162 free(ip); 163 if (input) 164 setloc(input->filenam, input->lineno); 165 } 166 167 static void 168 newline(void) 169 { 170 if (++lineno == 0) 171 die("cc1: %s: file too long", filenam); 172 } 173 174 /* 175 * Read the next character from the input file, counting number of lines 176 * and joining lines escaped with \ 177 */ 178 static int 179 readchar(void) 180 { 181 FILE *fp = input->fp; 182 int c; 183 184 repeat: 185 switch (c = getc(fp)) { 186 case '\\': 187 if ((c = getc(fp)) == '\n') { 188 newline(); 189 goto repeat; 190 } 191 ungetc(c, fp); 192 c = '\\'; 193 break; 194 case '\n': 195 newline(); 196 break; 197 case EOF: 198 break; 199 } 200 201 return c; 202 } 203 204 /* 205 * discard a C comment. This function is only called from readline 206 * because it is impossible to have a comment in a macro, because 207 * comments are always discarded before processing any cpp directive 208 */ 209 static void 210 comment(int type) 211 { 212 int c; 213 214 repeat: 215 while ((c = readchar()) != EOF && c != type) 216 ; 217 218 if (c == EOF) { 219 errorp("unterminated comment"); 220 return; 221 } 222 223 if (type == '*' && (c = readchar()) != '/') 224 goto repeat; 225 } 226 227 /* 228 * readline is used to read a full logic line from a file. 229 * It discards comments and check that the line fits in 230 * the input buffer 231 */ 232 static int 233 readline(void) 234 { 235 char *bp, *lim; 236 int c, peekc = 0, delim = 0; 237 238 if (feof(input->fp)) { 239 input->flags |= IEOF; 240 *input->p = '\0'; 241 return 0; 242 } 243 244 *input->line = '\0'; 245 lim = &input->line[INPUTSIZ-1]; 246 for (bp = input->line; bp < lim-1; *bp++ = c) { 247 c = (peekc) ? peekc : readchar(); 248 peekc = 0; 249 if (c == '\n' || c == EOF) 250 break; 251 if (c == '\\') { 252 peekc = readchar(); 253 if (peekc == '\n' || peekc == EOF) 254 continue; 255 if (bp == lim-2) 256 break; 257 *bp++ = c; 258 c = peekc; 259 peekc = 0; 260 continue; 261 } 262 263 if (delim && c == delim) 264 delim = 0; 265 else if (!delim && (c == '"' || c == '\'')) 266 delim = c; 267 if (c != '/' || delim) 268 continue; 269 270 /* check for /* or // */ 271 peekc = readchar(); 272 if (peekc != '*' && peekc != '/') 273 continue; 274 275 if (peekc == '/') { 276 comment('\n'); 277 break; 278 } else { 279 comment('*'); 280 c = ' '; 281 } 282 peekc = 0; 283 } 284 285 input->begin = input->p = input->line; 286 if (bp == lim-1) { 287 errorp("line too long"); 288 --bp; 289 } 290 *bp++ = '\n'; 291 *bp = '\0'; 292 293 return 1; 294 } 295 296 /* 297 * moreinput gets more bytes to be passed to the lexer. 298 * It can take more bytes from macro expansions or 299 * directly reading from files. When a cpp directive 300 * is processed the line is discarded because it must not 301 * be passed to the lexer 302 */ 303 static int 304 moreinput(void) 305 { 306 int wasexpand = 0; 307 308 repeat: 309 if (!input) 310 return 0; 311 312 if (*input->p == '\0') { 313 int t = input->flags & ITYPE; 314 if (t == IPARAM) { 315 input->flags |= IEOF; 316 return 0; 317 } 318 if (t == IMACRO) { 319 wasexpand = 1; 320 input->flags |= IEOF; 321 } 322 if (input->flags & IEOF) { 323 delinput(); 324 goto repeat; 325 } 326 if (!readline()) { 327 *input->p = '\0'; 328 goto repeat; 329 } 330 if (cpp()) 331 goto repeat; 332 } 333 334 if (onlycpp && !wasexpand) 335 ppragmaln(); 336 return 1; 337 } 338 339 static void 340 tok2str(void) 341 { 342 if ((yylen = input->p - input->begin) > INTIDENTSIZ) 343 error("token too big"); 344 memcpy(yytext, input->begin, yylen); 345 yytext[yylen] = '\0'; 346 input->begin = input->p; 347 } 348 349 static Symbol * 350 readint(char *s, int base, int sign, Symbol *sym) 351 { 352 Type *tp = sym->type; 353 struct limits *lim; 354 TUINT u, val, max; 355 int c; 356 357 lim = getlimits(tp); 358 max = lim->max.i; 359 if (*s == '0') 360 ++s; 361 if (toupper(*s) == 'X') 362 ++s; 363 364 for (u = 0; isxdigit(c = *s++); u = u*base + val) { 365 static char letters[] = "0123456789ABCDEF"; 366 val = strchr(letters, toupper(c)) - letters; 367 repeat: 368 if (u <= max/base && u*base <= max - val) 369 continue; 370 if (tp->prop & TSIGNED) { 371 if (tp == inttype) 372 tp = (base==10) ? longtype : uinttype; 373 else if (tp == longtype) 374 tp = (base==10) ? llongtype : ulongtype; 375 else 376 goto overflow; 377 } else { 378 if (tp == uinttype) 379 tp = (sign==UNSIGNED) ? ulongtype : longtype; 380 else if (tp == ulongtype) 381 tp = (sign==UNSIGNED) ? ullongtype : llongtype; 382 else 383 goto overflow; 384 } 385 sym->type = tp; 386 lim = getlimits(tp); 387 max = lim->max.i; 388 goto repeat; 389 } 390 391 if (tp->prop & TSIGNED) 392 sym->u.i = u; 393 else 394 sym->u.u = u; 395 396 return sym; 397 398 overflow: 399 errorp("overflow in integer constant"); 400 return sym; 401 } 402 403 static int 404 integer(char *s, int base) 405 { 406 Type *tp; 407 Symbol *sym; 408 unsigned size, sign; 409 410 for (size = sign = 0; ; ++input->p) { 411 switch (toupper(*input->p)) { 412 case 'L': 413 if (size == LLONG) 414 goto wrong_type; 415 size = (size == LONG) ? LLONG : LONG; 416 continue; 417 case 'U': 418 if (sign == UNSIGNED) 419 goto wrong_type; 420 sign = UNSIGNED; 421 continue; 422 default: 423 goto convert; 424 wrong_type: 425 error("invalid suffix in integer constant"); 426 } 427 } 428 429 convert: 430 tok2str(); 431 tp = ctype(INT, sign, size); 432 sym = newsym(NS_IDEN, NULL); 433 sym->type = tp; 434 sym->flags |= SCONSTANT; 435 yylval.sym = readint(s, base, sign, sym); 436 return CONSTANT; 437 } 438 439 static char * 440 digits(int base) 441 { 442 char *p; 443 int c; 444 445 for (p = input->p; c = *p; ++p) { 446 switch (base) { 447 case 8: 448 if (!strchr("01234567", c)) 449 goto end; 450 break; 451 case 10: 452 if (!isdigit(c)) 453 goto end; 454 break; 455 case 16: 456 if (!isxdigit(c)) 457 goto end; 458 break; 459 } 460 } 461 end: 462 input->p = p; 463 return yytext; 464 } 465 466 static int 467 number(void) 468 { 469 int base; 470 471 if (*input->p != '0') { 472 base = 10; 473 } else { 474 if (toupper(*++input->p) == 'X') { 475 ++input->p; 476 base = 16; 477 } else { 478 base = 8; 479 } 480 } 481 482 return integer(digits(base), base); 483 } 484 485 static int 486 escape(void) 487 { 488 int c, base; 489 490 switch (*++input->p) { 491 case 'a': 492 return '\a'; 493 case 'b': 494 return '\b'; 495 case 'f': 496 return '\f'; 497 case 'n': 498 return '\n'; 499 case 'r': 500 return '\r'; 501 case 't': 502 return '\t'; 503 case 'v': 504 return '\v'; 505 case '"': 506 return '"'; 507 case '\'': 508 return '\''; 509 case '\\': 510 return '\\'; 511 case '\?': 512 return '\?'; 513 case 'u': 514 /* 515 * FIXME: universal constants are not correctly handled 516 */ 517 if (!isdigit(*++input->p)) 518 warn("incorrect digit for numerical character constant"); 519 base = 10; 520 break; 521 case 'x': 522 if (!isxdigit(*++input->p)) 523 warn("\\x used with no following hex digits"); 524 base = 16; 525 break; 526 case '0': 527 if (!strchr("01234567", *++input->p)) 528 warn("\\0 used with no following octal digits"); 529 base = 8; 530 break; 531 default: 532 warn("unknown escape sequence"); 533 return ' '; 534 } 535 errno = 0; 536 /* FIXME: We don't check that there is an actual number */ 537 c = strtoul(input->p, &input->p, base); 538 if (errno || c > 255) 539 warn("character constant out of range"); 540 --input->p; 541 return c; 542 } 543 544 static Rune 545 utf8rune(void) 546 { 547 Rune wc; 548 unsigned c; 549 size_t i, len; 550 551 c = *input->p; 552 for (len = 0; c & 0x80; len++) 553 c <<= 1; 554 if (len == 0) 555 return c; 556 if (len == 1 || len == 8) 557 goto invalid; 558 559 wc = (c & 0xFF) >> len; 560 for (i = 0; i < len-1; i++) { 561 c = input->p[1]; 562 if ((c & 0xC0) != 0x80) 563 goto invalid; 564 input->p++; 565 wc <<= 6; 566 wc |= c & 0x3F; 567 } 568 return wc; 569 570 invalid: 571 errorp("invalid multibyte sequence"); 572 return 0xFFFD; 573 } 574 575 static Rune 576 decode(int multi) 577 { 578 Rune r; 579 580 if (*input->p == '\\') { 581 r = escape(); 582 return r; 583 } 584 585 return multi ? utf8rune() : *input->p; 586 } 587 588 static int 589 character(void) 590 { 591 int i, multi = 0; 592 Rune r, d; 593 Type *tp = inttype; 594 Symbol *sym; 595 596 if (*input->p == 'L') { 597 multi = 1; 598 tp = wchartype; 599 input->p++; 600 } 601 602 d = 0; 603 input->p++; 604 for (i = 0; *input->p != '\''; i++) { 605 r = decode(multi); 606 if (r > getlimits(tp)->max.i) 607 warn("character too large for enclosing character literal type"); 608 d |= r; 609 input->p++; 610 } 611 input->p++; 612 613 if (i == 0) 614 errorp("empty character constant"); 615 if (i > 1) 616 warn("multi-character character constant"); 617 618 sym = newsym(NS_IDEN, NULL); 619 sym->u.i = d; 620 sym->type = tp; 621 yylval.sym = sym; 622 tok2str(); 623 return CONSTANT; 624 } 625 626 /* 627 * string() parses a constant string, and convert all the 628 * escape sequences into single characters. This behaviour 629 * is correct except when we parse a #define, where we want 630 * to preserve the literal content of the string. In that 631 * case cpp.c:/^define( sets the variable disescape to 632 * disable converting escape sequences into characters. 633 */ 634 static int 635 string(void) 636 { 637 char *bp = yytext; 638 int c, esc; 639 640 *bp++ = '"'; 641 esc = 0; 642 for (++input->p; ; ++input->p) { 643 c = *input->p; 644 645 if (c == '"' && !esc) 646 break; 647 648 if (c == '\0') { 649 errorp("missing terminating '\"' character"); 650 break; 651 } 652 653 if (c == '\\' && !esc && disescape) 654 esc = 1; 655 else 656 esc = 0; 657 658 if (c == '\\' && !esc) 659 c = escape(); 660 661 if (bp == &yytext[STRINGSIZ+1]) { 662 /* too long, ignore everything until next quote */ 663 for (++input->p; *input->p != '"'; ++input->p) { 664 if (*input->p == '\\') 665 ++input->p; 666 if (*input->p == '\0') 667 break; 668 } 669 --bp; 670 errorp("string too long"); 671 break; 672 } 673 *bp++ = c; 674 } 675 676 input->begin = ++input->p; 677 *bp = '\0'; 678 679 yylen = bp - yytext + 1; 680 yylval.sym = newstring(yytext+1, yylen-1); 681 *bp++ = '"'; 682 *bp = '\0'; 683 return STRING; 684 } 685 686 static int 687 iden(void) 688 { 689 Symbol *sym; 690 char *p, *begin; 691 692 if (input->p[0] == 'L' && input->p[1] == '\'') 693 return character(); 694 695 begin = input->p; 696 for (p = begin; isalnum(*p) || *p == '_'; ++p) 697 ; 698 input->p = p; 699 tok2str(); 700 if ((sym = lookup(NS_CPP, yytext, NOALLOC)) != NULL) { 701 if (expand(sym)) 702 return next(); 703 } 704 sym = lookup(namespace, yytext, ALLOC); 705 yylval.sym = sym; 706 if (sym->flags & SCONSTANT) 707 return CONSTANT; 708 if (sym->token != IDEN) 709 yylval.token = sym->u.token; 710 return sym->token; 711 } 712 713 static int 714 follow(int expect, int ifyes, int ifno) 715 { 716 if (*input->p++ == expect) 717 return ifyes; 718 --input->p; 719 return ifno; 720 } 721 722 static int 723 minus(void) 724 { 725 switch (*input->p++) { 726 case '-': 727 return DEC; 728 case '>': 729 return INDIR; 730 case '=': 731 return SUB_EQ; 732 default: 733 --input->p; 734 return '-'; 735 } 736 } 737 738 static int 739 plus(void) 740 { 741 switch (*input->p++) { 742 case '+': 743 return INC; 744 case '=': 745 return ADD_EQ; 746 default: 747 --input->p; 748 return '+'; 749 } 750 } 751 752 static int 753 relational(int op, int equal, int shift, int assig) 754 { 755 int c; 756 757 if ((c = *input->p++) == '=') 758 return equal; 759 if (c == op) 760 return follow('=', assig, shift); 761 --input->p; 762 return op; 763 } 764 765 static int 766 logic(int op, int equal, int logic) 767 { 768 int c; 769 770 if ((c = *input->p++) == '=') 771 return equal; 772 if (c == op) 773 return logic; 774 --input->p; 775 return op; 776 } 777 778 static int 779 dot(void) 780 { 781 int c; 782 783 if ((c = *input->p) != '.') 784 return '.'; 785 if ((c = *++input->p) != '.') 786 error("incorrect token '..'"); 787 ++input->p; 788 return ELLIPSIS; 789 } 790 791 static int 792 operator(void) 793 { 794 int t; 795 796 switch (t = *input->p++) { 797 case '<': 798 t = relational('<', LE, SHL, SHL_EQ); 799 break; 800 case '>': 801 t = relational('>', GE, SHR, SHR_EQ); 802 break; 803 case '&': 804 t = logic('&', AND_EQ, AND); 805 break; 806 case '|': 807 t = logic('|', OR_EQ, OR); 808 break; 809 case '=': 810 t = follow('=', EQ, '='); 811 break; 812 case '^': 813 t = follow('=', XOR_EQ, '^'); 814 break; 815 case '*': 816 t = follow('=', MUL_EQ, '*'); 817 break; 818 case '/': 819 t = follow('=', DIV_EQ, '/'); 820 break; 821 case '%': 822 t = follow('=', MOD_EQ, '%'); 823 break; 824 case '!': 825 t = follow('=', NE, '!'); 826 break; 827 case '#': 828 t = follow('#', CONCAT, STRINGIZE); 829 break; 830 case '-': 831 t = minus(); 832 break; 833 case '+': 834 t = plus(); 835 break; 836 case '.': 837 t = dot(); 838 break; 839 } 840 tok2str(); 841 return t; 842 } 843 844 /* TODO: Ensure that namespace is NS_IDEN after a recovery */ 845 846 /* 847 * skip all the spaces until the next token. When we are in 848 * CPPMODE \n is not considered a whitespace 849 */ 850 static int 851 skipspaces(void) 852 { 853 int c; 854 855 if (!input) 856 return EOF; 857 858 for (;;) { 859 switch (c = *input->p) { 860 case '\n': 861 if (lexmode == CPPMODE) 862 goto return_byte; 863 ++input->p; 864 case '\0': 865 if (!moreinput()) 866 return EOF; 867 break; 868 case ' ': 869 case '\t': 870 case '\v': 871 case '\r': 872 case '\f': 873 ++input->p; 874 break; 875 default: 876 goto return_byte; 877 } 878 } 879 880 return_byte: 881 input->begin = input->p; 882 return c; 883 } 884 885 int 886 next(void) 887 { 888 int c; 889 890 if ((c = skipspaces()) == EOF) 891 yytoken = EOFTOK; 892 else if (isalpha(c) || c == '_') 893 yytoken = iden(); 894 else if (isdigit(c)) 895 yytoken = number(); 896 else if (c == '"') 897 yytoken = string(); 898 else if (c == '\'') 899 yytoken = character(); 900 else 901 yytoken = operator(); 902 903 if (yytoken == EOFTOK) { 904 strcpy(yytext, "<EOF>"); 905 if (cppctx && !input) 906 errorp("#endif expected"); 907 } 908 909 DBG("TOKEN %s", yytext); 910 return yytoken; 911 } 912 913 void 914 expect(int tok) 915 { 916 if (yytoken != tok) { 917 if (isgraph(tok)) 918 errorp("expected '%c' before '%s'", tok, yytext); 919 else 920 errorp("unexpected '%s'", yytext); 921 } else { 922 next(); 923 } 924 } 925 926 int 927 ahead(void) 928 { 929 skipspaces(); 930 return *input->begin; 931 } 932 933 void 934 setsafe(int type) 935 { 936 safe = type; 937 } 938 939 void 940 discard(void) 941 { 942 extern jmp_buf recover; 943 int c; 944 945 for (c = yytoken; ; c = *input->p++) { 946 switch (safe) { 947 case END_COMP: 948 if (c == '}') 949 goto jump; 950 goto semicolon; 951 case END_COND: 952 if (c == ')') 953 goto jump; 954 break; 955 case END_LDECL: 956 if (c == ',') 957 goto jump; 958 case END_DECL: 959 semicolon: 960 if (c == ';') 961 goto jump; 962 break; 963 } 964 if ((c == '\0' || c == EOFTOK) && !moreinput()) 965 exit(EXIT_FAILURE); 966 } 967 jump: 968 input->begin = input->p; 969 yytoken = c; 970 yytext[0] = c; 971 yytext[1] = '\0'; 972 exit(EXIT_FAILURE); 973 974 /* 975 * FIXME: We don't have a proper recover mechanism at this moment 976 * and we don't set the recover point ever, so executing this 977 * longjmp will generate surely a segmentation fault, so it does 978 * not make sense to do it. We just exit until we can find time 979 * to solve this problem. 980 */ 981 longjmp(recover, 1); 982 }