00001
00002
00003
00004
00005
00006
00007
00008
00009 #include "comma/parser/Lexer.h"
00010 #include <cstring>
00011
00012 using namespace comma;
00013
00014 Lexer::Lexer(TextProvider &txtProvider, Diagnostic &diag)
00015 : txtProvider(txtProvider),
00016 diagnostic(diag),
00017 currentIter(txtProvider.begin()),
00018 errorCount(0),
00019 scanningAborted(false),
00020 index(0)
00021 { }
00022
00023 std::string Lexer::Token::getString() const
00024 {
00025 return Lexer::tokenString(*this);
00026 }
00027
00028 const char *Lexer::tokenString(const Code code)
00029 {
00030 const char *result;
00031
00032 switch (code) {
00033 default:
00034 result = 0;
00035 break;
00036
00037 #define RESERVED(NAME, STRING) case TKN_ ## NAME: result = STRING; break;
00038 #define GLYPH(NAME, STRING) case TKN_ ## NAME: result = STRING; break;
00039 #include "comma/parser/Tokens.def"
00040 #undef RESERVED
00041 #undef GLYPH
00042 }
00043
00044 return result;
00045 }
00046
00047 std::string Lexer::tokenString(const Token &token)
00048 {
00049 Code code = token.getCode();
00050
00051 switch (code) {
00052 default:
00053 return std::string(tokenString(code));
00054 break;
00055
00056 case TKN_IDENTIFIER:
00057 case TKN_INTEGER:
00058 case TKN_STRING:
00059 case TKN_CHARACTER:
00060 return std::string(token.getRep(), token.getLength());
00061 }
00062 }
00063
00064 bool Lexer::isDecimalDigit(unsigned c)
00065 {
00066 return ('0' <= c && c <= '9');
00067 }
00068
00069 bool Lexer::isInitialIdentifierChar(unsigned c)
00070 {
00071 if (('a' <= c && c <= 'z') ||
00072 ('A' <= c && c <= 'Z') ||
00073 (c == '%') || (c == '_'))
00074 return true;
00075
00076 return false;
00077 }
00078
00079 bool Lexer::isInnerIdentifierChar(unsigned c)
00080 {
00081 return isInitialIdentifierChar(c) || isDecimalDigit(c) || c == '?';
00082 }
00083
00084 bool Lexer::isWhitespace(unsigned c)
00085 {
00086 return (c == ' ') || (c == '\t') || (c == '\n');
00087 }
00088
00089 Location Lexer::currentLocation() const
00090 {
00091 return txtProvider.getLocation(currentIter);
00092 }
00093
00094
00095
00096 unsigned Lexer::readStream()
00097 {
00098 unsigned c = *currentIter;
00099 ++currentIter;
00100
00101
00102
00103 switch (c) {
00104
00105 case '\r':
00106 if (*currentIter == '\n')
00107 ++currentIter;
00108 case '\n':
00109 return '\n';
00110 }
00111
00112 return c;
00113 }
00114
00115 unsigned Lexer::peekStream()
00116 {
00117 unsigned c = *currentIter;
00118
00119 if (c == '\r')
00120 return '\n';
00121
00122 return c;
00123 }
00124
00125 void Lexer::ungetStream()
00126 {
00127 --currentIter;
00128 }
00129
00130 void Lexer::ignoreStream()
00131 {
00132 readStream();
00133 }
00134
00135 bool Lexer::eatComment()
00136 {
00137 unsigned c = peekStream();
00138
00139 if (c == '-') {
00140 ignoreStream();
00141 if (peekStream() == '-') {
00142
00143
00144 for (;;) {
00145 c = readStream();
00146 if (c == '\n' || c == 0)
00147 return true;
00148 }
00149 }
00150 else {
00151 ungetStream();
00152 return false;
00153 }
00154 }
00155 return false;
00156 }
00157
00158 bool Lexer::eatWhitespace()
00159 {
00160 unsigned c = peekStream();
00161
00162 if (isWhitespace(c)) {
00163 do {
00164 ignoreStream();
00165 } while (isWhitespace(c = peekStream()));
00166 return true;
00167 }
00168 return false;
00169 }
00170
00171 void Lexer::emitToken(Code code,
00172 const TextIterator &start, const TextIterator &end)
00173 {
00174 Location loc = txtProvider.getLocation(start);
00175 const char *string = &start;
00176 unsigned length = &end - &start;
00177 *targetToken = Token(code, loc, string, length);
00178 }
00179
00180 void Lexer::emitToken(Code code, Location loc)
00181 {
00182 *targetToken = Token(code, loc, 0, 0);
00183 }
00184
00185 void Lexer::emitStringToken(const TextIterator &start, const TextIterator &end)
00186 {
00187 emitToken(TKN_STRING, start, end);
00188 }
00189
00190 void Lexer::emitIntegerToken(const TextIterator &start, const TextIterator &end)
00191 {
00192 emitToken(TKN_INTEGER, start, end);
00193 }
00194
00195 void Lexer::emitIdentifierToken(const TextIterator &start, const TextIterator &end)
00196 {
00197 emitToken(TKN_IDENTIFIER, start, end);
00198 }
00199
00200 void Lexer::emitCharacterToken(const TextIterator &start, const TextIterator &end)
00201 {
00202 emitToken(TKN_CHARACTER, start, end);
00203 }
00204
00205 Lexer::Code Lexer::getTokenCode(TextIterator &start, TextIterator &end) const
00206 {
00207 Code code = UNUSED_ID;
00208 const char *str = &start;
00209 unsigned length = &end - &start;
00210
00211 switch (length) {
00212 case 1:
00213 if (strncmp(str, "%", length) == 0)
00214 code = TKN_PERCENT;
00215 break;
00216
00217 case 2:
00218 if (strncmp(str, "is", length) == 0)
00219 code = TKN_IS;
00220 else if (strncmp(str, "if", length) == 0)
00221 code = TKN_IF;
00222 else if (strncmp(str, "in", length) == 0)
00223 code = TKN_IN;
00224 else if (strncmp(str, "of", length) == 0)
00225 code = TKN_OF;
00226 else if (strncmp(str, "or", length) == 0)
00227 code = TKN_OR;
00228 break;
00229
00230 case 3:
00231 if (strncmp(str, "end", length) == 0)
00232 code = TKN_END;
00233 else if (strncmp(str, "out", length) == 0)
00234 code = TKN_OUT;
00235 else if (strncmp(str, "add", length) == 0)
00236 code = TKN_ADD;
00237 else if (strncmp(str, "inj", length) == 0)
00238 code = TKN_INJ;
00239 else if (strncmp(str, "prj", length) == 0)
00240 code = TKN_PRJ;
00241 else if (strncmp(str, "and", length) == 0)
00242 code = TKN_AND;
00243 else if (strncmp(str, "mod", length) == 0)
00244 code = TKN_MOD;
00245 else if (strncmp(str, "rem", length) == 0)
00246 code = TKN_REM;
00247 else if (strncmp(str, "for", length) == 0)
00248 code = TKN_FOR;
00249 else if (strncmp(str, "not", length) == 0)
00250 code = TKN_NOT;
00251 else if (strncmp(str, "xor", length) == 0)
00252 code = TKN_XOR;
00253 else if (strncmp(str, "new", length) == 0)
00254 code = TKN_NEW;
00255 else if (strncmp(str, "all", length) == 0)
00256 code = TKN_ALL;
00257 break;
00258
00259 case 4:
00260 if (strncmp(str, "else", length) == 0)
00261 code = TKN_ELSE;
00262 else if (strncmp(str, "loop", length) == 0)
00263 code = TKN_LOOP;
00264 else if (strncmp(str, "then", length) == 0)
00265 code = TKN_THEN;
00266 else if (strncmp(str, "with", length) == 0)
00267 code = TKN_WITH;
00268 else if (strncmp(str, "type", length) == 0)
00269 code = TKN_TYPE;
00270 else if (strncmp(str, "when", length) == 0)
00271 code = TKN_WHEN;
00272 else if (strncmp(str, "null", length) == 0)
00273 code = TKN_NULL;
00274 break;
00275
00276 case 5:
00277 if (strncmp(str, "begin", length) == 0)
00278 code = TKN_BEGIN;
00279 else if (strncmp(str, "elsif", length) == 0)
00280 code = TKN_ELSIF;
00281 else if (strncmp(str, "while", length) == 0)
00282 code = TKN_WHILE;
00283 else if (strncmp(str, "range", length) == 0)
00284 code = TKN_RANGE;
00285 else if (strncmp(str, "array", length) == 0)
00286 code = TKN_ARRAY;
00287 else if (strncmp(str, "raise", length) == 0)
00288 code = TKN_RAISE;
00289 break;
00290
00291 case 6:
00292 if (strncmp(str, "domain", length) == 0)
00293 code = TKN_DOMAIN;
00294 else if (strncmp(str, "return", length) == 0)
00295 code = TKN_RETURN;
00296 else if (strncmp(str, "import", length) == 0)
00297 code = TKN_IMPORT;
00298 else if (strncmp(str, "pragma", length) == 0)
00299 code = TKN_PRAGMA;
00300 else if (strncmp(str, "others", length) == 0)
00301 code = TKN_OTHERS;
00302 else if (strncmp(str, "record", length) == 0)
00303 code = TKN_RECORD;
00304 else if (strncmp(str, "access", length) == 0)
00305 code = TKN_ACCESS;
00306 break;
00307
00308 case 7:
00309 if (strncmp(str, "carrier", length) == 0)
00310 code = TKN_CARRIER;
00311 else if (strncmp(str, "declare", length) == 0)
00312 code = TKN_DECLARE;
00313 else if (strncmp(str, "generic", length) == 0)
00314 code = TKN_GENERIC;
00315 else if (strncmp(str, "subtype", length) == 0)
00316 code = TKN_SUBTYPE;
00317 else if (strncmp(str, "reverse", length) == 0)
00318 code = TKN_REVERSE;
00319 else if (strncmp(str, "renames", length) == 0)
00320 code = TKN_RENAMES;
00321 break;
00322
00323 case 8:
00324 if (strncmp(str, "function", length) == 0)
00325 code = TKN_FUNCTION;
00326 else if (strncmp(str, "abstract", length) == 0)
00327 code = TKN_ABSTRACT;
00328 break;
00329
00330 case 9:
00331 if (strncmp(str, "procedure", length) == 0)
00332 code = TKN_PROCEDURE;
00333 else if (strncmp(str, "signature", length) == 0)
00334 code = TKN_SIGNATURE;
00335 else if (strncmp(str, "exception", length) == 0)
00336 code = TKN_EXCEPTION;
00337 break;
00338 }
00339 return code;
00340 }
00341
00342 void Lexer::diagnoseConsecutiveUnderscores(unsigned c1, unsigned c2)
00343 {
00344 if (c1 == '_' && c2 == '_') {
00345 report(diag::CONSECUTIVE_UNDERSCORE);
00346 do {
00347 ignoreStream();
00348 } while ((c2 = peekStream()) == '_');
00349 }
00350 }
00351
00352 bool Lexer::scanWord()
00353 {
00354 TextIterator start = currentIter;
00355 unsigned c1, c2;
00356
00357 if (isInitialIdentifierChar(c1 = peekStream())) {
00358 do {
00359 ignoreStream();
00360 c2 = peekStream();
00361 diagnoseConsecutiveUnderscores(c1, c2);
00362 } while (isInnerIdentifierChar(c1 = c2));
00363
00364 Code code = getTokenCode(start, currentIter);
00365
00366 if (code == UNUSED_ID)
00367 emitIdentifierToken(start, currentIter);
00368 else
00369 emitToken(code, txtProvider.getLocation(start));
00370 return true;
00371 }
00372 return false;
00373 }
00374
00375 bool Lexer::scanGlyph()
00376 {
00377 Location loc = currentLocation();
00378 unsigned c = readStream();
00379 Code code = UNUSED_ID;
00380
00381 switch (c) {
00382 case '(':
00383 code = TKN_LPAREN;
00384 break;
00385
00386 case ')':
00387 code = TKN_RPAREN;
00388 break;
00389
00390 case ';':
00391 code = TKN_SEMI;
00392 break;
00393
00394 case '.':
00395 switch (peekStream()) {
00396 case '.':
00397 ignoreStream();
00398 code = TKN_DDOT;
00399 break;
00400
00401 default:
00402 code = TKN_DOT;
00403 }
00404 break;
00405
00406 case ':':
00407 switch (peekStream()) {
00408 case '=':
00409 ignoreStream();
00410 code = TKN_ASSIGN;
00411 break;
00412
00413 default:
00414 code = TKN_COLON;
00415 }
00416 break;
00417
00418 case ',':
00419 code = TKN_COMMA;
00420 break;
00421
00422 case '=':
00423 switch (peekStream()) {
00424 default:
00425 code = TKN_EQUAL;
00426 break;
00427
00428 case '>':
00429 ignoreStream();
00430 code = TKN_RDARROW;
00431 break;
00432 }
00433 break;
00434
00435 case '<':
00436 switch (peekStream()) {
00437 default:
00438 code = TKN_LESS;
00439 break;
00440
00441 case '=':
00442 ignoreStream();
00443 code = TKN_LEQ;
00444 break;
00445
00446 case '>':
00447 ignoreStream();
00448 code = TKN_DIAMOND;
00449 }
00450 break;
00451
00452 case '>':
00453 switch (peekStream()) {
00454 default:
00455 code = TKN_GREAT;
00456 break;
00457
00458 case '=':
00459 ignoreStream();
00460 code = TKN_GEQ;
00461 break;
00462 }
00463 break;
00464
00465 case '+':
00466 code = TKN_PLUS;
00467 break;
00468
00469 case '-':
00470 code = TKN_MINUS;
00471 break;
00472
00473 case '*':
00474 switch (peekStream()) {
00475 case '*':
00476 ignoreStream();
00477 code = TKN_POW;
00478 break;
00479
00480 default:
00481 code = TKN_STAR;
00482 }
00483 break;
00484
00485 case '/':
00486 switch (peekStream()) {
00487 case '=':
00488 ignoreStream();
00489 code = TKN_NEQUAL;
00490 break;
00491
00492 default:
00493 code = TKN_FSLASH;
00494 }
00495 break;
00496
00497 case '&':
00498 code = TKN_AMPER;
00499 break;
00500
00501 case '@':
00502 code = TKN_AT;
00503 break;
00504
00505 case '|':
00506 code = TKN_BAR;
00507 break;
00508 }
00509
00510 if (code == UNUSED_ID) {
00511 ungetStream();
00512 return false;
00513 }
00514
00515 emitToken(code, loc);
00516 return true;
00517 }
00518
00519 bool Lexer::scanEscape()
00520 {
00521 Location loc = currentLocation();
00522 unsigned c;
00523
00524 switch (c = readStream()) {
00525 case '\\': break;
00526 case '"' : break;
00527 case '\'': break;
00528 case 't' : break;
00529 case 'n' : break;
00530 case 'r' : break;
00531 case 'b' : break;
00532
00533 case 0:
00534
00535
00536 ungetStream();
00537 return false;
00538
00539 default:
00540
00541 report(loc, diag::ILLEGAL_ESCAPE) << (char)c;
00542 return false;
00543 }
00544 return true;
00545 }
00546
00547 bool Lexer::scanCharacter()
00548 {
00549 TextIterator start = currentIter;
00550 Location loc = currentLocation();
00551 unsigned c;
00552
00553 if (peekStream() == '\'') {
00554 ignoreStream();
00555 c = readStream();
00556
00557 if (c == '\'') {
00558
00559
00560 report(loc, diag::EMPTY_CHARACTER_LITERAL);
00561 emitCharacterToken(start, currentIter);
00562 return true;
00563 }
00564
00565 if (peekStream() != '\'') {
00566
00567
00568 ungetStream();
00569 emitToken(TKN_QUOTE, loc);
00570 return true;
00571 }
00572
00573
00574
00575
00576
00577
00578 if (c == '(') {
00579 TextIterator cursor = currentIter;
00580 if (*++cursor && *++cursor == '\'') {
00581 ungetStream();
00582 emitToken(TKN_QUOTE, loc);
00583 return true;
00584 }
00585 }
00586
00587
00588
00589
00590 ignoreStream();
00591 emitCharacterToken(start, currentIter);
00592 return true;
00593 }
00594 return false;
00595 }
00596
00597 bool Lexer::scanString()
00598 {
00599 TextIterator start = currentIter;
00600 Location loc = currentLocation();
00601 unsigned c;
00602
00603 if (peekStream() == '"') {
00604 ignoreStream();
00605
00606 for (;;) {
00607 switch (c = readStream()) {
00608 case '\\':
00609
00610
00611 scanEscape();
00612 break;
00613
00614 case 0:
00615
00616
00617 report(loc, diag::UNTERMINATED_STRING);
00618 emitStringToken(start, currentIter);
00619 return true;
00620
00621 case '\n':
00622
00623 report(loc, diag::NEWLINE_IN_STRING_LIT);
00624 emitStringToken(start, currentIter);
00625 return true;
00626
00627 case '"':
00628
00629 emitStringToken(start, currentIter);
00630 return true;
00631 }
00632 }
00633 }
00634 return false;
00635 }
00636
00637 bool Lexer::scanNumeric()
00638 {
00639 Location loc = currentLocation();
00640 TextIterator start = currentIter;
00641 unsigned c = peekStream();
00642
00643 if (isDecimalDigit(c)) {
00644 ignoreStream();
00645
00646
00647
00648
00649 if (c == '0' && isDecimalDigit(peekStream())) {
00650 report(loc, diag::LEADING_ZERO_IN_INTEGER_LIT);
00651
00652 while (peekStream() == '0') ignoreStream();
00653
00654
00655
00656 if (!isDecimalDigit(peekStream())) {
00657 TextIterator end = start;
00658 emitIntegerToken(start, ++end);
00659 return true;
00660 }
00661 else c = readStream();
00662 }
00663
00664 for (;;) {
00665 c = readStream();
00666
00667 if (isDecimalDigit(c) || c == '_')
00668 continue;
00669 else {
00670 ungetStream();
00671 break;
00672 }
00673 }
00674 emitIntegerToken(start, currentIter);
00675 return true;
00676 }
00677 return false;
00678 }
00679
00680 void Lexer::beginExcursion()
00681 {
00682 positionStack.push_back(index);
00683 }
00684
00685 void Lexer::endExcursion()
00686 {
00687 index = positionStack.back();
00688 positionStack.pop_back();
00689 }
00690
00691 void Lexer::forgetExcursion()
00692 {
00693 unsigned saved_index = positionStack.back();
00694 positionStack.pop_back();
00695
00696 if (positionStack.empty()) {
00697 assert(saved_index == 0 && "index/position mismatch!");
00698 ((void)saved_index);
00699 tokens.clear();
00700 }
00701 }
00702
00703 void Lexer::peek(Token &tkn, unsigned n)
00704 {
00705 unsigned numTokens = tokens.size();
00706
00707 if (index + n < numTokens) {
00708 tkn = tokens[index + n];
00709 return;
00710 }
00711
00712 unsigned tokensNeeded = index + n - numTokens;
00713 targetToken = &tkn;
00714 for (unsigned i = 0; i <= tokensNeeded; ++i) {
00715 scanToken();
00716 if (targetToken->getCode() != TKN_EOT)
00717 tokens.push_back(*targetToken);
00718 }
00719 }
00720
00721 void Lexer::scan(Token &tkn)
00722 {
00723 unsigned numTokens = tokens.size();
00724
00725
00726 if (index < numTokens) {
00727 tkn = tokens[index++];
00728 return;
00729 }
00730
00731
00732 if (numTokens && positionStack.empty()) {
00733 tokens.clear();
00734 index = 0;
00735 }
00736
00737 targetToken = &tkn;
00738
00739 scanToken();
00740
00741
00742 if (!positionStack.empty() && targetToken->getCode() != TKN_EOT) {
00743 index++;
00744 tokens.push_back(*targetToken);
00745 }
00746 }
00747
00748 void Lexer::scanToken()
00749 {
00750 for (;;) {
00751 eatWhitespace();
00752 while (eatComment()) eatWhitespace();
00753
00754 if (peekStream() == 0 || scanningAborted) {
00755 emitToken(TKN_EOT, Location());
00756 return;
00757 }
00758
00759 if (scanWord()) return;
00760 if (scanGlyph()) return;
00761 if (scanString()) return;
00762 if (scanNumeric()) return;
00763 if (scanCharacter()) return;
00764
00765
00766
00767
00768
00769
00770
00771 report(diag::INVALID_CHARACTER) << static_cast<char>(peekStream());
00772 ignoreStream();
00773 abortScanning();
00774 }
00775 }
00776