source/libps/pslexer.cpp

0001 /*

0002     see copyright notice in pscript.h

0003 */
0004 #include "pspcheader.h"
0005 #include <ctype.h>
0006 #include <stdlib.h>
0007 #include "pstable.h"
0008 #include "psstring.h"
0009 #include "pscompiler.h"
0010 #include "pslexer.h"
0011
0012 #define CUR_CHAR (_currdata)
0013 #define RETURN_TOKEN(t) { _prevtoken = _curtoken; _curtoken = t; return t;}
0014 #define IS_EOB() (CUR_CHAR <= PSCRIPT_EOB)
0015 #define NEXT() {Next();_currentcolumn++;}
0016 #define INIT_TEMP_STRING() { _longstr.resize(0);}
0017 #define APPEND_CHAR(c) { _longstr.push_back(c);}
0018 #define TERMINATE_BUFFER() {_longstr.push_back(_SC('\0'));}
0019 #define ADD_KEYWORD(key,id) _keywords->NewSlot( PSString::Create(ss, _SC(#key)) ,PSInteger(id))
0020
0021 PSLexer::PSLexer(){}
0022 PSLexer::~PSLexer()
0023 {
0024     _keywords->Release();
0025 }
0026
0027 void PSLexer::Init(PSSharedState *ss, PSLEXREADFUNC rg, PSUserPointer up,CompilerErrorFunc efunc,void *ed)
0028 {
0029     _errfunc = efunc;
0030     _errtarget = ed;
0031     _sharedstate = ss;
0032     _keywords = PSTable::Create(ss, 37);
0033     ADD_KEYWORD(while, TK_WHILE);
0034     ADD_KEYWORD(do, TK_DO);
0035     ADD_KEYWORD(if, TK_IF);
0036     ADD_KEYWORD(else, TK_ELSE);
0037     ADD_KEYWORD(break, TK_BREAK);
0038     ADD_KEYWORD(continue, TK_CONTINUE);
0039     ADD_KEYWORD(return, TK_RETURN);
0040     ADD_KEYWORD(null, TK_NULL);
0041     ADD_KEYWORD(function, TK_FUNCTION);
0042     ADD_KEYWORD(local, TK_LOCAL);
0043     ADD_KEYWORD(for, TK_FOR);
0044     ADD_KEYWORD(foreach, TK_FOREACH);
0045     ADD_KEYWORD(in, TK_IN);
0046     ADD_KEYWORD(typeof, TK_TYPEOF);
0047     ADD_KEYWORD(base, TK_BASE);
0048     ADD_KEYWORD(delete, TK_DELETE);
0049     ADD_KEYWORD(try, TK_TRY);
0050     ADD_KEYWORD(catch, TK_CATCH);
0051     ADD_KEYWORD(throw, TK_THROW);
0052     ADD_KEYWORD(clone, TK_CLONE);
0053     ADD_KEYWORD(yield, TK_YIELD);
0054     ADD_KEYWORD(resume, TK_RESUME);
0055     ADD_KEYWORD(switch, TK_SWITCH);
0056     ADD_KEYWORD(case, TK_CASE);
0057     ADD_KEYWORD(default, TK_DEFAULT);
0058     ADD_KEYWORD(this, TK_THIS);
0059     ADD_KEYWORD(class,TK_CLASS);
0060     ADD_KEYWORD(extends,TK_EXTENDS);
0061     ADD_KEYWORD(constructor,TK_CONSTRUCTOR);
0062     ADD_KEYWORD(instanceof,TK_INSTANCEOF);
0063     ADD_KEYWORD(true,TK_TRUE);
0064     ADD_KEYWORD(false,TK_FALSE);
0065     ADD_KEYWORD(static,TK_STATIC);
0066     ADD_KEYWORD(enum,TK_ENUM);
0067     ADD_KEYWORD(const,TK_CONST);
0068     ADD_KEYWORD(__LINE__,TK___LINE__);
0069     ADD_KEYWORD(__FILE__,TK___FILE__);
0070
0071     _readf = rg;
0072     _up = up;
0073     _lasttokenline = _currentline = 1;
0074     _currentcolumn = 0;
0075     _prevtoken = -1;
0076     _reached_eof = PSFalse;
0077     Next();
0078 }
0079
0080 void PSLexer::Error(const PSChar *err)
0081 {
0082     _errfunc(_errtarget,err);
0083 }
0084
0085 void PSLexer::Next()
0086 {
0087     PSInteger t = _readf(_up);
0088     if(t > MAX_CHAR) Error(_SC("Invalid character"));
0089     if(t != 0) {
0090         _currdata = (LexChar)t;
0091         return;
0092     }
0093     _currdata = PSCRIPT_EOB;
0094     _reached_eof = PSTrue;
0095 }
0096
0097 const PSChar *PSLexer::Tok2Str(PSInteger tok)
0098 {
0099     PSObjectPtr itr, key, val;
0100     PSInteger nitr;
0101     while((nitr = _keywords->Next(false,itr, key, val)) != -1) {
0102         itr = (PSInteger)nitr;
0103         if(((PSInteger)_integer(val)) == tok)
0104             return _stringval(key);
0105     }
0106     return NULL;
0107 }
0108
0109 void PSLexer::LexBlockComment()
0110 {
0111     bool done = false;
0112     while(!done) {
0113         switch(CUR_CHAR) {
0114             case _SC('*'): { NEXT(); if(CUR_CHAR == _SC('/')) { done = true; NEXT(); }}; continue;
0115             case _SC('\n'): _currentline++; NEXT(); continue;
0116             case PSCRIPT_EOB: Error(_SC("missing \"*/\" in comment"));
0117             default: NEXT();
0118         }
0119     }
0120 }
0121 void PSLexer::LexLineComment()
0122 {
0123     do { NEXT(); } while (CUR_CHAR != _SC('\n') && (!IS_EOB()));
0124 }
0125
0126 PSInteger PSLexer::Lex()
0127 {
0128     _lasttokenline = _currentline;
0129     while(CUR_CHAR != PSCRIPT_EOB) {
0130         switch(CUR_CHAR){
0131         case _SC('\t'): case _SC('\r'): case _SC(' '): NEXT(); continue;
0132         case _SC('\n'):
0133             _currentline++;
0134             _prevtoken=_curtoken;
0135             _curtoken=_SC('\n');
0136             NEXT();
0137             _currentcolumn=1;
0138             continue;
0139         case _SC('#'): LexLineComment(); continue;
0140         case _SC('/'):
0141             NEXT();
0142             switch(CUR_CHAR){
0143             case _SC('*'):
0144                 NEXT();
0145                 LexBlockComment();
0146                 continue;
0147             case _SC('/'):
0148                 LexLineComment();
0149                 continue;
0150             case _SC('='):
0151                 NEXT();
0152                 RETURN_TOKEN(TK_DIVEQ);
0153                 continue;
0154             case _SC('>'):
0155                 NEXT();
0156                 RETURN_TOKEN(TK_ATTR_CLOSE);
0157                 continue;
0158             default:
0159                 RETURN_TOKEN('/');
0160             }
0161         case _SC('='):
0162             NEXT();
0163             if (CUR_CHAR != _SC('=')){ RETURN_TOKEN('=') }
0164             else { NEXT(); RETURN_TOKEN(TK_EQ); }
0165         case _SC('<'):
0166             NEXT();
0167             switch(CUR_CHAR) {
0168             case _SC('='):
0169                 NEXT();
0170                 if(CUR_CHAR == _SC('>')) {
0171                     NEXT();
0172                     RETURN_TOKEN(TK_3WAYSCMP);
0173                 }
0174                 RETURN_TOKEN(TK_LE)
0175                 break;
0176             case _SC('-'): NEXT(); RETURN_TOKEN(TK_NEWSLOT); break;
0177             case _SC('<'): NEXT(); RETURN_TOKEN(TK_SHIFTL); break;
0178             case _SC('/'): NEXT(); RETURN_TOKEN(TK_ATTR_OPEN); break;
0179             }
0180             RETURN_TOKEN('<');
0181         case _SC('>'):
0182             NEXT();
0183             if (CUR_CHAR == _SC('=')){ NEXT(); RETURN_TOKEN(TK_GE);}
0184             else if(CUR_CHAR == _SC('>')){
0185                 NEXT();
0186                 if(CUR_CHAR == _SC('>')){
0187                     NEXT();
0188                     RETURN_TOKEN(TK_USHIFTR);
0189                 }
0190                 RETURN_TOKEN(TK_SHIFTR);
0191             }
0192             else { RETURN_TOKEN('>') }
0193         case _SC('!'):
0194             NEXT();
0195             if (CUR_CHAR != _SC('=')){ RETURN_TOKEN('!')}
0196             else { NEXT(); RETURN_TOKEN(TK_NE); }
0197         case _SC('@'): {
0198             PSInteger stype;
0199             NEXT();
0200             if(CUR_CHAR != _SC('"')) {
0201                 RETURN_TOKEN('@');
0202             }
0203             if((stype=ReadString('"',true))!=-1) {
0204                 RETURN_TOKEN(stype);
0205             }
0206             Error(_SC("error parsing the string"));
0207                        }
0208         case _SC('"'):
0209         case _SC('\''): {
0210             PSInteger stype;
0211             if((stype=ReadString(CUR_CHAR,false))!=-1){
0212                 RETURN_TOKEN(stype);
0213             }
0214             Error(_SC("error parsing the string"));
0215             }
0216         case _SC('{'): case _SC('}'): case _SC('('): case _SC(')'): case _SC('['): case _SC(']'):
0217         case _SC(';'): case _SC(','): case _SC('?'): case _SC('^'): case _SC('~'):
0218             {PSInteger ret = CUR_CHAR;
0219             NEXT(); RETURN_TOKEN(ret); }
0220         case _SC('.'):
0221             NEXT();
0222             if (CUR_CHAR != _SC('.')){ RETURN_TOKEN('.') }
0223             NEXT();
0224             if (CUR_CHAR != _SC('.')){ Error(_SC("invalid token '..'")); }
0225             NEXT();
0226             RETURN_TOKEN(TK_VARPARAMS);
0227         case _SC('&'):
0228             NEXT();
0229             if (CUR_CHAR != _SC('&')){ RETURN_TOKEN('&') }
0230             else { NEXT(); RETURN_TOKEN(TK_AND); }
0231         case _SC('|'):
0232             NEXT();
0233             if (CUR_CHAR != _SC('|')){ RETURN_TOKEN('|') }
0234             else { NEXT(); RETURN_TOKEN(TK_OR); }
0235         case _SC(':'):
0236             NEXT();
0237             if (CUR_CHAR != _SC(':')){ RETURN_TOKEN(':') }
0238             else { NEXT(); RETURN_TOKEN(TK_DOUBLE_COLON); }
0239         case _SC('*'):
0240             NEXT();
0241             if (CUR_CHAR == _SC('=')){ NEXT(); RETURN_TOKEN(TK_MULEQ);}
0242             else RETURN_TOKEN('*');
0243         case _SC('%'):
0244             NEXT();
0245             if (CUR_CHAR == _SC('=')){ NEXT(); RETURN_TOKEN(TK_MODEQ);}
0246             else RETURN_TOKEN('%');
0247         case _SC('-'):
0248             NEXT();
0249             if (CUR_CHAR == _SC('=')){ NEXT(); RETURN_TOKEN(TK_MINUSEQ);}
0250             else if  (CUR_CHAR == _SC('-')){ NEXT(); RETURN_TOKEN(TK_MINUSMINUS);}
0251             else RETURN_TOKEN('-');
0252         case _SC('+'):
0253             NEXT();
0254             if (CUR_CHAR == _SC('=')){ NEXT(); RETURN_TOKEN(TK_PLUSEQ);}
0255             else if (CUR_CHAR == _SC('+')){ NEXT(); RETURN_TOKEN(TK_PLUSPLUS);}
0256             else RETURN_TOKEN('+');
0257         case PSCRIPT_EOB:
0258             return 0;
0259         default:{
0260                 if (scisdigit(CUR_CHAR)) {
0261                     PSInteger ret = ReadNumber();
0262                     RETURN_TOKEN(ret);
0263                 }
0264                 else if (scisalpha(CUR_CHAR) || CUR_CHAR == _SC('_')) {
0265                     PSInteger t = ReadID();
0266                     RETURN_TOKEN(t);
0267                 }
0268                 else {
0269                     PSInteger c = CUR_CHAR;
0270                     if (sciscntrl((int)c)) Error(_SC("unexpected character(control)"));
0271                     NEXT();
0272                     RETURN_TOKEN(c);
0273                 }
0274                 RETURN_TOKEN(0);
0275             }
0276         }
0277     }
0278     return 0;
0279 }
0280
0281 PSInteger PSLexer::GetIDType(const PSChar *s,PSInteger len)
0282 {
0283     PSObjectPtr t;
0284     if(_keywords->GetStr(s,len, t)) {
0285         return PSInteger(_integer(t));
0286     }
0287     return TK_IDENTIFIER;
0288 }
0289
0290 #ifdef PSUNICODE
0291 #if WCHAR_SIZE == 2
0292 PSInteger PSLexer::AddUTF16(PSUnsignedInteger ch)
0293 {
0294     if (ch >= 0x10000)
0295     {
0296         PSUnsignedInteger code = (ch - 0x10000);
0297         APPEND_CHAR((PSChar)(0xD800 | (code >> 10)));
0298         APPEND_CHAR((PSChar)(0xDC00 | (code & 0x3FF)));
0299         return 2;
0300     }
0301     else {
0302         APPEND_CHAR((PSChar)ch);
0303         return 1;
0304     }
0305 }
0306 #endif
0307 #else
0308 PSInteger PSLexer::AddUTF8(PSUnsignedInteger ch)
0309 {
0310     if (ch < 0x80) {
0311         APPEND_CHAR((char)ch);
0312         return 1;
0313     }
0314     if (ch < 0x800) {
0315         APPEND_CHAR((PSChar)((ch >> 6) | 0xC0));
0316         APPEND_CHAR((PSChar)((ch & 0x3F) | 0x80));
0317         return 2;
0318     }
0319     if (ch < 0x10000) {
0320         APPEND_CHAR((PSChar)((ch >> 12) | 0xE0));
0321         APPEND_CHAR((PSChar)(((ch >> 6) & 0x3F) | 0x80));
0322         APPEND_CHAR((PSChar)((ch & 0x3F) | 0x80));
0323         return 3;
0324     }
0325     if (ch < 0x110000) {
0326         APPEND_CHAR((PSChar)((ch >> 18) | 0xF0));
0327         APPEND_CHAR((PSChar)(((ch >> 12) & 0x3F) | 0x80));
0328         APPEND_CHAR((PSChar)(((ch >> 6) & 0x3F) | 0x80));
0329         APPEND_CHAR((PSChar)((ch & 0x3F) | 0x80));
0330         return 4;
0331     }
0332     return 0;
0333 }
0334 #endif
0335
0336 PSInteger PSLexer::ProcessStringHexEscape(PSChar *dest, PSInteger maxdigits)
0337 {
0338     NEXT();
0339     if (!isxdigit(CUR_CHAR)) Error(_SC("hexadecimal number expected"));
0340     PSInteger n = 0;
0341     while (isxdigit(CUR_CHAR) && n < maxdigits) {
0342         dest[n] = CUR_CHAR;
0343         n++;
0344         NEXT();
0345     }
0346     dest[n] = 0;
0347     return n;
0348 }
0349
0350 PSInteger PSLexer::ReadString(PSInteger ndelim,bool verbatim)
0351 {
0352     INIT_TEMP_STRING();
0353     NEXT();
0354     if(IS_EOB()) return -1;
0355     for(;;) {
0356         while(CUR_CHAR != ndelim) {
0357             PSInteger x = CUR_CHAR;
0358             switch (x) {
0359             case PSCRIPT_EOB:
0360                 Error(_SC("unfinished string"));
0361                 return -1;
0362             case _SC('\n'):
0363                 if(!verbatim) Error(_SC("newline in a constant"));
0364                 APPEND_CHAR(CUR_CHAR); NEXT();
0365                 _currentline++;
0366                 break;
0367             case _SC('\\'):
0368                 if(verbatim) {
0369                     APPEND_CHAR('\\'); NEXT();
0370                 }
0371                 else {
0372                     NEXT();
0373                     switch(CUR_CHAR) {
0374                     case _SC('x'):  {
0375                         const PSInteger maxdigits = sizeof(PSChar) * 2;
0376                         PSChar temp[maxdigits + 1];
0377                         ProcessStringHexEscape(temp, maxdigits);
0378                         PSChar *stemp;
0379                         APPEND_CHAR((PSChar)scstrtoul(temp, &stemp, 16));
0380                     }
0381                     break;
0382                     case _SC('U'):
0383                     case _SC('u'):  {
0384                         const PSInteger maxdigits = x == 'u' ? 4 : 8;
0385                         PSChar temp[8 + 1];
0386                         ProcessStringHexEscape(temp, maxdigits);
0387                         PSChar *stemp;
0388 #ifdef PSUNICODE
0389 #if WCHAR_SIZE == 2
0390                         AddUTF16(scstrtoul(temp, &stemp, 16));
0391 #else
0392                         ADD_CHAR((PSChar)scstrtoul(temp, &stemp, 16));
0393 #endif
0394 #else
0395                         AddUTF8(scstrtoul(temp, &stemp, 16));
0396 #endif
0397                     }
0398                     break;
0399                     case _SC('t'): APPEND_CHAR(_SC('\t')); NEXT(); break;
0400                     case _SC('a'): APPEND_CHAR(_SC('\a')); NEXT(); break;
0401                     case _SC('b'): APPEND_CHAR(_SC('\b')); NEXT(); break;
0402                     case _SC('n'): APPEND_CHAR(_SC('\n')); NEXT(); break;
0403                     case _SC('r'): APPEND_CHAR(_SC('\r')); NEXT(); break;
0404                     case _SC('v'): APPEND_CHAR(_SC('\v')); NEXT(); break;
0405                     case _SC('f'): APPEND_CHAR(_SC('\f')); NEXT(); break;
0406                     case _SC('0'): APPEND_CHAR(_SC('\0')); NEXT(); break;
0407                     case _SC('\\'): APPEND_CHAR(_SC('\\')); NEXT(); break;
0408                     case _SC('"'): APPEND_CHAR(_SC('"')); NEXT(); break;
0409                     case _SC('\''): APPEND_CHAR(_SC('\'')); NEXT(); break;
0410                     default:
0411                         Error(_SC("unrecognised escaper char"));
0412                     break;
0413                     }
0414                 }
0415                 break;
0416             default:
0417                 APPEND_CHAR(CUR_CHAR);
0418                 NEXT();
0419             }
0420         }
0421         NEXT();
0422         if(verbatim && CUR_CHAR == '"') { //double quotation

0423             APPEND_CHAR(CUR_CHAR);
0424             NEXT();
0425         }
0426         else {
0427             break;
0428         }
0429     }
0430     TERMINATE_BUFFER();
0431     PSInteger len = _longstr.size()-1;
0432     if(ndelim == _SC('\'')) {
0433         if(len == 0) Error(_SC("empty constant"));
0434         if(len > 1) Error(_SC("constant too long"));
0435         _nvalue = _longstr[0];
0436         return TK_INTEGER;
0437     }
0438     _svalue = &_longstr[0];
0439     return TK_STRING_LITERAL;
0440 }
0441
0442 void LexHexadecimal(const PSChar *s,PSUnsignedInteger *res)
0443 {
0444     *res = 0;
0445     while(*s != 0)
0446     {
0447         if(scisdigit(*s)) *res = (*res)*16+((*s++)-'0');
0448         else if(scisxdigit(*s)) *res = (*res)*16+(toupper(*s++)-'A'+10);
0449         else { assert(0); }
0450     }
0451 }
0452
0453 void LexInteger(const PSChar *s,PSUnsignedInteger *res)
0454 {
0455     *res = 0;
0456     while(*s != 0)
0457     {
0458         *res = (*res)*10+((*s++)-'0');
0459     }
0460 }
0461
0462 PSInteger scisodigit(PSInteger c) { return c >= _SC('0') && c <= _SC('7'); }
0463
0464 void LexOctal(const PSChar *s,PSUnsignedInteger *res)
0465 {
0466     *res = 0;
0467     while(*s != 0)
0468     {
0469         if(scisodigit(*s)) *res = (*res)*8+((*s++)-'0');
0470         else { assert(0); }
0471     }
0472 }
0473
0474 PSInteger isexponent(PSInteger c) { return c == 'e' || c=='E'; }
0475
0476
0477 #define MAX_HEX_DIGITS (sizeof(PSInteger)*2)
0478 PSInteger PSLexer::ReadNumber()
0479 {
0480 #define TINT 1
0481 #define TFLOAT 2
0482 #define THEX 3
0483 #define TSCIENTIFIC 4
0484 #define TOCTAL 5
0485     PSInteger type = TINT, firstchar = CUR_CHAR;
0486     PSChar *sTemp;
0487     INIT_TEMP_STRING();
0488     NEXT();
0489     if(firstchar == _SC('0') && (toupper(CUR_CHAR) == _SC('X') || scisodigit(CUR_CHAR)) ) {
0490         if(scisodigit(CUR_CHAR)) {
0491             type = TOCTAL;
0492             while(scisodigit(CUR_CHAR)) {
0493                 APPEND_CHAR(CUR_CHAR);
0494                 NEXT();
0495             }
0496             if(scisdigit(CUR_CHAR)) Error(_SC("invalid octal number"));
0497         }
0498         else {
0499             NEXT();
0500             type = THEX;
0501             while(isxdigit(CUR_CHAR)) {
0502                 APPEND_CHAR(CUR_CHAR);
0503                 NEXT();
0504             }
0505             if(_longstr.size() > MAX_HEX_DIGITS) Error(_SC("too many digits for an Hex number"));
0506         }
0507     }
0508     else {
0509         APPEND_CHAR((int)firstchar);
0510         while (CUR_CHAR == _SC('.') || scisdigit(CUR_CHAR) || isexponent(CUR_CHAR)) {
0511             if(CUR_CHAR == _SC('.') || isexponent(CUR_CHAR)) type = TFLOAT;
0512             if(isexponent(CUR_CHAR)) {
0513                 if(type != TFLOAT) Error(_SC("invalid numeric format"));
0514                 type = TSCIENTIFIC;
0515                 APPEND_CHAR(CUR_CHAR);
0516                 NEXT();
0517                 if(CUR_CHAR == '+' || CUR_CHAR == '-'){
0518                     APPEND_CHAR(CUR_CHAR);
0519                     NEXT();
0520                 }
0521                 if(!scisdigit(CUR_CHAR)) Error(_SC("exponent expected"));
0522             }
0523
0524             APPEND_CHAR(CUR_CHAR);
0525             NEXT();
0526         }
0527     }
0528     TERMINATE_BUFFER();
0529     switch(type) {
0530     case TSCIENTIFIC:
0531     case TFLOAT:
0532         _fvalue = (PSFloat)scstrtod(&_longstr[0],&sTemp);
0533         return TK_FLOAT;
0534     case TINT:
0535         LexInteger(&_longstr[0],(PSUnsignedInteger *)&_nvalue);
0536         return TK_INTEGER;
0537     case THEX:
0538         LexHexadecimal(&_longstr[0],(PSUnsignedInteger *)&_nvalue);
0539         return TK_INTEGER;
0540     case TOCTAL:
0541         LexOctal(&_longstr[0],(PSUnsignedInteger *)&_nvalue);
0542         return TK_INTEGER;
0543     }
0544     return 0;
0545 }
0546
0547 PSInteger PSLexer::ReadID()
0548 {
0549     PSInteger res;
0550     INIT_TEMP_STRING();
0551     do {
0552         APPEND_CHAR(CUR_CHAR);
0553         NEXT();
0554     } while(scisalnum(CUR_CHAR) || CUR_CHAR == _SC('_'));
0555     TERMINATE_BUFFER();
0556     res = GetIDType(&_longstr[0],_longstr.size() - 1);
0557     if(res == TK_IDENTIFIER || res == TK_CONSTRUCTOR) {
0558         _svalue = &_longstr[0];
0559     }
0560     return res;
0561 }