0001
0002
0003
0004 #include "pspcheader.h"
0005 #include <ctype.h>
0006 #include <stdlib.h>
0007 #include "pstable.h"
0008 #include "psstring.h"
0009 #include "pscompiler.h"
0010 #include "pslexer.h"
0011
0012 #define CUR_CHAR (_currdata)
0013 #define RETURN_TOKEN(t) { _prevtoken = _curtoken; _curtoken = t; return t;}
0014 #define IS_EOB() (CUR_CHAR <= PSCRIPT_EOB)
0015 #define NEXT() {Next();_currentcolumn++;}
0016 #define INIT_TEMP_STRING() { _longstr.resize(0);}
0017 #define APPEND_CHAR(c) { _longstr.push_back(c);}
0018 #define TERMINATE_BUFFER() {_longstr.push_back(_SC('\0'));}
0019 #define ADD_KEYWORD(key,id) _keywords->NewSlot( PSString::Create(ss, _SC(#key)) ,PSInteger(id))
0020
0021 PSLexer::PSLexer(){}
0022 PSLexer::~PSLexer()
0023 {
0024 _keywords->Release();
0025 }
0026
0027 void PSLexer::Init(PSSharedState *ss, PSLEXREADFUNC rg, PSUserPointer up,CompilerErrorFunc efunc,void *ed)
0028 {
0029 _errfunc = efunc;
0030 _errtarget = ed;
0031 _sharedstate = ss;
0032 _keywords = PSTable::Create(ss, 37);
0033 ADD_KEYWORD(while, TK_WHILE);
0034 ADD_KEYWORD(do, TK_DO);
0035 ADD_KEYWORD(if, TK_IF);
0036 ADD_KEYWORD(else, TK_ELSE);
0037 ADD_KEYWORD(break, TK_BREAK);
0038 ADD_KEYWORD(continue, TK_CONTINUE);
0039 ADD_KEYWORD(return, TK_RETURN);
0040 ADD_KEYWORD(null, TK_NULL);
0041 ADD_KEYWORD(function, TK_FUNCTION);
0042 ADD_KEYWORD(local, TK_LOCAL);
0043 ADD_KEYWORD(for, TK_FOR);
0044 ADD_KEYWORD(foreach, TK_FOREACH);
0045 ADD_KEYWORD(in, TK_IN);
0046 ADD_KEYWORD(typeof, TK_TYPEOF);
0047 ADD_KEYWORD(base, TK_BASE);
0048 ADD_KEYWORD(delete, TK_DELETE);
0049 ADD_KEYWORD(try, TK_TRY);
0050 ADD_KEYWORD(catch, TK_CATCH);
0051 ADD_KEYWORD(throw, TK_THROW);
0052 ADD_KEYWORD(clone, TK_CLONE);
0053 ADD_KEYWORD(yield, TK_YIELD);
0054 ADD_KEYWORD(resume, TK_RESUME);
0055 ADD_KEYWORD(switch, TK_SWITCH);
0056 ADD_KEYWORD(case, TK_CASE);
0057 ADD_KEYWORD(default, TK_DEFAULT);
0058 ADD_KEYWORD(this, TK_THIS);
0059 ADD_KEYWORD(class,TK_CLASS);
0060 ADD_KEYWORD(extends,TK_EXTENDS);
0061 ADD_KEYWORD(constructor,TK_CONSTRUCTOR);
0062 ADD_KEYWORD(instanceof,TK_INSTANCEOF);
0063 ADD_KEYWORD(true,TK_TRUE);
0064 ADD_KEYWORD(false,TK_FALSE);
0065 ADD_KEYWORD(static,TK_STATIC);
0066 ADD_KEYWORD(enum,TK_ENUM);
0067 ADD_KEYWORD(const,TK_CONST);
0068 ADD_KEYWORD(__LINE__,TK___LINE__);
0069 ADD_KEYWORD(__FILE__,TK___FILE__);
0070
0071 _readf = rg;
0072 _up = up;
0073 _lasttokenline = _currentline = 1;
0074 _currentcolumn = 0;
0075 _prevtoken = -1;
0076 _reached_eof = PSFalse;
0077 Next();
0078 }
0079
0080 void PSLexer::Error(const PSChar *err)
0081 {
0082 _errfunc(_errtarget,err);
0083 }
0084
0085 void PSLexer::Next()
0086 {
0087 PSInteger t = _readf(_up);
0088 if(t > MAX_CHAR) Error(_SC("Invalid character"));
0089 if(t != 0) {
0090 _currdata = (LexChar)t;
0091 return;
0092 }
0093 _currdata = PSCRIPT_EOB;
0094 _reached_eof = PSTrue;
0095 }
0096
0097 const PSChar *PSLexer::Tok2Str(PSInteger tok)
0098 {
0099 PSObjectPtr itr, key, val;
0100 PSInteger nitr;
0101 while((nitr = _keywords->Next(false,itr, key, val)) != -1) {
0102 itr = (PSInteger)nitr;
0103 if(((PSInteger)_integer(val)) == tok)
0104 return _stringval(key);
0105 }
0106 return NULL;
0107 }
0108
0109 void PSLexer::LexBlockComment()
0110 {
0111 bool done = false;
0112 while(!done) {
0113 switch(CUR_CHAR) {
0114 case _SC('*'): { NEXT(); if(CUR_CHAR == _SC('/')) { done = true; NEXT(); }}; continue;
0115 case _SC('\n'): _currentline++; NEXT(); continue;
0116 case PSCRIPT_EOB: Error(_SC("missing \"*/\" in comment"));
0117 default: NEXT();
0118 }
0119 }
0120 }
0121 void PSLexer::LexLineComment()
0122 {
0123 do { NEXT(); } while (CUR_CHAR != _SC('\n') && (!IS_EOB()));
0124 }
0125
0126 PSInteger PSLexer::Lex()
0127 {
0128 _lasttokenline = _currentline;
0129 while(CUR_CHAR != PSCRIPT_EOB) {
0130 switch(CUR_CHAR){
0131 case _SC('\t'): case _SC('\r'): case _SC(' '): NEXT(); continue;
0132 case _SC('\n'):
0133 _currentline++;
0134 _prevtoken=_curtoken;
0135 _curtoken=_SC('\n');
0136 NEXT();
0137 _currentcolumn=1;
0138 continue;
0139 case _SC('#'): LexLineComment(); continue;
0140 case _SC('/'):
0141 NEXT();
0142 switch(CUR_CHAR){
0143 case _SC('*'):
0144 NEXT();
0145 LexBlockComment();
0146 continue;
0147 case _SC('/'):
0148 LexLineComment();
0149 continue;
0150 case _SC('='):
0151 NEXT();
0152 RETURN_TOKEN(TK_DIVEQ);
0153 continue;
0154 case _SC('>'):
0155 NEXT();
0156 RETURN_TOKEN(TK_ATTR_CLOSE);
0157 continue;
0158 default:
0159 RETURN_TOKEN('/');
0160 }
0161 case _SC('='):
0162 NEXT();
0163 if (CUR_CHAR != _SC('=')){ RETURN_TOKEN('=') }
0164 else { NEXT(); RETURN_TOKEN(TK_EQ); }
0165 case _SC('<'):
0166 NEXT();
0167 switch(CUR_CHAR) {
0168 case _SC('='):
0169 NEXT();
0170 if(CUR_CHAR == _SC('>')) {
0171 NEXT();
0172 RETURN_TOKEN(TK_3WAYSCMP);
0173 }
0174 RETURN_TOKEN(TK_LE)
0175 break;
0176 case _SC('-'): NEXT(); RETURN_TOKEN(TK_NEWSLOT); break;
0177 case _SC('<'): NEXT(); RETURN_TOKEN(TK_SHIFTL); break;
0178 case _SC('/'): NEXT(); RETURN_TOKEN(TK_ATTR_OPEN); break;
0179 }
0180 RETURN_TOKEN('<');
0181 case _SC('>'):
0182 NEXT();
0183 if (CUR_CHAR == _SC('=')){ NEXT(); RETURN_TOKEN(TK_GE);}
0184 else if(CUR_CHAR == _SC('>')){
0185 NEXT();
0186 if(CUR_CHAR == _SC('>')){
0187 NEXT();
0188 RETURN_TOKEN(TK_USHIFTR);
0189 }
0190 RETURN_TOKEN(TK_SHIFTR);
0191 }
0192 else { RETURN_TOKEN('>') }
0193 case _SC('!'):
0194 NEXT();
0195 if (CUR_CHAR != _SC('=')){ RETURN_TOKEN('!')}
0196 else { NEXT(); RETURN_TOKEN(TK_NE); }
0197 case _SC('@'): {
0198 PSInteger stype;
0199 NEXT();
0200 if(CUR_CHAR != _SC('"')) {
0201 RETURN_TOKEN('@');
0202 }
0203 if((stype=ReadString('"',true))!=-1) {
0204 RETURN_TOKEN(stype);
0205 }
0206 Error(_SC("error parsing the string"));
0207 }
0208 case _SC('"'):
0209 case _SC('\''): {
0210 PSInteger stype;
0211 if((stype=ReadString(CUR_CHAR,false))!=-1){
0212 RETURN_TOKEN(stype);
0213 }
0214 Error(_SC("error parsing the string"));
0215 }
0216 case _SC('{'): case _SC('}'): case _SC('('): case _SC(')'): case _SC('['): case _SC(']'):
0217 case _SC(';'): case _SC(','): case _SC('?'): case _SC('^'): case _SC('~'):
0218 {PSInteger ret = CUR_CHAR;
0219 NEXT(); RETURN_TOKEN(ret); }
0220 case _SC('.'):
0221 NEXT();
0222 if (CUR_CHAR != _SC('.')){ RETURN_TOKEN('.') }
0223 NEXT();
0224 if (CUR_CHAR != _SC('.')){ Error(_SC("invalid token '..'")); }
0225 NEXT();
0226 RETURN_TOKEN(TK_VARPARAMS);
0227 case _SC('&'):
0228 NEXT();
0229 if (CUR_CHAR != _SC('&')){ RETURN_TOKEN('&') }
0230 else { NEXT(); RETURN_TOKEN(TK_AND); }
0231 case _SC('|'):
0232 NEXT();
0233 if (CUR_CHAR != _SC('|')){ RETURN_TOKEN('|') }
0234 else { NEXT(); RETURN_TOKEN(TK_OR); }
0235 case _SC(':'):
0236 NEXT();
0237 if (CUR_CHAR != _SC(':')){ RETURN_TOKEN(':') }
0238 else { NEXT(); RETURN_TOKEN(TK_DOUBLE_COLON); }
0239 case _SC('*'):
0240 NEXT();
0241 if (CUR_CHAR == _SC('=')){ NEXT(); RETURN_TOKEN(TK_MULEQ);}
0242 else RETURN_TOKEN('*');
0243 case _SC('%'):
0244 NEXT();
0245 if (CUR_CHAR == _SC('=')){ NEXT(); RETURN_TOKEN(TK_MODEQ);}
0246 else RETURN_TOKEN('%');
0247 case _SC('-'):
0248 NEXT();
0249 if (CUR_CHAR == _SC('=')){ NEXT(); RETURN_TOKEN(TK_MINUSEQ);}
0250 else if (CUR_CHAR == _SC('-')){ NEXT(); RETURN_TOKEN(TK_MINUSMINUS);}
0251 else RETURN_TOKEN('-');
0252 case _SC('+'):
0253 NEXT();
0254 if (CUR_CHAR == _SC('=')){ NEXT(); RETURN_TOKEN(TK_PLUSEQ);}
0255 else if (CUR_CHAR == _SC('+')){ NEXT(); RETURN_TOKEN(TK_PLUSPLUS);}
0256 else RETURN_TOKEN('+');
0257 case PSCRIPT_EOB:
0258 return 0;
0259 default:{
0260 if (scisdigit(CUR_CHAR)) {
0261 PSInteger ret = ReadNumber();
0262 RETURN_TOKEN(ret);
0263 }
0264 else if (scisalpha(CUR_CHAR) || CUR_CHAR == _SC('_')) {
0265 PSInteger t = ReadID();
0266 RETURN_TOKEN(t);
0267 }
0268 else {
0269 PSInteger c = CUR_CHAR;
0270 if (sciscntrl((int)c)) Error(_SC("unexpected character(control)"));
0271 NEXT();
0272 RETURN_TOKEN(c);
0273 }
0274 RETURN_TOKEN(0);
0275 }
0276 }
0277 }
0278 return 0;
0279 }
0280
0281 PSInteger PSLexer::GetIDType(const PSChar *s,PSInteger len)
0282 {
0283 PSObjectPtr t;
0284 if(_keywords->GetStr(s,len, t)) {
0285 return PSInteger(_integer(t));
0286 }
0287 return TK_IDENTIFIER;
0288 }
0289
0290 #ifdef PSUNICODE
0291 #if WCHAR_SIZE == 2
0292 PSInteger PSLexer::AddUTF16(PSUnsignedInteger ch)
0293 {
0294 if (ch >= 0x10000)
0295 {
0296 PSUnsignedInteger code = (ch - 0x10000);
0297 APPEND_CHAR((PSChar)(0xD800 | (code >> 10)));
0298 APPEND_CHAR((PSChar)(0xDC00 | (code & 0x3FF)));
0299 return 2;
0300 }
0301 else {
0302 APPEND_CHAR((PSChar)ch);
0303 return 1;
0304 }
0305 }
0306 #endif
0307 #else
0308 PSInteger PSLexer::AddUTF8(PSUnsignedInteger ch)
0309 {
0310 if (ch < 0x80) {
0311 APPEND_CHAR((char)ch);
0312 return 1;
0313 }
0314 if (ch < 0x800) {
0315 APPEND_CHAR((PSChar)((ch >> 6) | 0xC0));
0316 APPEND_CHAR((PSChar)((ch & 0x3F) | 0x80));
0317 return 2;
0318 }
0319 if (ch < 0x10000) {
0320 APPEND_CHAR((PSChar)((ch >> 12) | 0xE0));
0321 APPEND_CHAR((PSChar)(((ch >> 6) & 0x3F) | 0x80));
0322 APPEND_CHAR((PSChar)((ch & 0x3F) | 0x80));
0323 return 3;
0324 }
0325 if (ch < 0x110000) {
0326 APPEND_CHAR((PSChar)((ch >> 18) | 0xF0));
0327 APPEND_CHAR((PSChar)(((ch >> 12) & 0x3F) | 0x80));
0328 APPEND_CHAR((PSChar)(((ch >> 6) & 0x3F) | 0x80));
0329 APPEND_CHAR((PSChar)((ch & 0x3F) | 0x80));
0330 return 4;
0331 }
0332 return 0;
0333 }
0334 #endif
0335
0336 PSInteger PSLexer::ProcessStringHexEscape(PSChar *dest, PSInteger maxdigits)
0337 {
0338 NEXT();
0339 if (!isxdigit(CUR_CHAR)) Error(_SC("hexadecimal number expected"));
0340 PSInteger n = 0;
0341 while (isxdigit(CUR_CHAR) && n < maxdigits) {
0342 dest[n] = CUR_CHAR;
0343 n++;
0344 NEXT();
0345 }
0346 dest[n] = 0;
0347 return n;
0348 }
0349
0350 PSInteger PSLexer::ReadString(PSInteger ndelim,bool verbatim)
0351 {
0352 INIT_TEMP_STRING();
0353 NEXT();
0354 if(IS_EOB()) return -1;
0355 for(;;) {
0356 while(CUR_CHAR != ndelim) {
0357 PSInteger x = CUR_CHAR;
0358 switch (x) {
0359 case PSCRIPT_EOB:
0360 Error(_SC("unfinished string"));
0361 return -1;
0362 case _SC('\n'):
0363 if(!verbatim) Error(_SC("newline in a constant"));
0364 APPEND_CHAR(CUR_CHAR); NEXT();
0365 _currentline++;
0366 break;
0367 case _SC('\\'):
0368 if(verbatim) {
0369 APPEND_CHAR('\\'); NEXT();
0370 }
0371 else {
0372 NEXT();
0373 switch(CUR_CHAR) {
0374 case _SC('x'): {
0375 const PSInteger maxdigits = sizeof(PSChar) * 2;
0376 PSChar temp[maxdigits + 1];
0377 ProcessStringHexEscape(temp, maxdigits);
0378 PSChar *stemp;
0379 APPEND_CHAR((PSChar)scstrtoul(temp, &stemp, 16));
0380 }
0381 break;
0382 case _SC('U'):
0383 case _SC('u'): {
0384 const PSInteger maxdigits = x == 'u' ? 4 : 8;
0385 PSChar temp[8 + 1];
0386 ProcessStringHexEscape(temp, maxdigits);
0387 PSChar *stemp;
0388 #ifdef PSUNICODE
0389 #if WCHAR_SIZE == 2
0390 AddUTF16(scstrtoul(temp, &stemp, 16));
0391 #else
0392 ADD_CHAR((PSChar)scstrtoul(temp, &stemp, 16));
0393 #endif
0394 #else
0395 AddUTF8(scstrtoul(temp, &stemp, 16));
0396 #endif
0397 }
0398 break;
0399 case _SC('t'): APPEND_CHAR(_SC('\t')); NEXT(); break;
0400 case _SC('a'): APPEND_CHAR(_SC('\a')); NEXT(); break;
0401 case _SC('b'): APPEND_CHAR(_SC('\b')); NEXT(); break;
0402 case _SC('n'): APPEND_CHAR(_SC('\n')); NEXT(); break;
0403 case _SC('r'): APPEND_CHAR(_SC('\r')); NEXT(); break;
0404 case _SC('v'): APPEND_CHAR(_SC('\v')); NEXT(); break;
0405 case _SC('f'): APPEND_CHAR(_SC('\f')); NEXT(); break;
0406 case _SC('0'): APPEND_CHAR(_SC('\0')); NEXT(); break;
0407 case _SC('\\'): APPEND_CHAR(_SC('\\')); NEXT(); break;
0408 case _SC('"'): APPEND_CHAR(_SC('"')); NEXT(); break;
0409 case _SC('\''): APPEND_CHAR(_SC('\'')); NEXT(); break;
0410 default:
0411 Error(_SC("unrecognised escaper char"));
0412 break;
0413 }
0414 }
0415 break;
0416 default:
0417 APPEND_CHAR(CUR_CHAR);
0418 NEXT();
0419 }
0420 }
0421 NEXT();
0422 if(verbatim && CUR_CHAR == '"') {
0423 APPEND_CHAR(CUR_CHAR);
0424 NEXT();
0425 }
0426 else {
0427 break;
0428 }
0429 }
0430 TERMINATE_BUFFER();
0431 PSInteger len = _longstr.size()-1;
0432 if(ndelim == _SC('\'')) {
0433 if(len == 0) Error(_SC("empty constant"));
0434 if(len > 1) Error(_SC("constant too long"));
0435 _nvalue = _longstr[0];
0436 return TK_INTEGER;
0437 }
0438 _svalue = &_longstr[0];
0439 return TK_STRING_LITERAL;
0440 }
0441
0442 void LexHexadecimal(const PSChar *s,PSUnsignedInteger *res)
0443 {
0444 *res = 0;
0445 while(*s != 0)
0446 {
0447 if(scisdigit(*s)) *res = (*res)*16+((*s++)-'0');
0448 else if(scisxdigit(*s)) *res = (*res)*16+(toupper(*s++)-'A'+10);
0449 else { assert(0); }
0450 }
0451 }
0452
0453 void LexInteger(const PSChar *s,PSUnsignedInteger *res)
0454 {
0455 *res = 0;
0456 while(*s != 0)
0457 {
0458 *res = (*res)*10+((*s++)-'0');
0459 }
0460 }
0461
0462 PSInteger scisodigit(PSInteger c) { return c >= _SC('0') && c <= _SC('7'); }
0463
0464 void LexOctal(const PSChar *s,PSUnsignedInteger *res)
0465 {
0466 *res = 0;
0467 while(*s != 0)
0468 {
0469 if(scisodigit(*s)) *res = (*res)*8+((*s++)-'0');
0470 else { assert(0); }
0471 }
0472 }
0473
0474 PSInteger isexponent(PSInteger c) { return c == 'e' || c=='E'; }
0475
0476
0477 #define MAX_HEX_DIGITS (sizeof(PSInteger)*2)
0478 PSInteger PSLexer::ReadNumber()
0479 {
0480 #define TINT 1
0481 #define TFLOAT 2
0482 #define THEX 3
0483 #define TSCIENTIFIC 4
0484 #define TOCTAL 5
0485 PSInteger type = TINT, firstchar = CUR_CHAR;
0486 PSChar *sTemp;
0487 INIT_TEMP_STRING();
0488 NEXT();
0489 if(firstchar == _SC('0') && (toupper(CUR_CHAR) == _SC('X') || scisodigit(CUR_CHAR)) ) {
0490 if(scisodigit(CUR_CHAR)) {
0491 type = TOCTAL;
0492 while(scisodigit(CUR_CHAR)) {
0493 APPEND_CHAR(CUR_CHAR);
0494 NEXT();
0495 }
0496 if(scisdigit(CUR_CHAR)) Error(_SC("invalid octal number"));
0497 }
0498 else {
0499 NEXT();
0500 type = THEX;
0501 while(isxdigit(CUR_CHAR)) {
0502 APPEND_CHAR(CUR_CHAR);
0503 NEXT();
0504 }
0505 if(_longstr.size() > MAX_HEX_DIGITS) Error(_SC("too many digits for an Hex number"));
0506 }
0507 }
0508 else {
0509 APPEND_CHAR((int)firstchar);
0510 while (CUR_CHAR == _SC('.') || scisdigit(CUR_CHAR) || isexponent(CUR_CHAR)) {
0511 if(CUR_CHAR == _SC('.') || isexponent(CUR_CHAR)) type = TFLOAT;
0512 if(isexponent(CUR_CHAR)) {
0513 if(type != TFLOAT) Error(_SC("invalid numeric format"));
0514 type = TSCIENTIFIC;
0515 APPEND_CHAR(CUR_CHAR);
0516 NEXT();
0517 if(CUR_CHAR == '+' || CUR_CHAR == '-'){
0518 APPEND_CHAR(CUR_CHAR);
0519 NEXT();
0520 }
0521 if(!scisdigit(CUR_CHAR)) Error(_SC("exponent expected"));
0522 }
0523
0524 APPEND_CHAR(CUR_CHAR);
0525 NEXT();
0526 }
0527 }
0528 TERMINATE_BUFFER();
0529 switch(type) {
0530 case TSCIENTIFIC:
0531 case TFLOAT:
0532 _fvalue = (PSFloat)scstrtod(&_longstr[0],&sTemp);
0533 return TK_FLOAT;
0534 case TINT:
0535 LexInteger(&_longstr[0],(PSUnsignedInteger *)&_nvalue);
0536 return TK_INTEGER;
0537 case THEX:
0538 LexHexadecimal(&_longstr[0],(PSUnsignedInteger *)&_nvalue);
0539 return TK_INTEGER;
0540 case TOCTAL:
0541 LexOctal(&_longstr[0],(PSUnsignedInteger *)&_nvalue);
0542 return TK_INTEGER;
0543 }
0544 return 0;
0545 }
0546
0547 PSInteger PSLexer::ReadID()
0548 {
0549 PSInteger res;
0550 INIT_TEMP_STRING();
0551 do {
0552 APPEND_CHAR(CUR_CHAR);
0553 NEXT();
0554 } while(scisalnum(CUR_CHAR) || CUR_CHAR == _SC('_'));
0555 TERMINATE_BUFFER();
0556 res = GetIDType(&_longstr[0],_longstr.size() - 1);
0557 if(res == TK_IDENTIFIER || res == TK_CONSTRUCTOR) {
0558 _svalue = &_longstr[0];
0559 }
0560 return res;
0561 }