summaryrefslogtreecommitdiffstats
path: root/common/unqlite/jx9_lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'common/unqlite/jx9_lex.c')
-rw-r--r--common/unqlite/jx9_lex.c758
1 files changed, 758 insertions, 0 deletions
diff --git a/common/unqlite/jx9_lex.c b/common/unqlite/jx9_lex.c
new file mode 100644
index 0000000..7799950
--- /dev/null
+++ b/common/unqlite/jx9_lex.c
@@ -0,0 +1,758 @@
1/*
2 * Symisc JX9: A Highly Efficient Embeddable Scripting Engine Based on JSON.
3 * Copyright (C) 2012-2013, Symisc Systems http://jx9.symisc.net/
4 * Version 1.7.2
5 * For information on licensing, redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES
6 * please contact Symisc Systems via:
7 * legal@symisc.net
8 * licensing@symisc.net
9 * contact@symisc.net
10 * or visit:
11 * http://jx9.symisc.net/
12 */
13 /* $SymiscID: lex.c v1.0 FreeBSD 2012-12-09 00:19 stable <chm@symisc.net> $ */
14#ifndef JX9_AMALGAMATION
15#include "jx9Int.h"
16#endif
17/* This file implements a thread-safe and full reentrant lexical analyzer for the Jx9 programming language */
18/* Forward declarations */
19static sxu32 keywordCode(const char *z,int n);
20static sxi32 LexExtractNowdoc(SyStream *pStream,SyToken *pToken);
21/*
22 * Tokenize a raw jx9 input.
23 * Get a single low-level token from the input file. Update the stream pointer so that
24 * it points to the first character beyond the extracted token.
25 */
26static sxi32 jx9TokenizeInput(SyStream *pStream,SyToken *pToken,void *pUserData,void *pCtxData)
27{
28 SyString *pStr;
29 sxi32 rc;
30 /* Ignore leading white spaces */
31 while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisSpace(pStream->zText[0]) ){
32 /* Advance the stream cursor */
33 if( pStream->zText[0] == '\n' ){
34 /* Update line counter */
35 pStream->nLine++;
36 }
37 pStream->zText++;
38 }
39 if( pStream->zText >= pStream->zEnd ){
40 /* End of input reached */
41 return SXERR_EOF;
42 }
43 /* Record token starting position and line */
44 pToken->nLine = pStream->nLine;
45 pToken->pUserData = 0;
46 pStr = &pToken->sData;
47 SyStringInitFromBuf(pStr, pStream->zText, 0);
48 if( pStream->zText[0] >= 0xc0 || SyisAlpha(pStream->zText[0]) || pStream->zText[0] == '_' ){
49 /* The following code fragment is taken verbatim from the xPP source tree.
50 * xPP is a modern embeddable macro processor with advanced features useful for
51 * application seeking for a production quality, ready to use macro processor.
52 * xPP is a widely used library developed and maintened by Symisc Systems.
53 * You can reach the xPP home page by following this link:
54 * http://xpp.symisc.net/
55 */
56 const unsigned char *zIn;
57 sxu32 nKeyword;
58 /* Isolate UTF-8 or alphanumeric stream */
59 if( pStream->zText[0] < 0xc0 ){
60 pStream->zText++;
61 }
62 for(;;){
63 zIn = pStream->zText;
64 if( zIn[0] >= 0xc0 ){
65 zIn++;
66 /* UTF-8 stream */
67 while( zIn < pStream->zEnd && ((zIn[0] & 0xc0) == 0x80) ){
68 zIn++;
69 }
70 }
71 /* Skip alphanumeric stream */
72 while( zIn < pStream->zEnd && zIn[0] < 0xc0 && (SyisAlphaNum(zIn[0]) || zIn[0] == '_') ){
73 zIn++;
74 }
75 if( zIn == pStream->zText ){
76 /* Not an UTF-8 or alphanumeric stream */
77 break;
78 }
79 /* Synchronize pointers */
80 pStream->zText = zIn;
81 }
82 /* Record token length */
83 pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
84 nKeyword = keywordCode(pStr->zString, (int)pStr->nByte);
85 if( nKeyword != JX9_TK_ID ){
86 /* We are dealing with a keyword [i.e: if, function, CREATE, ...], save the keyword ID */
87 pToken->nType = JX9_TK_KEYWORD;
88 pToken->pUserData = SX_INT_TO_PTR(nKeyword);
89 }else{
90 /* A simple identifier */
91 pToken->nType = JX9_TK_ID;
92 }
93 }else{
94 sxi32 c;
95 /* Non-alpha stream */
96 if( pStream->zText[0] == '#' ||
97 ( pStream->zText[0] == '/' && &pStream->zText[1] < pStream->zEnd && pStream->zText[1] == '/') ){
98 pStream->zText++;
99 /* Inline comments */
100 while( pStream->zText < pStream->zEnd && pStream->zText[0] != '\n' ){
101 pStream->zText++;
102 }
103 /* Tell the upper-layer to ignore this token */
104 return SXERR_CONTINUE;
105 }else if( pStream->zText[0] == '/' && &pStream->zText[1] < pStream->zEnd && pStream->zText[1] == '*' ){
106 pStream->zText += 2;
107 /* Block comment */
108 while( pStream->zText < pStream->zEnd ){
109 if( pStream->zText[0] == '*' ){
110 if( &pStream->zText[1] >= pStream->zEnd || pStream->zText[1] == '/' ){
111 break;
112 }
113 }
114 if( pStream->zText[0] == '\n' ){
115 pStream->nLine++;
116 }
117 pStream->zText++;
118 }
119 pStream->zText += 2;
120 /* Tell the upper-layer to ignore this token */
121 return SXERR_CONTINUE;
122 }else if( SyisDigit(pStream->zText[0]) ){
123 pStream->zText++;
124 /* Decimal digit stream */
125 while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisDigit(pStream->zText[0]) ){
126 pStream->zText++;
127 }
128 /* Mark the token as integer until we encounter a real number */
129 pToken->nType = JX9_TK_INTEGER;
130 if( pStream->zText < pStream->zEnd ){
131 c = pStream->zText[0];
132 if( c == '.' ){
133 /* Real number */
134 pStream->zText++;
135 while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisDigit(pStream->zText[0]) ){
136 pStream->zText++;
137 }
138 if( pStream->zText < pStream->zEnd ){
139 c = pStream->zText[0];
140 if( c=='e' || c=='E' ){
141 pStream->zText++;
142 if( pStream->zText < pStream->zEnd ){
143 c = pStream->zText[0];
144 if( (c =='+' || c=='-') && &pStream->zText[1] < pStream->zEnd &&
145 pStream->zText[1] < 0xc0 && SyisDigit(pStream->zText[1]) ){
146 pStream->zText++;
147 }
148 while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisDigit(pStream->zText[0]) ){
149 pStream->zText++;
150 }
151 }
152 }
153 }
154 pToken->nType = JX9_TK_REAL;
155 }else if( c=='e' || c=='E' ){
156 SXUNUSED(pUserData); /* Prevent compiler warning */
157 SXUNUSED(pCtxData);
158 pStream->zText++;
159 if( pStream->zText < pStream->zEnd ){
160 c = pStream->zText[0];
161 if( (c =='+' || c=='-') && &pStream->zText[1] < pStream->zEnd &&
162 pStream->zText[1] < 0xc0 && SyisDigit(pStream->zText[1]) ){
163 pStream->zText++;
164 }
165 while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisDigit(pStream->zText[0]) ){
166 pStream->zText++;
167 }
168 }
169 pToken->nType = JX9_TK_REAL;
170 }else if( c == 'x' || c == 'X' ){
171 /* Hex digit stream */
172 pStream->zText++;
173 while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisHex(pStream->zText[0]) ){
174 pStream->zText++;
175 }
176 }else if(c == 'b' || c == 'B' ){
177 /* Binary digit stream */
178 pStream->zText++;
179 while( pStream->zText < pStream->zEnd && (pStream->zText[0] == '0' || pStream->zText[0] == '1') ){
180 pStream->zText++;
181 }
182 }
183 }
184 /* Record token length */
185 pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
186 return SXRET_OK;
187 }
188 c = pStream->zText[0];
189 pStream->zText++; /* Advance the stream cursor */
190 /* Assume we are dealing with an operator*/
191 pToken->nType = JX9_TK_OP;
192 switch(c){
193 case '$': pToken->nType = JX9_TK_DOLLAR; break;
194 case '{': pToken->nType = JX9_TK_OCB; break;
195 case '}': pToken->nType = JX9_TK_CCB; break;
196 case '(': pToken->nType = JX9_TK_LPAREN; break;
197 case '[': pToken->nType |= JX9_TK_OSB; break; /* Bitwise operation here, since the square bracket token '['
198 * is a potential operator [i.e: subscripting] */
199 case ']': pToken->nType = JX9_TK_CSB; break;
200 case ')': {
201 SySet *pTokSet = pStream->pSet;
202 /* Assemble type cast operators [i.e: (int), (float), (bool)...] */
203 if( pTokSet->nUsed >= 2 ){
204 SyToken *pTmp;
205 /* Peek the last recongnized token */
206 pTmp = (SyToken *)SySetPeek(pTokSet);
207 if( pTmp->nType & JX9_TK_KEYWORD ){
208 sxi32 nID = SX_PTR_TO_INT(pTmp->pUserData);
209 if( (sxu32)nID & (JX9_TKWRD_INT|JX9_TKWRD_FLOAT|JX9_TKWRD_STRING|JX9_TKWRD_BOOL) ){
210 pTmp = (SyToken *)SySetAt(pTokSet, pTokSet->nUsed - 2);
211 if( pTmp->nType & JX9_TK_LPAREN ){
212 /* Merge the three tokens '(' 'TYPE' ')' into a single one */
213 const char * zTypeCast = "(int)";
214 if( nID & JX9_TKWRD_FLOAT ){
215 zTypeCast = "(float)";
216 }else if( nID & JX9_TKWRD_BOOL ){
217 zTypeCast = "(bool)";
218 }else if( nID & JX9_TKWRD_STRING ){
219 zTypeCast = "(string)";
220 }
221 /* Reflect the change */
222 pToken->nType = JX9_TK_OP;
223 SyStringInitFromBuf(&pToken->sData, zTypeCast, SyStrlen(zTypeCast));
224 /* Save the instance associated with the type cast operator */
225 pToken->pUserData = (void *)jx9ExprExtractOperator(&pToken->sData, 0);
226 /* Remove the two previous tokens */
227 pTokSet->nUsed -= 2;
228 return SXRET_OK;
229 }
230 }
231 }
232 }
233 pToken->nType = JX9_TK_RPAREN;
234 break;
235 }
236 case '\'':{
237 /* Single quoted string */
238 pStr->zString++;
239 while( pStream->zText < pStream->zEnd ){
240 if( pStream->zText[0] == '\'' ){
241 if( pStream->zText[-1] != '\\' ){
242 break;
243 }else{
244 const unsigned char *zPtr = &pStream->zText[-2];
245 sxi32 i = 1;
246 while( zPtr > pStream->zInput && zPtr[0] == '\\' ){
247 zPtr--;
248 i++;
249 }
250 if((i&1)==0){
251 break;
252 }
253 }
254 }
255 if( pStream->zText[0] == '\n' ){
256 pStream->nLine++;
257 }
258 pStream->zText++;
259 }
260 /* Record token length and type */
261 pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
262 pToken->nType = JX9_TK_SSTR;
263 /* Jump the trailing single quote */
264 pStream->zText++;
265 return SXRET_OK;
266 }
267 case '"':{
268 sxi32 iNest;
269 /* Double quoted string */
270 pStr->zString++;
271 while( pStream->zText < pStream->zEnd ){
272 if( pStream->zText[0] == '{' && &pStream->zText[1] < pStream->zEnd && pStream->zText[1] == '$'){
273 iNest = 1;
274 pStream->zText++;
275 /* TICKET 1433-40: Hnadle braces'{}' in double quoted string where everything is allowed */
276 while(pStream->zText < pStream->zEnd ){
277 if( pStream->zText[0] == '{' ){
278 iNest++;
279 }else if (pStream->zText[0] == '}' ){
280 iNest--;
281 if( iNest <= 0 ){
282 pStream->zText++;
283 break;
284 }
285 }else if( pStream->zText[0] == '\n' ){
286 pStream->nLine++;
287 }
288 pStream->zText++;
289 }
290 if( pStream->zText >= pStream->zEnd ){
291 break;
292 }
293 }
294 if( pStream->zText[0] == '"' ){
295 if( pStream->zText[-1] != '\\' ){
296 break;
297 }else{
298 const unsigned char *zPtr = &pStream->zText[-2];
299 sxi32 i = 1;
300 while( zPtr > pStream->zInput && zPtr[0] == '\\' ){
301 zPtr--;
302 i++;
303 }
304 if((i&1)==0){
305 break;
306 }
307 }
308 }
309 if( pStream->zText[0] == '\n' ){
310 pStream->nLine++;
311 }
312 pStream->zText++;
313 }
314 /* Record token length and type */
315 pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
316 pToken->nType = JX9_TK_DSTR;
317 /* Jump the trailing quote */
318 pStream->zText++;
319 return SXRET_OK;
320 }
321 case ':':
322 pToken->nType = JX9_TK_COLON; /* Single colon */
323 break;
324 case ',': pToken->nType |= JX9_TK_COMMA; break; /* Comma is also an operator */
325 case ';': pToken->nType = JX9_TK_SEMI; break;
326 /* Handle combined operators [i.e: +=, ===, !=== ...] */
327 case '=':
328 pToken->nType |= JX9_TK_EQUAL;
329 if( pStream->zText < pStream->zEnd ){
330 if( pStream->zText[0] == '=' ){
331 pToken->nType &= ~JX9_TK_EQUAL;
332 /* Current operator: == */
333 pStream->zText++;
334 if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
335 /* Current operator: === */
336 pStream->zText++;
337 }
338 }
339 }
340 break;
341 case '!':
342 if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
343 /* Current operator: != */
344 pStream->zText++;
345 if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
346 /* Current operator: !== */
347 pStream->zText++;
348 }
349 }
350 break;
351 case '&':
352 pToken->nType |= JX9_TK_AMPER;
353 if( pStream->zText < pStream->zEnd ){
354 if( pStream->zText[0] == '&' ){
355 pToken->nType &= ~JX9_TK_AMPER;
356 /* Current operator: && */
357 pStream->zText++;
358 }else if( pStream->zText[0] == '=' ){
359 pToken->nType &= ~JX9_TK_AMPER;
360 /* Current operator: &= */
361 pStream->zText++;
362 }
363 }
364 case '.':
365 if( pStream->zText < pStream->zEnd && (pStream->zText[0] == '.' || pStream->zText[0] == '=') ){
366 /* Concatenation operator: '..' or '.=' */
367 pStream->zText++;
368 }
369 break;
370 case '|':
371 if( pStream->zText < pStream->zEnd ){
372 if( pStream->zText[0] == '|' ){
373 /* Current operator: || */
374 pStream->zText++;
375 }else if( pStream->zText[0] == '=' ){
376 /* Current operator: |= */
377 pStream->zText++;
378 }
379 }
380 break;
381 case '+':
382 if( pStream->zText < pStream->zEnd ){
383 if( pStream->zText[0] == '+' ){
384 /* Current operator: ++ */
385 pStream->zText++;
386 }else if( pStream->zText[0] == '=' ){
387 /* Current operator: += */
388 pStream->zText++;
389 }
390 }
391 break;
392 case '-':
393 if( pStream->zText < pStream->zEnd ){
394 if( pStream->zText[0] == '-' ){
395 /* Current operator: -- */
396 pStream->zText++;
397 }else if( pStream->zText[0] == '=' ){
398 /* Current operator: -= */
399 pStream->zText++;
400 }else if( pStream->zText[0] == '>' ){
401 /* Current operator: -> */
402 pStream->zText++;
403 }
404 }
405 break;
406 case '*':
407 if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
408 /* Current operator: *= */
409 pStream->zText++;
410 }
411 break;
412 case '/':
413 if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
414 /* Current operator: /= */
415 pStream->zText++;
416 }
417 break;
418 case '%':
419 if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
420 /* Current operator: %= */
421 pStream->zText++;
422 }
423 break;
424 case '^':
425 if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
426 /* Current operator: ^= */
427 pStream->zText++;
428 }
429 break;
430 case '<':
431 if( pStream->zText < pStream->zEnd ){
432 if( pStream->zText[0] == '<' ){
433 /* Current operator: << */
434 pStream->zText++;
435 if( pStream->zText < pStream->zEnd ){
436 if( pStream->zText[0] == '=' ){
437 /* Current operator: <<= */
438 pStream->zText++;
439 }else if( pStream->zText[0] == '<' ){
440 /* Current Token: <<< */
441 pStream->zText++;
442 /* This may be the beginning of a Heredoc/Nowdoc string, try to delimit it */
443 rc = LexExtractNowdoc(&(*pStream), &(*pToken));
444 if( rc == SXRET_OK ){
445 /* Here/Now doc successfuly extracted */
446 return SXRET_OK;
447 }
448 }
449 }
450 }else if( pStream->zText[0] == '>' ){
451 /* Current operator: <> */
452 pStream->zText++;
453 }else if( pStream->zText[0] == '=' ){
454 /* Current operator: <= */
455 pStream->zText++;
456 }
457 }
458 break;
459 case '>':
460 if( pStream->zText < pStream->zEnd ){
461 if( pStream->zText[0] == '>' ){
462 /* Current operator: >> */
463 pStream->zText++;
464 if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
465 /* Current operator: >>= */
466 pStream->zText++;
467 }
468 }else if( pStream->zText[0] == '=' ){
469 /* Current operator: >= */
470 pStream->zText++;
471 }
472 }
473 break;
474 default:
475 break;
476 }
477 if( pStr->nByte <= 0 ){
478 /* Record token length */
479 pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
480 }
481 if( pToken->nType & JX9_TK_OP ){
482 const jx9_expr_op *pOp;
483 /* Check if the extracted token is an operator */
484 pOp = jx9ExprExtractOperator(pStr, (SyToken *)SySetPeek(pStream->pSet));
485 if( pOp == 0 ){
486 /* Not an operator */
487 pToken->nType &= ~JX9_TK_OP;
488 if( pToken->nType <= 0 ){
489 pToken->nType = JX9_TK_OTHER;
490 }
491 }else{
492 /* Save the instance associated with this operator for later processing */
493 pToken->pUserData = (void *)pOp;
494 }
495 }
496 }
497 /* Tell the upper-layer to save the extracted token for later processing */
498 return SXRET_OK;
499}
500/***** This file contains automatically generated code ******
501**
502** The code in this file has been automatically generated by
503**
504** $Header: /sqlite/sqlite/tool/mkkeywordhash.c,v 1.38 2011/12/21 01:00:46 <chm@symisc.net> $
505**
506** The code in this file implements a function that determines whether
507** or not a given identifier is really a JX9 keyword. The same thing
508** might be implemented more directly using a hand-written hash table.
509** But by using this automatically generated code, the size of the code
510** is substantially reduced. This is important for embedded applications
511** on platforms with limited memory.
512*/
513/* Hash score: 35 */
514static sxu32 keywordCode(const char *z, int n)
515{
516 /* zText[] encodes 188 bytes of keywords in 128 bytes */
517 /* printegereturnconstaticaselseifloatincludefaultDIEXITcontinue */
518 /* diewhileASPRINTbooleanbreakforeachfunctionimportstringswitch */
519 /* uplink */
520 static const char zText[127] = {
521 'p','r','i','n','t','e','g','e','r','e','t','u','r','n','c','o','n','s',
522 't','a','t','i','c','a','s','e','l','s','e','i','f','l','o','a','t','i',
523 'n','c','l','u','d','e','f','a','u','l','t','D','I','E','X','I','T','c',
524 'o','n','t','i','n','u','e','d','i','e','w','h','i','l','e','A','S','P',
525 'R','I','N','T','b','o','o','l','e','a','n','b','r','e','a','k','f','o',
526 'r','e','a','c','h','f','u','n','c','t','i','o','n','i','m','p','o','r',
527 't','s','t','r','i','n','g','s','w','i','t','c','h','u','p','l','i','n',
528 'k',
529 };
530 static const unsigned char aHash[59] = {
531 0, 0, 0, 0, 15, 0, 30, 0, 0, 2, 19, 18, 0,
532 0, 10, 3, 12, 0, 28, 29, 23, 0, 13, 22, 0, 0,
533 14, 24, 25, 31, 11, 0, 0, 0, 0, 1, 5, 0, 0,
534 20, 0, 27, 9, 0, 0, 0, 8, 0, 0, 26, 6, 0,
535 0, 17, 0, 0, 0, 0, 0,
536 };
537 static const unsigned char aNext[31] = {
538 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
539 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 21, 7,
540 0, 0, 0, 0, 0,
541 };
542 static const unsigned char aLen[31] = {
543 5, 7, 3, 6, 5, 6, 4, 2, 6, 4, 2, 5, 7,
544 7, 3, 4, 8, 3, 5, 2, 5, 4, 7, 5, 3, 7,
545 8, 6, 6, 6, 6,
546 };
547 static const sxu16 aOffset[31] = {
548 0, 2, 2, 8, 14, 17, 22, 23, 25, 25, 29, 30, 35,
549 40, 47, 49, 53, 61, 64, 69, 71, 76, 76, 83, 88, 88,
550 95, 103, 109, 115, 121,
551 };
552 static const sxu32 aCode[31] = {
553 JX9_TKWRD_PRINT, JX9_TKWRD_INT, JX9_TKWRD_INT, JX9_TKWRD_RETURN, JX9_TKWRD_CONST,
554 JX9_TKWRD_STATIC, JX9_TKWRD_CASE, JX9_TKWRD_AS, JX9_TKWRD_ELIF, JX9_TKWRD_ELSE,
555 JX9_TKWRD_IF, JX9_TKWRD_FLOAT, JX9_TKWRD_INCLUDE, JX9_TKWRD_DEFAULT, JX9_TKWRD_DIE,
556 JX9_TKWRD_EXIT, JX9_TKWRD_CONTINUE, JX9_TKWRD_DIE, JX9_TKWRD_WHILE, JX9_TKWRD_AS,
557 JX9_TKWRD_PRINT, JX9_TKWRD_BOOL, JX9_TKWRD_BOOL, JX9_TKWRD_BREAK, JX9_TKWRD_FOR,
558 JX9_TKWRD_FOREACH, JX9_TKWRD_FUNCTION, JX9_TKWRD_IMPORT, JX9_TKWRD_STRING, JX9_TKWRD_SWITCH,
559 JX9_TKWRD_UPLINK,
560 };
561 int h, i;
562 if( n<2 ) return JX9_TK_ID;
563 h = (((int)z[0]*4) ^ ((int)z[n-1]*3) ^ n) % 59;
564 for(i=((int)aHash[h])-1; i>=0; i=((int)aNext[i])-1){
565 if( (int)aLen[i]==n && SyMemcmp(&zText[aOffset[i]],z,n)==0 ){
566 /* JX9_TKWRD_PRINT */
567 /* JX9_TKWRD_INT */
568 /* JX9_TKWRD_INT */
569 /* JX9_TKWRD_RETURN */
570 /* JX9_TKWRD_CONST */
571 /* JX9_TKWRD_STATIC */
572 /* JX9_TKWRD_CASE */
573 /* JX9_TKWRD_AS */
574 /* JX9_TKWRD_ELIF */
575 /* JX9_TKWRD_ELSE */
576 /* JX9_TKWRD_IF */
577 /* JX9_TKWRD_FLOAT */
578 /* JX9_TKWRD_INCLUDE */
579 /* JX9_TKWRD_DEFAULT */
580 /* JX9_TKWRD_DIE */
581 /* JX9_TKWRD_EXIT */
582 /* JX9_TKWRD_CONTINUE */
583 /* JX9_TKWRD_DIE */
584 /* JX9_TKWRD_WHILE */
585 /* JX9_TKWRD_AS */
586 /* JX9_TKWRD_PRINT */
587 /* JX9_TKWRD_BOOL */
588 /* JX9_TKWRD_BOOL */
589 /* JX9_TKWRD_BREAK */
590 /* JX9_TKWRD_FOR */
591 /* JX9_TKWRD_FOREACH */
592 /* JX9_TKWRD_FUNCTION */
593 /* JX9_TKWRD_IMPORT */
594 /* JX9_TKWRD_STRING */
595 /* JX9_TKWRD_SWITCH */
596 /* JX9_TKWRD_UPLINK */
597 return aCode[i];
598 }
599 }
600 return JX9_TK_ID;
601}
602/*
603 * Extract a heredoc/nowdoc text from a raw JX9 input.
604 * According to the JX9 language reference manual:
605 * A third way to delimit strings is the heredoc syntax: <<<. After this operator, an identifier
606 * is provided, then a newline. The string itself follows, and then the same identifier again
607 * to close the quotation.
608 * The closing identifier must begin in the first column of the line. Also, the identifier must
609 * follow the same naming rules as any other label in JX9: it must contain only alphanumeric
610 * characters and underscores, and must start with a non-digit character or underscore.
611 * Heredoc text behaves just like a double-quoted string, without the double quotes.
612 * This means that quotes in a heredoc do not need to be escaped, but the escape codes listed
613 * above can still be used. Variables are expanded, but the same care must be taken when expressing
614 * complex variables inside a heredoc as with strings.
615 * Nowdocs are to single-quoted strings what heredocs are to double-quoted strings.
616 * A nowdoc is specified similarly to a heredoc, but no parsing is done inside a nowdoc.
617 * The construct is ideal for embedding JX9 code or other large blocks of text without the need
618 * for escaping. It shares some features in common with the SGML <![CDATA[ ]]> construct, in that
619 * it declares a block of text which is not for parsing.
620 * A nowdoc is identified with the same <<< sequence used for heredocs, but the identifier which follows
621 * is enclosed in single quotes, e.g. <<<'EOT'. All the rules for heredoc identifiers also apply to nowdoc
622 * identifiers, especially those regarding the appearance of the closing identifier.
623 */
624static sxi32 LexExtractNowdoc(SyStream *pStream, SyToken *pToken)
625{
626 const unsigned char *zIn = pStream->zText;
627 const unsigned char *zEnd = pStream->zEnd;
628 const unsigned char *zPtr;
629 SyString sDelim;
630 SyString sStr;
631 /* Jump leading white spaces */
632 while( zIn < zEnd && zIn[0] < 0xc0 && SyisSpace(zIn[0]) && zIn[0] != '\n' ){
633 zIn++;
634 }
635 if( zIn >= zEnd ){
636 /* A simple symbol, return immediately */
637 return SXERR_CONTINUE;
638 }
639 if( zIn[0] == '\'' || zIn[0] == '"' ){
640 zIn++;
641 }
642 if( zIn[0] < 0xc0 && !SyisAlphaNum(zIn[0]) && zIn[0] != '_' ){
643 /* Invalid delimiter, return immediately */
644 return SXERR_CONTINUE;
645 }
646 /* Isolate the identifier */
647 sDelim.zString = (const char *)zIn;
648 for(;;){
649 zPtr = zIn;
650 /* Skip alphanumeric stream */
651 while( zPtr < zEnd && zPtr[0] < 0xc0 && (SyisAlphaNum(zPtr[0]) || zPtr[0] == '_') ){
652 zPtr++;
653 }
654 if( zPtr < zEnd && zPtr[0] >= 0xc0 ){
655 zPtr++;
656 /* UTF-8 stream */
657 while( zPtr < zEnd && ((zPtr[0] & 0xc0) == 0x80) ){
658 zPtr++;
659 }
660 }
661 if( zPtr == zIn ){
662 /* Not an UTF-8 or alphanumeric stream */
663 break;
664 }
665 /* Synchronize pointers */
666 zIn = zPtr;
667 }
668 /* Get the identifier length */
669 sDelim.nByte = (sxu32)((const char *)zIn-sDelim.zString);
670 if( zIn[0] == '"' || zIn[0] == '\'' ){
671 /* Jump the trailing single quote */
672 zIn++;
673 }
674 /* Jump trailing white spaces */
675 while( zIn < zEnd && zIn[0] < 0xc0 && SyisSpace(zIn[0]) && zIn[0] != '\n' ){
676 zIn++;
677 }
678 if( sDelim.nByte <= 0 || zIn >= zEnd || zIn[0] != '\n' ){
679 /* Invalid syntax */
680 return SXERR_CONTINUE;
681 }
682 pStream->nLine++; /* Increment line counter */
683 zIn++;
684 /* Isolate the delimited string */
685 sStr.zString = (const char *)zIn;
686 /* Go and found the closing delimiter */
687 for(;;){
688 /* Synchronize with the next line */
689 while( zIn < zEnd && zIn[0] != '\n' ){
690 zIn++;
691 }
692 if( zIn >= zEnd ){
693 /* End of the input reached, break immediately */
694 pStream->zText = pStream->zEnd;
695 break;
696 }
697 pStream->nLine++; /* Increment line counter */
698 zIn++;
699 if( (sxu32)(zEnd - zIn) >= sDelim.nByte && SyMemcmp((const void *)sDelim.zString, (const void *)zIn, sDelim.nByte) == 0 ){
700 zPtr = &zIn[sDelim.nByte];
701 while( zPtr < zEnd && zPtr[0] < 0xc0 && SyisSpace(zPtr[0]) && zPtr[0] != '\n' ){
702 zPtr++;
703 }
704 if( zPtr >= zEnd ){
705 /* End of input */
706 pStream->zText = zPtr;
707 break;
708 }
709 if( zPtr[0] == ';' ){
710 const unsigned char *zCur = zPtr;
711 zPtr++;
712 while( zPtr < zEnd && zPtr[0] < 0xc0 && SyisSpace(zPtr[0]) && zPtr[0] != '\n' ){
713 zPtr++;
714 }
715 if( zPtr >= zEnd || zPtr[0] == '\n' ){
716 /* Closing delimiter found, break immediately */
717 pStream->zText = zCur; /* Keep the semi-colon */
718 break;
719 }
720 }else if( zPtr[0] == '\n' ){
721 /* Closing delimiter found, break immediately */
722 pStream->zText = zPtr; /* Synchronize with the stream cursor */
723 break;
724 }
725 /* Synchronize pointers and continue searching */
726 zIn = zPtr;
727 }
728 } /* For(;;) */
729 /* Get the delimited string length */
730 sStr.nByte = (sxu32)((const char *)zIn-sStr.zString);
731 /* Record token type and length */
732 pToken->nType = JX9_TK_NOWDOC;
733 SyStringDupPtr(&pToken->sData, &sStr);
734 /* Remove trailing white spaces */
735 SyStringRightTrim(&pToken->sData);
736 /* All done */
737 return SXRET_OK;
738}
739/*
740 * Tokenize a raw jx9 input.
741 * This is the public tokenizer called by most code generator routines.
742 */
743JX9_PRIVATE sxi32 jx9Tokenize(const char *zInput,sxu32 nLen,SySet *pOut)
744{
745 SyLex sLexer;
746 sxi32 rc;
747 /* Initialize the lexer */
748 rc = SyLexInit(&sLexer, &(*pOut),jx9TokenizeInput,0);
749 if( rc != SXRET_OK ){
750 return rc;
751 }
752 /* Tokenize input */
753 rc = SyLexTokenizeInput(&sLexer, zInput, nLen, 0, 0, 0);
754 /* Release the lexer */
755 SyLexRelease(&sLexer);
756 /* Tokenization result */
757 return rc;
758}