diff --git a/README.md b/README.md index 6d26a73..8732b5e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +# This is a port to LjsJIT https://github.com/mingodad/ljsjit + LPegLJ v1.0 ============= diff --git a/src/lpcap.ljs b/src/lpcap.ljs new file mode 100644 index 0000000..64d5a1c --- /dev/null +++ b/src/lpcap.ljs @@ -0,0 +1,626 @@ +/* +LPEGLJ +lpcap.lua +Capture functions +Copyright (C) 2014 Rostislav Sacek. +based on LPeg v1.0 - PEG pattern matching for Lua +Lua.org & PUC-Rio written by Roberto Ierusalimschy +http://www.inf.puc-rio.br/~roberto/lpeg/ + +** Permission is hereby granted, free of charge, to any person obtaining +** a copy of this software and associated documentation files (the +** "Software"), to deal in the Software without restriction, including +** without limitation the rights to use, copy, modify, merge, publish, +** distribute, sublicense, and/or sell copies of the Software, and to +** permit persons to whom the Software is furnished to do so, subject to +** the following conditions: +** +** The above copyright notice and this permission notice shall be +** included in all copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] +--*/ +var ffi = require ("ffi"); + +var Cclose = 0; +var Cposition = 1; +var Cconst = 2; +var Cbackref = 3; +var Carg = 4; +var Csimple = 5; +var Ctable = 6; +var Cfunction = 7; +var Cquery = 8; +var Cstring = 9; +var Cnum = 10; +var Csubst = 11; +var Cfold = 12; +var Cruntime = 13; +var Cgroup = 14; + +var MAXSTRCAPS = 10; + +var pushcapture; +var addonestring; + + +// Goes back in a list of captures looking for an open capture +// corresponding to a close + +var function findopen(cs, index) { + var n = 0; // number of closes waiting an open + while( true ) { + --index ; + if( cs.ocap[index].kind == Cclose ) { + ++n; // one more open to skip + } else if( cs.ocap[index].siz == 0 ) { + if( n == 0 ) { + return index; + } + --n ; + } + } +} + + +var function checknextcap(cs, captop) { + var cap = cs.cap; + // not a single capture? ((cap)->siz != 0) + if( cs.ocap[cap].siz == 0 ) { + var n = 0; // number of opens waiting a close + // look for corresponding close + while( true ) { + ++cap ; + if( cap > captop ) { return; } + if( cs.ocap[cap].kind == Cclose ) { + --n ; + if( n + 1 == 0 ) { + break; + } + } else if( cs.ocap[cap].siz == 0 ) { + ++n ; + } + } + } + ++cap ; // + 1 to skip last close (or entire single capture) + if( cap > captop ) { return; } + return true; +} + + +// Go to the next capture + +var function nextcap(cs) { + var cap = cs.cap; + // not a single capture? ((cap)->siz != 0) + if( cs.ocap[cap].siz == 0 ) { + var n = 0; // number of opens waiting a close + // look for corresponding close + while( true ) { + ++cap ; + if( cs.ocap[cap].kind == Cclose ) { + --n ; + if( n + 1 == 0 ) { + break; + } + } else if( cs.ocap[cap].siz == 0 ) { + ++n ; + } + } + } + cs.cap = cap + 1; // + 1 to skip last close (or entire single capture) +} + + +// Push on the Lua stack all values generated by nested captures inside +// the current capture. Returns number of values pushed. 'addextra' +// makes it push the entire match after all captured values. The +// entire match is pushed also if there are no other nested values, +// so the function never returns zero. + +var function pushnestedvalues(cs, addextra, out, valuetable) { + var co = cs.cap; + cs.cap = cs.cap + 1; + // no nested captures? + if( cs.ocap[cs.cap - 1].siz != 0 ) { + var st = cs.ocap[co].s; + var l = cs.ocap[co].siz - 1; + out.outindex = out.outindex + 1; + out.out[out.outindex] = cs.s && cs.s->sub(st, st + l - 1) || cs.stream(st, st + l - 1); + return 1; // that is it + } else { + var n = 0; + while( cs.ocap[cs.cap].kind != Cclose ) { // repeat for all nested patterns + n += pushcapture(cs, out, valuetable); + } + // need extra? + if( addextra || n == 0 ) { + var st = cs.ocap[co].s; + var l = cs.ocap[cs.cap].s - cs.ocap[co].s; + out.outindex = out.outindex + 1; + out.out[out.outindex] = cs.s && cs.s->sub(st, st + l - 1) || cs.stream(st, st + l - 1); + ++n ; + } + cs.cap = cs.cap + 1; // skip close entry + return n; + } +} + + +// Push only the first value generated by nested captures + +var function pushonenestedvalue(cs, out, valuetable) { + var n = pushnestedvalues(cs, false, out, valuetable); + for( i = n, 2, -1 ) { + out.out[out.outindex] = null; + out.outindex = out.outindex - 1; + } +} + + +// Try to find a named group capture with the name given at the top of +// the stack; goes backward from 'cap'. + +var function findback(cs, cap, name, valuetable) { + // repeat until end of list + while( cap > 0 ) { + --cap ; + var vcontinue; + if( cs.ocap[cap].kind == Cclose ) { + cap = findopen(cs, cap); // skip nested captures + } else if( cs.ocap[cap].siz == 0 ) { + vcontinue = true; // opening an enclosing capture: skip and get previous + } + if( ! vcontinue && cs.ocap[cap].kind == Cgroup && cs.ocap[cap].idx != 0 ) { + var gname = valuetable[cs.ocap[cap].idx]; // get group name + // right group? + if( name == gname ) { + return cap; + } + } + } + error(("back reference '%s' not found")->format(name), 0); +} + + +// Back-reference capture. Return number of values pushed. + +var function backrefcap(cs, out, valuetable) { + var curr = cs.cap; + var name = valuetable[cs.ocap[cs.cap].idx]; // reference name + cs.cap = findback(cs, curr, name, valuetable); // find corresponding group + var n = pushnestedvalues(cs, false, out, valuetable); // push group's values + cs.cap = curr + 1; + return n; +} + + +// Table capture: creates a new table and populates it with nested +// captures. + +var function tablecap(cs, out, valuetable) { + var n = 0; + var t = {}; + cs.cap = cs.cap + 1; + // table is empty + if( cs.ocap[cs.cap - 1].siz == 0 ) { + while( cs.ocap[cs.cap].kind != Cclose ) { + var subout = { outindex = 0, out = {} }; + // named group? + if( cs.ocap[cs.cap].kind == Cgroup && cs.ocap[cs.cap].idx != 0 ) { + var groupname = valuetable[cs.ocap[cs.cap].idx]; // push group name + pushonenestedvalue(cs, subout, valuetable); + t[groupname] = subout.out[1]; + } else { + // not a named group + var k = pushcapture(cs, subout, valuetable); + // store all values into table + for( i = 1, subout.outindex ) { + t[i + n] = subout.out[i]; + } + n += k; + } + } + cs.cap = cs.cap + 1; // skip close entry + } + out.outindex = out.outindex + 1; + out.out[out.outindex] = t; + return 1; // number of values pushed (only the table) +} + + +// Table-query capture + +var function querycap(cs, out, valuetable) { + var table = valuetable[cs.ocap[cs.cap].idx]; + var subout = { outindex = 0, out = {} }; + pushonenestedvalue(cs, subout, valuetable); // get nested capture + // query cap. value at table + if( table[subout.out[1]] != null ) { + out.outindex = out.outindex + 1; + out.out[out.outindex] = table[subout.out[1]]; + return 1; + } + return 0; +} + + +// Fold capture + +var function foldcap(cs, out, valuetable) { + var fce = valuetable[cs.ocap[cs.cap].idx]; + cs.cap = cs.cap + 1; + // no nested captures? + // or no nested captures (large subject)? + if( cs.ocap[cs.cap - 1].siz != 0 || + cs.ocap[cs.cap].kind == Cclose ) { + error("no initial value for fold capture", 0); + } + var subout = { outindex = 0; out = {} }; + var n = pushcapture(cs, subout, valuetable); // nested captures with no values? + if( n == 0 ) { + error("no initial value for fold capture", 0); + } + var acumulator = subout.out[1]; // leave only one result for accumulator + while( cs.ocap[cs.cap].kind != Cclose ) { + var xsubout = { outindex = 0; out = {} }; + n = pushcapture(cs, xsubout, valuetable); // get next capture's values + acumulator = fce(acumulator, unpack(xsubout.out, 1, xsubout.outindex)); // call folding function + } + cs.cap = cs.cap + 1; // skip close entry + out.outindex = out.outindex + 1; + out.out[out.outindex] = acumulator; + return 1; // only accumulator left on the stack +} + + +var function retcount(...) { + return select('#', ...), { ... }; +} + + +// Function capture + +var function functioncap(cs, out, valuetable) { + var fce = valuetable[cs.ocap[cs.cap].idx]; // push function + var subout = { outindex = 0, out = {} }; + var n = pushnestedvalues(cs, false, subout, valuetable); // push nested captures + var count, ret = retcount(fce(unpack(subout.out, 1, n))); // call function + for( i = 1, count ) { + out.outindex = out.outindex + 1; + out.out[out.outindex] = ret[i]; + } + return count; +} + + +// Select capture + +var function numcap(cs, out, valuetable) { + var idx = valuetable[cs.ocap[cs.cap].idx]; // value to select + // no values? + if( idx == 0 ) { + nextcap(cs); // skip entire capture + return 0; // no value produced + } else { + var subout = { outindex = 0, out = {} }; + var n = pushnestedvalues(cs, false, subout, valuetable); + // invalid index? + if( n < idx ) { + error(("no capture '%d'")->format(idx), 0); + } else { + out.outindex = out.outindex + 1; + out.out[out.outindex] = subout.out[idx]; // get selected capture + return 1; + } + } +} + + +// Calls a runtime capture. Returns number of captures removed by +// the call, including the initial Cgroup. (Captures to be added are +// on the Lua stack.) + +var function runtimecap(cs, close, s, out, valuetable) { + var open = findopen(cs, close); + assert(cs.ocap[open].kind == Cgroup); + cs.ocap[close].kind = Cclose; // closes the group + cs.ocap[close].s = s; + cs.cap = open; + var fce = valuetable[cs.ocap[cs.cap].idx]; // push function to be called + var subout = { outindex = 0, out = {} }; + var n = pushnestedvalues(cs, false, subout, valuetable); // push nested captures + var count, ret = retcount(fce(cs.s || cs.stream, s, unpack(subout.out, 1, n))); // call dynamic function + for( i = 1, count ) { + out.outindex = out.outindex + 1; + out.out[out.outindex] = ret[i]; + } + return close - open; // number of captures of all kinds removed +} + +// Collect values from current capture into array 'cps'. Current +// capture must be Cstring (first call) or Csimple (recursive calls). +// (In first call, fills %0 with whole match for Cstring.) +// Returns number of elements in the array that were filled. + +var function getstrcaps(cs, cps, n) { + var k = n; + ++n ; + cps[k + 1].isstring = true; // get string value + cps[k + 1].startstr = cs.ocap[cs.cap].s; // starts here + cs.cap = cs.cap + 1; + // nested captures? + if( cs.ocap[cs.cap - 1].siz == 0 ) { + // traverse them + while( cs.ocap[cs.cap].kind != Cclose ) { + // too many captures? + if( n >= MAXSTRCAPS ) { + nextcap(cs); // skip extra captures (will not need them) + } else if( cs.ocap[cs.cap].kind == Csimple ) { + // string? + n = getstrcaps(cs, cps, n); // put info. into array + } else { + cps[n + 1].isstring = false; // not a string + cps[n + 1].origcap = cs.cap; // keep original capture + nextcap(cs); + ++n ; + } + } + cs.cap = cs.cap + 1; // skip close + } + cps[k + 1].endstr = cs.ocap[cs.cap - 1].s + cs.ocap[cs.cap - 1].siz - 1; // ends here + return n; +} + + +// add next capture value (which should be a string) to buffer 'b' + +// String capture: add result to buffer 'b' (instead of pushing +// it into the stack) + +var function stringcap(cs, b, valuetable) { + var cps = {}; + for( i = 1, MAXSTRCAPS ) { + cps[#cps + 1] = {}; + } + var fmt = valuetable[cs.ocap[cs.cap].idx]; + var n = getstrcaps(cs, cps, 0) - 1; // collect nested captures + var i = 1; + + // traverse them + while( i <= #fmt ) { + var c = fmt->sub(i, i); + // not an escape? + if( c != '%' ) { + b[#b + 1] = c; // add it to buffer + } else if( fmt->sub(i + 1, i + 1) < '0' || fmt->sub(i + 1, i + 1) > '9' ) { + // not followed by a digit? + ++i ; + b[#b + 1] = fmt->sub(i, i); + } else { + ++i ; + var l = fmt->sub(i, i) - '0'; // capture index + if( l > n ) { + error(("invalid capture index (%d)")->format(l), 0); + } else if( cps[l + 1].isstring ) { + b[#b + 1] = cs.s && cs.s->sub(cps[l + 1].startstr, cps[l + 1].endstr - cps[l + 1].startstr + cps[l + 1].startstr - 1) || + cs.stream(cps[l + 1].startstr, cps[l + 1].endstr - cps[l + 1].startstr + cps[l + 1].startstr - 1); + } else { + var curr = cs.cap; + cs.cap = cps[l + 1].origcap; // go back to evaluate that nested capture + if( ! addonestring(cs, b, "capture", valuetable) ) { + error(("no values in capture index %d")->format(l), 0); + } + cs.cap = curr; // continue from where it stopped + } + } + ++i ; + } +} + + +// Substitution capture: add result to buffer 'b' + +var function substcap(cs, b, valuetable) { + var curr = cs.ocap[cs.cap].s; + // no nested captures? + if( cs.ocap[cs.cap].siz != 0 ) { + // keep original text + b[#b + 1] = cs.s && cs.s->sub(curr, cs.ocap[cs.cap].siz - 1 + curr - 1) || + cs.stream(curr, cs.ocap[cs.cap].siz - 1 + curr - 1); + } else { + cs.cap = cs.cap + 1; // skip open entry + // traverse nested captures + while( cs.ocap[cs.cap].kind != Cclose ) { + var next = cs.ocap[cs.cap].s; + b[#b + 1] = cs.s && cs.s->sub(curr, next - curr + curr - 1) || + cs.stream(curr, next - curr + curr - 1); // add text up to capture + if( addonestring(cs, b, "replacement", valuetable) ) { + curr = cs.ocap[cs.cap - 1].s + cs.ocap[cs.cap - 1].siz - 1; // continue after match + } else { + // no capture value + curr = next; // keep original text in final result + } + } + b[#b + 1] = cs.s && cs.s->sub(curr, curr + cs.ocap[cs.cap].s - curr - 1) || + cs.stream(curr, curr + cs.ocap[cs.cap].s - curr - 1); // add last piece of text + } + cs.cap = cs.cap + 1; // go to next capture +} + + +// Evaluates a capture and adds its first value to buffer 'b'; returns +// whether there was a value + +function addonestring(cs, b, what, valuetable) { + var tag = cs.ocap[cs.cap].kind; + if( tag == Cstring ) { + stringcap(cs, b, valuetable); // add capture directly to buffer + return 1; + } else if( tag == Csubst ) { + substcap(cs, b, valuetable); // add capture directly to buffer + return 1; + } else { + var subout = { outindex = 0, out = {} }; + var n = pushcapture(cs, subout, valuetable); + if( n > 0 ) { + if( type(subout.out[1]) != 'string' && type(subout.out[1]) != 'number' ) { + error(("invalid %s value (a %s)")->format(what, type(subout.out[1])), 0); + } + b[#b + 1] = subout.out[1]; + return n; + } + } +} + + +// Push all values of the current capture into the stack; returns +// number of values pushed + +function pushcapture(cs, out, valuetable) { + var type = cs.ocap[cs.cap].kind; + if( type == Cposition ) { + out.outindex = out.outindex + 1; + out.out[out.outindex] = cs.ocap[cs.cap].s; + cs.cap = cs.cap + 1; + return 1; + } else if( type == Cconst ) { + out.outindex = out.outindex + 1; + out.out[out.outindex] = valuetable[cs.ocap[cs.cap].idx]; + cs.cap = cs.cap + 1; + return 1; + } else if( type == Carg ) { + var arg = valuetable[cs.ocap[cs.cap].idx]; + cs.cap = cs.cap + 1; + if( arg > cs.ptopcount ) { + error(("reference to absent extra argument #%d")->format(arg), 0); + } + out.outindex = out.outindex + 1; + out.out[out.outindex] = cs.ptop[arg]; + return 1; + } else if( type == Csimple ) { + var k = pushnestedvalues(cs, true, out, valuetable); + var index = out.outindex; + table.insert(out.out, index - k + 1, out.out[index]); + out[index + 1] = null; + return k; + } else if( type == Cruntime ) { + out.outindex = out.outindex + 1; + out.out[out.outindex] = valuetable[cs.ocap[cs.cap].idx]; + cs.cap = cs.cap + 1; + return 1; + } else if( type == Cstring ) { + var b = {}; + stringcap(cs, b, valuetable); + out.outindex = out.outindex + 1; + out.out[out.outindex] = table.concat(b); + return 1; + } else if( type == Csubst ) { + var b = {}; + substcap(cs, b, valuetable); + out.outindex = out.outindex + 1; + out.out[out.outindex] = table.concat(b); + return 1; + } else if( type == Cgroup ) { + // anonymous group? + if( cs.ocap[cs.cap].idx == 0 ) { + return pushnestedvalues(cs, false, out, valuetable); // add all nested values + } else { + // named group: add no values + nextcap(cs); // skip capture + return 0; + } + } else if( type == Cbackref ) { + return backrefcap(cs, out, valuetable); + } else if( type == Ctable ) { + return tablecap(cs, out, valuetable); + } else if( type == Cfunction ) { + return functioncap(cs, out, valuetable); + } else if( type == Cnum ) { + return numcap(cs, out, valuetable); + } else if( type == Cquery ) { + return querycap(cs, out, valuetable); + } else if( type == Cfold ) { + return foldcap(cs, out, valuetable); + } else { + assert(false); + } +} + + +// Prepare a CapState structure and traverse the entire list of +// captures in the stack pushing its results. 's' is the subject +// string, 'r' is the final position of the match, and 'ptop' +// the index in the stack where some useful values were pushed. +// Returns the number of results pushed. (If the list produces no +// results, push the final position of the match.) + +var function getcaptures(capture, s, stream, r, valuetable, ...) { + var n = 0; + var cs = { cap = 0 }; + var out = { outindex = 0; out = {} }; + // is there any capture? + if( capture[cs.cap].kind != Cclose ) { + cs.ocap = capture; + cs.s = s; + cs.stream = stream; + cs.ptopcount, cs.ptop = retcount(...); + do { // collect their values + n += pushcapture(cs, out, valuetable); + } while(!( cs.ocap[cs.cap].kind == Cclose) ); + } + // no capture values? + if( n == 0 ) { + if( ! r ) { + return; + } else { + return r; + } + } + assert(out.outindex < 7998, "(too many captures)"); + return unpack(out.out, 1, out.outindex); +} + +var function getcapturesruntime(capture, s, stream, notdelete, min, max, captop, valuetable, ...) { + var n = 0; + var cs = { cap = min }; + var out = { outindex = 0; out = {} }; + cs.ocap = capture; + cs.s = s; + cs.stream = stream; + cs.ptopcount, cs.ptop = retcount(...); + var start = 0; + do { // collect their values + if( ! checknextcap(cs, max) ) { break; } + var xnotdelete = notdelete || capture[cs.cap].kind == Cgroup && capture[cs.cap].idx != 0 && capture[cs.cap].candelete == 0; + pushcapture(cs, out, valuetable); + if( xnotdelete ) { + start = cs.cap; + } else { + n = n + cs.cap - start; + for( i = 0, captop - cs.cap - 1 ) { + ffi.copy(capture + start + i, capture + cs.cap + i, ffi.sizeof('CAPTURE')); + } + max = max - (cs.cap - start); + captop = captop - (cs.cap - start); + cs.cap = start; + } + } while(!( cs.cap == max) ); + assert(out.outindex < 7998, "(too many captures)"); + return n, out.out, out.outindex; +} + +return { + getcaptures = getcaptures, + runtimecap = runtimecap, + getcapturesruntime = getcapturesruntime, +}; + diff --git a/src/lpcode.ljs b/src/lpcode.ljs new file mode 100644 index 0000000..8e9333f --- /dev/null +++ b/src/lpcode.ljs @@ -0,0 +1,1061 @@ +/* +LPEGLJ +lpcode.lua +Generating code from tree +Copyright (C) 2014 Rostislav Sacek. +based on LPeg v1.0 - PEG pattern matching for Lua +Lua.org & PUC-Rio written by Roberto Ierusalimschy +http://www.inf.puc-rio.br/~roberto/lpeg/ + +** Permission is hereby granted, free of charge, to any person obtaining +** a copy of this software and associated documentation files (the +** "Software"), to deal in the Software without restriction, including +** without limitation the rights to use, copy, modify, merge, publish, +** distribute, sublicense, and/or sell copies of the Software, and to +** permit persons to whom the Software is furnished to do so, subject to +** the following conditions: +** +** The above copyright notice and this permission notice shall be +** included in all copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] +--*/ +var ffi = require ("ffi"); +require ("lpvm"); + +var band, bor, bnot, rshift, lshift = bit.band, bit.bor, bit.bnot, bit.rshift, bit.lshift; + +var TChar = 0; +var TSet = 1; +var TAny = 2; // standard PEG elements +var TTrue = 3; +var TFalse = 4; +var TRep = 5; +var TSeq = 6; +var TChoice = 7; +var TNot = 8; +var TAnd = 9; +var TCall = 10; +var TOpenCall = 11; +var TRule = 12; // sib1 is rule's pattern, sib2 is 'next' rule +var TGrammar = 13; // sib1 is initial (and first) rule +var TBehind = 14; // match behind +var TCapture = 15; // regular capture +var TRunTime = 16; // run-time capture + + +var IAny = 0; // if no char, fail +var IChar = 1; // if char != val, fail +var ISet = 2; // if char not in val, fail +var ITestAny = 3; // in no char, jump to 'offset' +var ITestChar = 4; // if char != val, jump to 'offset' +var ITestSet = 5; // if char not in val, jump to 'offset' +var ISpan = 6; // read a span of chars in val +var IBehind = 7; // walk back 'val' characters (fail if not possible) +var IRet = 8; // return from a rule +var IEnd = 9; // end of pattern +var IChoice = 10; // stack a choice; next fail will jump to 'offset' +var IJmp = 11; // jump to 'offset' +var ICall = 12; // call rule at 'offset' +var IOpenCall = 13; // call rule number 'offset' (must be closed to a ICall) +var ICommit = 14; // pop choice and jump to 'offset' +var IPartialCommit = 15; // update top choice to current position and jump +var IBackCommit = 16; // "fails" but jump to its own 'offset' +var IFailTwice = 17; // pop one choice and then fail +var IFail = 18; // go back to saved state on choice and jump to saved offset +var IGiveup = 19; // internal use +var IFullCapture = 20; // complete capture of last 'off' chars +var IOpenCapture = 21; // start a capture +var ICloseCapture = 22; +var ICloseRunTime = 23; + + +var Cclose = 0; +var Cposition = 1; +var Cconst = 2; +var Cbackref = 3; +var Carg = 4; +var Csimple = 5; +var Ctable = 6; +var Cfunction = 7; +var Cquery = 8; +var Cstring = 9; +var Cnum = 10; +var Csubst = 11; +var Cfold = 12; +var Cruntime = 13; +var Cgroup = 14; + + +var PEnullable = 0; +var PEnofail = 1; +var RuleLR = 0x10000; +var NOINST = -2; + + +var MAXBEHINDPREDICATE = 255; +var MAXRULES = 200; +var MAXOFF = 0xF; + +// number of siblings for each tree +var numsiblings = { + 0, 0, 0, // char, set, any + 0, 0, // true, false + 1, // rep + 2, 2, // seq, choice + 1, 1, // not, and + 0, 0, 2, 1, // call, opencall, rule, grammar + 1, // behind + 1, 1 // capture, runtime capture +}; + + + +var patternelement = ffi.typeof('PATTERN_ELEMENT'); +var pattern = ffi.typeof('PATTERN'); +var settype = ffi.typeof('int32_t[8]'); +var fullset = settype(-1, -1, -1, -1, -1, -1, -1, -1); + +// {====================================================== +// Analysis and some optimizations +// ======================================================= + +var codegen; + + +// Check whether a charset is empty (IFail), singleton (IChar), +// full (IAny), or none of those (ISet). + +var function charsettype(cs) { + var count = 0; + var candidate = -1; // candidate position for a char + for( i = 0, 8 - 1 ) { + var b = cs[i]; + if( b == 0 ) { + if( count > 1 ) { + return ISet; // else set is still empty + } + } else if( b == -1 ) { + if( count < (i * 32) ) { + return ISet; + } else { + count += 32; // set is still full + } + // byte has only one bit? + } else if( band(b, (b - 1)) == 0 ) { + if( count > 0 ) { + return ISet; // set is neither full nor empty + // set has only one char till now; track it + } else { + ++count ; + candidate = i; + } + } else { + return ISet; // byte is neither empty, full, nor singleton + } + } + if( count == 0 ) { + return IFail, 0; // empty set + // singleton; find character bit inside byte + } else if( count == 1 ) { + var b = cs[candidate]; + var c = candidate * 32; + for( i = 1, 32 ) { + if( b == 1 ) { + c = c + i - 1; + break; + } + b = rshift(b, 1); + } + return IChar, c; + } else if( count == 256 ) { + return IAny, 0; // full set + } else { + assert(false); // should have returned by now + } +} + + +// A few basic operations on Charsets + +var function cs_complement(cs) { + for( i = 0, 8 - 1 ) { + cs[i] = bnot(cs[i]); + } +} + + +var function cs_equal(cs1, cs2) { + for( i = 0, 8 - 1 ) { + if( cs1[i] != cs2[i] ) { + return; + } + } + return true; +} + + +// computes whether sets st1 and st2 are disjoint + +var function cs_disjoint(st1, st2) { + for( i = 0, 8 - 1 ) { + if( band(st1[i], st2[i]) != 0 ) { + return; + } + } + return true; +} + + +// Convert a 'char' pattern (TSet, TChar, TAny) to a charset + +var function tocharset(tree, index, valuetable) { + var val = settype(); + if( tree.p[index].tag == TSet ) { + ffi.copy(val, valuetable[tree.p[index].val], ffi.sizeof(val)); + return val; + } else if( tree.p[index].tag == TChar ) { + var b = tree.p[index].val; + // only one char + // add that one + val[rshift(b, 5)] = lshift(1, band(b, 31)); + return val; + } else if( tree.p[index].tag == TAny ) { + ffi.fill(val, ffi.sizeof(val), 0xff); + return val; + } +} + + +// checks whether a pattern has captures + +var function hascaptures(tree, index) { + if( tree.p[index].tag == TCapture || tree.p[index].tag == TRunTime ) { + return true; + } else if( tree.p[index].tag == TCall ) { + return hascaptures(tree, index + tree.p[index].ps); + } else { + var ns = numsiblings[tree.p[index].tag + 1]; + if( ns == 0 ) { + return; + } else if( ns == 1 ) { + return hascaptures(tree, index + 1); + } else if( ns == 2 ) { + if( hascaptures(tree, index + 1) ) { + return true; + } else { + return hascaptures(tree, index + tree.p[index].ps); + } + } else { + assert(false); + } + } +} + + +// Checks how a pattern behaves regarding the empty string, +// in one of two different ways: +// A pattern is *nullable* if it can match without consuming any character; +// A pattern is *nofail* if it never fails for any string +// (including the empty string). +// The difference is only for predicates; for patterns without +// predicates, the two properties are equivalent. +// (With predicates, &'a' is nullable but not nofail. Of course, +// nofail => nullable.) +// These functions are all convervative in the following way: +// p is nullable => nullable(p) +// nofail(p) => p cannot fail +// (The function assumes that TOpenCall and TRunTime are not nullable: +// TOpenCall must be checked again when the grammar is fixed; +// TRunTime is an arbitrary choice.) + +var function checkaux(tree, pred, index, lrcall) { + lrcall = lrcall || {}; + var tag = tree.p[index].tag; + if( tag == TChar || tag == TSet || tag == TAny || + tag == TFalse || tag == TOpenCall ) { + return; // not nullable + } else if( tag == TRep || tag == TTrue ) { + return true; // no fail + } else if( tag == TNot || tag == TBehind ) { + // can match empty, but may fail + if( pred == PEnofail ) { + return; + } else { + return true; // PEnullable + } + } else if( tag == TAnd ) { + // can match empty; fail iff body does + if( pred == PEnullable ) { + return true; + } else { + return checkaux(tree, pred, index + 1, lrcall); + } + // can fail; match empty iff body does + } else if( tag == TRunTime ) { + if( pred == PEnofail ) { + return; + } else { + return checkaux(tree, pred, index + 1, lrcall); + } + } else if( tag == TSeq ) { + if( ! checkaux(tree, pred, index + 1, lrcall) ) { + return; + } else { + return checkaux(tree, pred, index + tree.p[index].ps, lrcall); + } + } else if( tag == TChoice ) { + if( checkaux(tree, pred, index + tree.p[index].ps, lrcall) ) { + return true; + } else { + return checkaux(tree, pred, index + 1, lrcall); + } + } else if( tag == TCapture || tag == TGrammar || tag == TRule ) { + return checkaux(tree, pred, index + 1, lrcall); + } else if( tag == TCall ) { + //left recursive rule + if( bit.band(tree.p[index].cap, 0xffff) != 0 ) { + var lr = index + tree.p[index].ps; + if( lrcall[lr] ) { + return; + } + lrcall[lr] = true; + } + return checkaux(tree, pred, index + tree.p[index].ps, lrcall); + } else { + assert(false); + } +} + + +// number of characters to match a pattern (or -1 if variable) +// ('count' avoids infinite loops for grammars) + +var function fixedlenx(tree, count, len, index) { + var tag = tree.p[index].tag; + if( tag == TChar || tag == TSet || tag == TAny ) { + return len + 1; + } else if( tag == TFalse || tag == TTrue || tag == TNot || tag == TAnd || tag == TBehind ) { + return len; + } else if( tag == TRep || tag == TRunTime || tag == TOpenCall ) { + return -1; + } else if( tag == TCapture || tag == TRule || tag == TGrammar ) { + return fixedlenx(tree, count, len, index + 1); + } else if( tag == TCall ) { + if( count >= MAXRULES ) { + return -1; // may be a loop + } else { + return fixedlenx(tree, count + 1, len, index + tree.p[index].ps); + } + } else if( tag == TSeq ) { + len = fixedlenx(tree, count, len, index + 1); + if( (len < 0) ) { + return -1; + } else { + return fixedlenx(tree, count, len, index + tree.p[index].ps); + } + } else if( tag == TChoice ) { + var n1 = fixedlenx(tree, count, len, index + 1); + if( n1 < 0 ) { return -1; } + var n2 = fixedlenx(tree, count, len, index + tree.p[index].ps); + if( n1 == n2 ) { + return n1; + } else { + return -1; + } + } else { + assert(false); + } +} + + +// Computes the 'first set' of a pattern. +// The result is a conservative aproximation: +// match p ax -> x' for some x ==> a in first(p). +// match p '' -> '' ==> returns 1. +// The set 'follow' is the first set of what follows the +// pattern (full set if nothing follows it) + +var function getfirst(tree, follow, index, valuetable, lrcall) { + lrcall = lrcall || {}; + var tag = tree.p[index].tag; + if( tag == TChar || tag == TSet || tag == TAny ) { + var firstset = tocharset(tree, index, valuetable); + return 0, firstset; + } else if( tag == TTrue ) { + var firstset = settype(); + ffi.copy(firstset, follow, ffi.sizeof(firstset)); + return 1, firstset; + } else if( tag == TFalse ) { + var firstset = settype(); + return 0, firstset; + } else if( tag == TChoice ) { + var e1, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall); + var e2, csaux = getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall); + for( i = 0, 8 - 1 ) { + firstset[i] = bor(firstset[i], csaux[i]); + } + return bor(e1, e2), firstset; + } else if( tag == TSeq ) { + if( ! checkaux(tree, PEnullable, index + 1) ) { + return getfirst(tree, fullset, index + 1, valuetable, lrcall); + // FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) + } else { + var e2, csaux = getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall); + var e1, firstset = getfirst(tree, csaux, index + 1, valuetable, lrcall); + if( e1 == 0 ) { // 'e1' ensures that first can be used + return 0, firstset; + // one of the children has a matchtime? + } else if( band(bor(e1, e2), 2) == 2 ) { + return 2, firstset; // pattern has a matchtime capture + } else { + return e2, firstset; // else depends on 'e2' + } + } + } else if( tag == TRep ) { + var _, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall); + for( i = 0, 8 - 1 ) { + firstset[i] = bor(firstset[i], follow[i]); + } + return 1, firstset; // accept the empty string + } else if( tag == TCapture || tag == TGrammar || tag == TRule ) { + return getfirst(tree, follow, index + 1, valuetable, lrcall); + // function invalidates any follow info. + } else if( tag == TRunTime ) { + var e, firstset = getfirst(tree, fullset, index + 1, valuetable, lrcall); + if( e != 0 ) { + return 2, firstset; // function is not "protected"? + } else { + return 0, firstset; // pattern inside capture ensures first can be used + } + } else if( tag == TCall ) { + // left recursive rule + if( bit.band(tree.p[index].cap, 0xffff) != 0 ) { + var lr = index + tree.p[index].ps; + if( lrcall[lr] ) { + return 0, settype(); + } else { + lrcall[lr] = true; + } + } + return getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall); + } else if( tag == TAnd ) { + var e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall); + for( i = 0, 8 - 1 ) { + firstset[i] = band(firstset[i], follow[i]); + } + return e, firstset; + } else if( tag == TNot ) { + var firstset = tocharset(tree, index + 1, valuetable); + if( firstset ) { + cs_complement(firstset); + return 1, firstset; + } + var e; + e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall); + ffi.copy(firstset, follow, ffi.sizeof(firstset)); + return bor(e, 1), firstset; // always can accept the empty string + // instruction gives no new information + } else if( tag == TBehind ) { + // call 'getfirst' to check for math-time captures + var e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall); + ffi.copy(firstset, follow, ffi.sizeof(firstset)); + return bor(e, 1), firstset; // always can accept the empty string + } else { + assert(false); + } +} + + +// If it returns true, then pattern can fail only depending on the next +// character of the subject + +var function headfail(tree, index, lrcall) { + lrcall = lrcall || {}; + var tag = tree.p[index].tag; + if( tag == TChar || tag == TSet || tag == TAny || tag == TFalse ) { + return true; + } else if( tag == TTrue || tag == TRep || tag == TRunTime || tag == TNot || tag == TBehind ) { + return; + } else if( tag == TCapture || tag == TGrammar || tag == TRule || tag == TAnd ) { + return headfail(tree, index + 1, lrcall); + } else if( tag == TCall ) { + // left recursive rule + if( bit.band(tree.p[index].cap, 0xffff) != 0 ) { + var lr = index + tree.p[index].ps; + if( lrcall[lr] ) { + return true; + } else { + lrcall[lr] = true; + } + } + return headfail(tree, index + tree.p[index].ps, lrcall); + } else if( tag == TSeq ) { + if( ! checkaux(tree, PEnofail, index + tree.p[index].ps) ) { + return; + } else { + return headfail(tree, index + 1, lrcall); + } + } else if( tag == TChoice ) { + if( ! headfail(tree, index + 1, lrcall) ) { + return; + } else { + return headfail(tree, index + tree.p[index].ps, lrcall); + } + } else { + assert(false); + } +} + + +// Check whether the code generation for the given tree can benefit +// from a follow set (to avoid computing the follow set when it is +// not needed) + +var function needfollow(tree, index) { + var tag = tree.p[index].tag; + if( tag == TChar || tag == TSet || tag == TAny || tag == TFalse || tag == TTrue || tag == TAnd || tag == TNot || + tag == TRunTime || tag == TGrammar || tag == TCall || tag == TBehind ) { + return; + } else if( tag == TChoice || tag == TRep ) { + return true; + } else if( tag == TCapture ) { + return needfollow(tree, index + 1); + } else if( tag == TSeq ) { + return needfollow(tree, index + tree.p[index].ps); + } else { + assert(false); + } +} + +// ====================================================== + + +// {====================================================== +// Code generation +// ======================================================= + + +// code generation is recursive; 'opt' indicates that the code is +// being generated under a 'IChoice' operator jumping to its end. +// 'tt' points to a previous test protecting this code. 'fl' is +// the follow set of the pattern. + + +var function addinstruction(code, op, val) { + var size = code.size; + if( size >= code.allocsize ) { + code->doublesize(); + } + code.p[size].code = op; + code.p[size].val = val; + code.size = size + 1; + return size; +} + + +var function setoffset(code, instruction, offset) { + code.p[instruction].offset = offset; +} + + +// Add a capture instruction: +// 'op' is the capture instruction; 'cap' the capture kind; +// 'key' the key into ktable; 'aux' is optional offset + +var function addinstcap(code, op, cap, key, aux) { + var i = addinstruction(code, op, bor(cap, lshift(aux, 4))); + setoffset(code, i, key); + return i; +} + + +var function jumptothere(code, instruction, target) { + if( instruction >= 0 ) { + setoffset(code, instruction, target - instruction); + } +} + + +var function jumptohere(code, instruction) { + jumptothere(code, instruction, code.size); +} + + +// Code an IChar instruction, or IAny if there is an equivalent +// test dominating it + +var function codechar(code, c, tt) { + assert(tt != -1); + if( tt >= 0 && code.p[tt].code == ITestChar && + code.p[tt].val == c ) { + addinstruction(code, IAny, 0); + } else { + addinstruction(code, IChar, c); + } +} + + +// Code an ISet instruction + +var function coderealcharset(code, cs, valuetable) { + var ind = #valuetable + 1; + valuetable[ind] = cs; + return addinstruction(code, ISet, ind); +} + + +// code a char set, optimizing unit sets for IChar, "complete" +// sets for IAny, and empty sets for IFail; also use an IAny +// when instruction is dominated by an equivalent test. + +var function codecharset(code, cs, tt, valuetable) { + var op, c = charsettype(cs); + if( op == IChar ) { + codechar(code, c, tt); + } else if( op == ISet ) { + assert(tt != -1); + if( tt >= 0 && code.p[tt].code == ITestSet && + cs_equal(cs, valuetable[code.p[tt].val]) ) { + addinstruction(code, IAny, 0); + } else { + coderealcharset(code, cs, valuetable); + } + } else { + addinstruction(code, op, c); + } +} + + +// code a test set, optimizing unit sets for ITestChar, "complete" +// sets for ITestAny, and empty sets for IJmp (always fails). +// 'e' is true iff test should accept the empty string. (Test +// instructions in the current VM never accept the empty string.) + +var function codetestset(code, cs, e, valuetable) { + if( e != 0 ) { + return NOINST; // no test + } else { + var pos = code.size; + codecharset(code, cs, NOINST, valuetable); + var inst = code.p[pos]; + code = inst.code; + if( code == IFail ) { + inst.code = IJmp; // always jump + } else if( code == IAny ) { + inst.code = ITestAny; + } else if( code == IChar ) { + inst.code = ITestChar; + } else if( code == ISet ) { + inst.code = ITestSet; + } else { + assert(false); + } + return pos; + } +} + + +// Find the final destination of a sequence of jumps + +var function finaltarget(code, i) { + while( code.p[i].code == IJmp ) { + i += code.p[i].offset; + } + return i; +} + + +// final label (after traversing any jumps) + +var function finallabel(code, i) { + return finaltarget(code, i + code.p[i].offset); +} + +// == behind n;

(where n = fixedlen(p)) + +var function codebehind(code, tree, index, valuetable) { + if( tree.p[index].val > 0 ) { + addinstruction(code, IBehind, tree.p[index].val); + } + codegen(code, tree, fullset, false, NOINST, index + 1, valuetable); // NOINST +} + + +// Choice; optimizations: +// - when p1 is headfail +// - when first(p1) and first(p2) are disjoint; than +// a character not in first(p1) cannot go to p1, and a character +// in first(p1) cannot go to p2 (at it is not in first(p2)). +// (The optimization is not valid if p1 accepts the empty string, +// as then there is no character at all...) +// - when p2 is empty and opt is true; a IPartialCommit can resuse +// the Choice already active in the stack. + +var function codechoice(code, tree, fl, opt, p1, p2, valuetable) { + var emptyp2 = tree.p[p2].tag == TTrue; + var e1, st1 = getfirst(tree, fullset, p1, valuetable); + var _, st2 = getfirst(tree, fl, p2, valuetable); + if( headfail(tree, p1) || (e1 == 0 && cs_disjoint(st1, st2)) ) { + // == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: + var test = codetestset(code, st1, 0, valuetable); + var jmp = NOINST; + codegen(code, tree, fl, false, test, p1, valuetable); + if( ! emptyp2 ) { + jmp = addinstruction(code, IJmp, 0); + } + jumptohere(code, test); + codegen(code, tree, fl, opt, NOINST, p2, valuetable); + jumptohere(code, jmp); + } else if( opt && emptyp2 ) { + // p1? == IPartialCommit; p1 + jumptohere(code, addinstruction(code, IPartialCommit, 0)); + codegen(code, tree, fullset, true, NOINST, p1, valuetable); + } else { + // == + // test(fail(p1)) -> L1; choice L1; ; commit L2; L1: ; L2: + var test = codetestset(code, st1, e1, valuetable); + var pchoice = addinstruction(code, IChoice, 0); + codegen(code, tree, fullset, emptyp2, test, p1, valuetable); + var pcommit = addinstruction(code, ICommit, 0); + jumptohere(code, pchoice); + jumptohere(code, test); + codegen(code, tree, fl, opt, NOINST, p2, valuetable); + jumptohere(code, pcommit); + } +} + + +// And predicate +// optimization: fixedlen(p) = n ==> <&p> ==

; behind n +// (valid only when 'p' has no captures) + +var function codeand(code, tree, tt, index, valuetable) { + var n = fixedlenx(tree, 0, 0, index); + if( n >= 0 && n <= MAXBEHINDPREDICATE && ! hascaptures(tree, index) ) { + codegen(code, tree, fullset, false, tt, index, valuetable); + if( n > 0 ) { + addinstruction(code, IBehind, n); + } + } else { + // default: Choice L1; p1; BackCommit L2; L1: Fail; L2: + var pchoice = addinstruction(code, IChoice, 0); + codegen(code, tree, fullset, false, tt, index, valuetable); + var pcommit = addinstruction(code, IBackCommit, 0); + jumptohere(code, pchoice); + addinstruction(code, IFail, 0); + jumptohere(code, pcommit); + } +} + + +// Captures: if pattern has fixed (and not too big) length, use +// a single IFullCapture instruction after the match; otherwise, +// enclose the pattern with OpenCapture - CloseCapture. + +var function codecapture(code, tree, fl, tt, index, valuetable) { + var len = fixedlenx(tree, 0, 0, index + 1); + if( len >= 0 && len <= MAXOFF && ! hascaptures(tree, index + 1) ) { + codegen(code, tree, fl, false, tt, index + 1, valuetable); + addinstcap(code, IFullCapture, tree.p[index].cap, tree.p[index].val, len); + } else { + addinstcap(code, IOpenCapture, tree.p[index].cap, tree.p[index].val, 0); + codegen(code, tree, fl, false, tt, index + 1, valuetable); + addinstcap(code, ICloseCapture, Cclose, 0, 0); + } +} + + +var function coderuntime(code, tree, tt, index, valuetable) { + addinstcap(code, IOpenCapture, Cgroup, tree.p[index].val, 0); + codegen(code, tree, fullset, false, tt, index + 1, valuetable); + addinstcap(code, ICloseRunTime, Cclose, 0, 0); +} + + +// Repetion; optimizations: +// When pattern is a charset, can use special instruction ISpan. +// When pattern is head fail, or if it starts with characters that +// are disjoint from what follows the repetions, a simple test +// is enough (a fail inside the repetition would backtrack to fail +// again in the following pattern, so there is no need for a choice). +// When 'opt' is true, the repetion can reuse the Choice already +// active in the stack. + +var function coderep(code, tree, opt, fl, index, valuetable) { + var st = tocharset(tree, index, valuetable); + if( st ) { + var op = coderealcharset(code, st, valuetable); + code.p[op].code = ISpan; + } else { + var e1; + e1, st = getfirst(tree, fullset, index, valuetable); + if( headfail(tree, index) || (e1 == 0 && cs_disjoint(st, fl)) ) { + // L1: test (fail(p1)) -> L2;

; jmp L1; L2: + var test = codetestset(code, st, 0, valuetable); + codegen(code, tree, fullset, false, test, index, valuetable); + var jmp = addinstruction(code, IJmp, 0); + jumptohere(code, test); + jumptothere(code, jmp, test); + } else { + // test(fail(p1)) -> L2; choice L2; L1:

; partialcommit L1; L2: + // or (if 'opt'): partialcommit L1; L1:

; partialcommit L1; + var test = codetestset(code, st, e1, valuetable); + var pchoice = NOINST; + if( opt ) { + jumptohere(code, addinstruction(code, IPartialCommit, 0)); + } else { + pchoice = addinstruction(code, IChoice, 0); + } + var l2 = code.size; + codegen(code, tree, fullset, false, NOINST, index, valuetable); + var commit = addinstruction(code, IPartialCommit, 0); + jumptothere(code, commit, l2); + jumptohere(code, pchoice); + jumptohere(code, test); + } + } +} + + +// Not predicate; optimizations: +// In any case, if first test fails, 'not' succeeds, so it can jump to +// the end. If pattern is headfail, that is all (it cannot fail +// in other parts); this case includes 'not' of simple sets. Otherwise, +// use the default code (a choice plus a failtwice). + +var function codenot(code, tree, index, valuetable) { + var e, st = getfirst(tree, fullset, index, valuetable); + var test = codetestset(code, st, e, valuetable); + // test (fail(p1)) -> L1; fail; L1: + if( headfail(tree, index) ) { + addinstruction(code, IFail, 0); + } else { + // test(fail(p))-> L1; choice L1;

; failtwice; L1: + var pchoice = addinstruction(code, IChoice, 0); + codegen(code, tree, fullset, false, NOINST, index, valuetable); + addinstruction(code, IFailTwice, 0); + jumptohere(code, pchoice); + } + jumptohere(code, test); +} + + +// change open calls to calls, using list 'positions' to find +// correct offsets; also optimize tail calls + +var function correctcalls(code, positions, from, to) { + for( i = from, to - 1 ) { + if( code.p[i].code == IOpenCall ) { + var n = code.p[i].offset; // rule number + var rule = positions[n]; // rule position + assert(rule == from || code.p[rule - 1].code == IRet); + // call; ret ? + if( bit.band(code.p[i].val, 0xffff) == 0 && code.p[finaltarget(code, i + 1)].code == IRet ) { + code.p[i].code = IJmp; // tail call + } else { + code.p[i].code = ICall; + } + jumptothere(code, i, rule); // call jumps to respective rule + } + } +} + + +// Code for a grammar: +// call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: + +var function codegrammar(code, tree, index, valuetable) { + var positions = {}; + var rulenumber = 1; + // tree.p[rule].tag + var rule = index + 1; + assert(tree.p[rule].tag == TRule); + var LR = 0; + if( band(RuleLR, tree.p[rule].cap) != 0 ) { LR = 1; } + var firstcall = addinstruction(code, ICall, LR); // call initial rule + code.p[firstcall].aux = tree.p[rule].val; + var jumptoend = addinstruction(code, IJmp, 0); // jump to the end + jumptohere(code, firstcall); // here starts the initial rule + while( tree.p[rule].tag == TRule ) { + positions[rulenumber] = code.size; // save rule position + ++rulenumber ; + codegen(code, tree, fullset, false, NOINST, rule + 1, valuetable); // code rule + addinstruction(code, IRet, 0); + rule += tree.p[rule].ps; + } + assert(tree.p[rule].tag == TTrue); + jumptohere(code, jumptoend); + correctcalls(code, positions, firstcall + 2, code.size); +} + + +var function codecall(code, tree, index, val) { + var c = addinstruction(code, IOpenCall, tree.p[index].cap); // to be corrected later + code.p[c].aux = val; + assert(tree.p[index + tree.p[index].ps].tag == TRule); + setoffset(code, c, band(tree.p[index + tree.p[index].ps].cap, 0x7fff)); // offset = rule number +} + + +var function codeseq(code, tree, fl, opt, tt, p1, p2, valuetable) { + if( needfollow(tree, p1) ) { + var _, fll = getfirst(tree, fl, p2, valuetable); // p1 follow is p2 first + codegen(code, tree, fll, false, tt, p1, valuetable); + } else { + // use 'fullset' as follow + codegen(code, tree, fullset, false, tt, p1, valuetable); + } + // can p1 consume anything? + if( (fixedlenx(tree, 0, 0, p1) != 0) ) { + tt = NOINST; // invalidate test + } + return codegen(code, tree, fl, opt, tt, p2, valuetable); +} + + +// Main code-generation function: dispatch to auxiliar functions +// according to kind of tree + +// code generation is recursive; 'opt' indicates that the code is being +// generated as the last thing inside an optional pattern (so, if that +// code is optional too, it can reuse the 'IChoice' already in place for +// the outer pattern). 'tt' points to a previous test protecting this +// code (or NOINST). 'fl' is the follow set of the pattern. + +function codegen(code, tree, fl, opt, tt, index, valuetable) { + var tag = tree.p[index].tag; + if( tag == TChar ) { + return codechar(code, tree.p[index].val, tt); + } else if( tag == TAny ) { + return addinstruction(code, IAny, 0); + } else if( tag == TSet ) { + return codecharset(code, valuetable[tree.p[index].val], tt, valuetable); + } else if( tag == TTrue ) { + } else if( tag == TFalse ) { + return addinstruction(code, IFail, 0); + } else if( tag == TSeq ) { + return codeseq(code, tree, fl, opt, tt, index + 1, index + tree.p[index].ps, valuetable); + } else if( tag == TChoice ) { + return codechoice(code, tree, fl, opt, index + 1, index + tree.p[index].ps, valuetable); + } else if( tag == TRep ) { + return coderep(code, tree, opt, fl, index + 1, valuetable); + } else if( tag == TBehind ) { + return codebehind(code, tree, index, valuetable); + } else if( tag == TNot ) { + return codenot(code, tree, index + 1, valuetable); + } else if( tag == TAnd ) { + return codeand(code, tree, tt, index + 1, valuetable); + } else if( tag == TCapture ) { + return codecapture(code, tree, fl, tt, index, valuetable); + } else if( tag == TRunTime ) { + return coderuntime(code, tree, tt, index, valuetable); + } else if( tag == TGrammar ) { + return codegrammar(code, tree, index, valuetable); + } else if( tag == TCall ) { + return codecall(code, tree, index, tree.p[index].val); + } else { + assert(false); + } +} + + +// Optimize jumps and other jump-like instructions. +// * Update labels of instructions with labels to their final +// destinations (e.g., choice L1; ... L1: jmp L2: becomes +// choice L2) +// * Jumps to other instructions that do jumps become those +// instructions (e.g., jump to return becomes a return; jump +// to commit becomes a commit) + +var function peephole(code) { + var i = 0; + while( i < code.size ) { + var tag = code.p[i].code; + if( tag == IChoice || tag == ICall || tag == ICommit || tag == IPartialCommit || + tag == IBackCommit || tag == ITestChar || tag == ITestSet || tag == ITestAny ) { + // instructions with labels + jumptothere(code, i, finallabel(code, i)); // optimize label + + } else if( tag == IJmp ) { + var ft = finaltarget(code, i); + tag = code.p[ft].code; // jumping to what? + // instructions with unconditional implicit jumps + if( tag == IRet || tag == IFail || tag == IFailTwice || tag == IEnd ) { + ffi.copy(code.p + i, code.p + ft, ffi.sizeof(patternelement)); // jump becomes that instruction + } else if( tag == ICommit || tag == IPartialCommit || tag == IBackCommit ) { + // inst. with unconditional explicit jumps + var fft = finallabel(code, ft); + ffi.copy(code.p + i, code.p + ft, ffi.sizeof(patternelement)); // jump becomes that instruction... + jumptothere(code, i, fft); // but must correct its offset + --i; // reoptimize its label + } else { + jumptothere(code, i, ft); // optimize label + } + } + ++i ; + } +} + + +// Compile a pattern + +var function compile(tree, index, valuetable) { + var code = pattern(); + codegen(code, tree, fullset, false, NOINST, index, valuetable); + addinstruction(code, IEnd, 0); + peephole(code); + ffi.C.free(tree.code); + tree.code = code; +} + +var function pat_new(ct, size) { + size = size || 0; + var allocsize = size; + if( allocsize < 10 ) { + allocsize = 10; + } + var pat = ffi.cast('PATTERN*', ffi.C.malloc(ffi.sizeof(pattern))); + assert(pat != null); + pat.allocsize = allocsize; + pat.size = size; + pat.p = ffi.C.malloc(ffi.sizeof(patternelement) * allocsize); + assert(pat.p != null); + ffi.fill(pat.p, ffi.sizeof(patternelement) * allocsize); + return pat; +} + + +var function doublesize(ct) { + ct.p = ffi.C.realloc(ct.p, ffi.sizeof(patternelement) * ct.allocsize * 2); + assert(ct.p != null); + ffi.fill(ct.p + ct.allocsize, ffi.sizeof(patternelement) * ct.allocsize); + ct.allocsize = ct.allocsize * 2; +} + +var pattreg = { + doublesize = doublesize, +}; + +var metareg = { + ["__new"] = pat_new, + ["__index"] = pattreg +}; + +ffi.metatype(pattern, metareg); + +return { + checkaux = checkaux, + tocharset = tocharset, + fixedlenx = fixedlenx, + hascaptures = hascaptures, + compile = compile, +}; diff --git a/src/lpeglj.ljs b/src/lpeglj.ljs new file mode 100644 index 0000000..bffd705 --- /dev/null +++ b/src/lpeglj.ljs @@ -0,0 +1,1372 @@ +/* +LPEGLJ +lpeglj.lua +Main module and tree generation +Copyright (C) 2014 Rostislav Sacek. +based on LPeg v1.0 - PEG pattern matching for Lua +Lua.org & PUC-Rio written by Roberto Ierusalimschy +http://www.inf.puc-rio.br/~roberto/lpeg/ + +** Permission is hereby granted, free of charge, to any person obtaining +** a copy of this software and associated documentation files (the +** "Software"), to deal in the Software without restriction, including +** without limitation the rights to use, copy, modify, merge, publish, +** distribute, sublicense, and/or sell copies of the Software, and to +** permit persons to whom the Software is furnished to do so, subject to +** the following conditions: +** +** The above copyright notice and this permission notice shall be +** included in all copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] +--*/ + +assert(jit.version_num > 20000, "Use LuaJIT v2.0.1 or higher."); + +var ffi = require ("ffi"); +var lpcode = require ("lpcode"); +var lpprint = require ("lpprint"); +var lpvm = require ("lpvm"); + +var band, bor, bnot, rshift, lshift = bit.band, bit.bor, bit.bnot, bit.rshift, bit.lshift; + +ffi.cdef ([=[ + int isalnum(int c); + int isalpha(int c); + int iscntrl(int c); + int isdigit(int c); + int isgraph(int c); + int islower(int c); + int isprint(int c); + int ispunct(int c); + int isspace(int c); + int isupper(int c); + int isxdigit(int c); +]=]); + +var MAXBEHIND = 255; +var MAXRULES = 200; +var VERSION = "1.0.0.0LJ"; + +var TChar = 0; +var TSet = 1; +var TAny = 2; // standard PEG elements +var TTrue = 3; +var TFalse = 4; +var TRep = 5; +var TSeq = 6; +var TChoice = 7; +var TNot = 8; +var TAnd = 9; +var TCall = 10; +var TOpenCall = 11; +var TRule = 12; // sib1 is rule's pattern, sib2 is 'next' rule +var TGrammar = 13; // sib1 is initial (and first) rule +var TBehind = 14; // match behind +var TCapture = 15; // regular capture +var TRunTime = 16; // run-time capture + +var IAny = 0; // if no char, fail +var IChar = 1; // if char != val, fail +var ISet = 2; // if char not in val, fail +var ITestAny = 3; // in no char, jump to 'offset' +var ITestChar = 4; // if char != val, jump to 'offset' +var ITestSet = 5; // if char not in val, jump to 'offset' +var ISpan = 6; // read a span of chars in val +var IBehind = 7; // walk back 'val' characters (fail if not possible) +var IRet = 8; // return from a rule +var IEnd = 9; // end of pattern +var IChoice = 10; // stack a choice; next fail will jump to 'offset' +var IJmp = 11; // jump to 'offset' +var ICall = 12; // call rule at 'offset' +var IOpenCall = 13; // call rule number 'offset' (must be closed to a ICall) +var ICommit = 14; // pop choice and jump to 'offset' +var IPartialCommit = 15; // update top choice to current position and jump +var IBackCommit = 16; // "fails" but jump to its own 'offset' +var IFailTwice = 17; // pop one choice and then fail +var IFail = 18; // go back to saved state on choice and jump to saved offset +var IGiveup = 19; // internal use +var IFullCapture = 20; // complete capture of last 'off' chars +var IOpenCapture = 21; // start a capture +var ICloseCapture = 22; +var ICloseRunTime = 23; + +var Cclose = 0; +var Cposition = 1; +var Cconst = 2; +var Cbackref = 3; +var Carg = 4; +var Csimple = 5; +var Ctable = 6; +var Cfunction = 7; +var Cquery = 8; +var Cstring = 9; +var Cnum = 10; +var Csubst = 11; +var Cfold = 12; +var Cruntime = 13; +var Cgroup = 14; + +var PEnullable = 0; +var PEnofail = 1; +var PEleftrecursion = 2; + +var newgrammar; + +var RuleLR = 0x10000; +var Ruleused = 0x20000; +var BCapcandelete = 0x30000; + +var LREnable = false; + +// number of siblings for each tree +var numsiblings = { + 0, 0, 0, // char, set, any + 0, 0, // true, false + 1, // rep + 2, 2, // seq, choice + 1, 1, // not, and + 0, 0, 2, 1, // call, opencall, rule, grammar + 1, // behind + 1, 1 // capture, runtime capture +}; + + + +var patternid = 0; +var valuetable = {}; + +var funcnames = setmetatable({}, { __mode = 'k' }); + +var treepatternelement = ffi.typeof('TREEPATTERN_ELEMENT'); +var treepattern = ffi.typeof('TREEPATTERN'); +var patternelement = ffi.typeof('PATTERN_ELEMENT'); +var pattern = ffi.typeof('PATTERN'); +var settype = ffi.typeof('int32_t[8]'); +var uint32 = ffi.typeof('uint32_t[1]'); + +// Fix a TOpenCall into a TCall node, using table 'postable' to +// translate a key to its rule address in the tree. Raises an +// error if key does not exist. + +var function fixonecall(postable, grammar, index, valuetable) { + var name = valuetable[grammar.p[index].val]; // get rule's name + var n = postable[name]; // query name in position table + // no position? + if( ! n ) { + error(("rule '%s' undefined in given grammar")->format(type(name) == 'table' && '(a table)' || name), 0); + } + grammar.p[index].tag = TCall; + grammar.p[index].ps = n - index; // position relative to node + grammar.p[index + grammar.p[index].ps].cap = bit.bor(grammar.p[index + grammar.p[index].ps].cap, Ruleused); +} + + +// Transform left associative constructions into right +// associative ones, for sequence and choice; that is: +// (t11 + t12) + t2 => t11 + (t12 + t2) +// (t11 * t12) * t2 => t11 * (t12 * t2) +// (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2)) + +var function correctassociativity(tree, index) { + var t1 = index + 1; + assert(tree.p[index].tag == TChoice || tree.p[index].tag == TSeq); + while( tree.p[t1].tag == tree.p[index].tag ) { + var n1size = tree.p[index].ps - 1; // t1 == Op t11 t12 + var n11size = tree.p[t1].ps - 1; + var n12size = n1size - n11size - 1; + for( i = 1, n11size ) { + ffi.copy(tree.p + index + i, tree.p + t1 + i, ffi.sizeof(treepatternelement)); + } + tree.p[index].ps = n11size + 1; + tree.p[index + tree.p[index].ps].tag = tree.p[index].tag; + tree.p[index + tree.p[index].ps].ps = n12size + 1; + } +} + + +// Make final adjustments in a tree. Fix open calls in tree, +// making them refer to their respective rules or raising appropriate +// errors (if not inside a grammar). Correct associativity of associative +// constructions (making them right associative). + +var function finalfix(fix, postable, grammar, index, valuetable) { + + var tag = grammar.p[index].tag; + //subgrammars were already fixed + if( tag == TGrammar ) { + return; + } else if( tag == TOpenCall ) { + // inside a grammar? + if( fix ) { + fixonecall(postable, grammar, index, valuetable); + // open call outside grammar + } else { + error(("rule '%s' used outside a grammar")->format(tostring(valuetable[grammar.p[index].val])), 0); + } + } else if( tag == TSeq || tag == TChoice ) { + correctassociativity(grammar, index); + } + var ns = numsiblings[tag + 1]; + if( ns == 0 ) { + } else if( ns == 1 ) { + return finalfix(fix, postable, grammar, index + 1, valuetable); + } else if( ns == 2 ) { + finalfix(fix, postable, grammar, index + 1, valuetable); + return finalfix(fix, postable, grammar, index + grammar.p[index].ps, valuetable); + } else { + assert(false); + } +} + + +// {====================================================== +// Tree generation +// ======================================================= + +var function newcharset() { + var tree = treepattern(1); + valuetable[tree.id] = { settype() }; + tree.p[0].tag = TSet; + tree.p[0].val = 1; + return tree, valuetable[tree.id][1]; +} + + +// add to tree a sequence where first sibling is 'sib' (with size +// 'sibsize') + +var function seqaux(tree, sib, start, sibsize) { + tree.p[start].tag = TSeq; + tree.p[start].ps = sibsize + 1; + ffi.copy(tree.p + start + 1, sib.p, ffi.sizeof(treepatternelement) * sibsize); +} + + +// Build a sequence of 'n' nodes, each with tag 'tag' and 'val' got +// from the array 's' (or 0 if array is NULL). (TSeq is binary, so it +// must build a sequence of sequence of sequence...) + +var function fillseq(tree, tag, start, n, s) { + // initial n-1 copies of Seq tag; Seq ... + for( i = 1, n - 1 ) { + tree.p[start].tag = TSeq; + tree.p[start].ps = 2; + tree.p[start + 1].tag = tag; + if( s ) { + tree.p[start + 1].val = s->sub(i, i)->byte(); + } + start += tree.p[start].ps; + } + tree.p[start].tag = tag; // last one does not need TSeq + if( s ) { + tree.p[start].val = s->sub(n, n)->byte(); + } +} + + +// Numbers as patterns: +// 0 == true (always match); n == TAny repeated 'n' times; +// -n == not (TAny repeated 'n' times) + +var function numtree(n) { + if( n == 0 ) { + var tree = treepattern(1); + tree.p[0].tag = TTrue; + return tree; + } else { + var tree, start; + if( n > 0 ) { + tree = treepattern(2 * n - 1); + start = 0; + // negative: code it as !(-n) + } else { + n = -n; + tree = treepattern(2 * n); + tree.p[0].tag = TNot; + start = 1; + } + fillseq(tree, TAny, start, n); // sequence of 'n' any's + return tree; + } +} + + +// Convert value to a pattern + +var function getpatt(val, name) { + var typ = type(val); + if( typ == 'string' ) { + // empty? + if( #val == 0 ) { + var pat = treepattern(1); + pat.p[0].tag = TTrue; // always match + return pat; + } else { + var tree = treepattern(2 * (#val - 1) + 1); + fillseq(tree, TChar, 0, #val, val); // sequence of '#val' chars + return tree; + } + } else if( typ == 'number' ) { + return numtree(val); + } else if( typ == 'boolean' ) { + var pat = treepattern(1); + pat.p[0].tag = val && TTrue || TFalse; + return pat; + } else if( typ == 'table' ) { + return newgrammar(val); + } else if( typ == 'function' ) { + if( name && type(name) == 'string' ) { + funcnames[val] = name; + } + var pat = treepattern(2); + valuetable[pat.id] = { val }; + pat.p[0].tag = TRunTime; + pat.p[0].val = 1; + pat.p[1].tag = TTrue; + return pat; + } else if( ffi.istype(treepattern, val) ) { + assert(val.treesize > 0); + return val; + } + assert(false); +} + +var function copykeys(ktable1, ktable2) { + var ktable, offset = {}, 0; + if( ! ktable1 && ! ktable2 ) { + return ktable, 0; + } else if( ktable1 ) { + for( i = 1, #ktable1 ) { + ktable[#ktable + 1] = ktable1[i]; + } + offset = #ktable1; + if( ! ktable2 ) { + return ktable, 0; + } + } + if( ktable2 ) { + for( i = 1, #ktable2 ) { + ktable[#ktable + 1] = ktable2[i]; + } + } + assert(#ktable < 65536, "too many Lua values in pattern"); + return ktable, offset; +} + +var function correctkeys(tree, index, offset) { + var tag = tree.p[index].tag; + if( (tag == TSet || tag == TRule || tag == TCall || tag == TRunTime || tag == TOpenCall || tag == TCapture) && + tree.p[index].val != 0 ) { + tree.p[index].val = tree.p[index].val + offset; + } + var ns = numsiblings[tag + 1]; + if( ns == 0 ) { + } else if( ns == 1 ) { + return correctkeys(tree, index + 1, offset); + } else if( ns == 2 ) { + correctkeys(tree, index + 1, offset); + return correctkeys(tree, index + tree.p[index].ps, offset); + } else { + assert(false); + } +} + + + +// create a new tree, with a new root and one sibling. + +var function newroot1sib(tag, pat) { + var tree1 = getpatt(pat); + var tree = treepattern(1 + tree1.treesize); // create new tree + valuetable[tree.id] = copykeys(valuetable[tree1.id]); + tree.p[0].tag = tag; + ffi.copy(tree.p + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize); + return tree; +} + + +// create a new tree, with a new root and 2 siblings. + +var function newroot2sib(tag, pat1, pat2) { + var tree1 = getpatt(pat1); + var tree2 = getpatt(pat2); + var tree = treepattern(1 + tree1.treesize + tree2.treesize); // create new tree + var ktable, offset = copykeys(valuetable[tree1.id], valuetable[tree2.id]); + valuetable[tree.id] = ktable; + tree.p[0].tag = tag; + tree.p[0].ps = 1 + tree1.treesize; + ffi.copy(tree.p + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize); + ffi.copy(tree.p + 1 + tree1.treesize, tree2.p, ffi.sizeof(treepatternelement) * tree2.treesize); + if( offset > 0 ) { + correctkeys(tree, 1 + tree1.treesize, offset); + } + return tree; +} + + +var function lp_P(val, name) { + assert(type(val) != 'null'); + return getpatt(val, name); +} + + +// sequence operator; optimizations: +// false x => false, x true => x, true x => x +// (cannot do x . false => false because x may have runtime captures) + +var function lp_seq(pat1, pat2) { + var tree1 = getpatt(pat1); + var tree2 = getpatt(pat2); + // false . x == false, x . true = x + if( tree1.p[0].tag == TFalse || tree2.p[0].tag == TTrue ) { + return tree1; + // true . x = x + } else if( tree1.p[0].tag == TTrue ) { + return tree2; + } else { + return newroot2sib(TSeq, tree1, tree2); + } +} + + +// choice operator; optimizations: +// charset / charset => charset +// true / x => true, x / false => x, false / x => x +// (x / true is not equivalent to true) + +var function lp_choice(pat1, pat2) { + var tree1 = getpatt(pat1); + var tree2 = getpatt(pat2); + var charset1 = lpcode.tocharset(tree1, 0, valuetable[tree1.id]); + var charset2 = lpcode.tocharset(tree2, 0, valuetable[tree2.id]); + if( charset1 && charset2 ) { + var t, set = newcharset(); + for( i = 0, 7 ) { + set[i] = bor(charset1[i], charset2[i]); + } + return t; + } else if( lpcode.checkaux(tree1, PEnofail, 0) || tree2.p[0].tag == TFalse ) { + return tree1; // true / x => true, x / false => x + } else if( tree1.p[0].tag == TFalse ) { + return tree2; // false / x => x + } else { + return newroot2sib(TChoice, tree1, tree2); + } +} + + +// p^n + +var function lp_star(tree1, n) { + var tree; + n = tonumber(n); + assert(type(n) == 'number'); + // seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) + if( n >= 0 ) { + tree = treepattern((n + 1) * (tree1.treesize + 1)); + if( lpcode.checkaux(tree1, PEnullable, 0) ) { + error("loop body may accept empty string", 0); + } + valuetable[tree.id] = copykeys(valuetable[tree1.id]); + var start = 0; + // repeat 'n' times + for( i = 1, n ) { + seqaux(tree, tree1, start, tree1.treesize); + start += tree.p[start].ps; + } + tree.p[start].tag = TRep; + ffi.copy(tree.p + start + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize); + // choice (seq tree1 ... choice tree1 true ...) true + } else { + n = -n; + // size = (choice + seq + tree1 + true) * n, but the last has no seq + tree = treepattern(n * (tree1.treesize + 3) - 1); + valuetable[tree.id] = copykeys(valuetable[tree1.id]); + var start = 0; + // repeat (n - 1) times + for( i = n, 2, -1 ) { + tree.p[start].tag = TChoice; + tree.p[start].ps = i * (tree1.treesize + 3) - 2; + tree.p[start + tree.p[start].ps].tag = TTrue; + ++start ; + seqaux(tree, tree1, start, tree1.treesize); + start += tree.p[start].ps; + } + tree.p[start].tag = TChoice; + tree.p[start].ps = tree1.treesize + 1; + tree.p[start + tree.p[start].ps].tag = TTrue; + ffi.copy(tree.p + start + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize); + } + return tree; +} + + +// #p == &p + +var function lp_and(pat) { + return newroot1sib(TAnd, pat); +} + + +// -p == !p + +var function lp_not(pat) { + return newroot1sib(TNot, pat); +} + + +// [t1 - t2] == Seq (Not t2) t1 +// If t1 and t2 are charsets, make their difference. + +var function lp_sub(pat1, pat2) { + var tree1 = getpatt(pat1); + var tree2 = getpatt(pat2); + var charset1 = lpcode.tocharset(tree1, 0, valuetable[tree1.id]); + var charset2 = lpcode.tocharset(tree2, 0, valuetable[tree2.id]); + if( charset1 && charset2 ) { + var tree, set = newcharset(); + for( i = 0, 7 ) { + set[i] = band(charset1[i], bnot(charset2[i])); + } + return tree; + } else { + var tree = treepattern(2 + tree1.treesize + tree2.treesize); + var ktable, offset = copykeys(valuetable[tree2.id], valuetable[tree1.id]); + valuetable[tree.id] = ktable; + tree.p[0].tag = TSeq; // sequence of... + tree.p[0].ps = 2 + tree2.treesize; + tree.p[1].tag = TNot; // ...not... + ffi.copy(tree.p + 2, tree2.p, ffi.sizeof(treepatternelement) * tree2.treesize); + ffi.copy(tree.p + tree2.treesize + 2, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize); + if( offset > 0 ) { + correctkeys(tree, 2 + tree2.treesize, offset); + } + return tree; + } +} + + +var function lp_set(val) { + assert(type(val) == 'string'); + var tree, set = newcharset(); + for( i = 1, #val ) { + var b = val->sub(i, i)->byte(); + set[rshift(b, 5)] = bor(set[rshift(b, 5)], lshift(1, band(b, 31))); + } + return tree; +} + + +var function lp_range(...) { + var args = { ... }; + var top = #args; + var tree, set = newcharset(); + for( i = 1, top ) { + assert(#args[i] == 2, args[i] .. " range must have two characters"); + for( b = args[i]->sub(1, 1)->byte(), args[i]->sub(2, 2)->byte() ) { + set[rshift(b, 5)] = bor(set[rshift(b, 5)], lshift(1, band(b, 31))); + } + } + return tree; +} + + +// Look-behind predicate + +var function lp_behind(pat) { + var tree1 = getpatt(pat); + var n = lpcode.fixedlenx(tree1, 0, 0, 0); + assert(! lpcode.hascaptures(tree1, 0), "pattern have captures"); + assert(n >= 0, "pattern may not have fixed length"); + assert(n <= MAXBEHIND, "pattern too long to look behind"); + var tree = newroot1sib(TBehind, pat); + tree.p[0].val = n; + return tree; +} + + +// Create a non-terminal + +var function lp_V(val, p) { + assert(val, "non-null value expected"); + var tree = treepattern(1); + valuetable[tree.id] = { val }; + tree.p[0].tag = TOpenCall; + tree.p[0].val = 1; + tree.p[0].cap = p || 0; + return tree; +} + + +// Create a tree for a non-empty capture, with a body and +// optionally with an associated value + +var function capture_aux(cap, pat, val) { + var tree = newroot1sib(TCapture, pat); + tree.p[0].cap = cap; + if( val ) { + var ind = #valuetable[tree.id] + 1; + assert(ind <= 65536, "too many Lua values in pattern" .. ind); + valuetable[tree.id][ind] = val; + tree.p[0].val = ind; + } + return tree; +} + + +// Fill a tree with an empty capture, using an empty (TTrue) sibling. + +var function auxemptycap(tree, cap, par, start) { + tree.p[start].tag = TCapture; + tree.p[start].cap = cap; + if( type(par) != 'null' ) { + var ind = #valuetable[tree.id] + 1; + assert(ind <= 65536, "too many Lua values in pattern"); + valuetable[tree.id][ind] = par; + tree.p[start].val = ind; + } + tree.p[start + 1].tag = TTrue; +} + + +// Create a tree for an empty capture + +var function newemptycap(cap, par) { + var tree = treepattern(2); + if( type(par) != 'null' ) { valuetable[tree.id] = {}; } + auxemptycap(tree, cap, par, 0); + return tree; +} + + +// Captures with syntax p / v +// (function capture, query capture, string capture, or number capture) + +var function lp_divcapture(pat, par, xxx) { + var typ = type(par); + if( typ == "function" ) { + return capture_aux(Cfunction, pat, par); + } else if( typ == "table" ) { + return capture_aux(Cquery, pat, par); + } else if( typ == "string" ) { + return capture_aux(Cstring, pat, par); + } else if( typ == "number" ) { + var tree = newroot1sib(TCapture, pat); + assert(0 <= par && par <= 0xffff, "invalid number"); + tree.p[0].cap = Cnum; + var ind = #valuetable[tree.id] + 1; + assert(ind <= 65536, "too many Lua values in pattern"); + valuetable[tree.id][ind] = par; + tree.p[0].val = ind; + return tree; + } else { + error("invalid replacement value", 0); + } +} + + +var function lp_substcapture(pat) { + return capture_aux(Csubst, pat); +} + + +var function lp_tablecapture(pat) { + return capture_aux(Ctable, pat, 0); +} + + +var function lp_groupcapture(pat, val) { + if( ! val ) { + return capture_aux(Cgroup, pat); + } else { + return capture_aux(Cgroup, pat, val); + } +} + + +var function lp_foldcapture(pat, fce) { + assert(type(fce) == 'function'); + return capture_aux(Cfold, pat, fce); +} + + +var function lp_simplecapture(pat) { + return capture_aux(Csimple, pat); +} + + +var function lp_poscapture() { + return newemptycap(Cposition); +} + + +var function lp_argcapture(val) { + assert(type(val) == 'number'); + var tree = newemptycap(Carg, 0); + var ind = #valuetable[tree.id] + 1; + assert(ind <= 65536, "too many Lua values in pattern"); + valuetable[tree.id][ind] = val; + tree.p[0].val = ind; + assert(0 < val && val <= 0xffff, "invalid argument index"); + return tree; +} + + +var function lp_backref(val) { + return newemptycap(Cbackref, val); +} + + +// Constant capture + +var function lp_constcapture(...) { + var tree; + var args = { ... }; + var n = select('#', ...); // number of values + // no values? + if( n == 0 ) { + tree = treepattern(1); // no capture + tree.p[0].tag = TTrue; + } else if( n == 1 ) { + tree = newemptycap(Cconst, args[1]); // single constant capture + // create a group capture with all values + } else { + tree = treepattern(3 + 3 * (n - 1)); + valuetable[tree.id] = {}; + tree.p[0].tag = TCapture; + tree.p[0].cap = Cgroup; + var start = 1; + for( i = 1, n - 1 ) { + tree.p[start].tag = TSeq; + tree.p[start].ps = 3; + auxemptycap(tree, Cconst, args[i], start + 1); + start += tree.p[start].ps; + } + auxemptycap(tree, Cconst, args[n], start); + } + return tree; +} + + +var function lp_matchtime(pat, fce, name) { + assert(type(fce) == 'function'); + if( name && type(name) == 'string' ) { + funcnames[fce] = name; + } + var tree = newroot1sib(TRunTime, pat); + var ind = #valuetable[tree.id] + 1; + assert(ind <= 65536, "too many Lua values in pattern"); + valuetable[tree.id][ind] = fce; + tree.p[0].val = ind; + return tree; +} + +// ====================================================== + + + +// ====================================================== +// Grammar - Tree generation +// ======================================================= + + +// return index and the pattern for the +// initial rule of grammar; +// also add that index into position table. + +var function getfirstrule(pat, postab) { + var key; + // access first element + if( type(pat[1]) == 'string' ) { + key = pat[1]; + } else { + key = 1; + } + var rule = pat[key]; + if( ! rule ) { + error("grammar has no initial rule", 0); + } + // initial rule not a pattern? + if( ! ffi.istype(treepattern, rule) ) { + error(("initial rule '%s' is not a pattern")->format(tostring(key)), 0); + } + postab[key] = 1; + return key, rule; +} + + +// traverse grammar, collect all its keys and patterns +// into rule table. Create a new table (before all pairs key-pattern) to +// collect all keys and their associated positions in the final tree +// (the "position table"). +// Return the number of rules and the total size +// for the new tree. + +var function collectrules(pat) { + var n = 1; // to count number of rules + var postab = {}; + var firstkeyrule, firstrule = getfirstrule(pat, postab); + var rules = { firstkeyrule, firstrule }; + var size = 2 + firstrule.treesize; // TGrammar + TRule + rule + for( key, val in pairs(pat) ) { + // initial rule? + if( key != 1 && tostring(val) != tostring(firstrule) ) { + // value is not a pattern? + if( ! ffi.istype(treepattern, val) ) { + error(("rule '%s' is not a pattern")->format(tostring(key)), 0); + } + rules[#rules + 1] = key; + rules[#rules + 1] = val; + postab[key] = size; + size = 1 + size + val.treesize; + ++n ; + } + } + ++size ; // TTrue to finish list of rules + return n, size, rules, postab; +} + + +var function buildgrammar(grammar, rules, n, index, valuetable) { + var ktable, offset = {}, 0; + // add each rule into new tree + for( i = 1, n ) { + var size = rules[i * 2].treesize; + grammar.p[index].tag = TRule; + grammar.p[index].cap = i; // rule number + grammar.p[index].ps = size + 1; // point to next rule + var ind = #ktable + 1; + ktable[ind] = rules[i * 2 - 1]; + grammar.p[index].val = ind; + ffi.copy(grammar.p + index + 1, rules[i * 2].p, ffi.sizeof(treepatternelement) * size); // copy rule + ktable, offset = copykeys(ktable, valuetable[rules[i * 2].id]); + if( offset > 0 ) { + correctkeys(grammar, index + 1, offset); + } + index += grammar.p[index].ps; // move to next rule + } + grammar.p[index].tag = TTrue; // finish list of rules + return ktable; +} + + +// Check whether a tree has potential infinite loops + +var function checkloops(tree, index) { + var tag = tree.p[index].tag; + if( tag == TRep && lpcode.checkaux(tree, PEnullable, index + 1) ) { + return true; + } else if( tag == TGrammar ) { + return; // sub-grammars already checked + } else { + tag = numsiblings[tree.p[index].tag + 1]; + if( tag == 0 ) { + return; + } else if( tag == 1 ) { + return checkloops(tree, index + 1); + } else if( tag == 2 ) { + if( checkloops(tree, index + 1) ) { + return true; + } else { + return checkloops(tree, index + tree.p[index].ps); + } + } else { + assert(false); + } + } +} + +// Check whether a rule can be left recursive; returns PEleftrecursion in that +// case; otherwise return 1 iff pattern is nullable. + +var function verifyrule(rulename, tree, passed, nullable, index, valuetable) { + var tag = tree.p[index].tag; + if( tag == TChar || tag == TSet || tag == TAny || tag == TFalse ) { + return nullable; // cannot pass from here + } else if( tag == TTrue || tag == TBehind ) { + return true; + } else if( tag == TNot || tag == TAnd || tag == TRep ) { + return verifyrule(rulename, tree, passed, true, index + 1, valuetable); + } else if( tag == TCapture || tag == TRunTime ) { + return verifyrule(rulename, tree, passed, nullable, index + 1, valuetable); + } else if( tag == TCall ) { + var rule = valuetable[tree.p[index].val]; + if( rule == rulename ) { return PEleftrecursion; } + if( passed[rule] && passed[rule] > MAXRULES ) { + return nullable; + } + return verifyrule(rulename, tree, passed, nullable, index + tree.p[index].ps, valuetable); + // only check 2nd child if first is nullable + } else if( tag == TSeq ) { + var res = verifyrule(rulename, tree, passed, false, index + 1, valuetable); + if( res == PEleftrecursion ) { + return res; + } else if( ! res ) { + return nullable; + } else { + return verifyrule(rulename, tree, passed, nullable, index + tree.p[index].ps, valuetable); + } + // must check both children + } else if( tag == TChoice ) { + nullable = verifyrule(rulename, tree, passed, nullable, index + 1, valuetable); + if( nullable == PEleftrecursion ) { return nullable; } + return verifyrule(rulename, tree, passed, nullable, index + tree.p[index].ps, valuetable); + } else if( tag == TRule ) { + var rule = valuetable[tree.p[index].val]; + passed[rule] = (passed[rule] || 0) + 1; + return verifyrule(rulename, tree, passed, nullable, index + 1, valuetable); + } else if( tag == TGrammar ) { + return lpcode.checkaux(tree, PEnullable, index); // sub-grammar cannot be left recursive + } else { + assert(false); + } +} + + +var function verifygrammar(rule, index, valuetable) { + // check left-recursive rules + var LR = {}; + var ind = index + 1; + while( rule.p[ind].tag == TRule ) { + var rulename = valuetable[rule.p[ind].val]; + // used rule + if( rulename ) { + if( verifyrule(rulename, rule, {}, false, ind + 1, valuetable) == PEleftrecursion ) { + if( ! LREnable ) { + error(("rule '%s' may be left recursive")->format(rulename), 0); + } + LR[rulename] = true; + } + } + ind += rule.p[ind].ps; + } + assert(rule.p[ind].tag == TTrue); + + for( i = 0, rule.treesize - 1 ) { + if( rule.p[i].tag == TRule && LR[valuetable[rule.p[i].val]] ) { + rule.p[i].cap = bor(rule.p[i].cap, RuleLR); //TRule can be left recursive + } + if( rule.p[i].tag == TCall && LR[valuetable[rule.p[i].val]] ) { + if( rule.p[i].cap == 0 ) { + rule.p[i].cap = 1; //TCall can be left recursive + } + } + } + + // check infinite loops inside rules + ind = index + 1; + while( rule.p[ind].tag == TRule ) { + // used rule + if( rule.p[ind].val ) { + if( checkloops(rule, ind + 1) ) { + error(("empty loop in rule '%s'")->format(tostring(valuetable[rule.p[ind].val])), 0); + } + } + ind += rule.p[ind].ps; + } + assert(rule.p[ind].tag == TTrue); +} + + +// Give a name for the initial rule if it is not referenced + +var function initialrulename(grammar, val, valuetable) { + grammar.p[1].cap = bit.bor(grammar.p[1].cap, Ruleused); + // initial rule is not referenced? + if( grammar.p[1].val == 0 ) { + var ind = #valuetable + 1; + assert(ind <= 65536, "too many Lua values in pattern"); + valuetable[ind] = val; + grammar.p[1].val = ind; + } +} + + +function newgrammar(pat) { + // traverse grammar. Create a new table (before all pairs key-pattern) to + // collect all keys and their associated positions in the final tree + // (the "position table"). + // Return new tree. + + var n, size, rules, postab = collectrules(pat); + var grammar = treepattern(size); + var start = 0; + grammar.p[start].tag = TGrammar; + grammar.p[start].val = n; + valuetable[grammar.id] = buildgrammar(grammar, rules, n, start + 1, valuetable); + finalfix(true, postab, grammar, start + 1, valuetable[grammar.id]); + initialrulename(grammar, rules[1], valuetable[grammar.id]); + verifygrammar(grammar, 0, valuetable[grammar.id]); + return grammar; +} + +// ====================================================== + +// remove duplicity from value table + +var function reducevaluetable(p) { + var vtable = valuetable[p.id]; + var value = {}; + var newvaluetable = {}; + + var function check(v) { + if( v > 0 ) { + var ord = value[vtable[v]]; + if( ! ord ) { + newvaluetable[#newvaluetable + 1] = vtable[v]; + ord = #newvaluetable; + value[vtable[v]] = ord; + } + return ord; + } + return 0; + } + + var function itertree(p, index) { + var tag = p.p[index].tag; + if( tag == TSet || tag == TCall || tag == TOpenCall || + tag == TRule || tag == TCapture || tag == TRunTime ) { + p.p[index].val = check(p.p[index].val); + } + var ns = numsiblings[tag + 1]; + if( ns == 0 ) { + } else if( ns == 1 ) { + return itertree(p, index + 1); + } else if( ns == 2 ) { + itertree(p, index + 1); + return itertree(p, index + p.p[index].ps); + } else { + assert(false); + } + } + + if( p.treesize > 0 ) { + itertree(p, 0); + } + if( p.code != null ) { + for( i = 0, p.code.size - 1 ) { + var code = p.code.p[i].code; + if( code == ICall || code == IJmp ) { + p.code.p[i].aux = check(p.code.p[i].aux); + } else if( code == ISet || code == ITestSet || code == ISpan ) { + p.code.p[i].val = check(p.code.p[i].val); + } else if( code == IOpenCapture || code == IFullCapture ) { + p.code.p[i].offset = check(p.code.p[i].offset); + } + } + } + valuetable[p.id] = newvaluetable; +} + + +var function checkalt(tree) { + var notchecked = {}; + var notinalternativerules = {}; + + var function iter(tree, index, choice, rule) { + var tag = tree[index].tag; + if( tag == TCapture && bit.band(tree[index].cap, 0xffff) == Cgroup ) { + if( ! choice ) { + if( rule ) { + notchecked[rule] = index; + } + } else { + tree[index].cap = bit.bor(tree[index].cap, BCapcandelete); + } + } else if( tag == TChoice ) { + choice = true; + } else if( tag == TRule ) { + rule = tree[index].val; + if( bit.band(tree[index].cap, 0xffff) - 1 == 0 ) { + notinalternativerules[rule] = notinalternativerules[rule] || true; + } + } else if( tag == TCall ) { + var r = tree[index].val; + if( ! choice ) { + notinalternativerules[r] = notinalternativerules[r] || true; + } + } + var sibs = numsiblings[tree[index].tag + 1] || 0; + if( sibs >= 1 ) { + iter(tree, index + 1, choice, rule); + if( sibs >= 2 ) { + return iter(tree, index + tree[index].ps, choice, rule); + } + } + } + + iter(tree, 0); + for( k, v in pairs(notchecked) ) { + if( ! notinalternativerules[k] ) { + tree[v].cap = bit.bor(tree[v].cap, BCapcandelete); + } + } +} +var function prepcompile(p, index) { + finalfix(false, null, p, index, valuetable[p.id]); + checkalt(p.p); + lpcode.compile(p, index, valuetable[p.id]); + reducevaluetable(p); + return p.code; +} + + +var function lp_printtree(pat, c) { + assert(pat.treesize > 0); + if( c ) { + finalfix(false, null, pat, 0, valuetable[pat.id]); + } + lpprint.printtree(pat.p, 0, 0, valuetable[pat.id]); +} + + +var function lp_printcode(pat) { + // not compiled yet? + if( pat.code == null ) { + prepcompile(pat, 0); + } + lpprint.printpatt(pat.code, valuetable[pat.id]); +} + + +// Main match function + +var function lp_match(pat, s, init, ...) { + var p = ffi.istype(treepattern, pat) && pat || getpatt(pat); + p.code = p.code != null && p.code || prepcompile(p, 0); + return lpvm.match(p, s, init, valuetable[p.id], ...); +} + +var function lp_streammatch(pat, init, ...) { + var p = ffi.istype(treepattern, pat) && pat || getpatt(pat); + p.code = p.code != null && p.code || prepcompile(p, 0); + return lpvm.streammatch(p, init, valuetable[p.id], ...); +} + +// Only for testing purpose +// stream emulation (send all chars from string one char after char) +var function lp_emulatestreammatch(pat, s, init, ...) { + var p = ffi.istype(treepattern, pat) && pat || getpatt(pat); + p.code = p.code != null && p.code || prepcompile(p, 0); + return lpvm.emulatestreammatch(p, s, init, valuetable[p.id], ...); +} + +// {====================================================== +// Library creation and functions not related to matching +// ======================================================= + +var function lp_setmax(val) { + lpvm.setmax(val); +} + +var function lp_setmaxbehind(val) { + lpvm.setmaxbehind(val); +} + +var function lp_enableleftrecursion(val) { + LREnable = val; +} + + +var function lp_version() { + return VERSION; +} + + +var function lp_type(pat) { + if( ffi.istype(treepattern, pat) ) { + return "pattern"; + } +} + + +var function createcat(tab, catname, catfce) { + var t, set = newcharset(); + for( i = 0, 255 ) { + if( catfce(i) != 0 ) { + set[rshift(i, 5)] = bor(set[rshift(i, 5)], lshift(1, band(i, 31))); + } + } + tab[catname] = t; +} + + +var function lp_locale(tab) { + tab = tab || {}; + createcat(tab, "alnum", function(c) { return ffi.C.isalnum(c); }); + createcat(tab, "alpha", function(c) { return ffi.C.isalpha(c); }); + createcat(tab, "cntrl", function(c) { return ffi.C.iscntrl(c); }); + createcat(tab, "digit", function(c) { return ffi.C.isdigit(c); }); + createcat(tab, "graph", function(c) { return ffi.C.isgraph(c); }); + createcat(tab, "lower", function(c) { return ffi.C.islower(c); }); + createcat(tab, "print", function(c) { return ffi.C.isprint(c); }); + createcat(tab, "punct", function(c) { return ffi.C.ispunct(c); }); + createcat(tab, "space", function(c) { return ffi.C.isspace(c); }); + createcat(tab, "upper", function(c) { return ffi.C.isupper(c); }); + createcat(tab, "xdigit", function(c) { return ffi.C.isxdigit(c); }); + return tab; +} + + +var function lp_new(ct, size) { + var pat = ffi.new(ct, size); + pat.treesize = size; + ++patternid ; + pat.id = patternid; + return pat; +} + + +var function lp_gc(ct) { + valuetable[ct.id] = null; + if( ct.code != null ) { + ffi.C.free(ct.code.p); + ffi.C.free(ct.code); + } +} + +var function lp_eq(ct1, ct2) { + return tostring(ct1) == tostring(ct2); +} + +var function lp_load(str, fcetab) { + var pat, t = lpvm.load(str, fcetab, true); + valuetable[pat.id] = t; + return pat; +} + +var function lp_loadfile(fname, fcetab) { + var pat, t = lpvm.loadfile(fname, fcetab, true); + valuetable[pat.id] = t; + return pat; +} + +var function lp_dump(ct, tree) { + var funccount = 0; + // not compiled yet? + if( ct.code == null ) { + prepcompile(ct, 0); + } + var out = {}; + if( tree ) { + out[#out + 1] = ffi.string(uint32(ct.treesize), 4); + out[#out + 1] = ffi.string(ct.p, ffi.sizeof(treepatternelement) * ct.treesize); + } else { + out[#out + 1] = ffi.string(uint32(0), 4); + } + out[#out + 1] = ffi.string(uint32(ct.code.size), 4); + out[#out + 1] = ffi.string(ct.code.p, ct.code.size * ffi.sizeof(patternelement)); + var t = valuetable[ct.id]; + var len = t && #t || 0; + out[#out + 1] = ffi.string(uint32(len), 4); + if( len > 0 ) { + for( _, val in ipairs(t) ) { + var typ = type(val); + if( typ == 'string' ) { + out[#out + 1] = 'str'; + out[#out + 1] = ffi.string(uint32(#val), 4); + out[#out + 1] = val; + } else if( typ == 'number' ) { + val = tostring(val); + out[#out + 1] = 'num'; + out[#out + 1] = ffi.string(uint32(#val), 4); + out[#out + 1] = val; + } else if( typ == 'cdata' ) { + out[#out + 1] = 'cdt'; + out[#out + 1] = ffi.string(val, ffi.sizeof(val)); + } else if( typ == 'function' ) { + out[#out + 1] = 'fnc'; + ++funccount ; + var name = funcnames[val] || ('FNAME%03d')->format(funccount); + out[#out + 1] = ffi.string(uint32(#name), 4); + out[#out + 1] = name; + if( ! funcnames[val] && debug.getupvalue(val, 1) ) { + io.write(("Patterns function (%d) contains upvalue (%s) - use symbol name for function (%s).\n")->format(funccount, debug.getupvalue(val, 1), name), 0); + } + var data = string.dump(val, true); + out[#out + 1] = ffi.string(uint32(#data), 4); + out[#out + 1] = data; + } else { + error(("Type '%s' NYI for dump")->format(typ), 0); + } + } + } + return table.concat(out); +} + +var function lp_save(ct, fname, tree) { + var file = assert(io.open(fname, 'wb')); + file->write(lp_dump(ct, tree)); + file->close(); +} + + +var pattreg = { + ["ptree"] = lp_printtree, + ["pcode"] = lp_printcode, + ["match"] = lp_match, + ["streammatch"] = lp_streammatch, + ["emulatestreammatch"] = lp_emulatestreammatch, + ["setmaxbehind"] = lp_setmaxbehind, + ["B"] = lp_behind, + ["V"] = lp_V, + ["C"] = lp_simplecapture, + ["Cc"] = lp_constcapture, + ["Cmt"] = lp_matchtime, + ["Cb"] = lp_backref, + ["Carg"] = lp_argcapture, + ["Cp"] = lp_poscapture, + ["Cs"] = lp_substcapture, + ["Ct"] = lp_tablecapture, + ["Cf"] = lp_foldcapture, + ["Cg"] = lp_groupcapture, + ["P"] = lp_P, + ["S"] = lp_set, + ["R"] = lp_range, + ["L"] = lp_and, + ["locale"] = lp_locale, + ["version"] = lp_version, + ["setmaxstack"] = lp_setmax, + ["type"] = lp_type, + ["enableleftrecursion"] = lp_enableleftrecursion, + ["enablememoization"] = lpvm.enablememoization, + ["enabletracing"] = lpvm.enabletracing, + ["save"] = lp_save, + ["dump"] = lp_dump, + ["load"] = lp_load, + ["loadfile"] = lp_loadfile, + ["__mul"] = lp_seq, + ["__add"] = lp_choice, + ["__pow"] = lp_star, + ["__len"] = lp_and, + ["__div"] = lp_divcapture, + ["__unm"] = lp_not, + ["__sub"] = lp_sub, +}; + +var metareg = { + ["__gc"] = lp_gc, + ["__new"] = lp_new, + ["__mul"] = lp_seq, + ["__add"] = lp_choice, + ["__pow"] = lp_star, + ["__len"] = lp_and, + ["__div"] = lp_divcapture, + ["__unm"] = lp_not, + ["__sub"] = lp_sub, + ["__eq"] = lp_eq, + ["__index"] = pattreg +}; + +ffi.metatype(treepattern, metareg); + +return pattreg; diff --git a/src/lpprint.ljs b/src/lpprint.ljs new file mode 100644 index 0000000..4d3afad --- /dev/null +++ b/src/lpprint.ljs @@ -0,0 +1,356 @@ +/* +LPEGLJ +lpprint.lua +Tree, code and debug print function (only for debuging) +Copyright (C) 2014 Rostislav Sacek. +based on LPeg v1.0 - PEG pattern matching for Lua +Lua.org & PUC-Rio written by Roberto Ierusalimschy +http://www.inf.puc-rio.br/~roberto/lpeg/ + +** Permission is hereby granted, free of charge, to any person obtaining +** a copy of this software and associated documentation files (the +** "Software"), to deal in the Software without restriction, including +** without limitation the rights to use, copy, modify, merge, publish, +** distribute, sublicense, and/or sell copies of the Software, and to +** permit persons to whom the Software is furnished to do so, subject to +** the following conditions: +** +** The above copyright notice and this permission notice shall be +** included in all copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] +--*/ + +var ffi = require("ffi"); +var band, rshift, lshift = bit.band, bit.rshift, bit.lshift; + +ffi.cdef([=[ + int isprint ( int c ); +]=]); + +var RuleLR = 0x10000; +var Ruleused = 0x20000; + +// {====================================================== +// Printing patterns (for debugging) +// ======================================================= + +var TChar = 0; +var TSet = 1; +var TAny = 2; // standard PEG elements +var TTrue = 3; +var TFalse = 4; +var TRep = 5; +var TSeq = 6; +var TChoice = 7; +var TNot = 8; +var TAnd = 9; +var TCall = 10; +var TOpenCall = 11; +var TRule = 12; // sib1 is rule's pattern, sib2 is 'next' rule +var TGrammar = 13; // sib1 is initial (and first) rule +var TBehind = 14; // match behind +var TCapture = 15; // regular capture +var TRunTime = 16; // run-time capture + +var IAny = 0; // if no char, fail +var IChar = 1; // if char != aux, fail +var ISet = 2; // if char not in val, fail +var ITestAny = 3; // in no char, jump to 'offset' +var ITestChar = 4; // if char != aux, jump to 'offset' +var ITestSet = 5; // if char not in val, jump to 'offset' +var ISpan = 6; // read a span of chars in val +var IBehind = 7; // walk back 'aux' characters (fail if not possible) +var IRet = 8; // return from a rule +var IEnd = 9; // end of pattern +var IChoice = 10; // stack a choice; next fail will jump to 'offset' +var IJmp = 11; // jump to 'offset' +var ICall = 12; // call rule at 'offset' +var IOpenCall = 13; // call rule number 'offset' (must be closed to a ICall) +var ICommit = 14; // pop choice and jump to 'offset' +var IPartialCommit = 15; // update top choice to current position and jump +var IBackCommit = 16; // "fails" but jump to its own 'offset' +var IFailTwice = 17; // pop one choice and then fail +var IFail = 18; // go back to saved state on choice and jump to saved offset +var IGiveup = 19; // internal use +var IFullCapture = 20; // complete capture of last 'off' chars +var IOpenCapture = 21; // start a capture +var ICloseCapture = 22; +var ICloseRunTime = 23; + +var Cclose = 0; +var Cposition = 1; +var Cconst = 2; +var Cbackref = 3; +var Carg = 4; +var Csimple = 5; +var Ctable = 6; +var Cfunction = 7; +var Cquery = 8; +var Cstring = 9; +var Cnum = 10; +var Csubst = 11; +var Cfold = 12; +var Cruntime = 13; +var Cgroup = 14; + + +// number of siblings for each tree +var numsiblings = { + [TRep] = 1, + [TSeq] = 2, + [TChoice] = 2, + [TNot] = 1, + [TAnd] = 1, + [TRule] = 2, + [TGrammar] = 1, + [TBehind] = 1, + [TCapture] = 1, + [TRunTime] = 1, +}; +var names = { + [IAny] = "any", + [IChar] = "char", + [ISet] = "set", + [ITestAny] = "testany", + [ITestChar] = "testchar", + [ITestSet] = "testset", + [ISpan] = "span", + [IBehind] = "behind", + [IRet] = "ret", + [IEnd] = "end", + [IChoice] = "choice", + [IJmp] = "jmp", + [ICall] = "call", + [IOpenCall] = "open_call", + [ICommit] = "commit", + [IPartialCommit] = "partial_commit", + [IBackCommit] = "back_commit", + [IFailTwice] = "failtwice", + [IFail] = "fail", + [IGiveup] = "giveup", + [IFullCapture] = "fullcapture", + [IOpenCapture] = "opencapture", + [ICloseCapture] = "closecapture", + [ICloseRunTime] = "closeruntime" +}; + +var function printcharset(st) { + io.write("["); + var i = 0; + while( i <= 255 ) { + var first = i; + while( band(st[rshift(i, 5)], lshift(1, band(i, 31))) != 0 && i <= 255 ) { + ++i ; + } + if( i - 1 == first ) { // unary range? + io.write(("(%02x)")->format(first)); + } else if( i - 1 > first ) { // non-empty range? + io.write(("(%02x-%02x)")->format(first, i - 1)); + } + ++i ; + } + io.write("]"); +} + +var modes = { + [Cclose] = "close", + [Cposition] = "position", + [Cconst] = "constant", + [Cbackref] = "backref", + [Carg] = "argument", + [Csimple] = "simple", + [Ctable] = "table", + [Cfunction] = "function", + [Cquery] = "query", + [Cstring] = "string", + [Cnum] = "num", + [Csubst] = "substitution", + [Cfold] = "fold", + [Cruntime] = "runtime", + [Cgroup] = "group" +}; + +var function printcapkind(kind) { + io.write(("%s")->format(modes[kind])); +} + +var function printjmp(p, index) { + io.write(("-> %d")->format(index + p[index].offset)); +} + +var function printrulename(p, index, rulenames) { + if( rulenames && rulenames[index + p[index].offset] ) { + io.write(' ', rulenames[index + p[index].offset]); + } +} + +var function printinst(p, index, valuetable, rulenames) { + var code = p[index].code; + if( rulenames && rulenames[index] ) { + io.write(rulenames[index], '\n'); + } + io.write(("%04d: %s ")->format(index, names[code])); + if( code == IChar ) { + io.write(("'%s'")->format(string.char(p[index].val))); + } else if( code == ITestChar ) { + io.write(("'%s'")->format(string.char(p[index].val))); + printjmp(p, index); + printrulename(p, index, rulenames); + } else if( code == IFullCapture ) { + printcapkind(band(p[index].val, 0x0f)); + io.write((" (size = %d) (idx = %s)")->format(band(rshift(p[index].val, 4), 0xF), tostring(valuetable[p[index].offset]))); + } else if( code == IOpenCapture ) { + printcapkind(band(p[index].val, 0x0f)); + io.write((" (idx = %s)")->format(tostring(valuetable[p[index].offset]))); + } else if( code == ISet ) { + printcharset(valuetable[p[index].val]); + } else if( code == ITestSet ) { + printcharset(valuetable[p[index].val]); + printjmp(p, index); + printrulename(p, index, rulenames); + } else if( code == ISpan ) { + printcharset(valuetable[p[index].val]); + } else if( code == IOpenCall ) { + io.write(("-> %d")->format(p[index].offset)); + } else if( code == IBehind ) { + io.write(("%d")->format(p[index].val)); + } else if( code == IJmp || code == ICall || code == ICommit || code == IChoice || + code == IPartialCommit || code == IBackCommit || code == ITestAny ) { + printjmp(p, index); + if( (code == ICall || code == IJmp) && p[index].aux > 0 ) { + io.write(' ', valuetable[p[index].aux]); + } else { + printrulename(p, index, rulenames); + } + } + io.write("\n"); +} + + +var function printpatt(p, valuetable) { + var ruleNames = {}; + for( i = 0, p.size - 1 ) { + var code = p.p[i].code; + if( (code == ICall || code == IJmp) && p.p[i].aux > 0 ) { + var index = i + p.p[i].offset; + ruleNames[index] = valuetable[p.p[i].aux]; + } + } + for( i = 0, p.size - 1 ) { + printinst(p.p, i, valuetable, ruleNames); + } +} + + +var function printcap(cap, index, valuetable) { + printcapkind(cap[index].kind); + io.write((" (idx: %s - size: %d) -> %d\n")->format(valuetable[cap[index].idx], cap[index].siz, cap[index].s)); +} + + +var function printcaplist(cap, limit, valuetable) { + io.write(">======\n"); + var index = 0; + while( cap[index].s && index < limit ) { + printcap(cap, index, valuetable); + ++index ; + } + io.write("=======\n"); +} + +// ====================================================== + + + +// {====================================================== +// Printing trees (for debugging) +// ======================================================= + +var tagnames = { + [TChar] = "char", + [TSet] = "set", + [TAny] = "any", + [TTrue] = "true", + [TFalse] = "false", + [TRep] = "rep", + [TSeq] = "seq", + [TChoice] = "choice", + [TNot] = "not", + [TAnd] = "and", + [TCall] = "call", + [TOpenCall] = "opencall", + [TRule] = "rule", + [TGrammar] = "grammar", + [TBehind] = "behind", + [TCapture] = "capture", + [TRunTime] = "run-time" +}; + + +var function printtree(tree, ident, index, valuetable) { + for( i = 1, ident ) { + io.write(" "); + } + var tag = tree[index].tag; + io.write(("%s")->format(tagnames[tag])); + if( tag == TChar ) { + var c = tree[index].val; + if( ffi.C.isprint(c) ) { + io.write((" '%c'\n")->format(c)); + } else { + io.write((" (%02X)\n")->format(c)); + } + } else if( tag == TSet ) { + printcharset(valuetable[tree[index].val]); + io.write("\n"); + } else if( tag == TOpenCall || tag == TCall ) { + io.write((" key: %s\n")->format(tostring(valuetable[tree[index].val]))); + } else if( tag == TBehind ) { + io.write((" %d\n")->format(tree[index].val)); + printtree(tree, ident + 2, index + 1, valuetable); + } else if( tag == TCapture ) { + io.write((" cap: %s n: %s\n")->format(modes[bit.band(tree[index].cap, 0xffff)], valuetable[tree[index].val])); + printtree(tree, ident + 2, index + 1, valuetable); + } else if( tag == TRule ) { + var extra = bit.band(tree[index].cap, RuleLR) == RuleLR && ' left recursive' || ''; + extra = extra .. (bit.band(tree[index].cap, Ruleused) != Ruleused && ' not used' || ''); + io.write((" n: %d key: %s%s\n")->format(bit.band(tree[index].cap, 0xffff) - 1, valuetable[tree[index].val], extra)); + printtree(tree, ident + 2, index + 1, valuetable); + // do not print next rule as a sibling + } else if( tag == TGrammar ) { + var ruleindex = index + 1; + io.write((" %d\n")->format(tree[index].val)); // number of rules + for( i = 1, tree[index].val ) { + printtree(tree, ident + 2, ruleindex, valuetable); + ruleindex += tree[ruleindex].ps; + } + assert(tree[ruleindex].tag == TTrue); // sentinel + } else { + var sibs = numsiblings[tree[index].tag] || 0; + io.write("\n"); + if( sibs >= 1 ) { + printtree(tree, ident + 2, index + 1, valuetable); + if( sibs >= 2 ) { + printtree(tree, ident + 2, index + tree[index].ps, valuetable); + } + } + } +} + +// }====================================================== */ + +return { + printtree = printtree, + printpatt = printpatt, + printcaplist = printcaplist, + printinst = printinst +}; diff --git a/src/lpvm.ljs b/src/lpvm.ljs new file mode 100644 index 0000000..f478c85 --- /dev/null +++ b/src/lpvm.ljs @@ -0,0 +1,1039 @@ +/* +LPEGLJ +lpvm.lua +Virtual machine +Copyright (C) 2014 Rostislav Sacek. +based on LPeg v1.0 - PEG pattern matching for Lua +Lua.org & PUC-Rio written by Roberto Ierusalimschy +http://www.inf.puc-rio.br/~roberto/lpeg/ + +** Permission is hereby granted, free of charge, to any person obtaining +** a copy of this software and associated documentation files (the +** "Software"), to deal in the Software without restriction, including +** without limitation the rights to use, copy, modify, merge, publish, +** distribute, sublicense, and/or sell copies of the Software, and to +** permit persons to whom the Software is furnished to do so, subject to +** the following conditions: +** +** The above copyright notice and this permission notice shall be +** included in all copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] +--*/ + +var ffi = require ("ffi"); +var lpcap = require ("lpcap"); +/* Only for debug purpose +var lpprint = require("lpprint"); +--*/ + +var band, rshift, lshift = bit.band, bit.rshift, bit.lshift; + +// {====================================================== +// Virtual Machine +// ======================================================= + +// Interpret the result of a dynamic capture: false -> fail; +// true -> keep current position; number -> next position. +// Return new subject position. 'fr' is stack index where +// is the result; 'curr' is current subject position; 'limit' +// is subject's size. +var MAXBEHINDPREDICATE = 255; // max behind for Look-behind predicate +var MAXOFF = 0xF; // maximum for full capture +var MAXBEHIND = math.max(MAXBEHINDPREDICATE, MAXOFF); // maximum before current pos +var INITBACK = 400; // default maximum size for call/backtrack stack + +var IAny = 0; // if no char, fail +var IChar = 1; // if char != val, fail +var ISet = 2; // if char not in val, fail +var ITestAny = 3; // in no char, jump to 'offset' +var ITestChar = 4; // if char != val, jump to 'offset' +var ITestSet = 5; // if char not in val, jump to 'offset' +var ISpan = 6; // read a span of chars in val +var IBehind = 7; // walk back 'val' characters (fail if not possible) +var IRet = 8; // return from a rule +var IEnd = 9; // end of pattern +var IChoice = 10; // stack a choice; next fail will jump to 'offset' +var IJmp = 11; // jump to 'offset' +var ICall = 12; // call rule at 'offset' +var IOpenCall = 13; // call rule number 'offset' (must be closed to a ICall) +var ICommit = 14; // pop choice and jump to 'offset' +var IPartialCommit = 15; // update top choice to current position and jump +var IBackCommit = 16; // "fails" but jump to its own 'offset' +var IFailTwice = 17; // pop one choice and then fail +var IFail = 18; // go back to saved state on choice and jump to saved offset +var IGiveup = 19; // internal use +var IFullCapture = 20; // complete capture of last 'off' chars +var IOpenCapture = 21; // start a capture +var ICloseCapture = 22; +var ICloseRunTime = 23; + +var Cclose = 0; +var Cposition = 1; +var Cconst = 2; +var Cbackref = 3; +var Carg = 4; +var Csimple = 5; +var Ctable = 6; +var Cfunction = 7; +var Cquery = 8; +var Cstring = 9; +var Cnum = 10; +var Csubst = 11; +var Cfold = 12; +var Cruntime = 13; +var Cgroup = 14; + +var BCapcandelete = 0x30000; +var maxstack = INITBACK; +var maxcapturedefault = 100; +var maxmemo = 1000; +var usememoization = false; +var trace = false; + +var FAIL = -1; +var LRFAIL = -1; +var VOID = -2; +var CHOICE = -3; +var CALL = -4; + +ffi.cdef ([=[ +typedef struct { + int code; + int val; + int offset; + int aux; + } PATTERN_ELEMENT; +typedef struct { + int allocsize; + int size; + PATTERN_ELEMENT *p; + } PATTERN; +typedef struct { + int tag; + int val; + int ps; + int cap; + } TREEPATTERN_ELEMENT; +typedef struct { + int id; + int treesize; + PATTERN *code; + TREEPATTERN_ELEMENT p[?]; + } TREEPATTERN; + +typedef struct { + double s; + double X; + double memos; + int p; + int caplevel; + int pA; + int valuetabletop; + } STACK; + +typedef struct { + double s; + int siz; + int idx; + int kind; + int candelete; + } CAPTURE; + +void *malloc( size_t size ); +void free( void *memblock ); +void *realloc( void *memblock, size_t size ); +]=]); + +var treepatternelement = ffi.typeof('TREEPATTERN_ELEMENT'); +var treepattern = ffi.typeof('TREEPATTERN'); +var patternelement = ffi.typeof('PATTERN_ELEMENT'); +var pattern = ffi.typeof('PATTERN'); +var settype = ffi.typeof('int32_t[8]'); + +var function resdyncaptures(fr, curr, limit, checkstreamlen) { + var typ = type(fr); + // false value? + if( ! fr ) { + return FAIL; // and fail + } else if( typ == 'boolean' ) { + // true? + return curr; // keep current position + } else { + var res = fr; // new position + if( res < curr || (limit && res > limit) || (! limit && checkstreamlen && ! checkstreamlen(res - 2)) ) { + error("invalid position returned by match-time capture", 0); + } + return res; + } + assert(false); +} + + +// Add capture values returned by a dynamic capture to the capture list +// 'base', nested inside a group capture. 'fd' indexes the first capture +// value, 'n' is the number of values (at least 1). + +var function adddyncaptures(s, base, index, n, fd, valuetable) { + // Cgroup capture is already there + assert(base[index].kind == Cgroup && base[index].siz == 0); + base[index].idx = 0; // make it an anonymous group + base[index + 1] = {}; + // add runtime captures + for( i = 1, n ) { + base[index + i].kind = Cruntime; + base[index + i].siz = 1; // mark it as closed + var ind = #valuetable + 1; + valuetable[ind] = fd[i + 1]; + base[index + i].idx = ind; // stack index of capture value + base[index + i].s = s; + base[index + i + 1] = {}; + } + base[index + n + 1].kind = Cclose; // close group + base[index + n + 1].siz = 1; + base[index + n + 1].s = s; + base[index + n + 2] = {}; +} + + +// Opcode interpreter + +var function match(stream, last, o, s, op, valuetable, ...) { + var arg = { ... }; + var argcount = select('#', ...); + var len = #o; + var ptr = ffi.cast('const unsigned char*', o); + --s ; + var stackptr = 0; // point to first empty slot in stack + var captop = 0; // point to first empty slot in captures + var STACK = ffi.new("STACK[?]", INITBACK); + var CAPTURE = ffi.new("CAPTURE[?]", maxcapturedefault); + var CAPTURESTACK = { { capture = CAPTURE, captop = captop, maxcapture = maxcapturedefault } }; + var capturestackptr = #CAPTURESTACK; + var maxcapture = maxcapturedefault; + var stacklimit = INITBACK; + var L = {}; + var Memo1, Memo2 = {}, {}; + var memoind = 0; + var maxpointer = 2 ** math.ceil(math.log(op.size) / math.log(2)); + var nocapturereleased = true; + + var p = 0; // current instruction + var streambufsize = 2 ** 8; + var streambufsizemask = streambufsize - 1; // faster modulo + var streambufs = {}; + var streambufoffset = 0; + var streamstartbuffer = 0; + var streambufferscount = 0; + var level = -1; + + var function deletestreambuffers() { + var min = s; + for( i = stackptr - 1, 0, -1 ) { + var val = STACK[i].s; + if( val >= 0 ) { + min = math.min(val, min); + } + } + + for( i = captop - 1, 0, -1 ) { + var val = CAPTURE[i].s; + if( val >= 0 ) { + min = math.min(val, min); + } + } + for( i = streamstartbuffer + 1, streambufoffset - streambufsize, streambufsize ) { + // max behind for full capture and max behind for Look-behind predicate + if( i + streambufsize + MAXBEHIND < min ) { + streambufs[i] = null; + --streambufferscount ; + } else { + streamstartbuffer = i - 1; + break; + } + } + } + + var function addstreamdata(s, last) { + var len = #s; + var srcoffset = 0; + if( streambufferscount > 128 ) { + deletestreambuffers(); + } + do { + var offset = bit.band(streambufoffset, streambufsizemask); + if( offset > 0 ) { + var index = streambufoffset - offset + 1; + var count = math.min(len, streambufsize - offset); + ffi.copy(streambufs[index] + offset, s->sub(srcoffset + 1, srcoffset + 1 + count), count); + len -= count; + srcoffset += count; + streambufoffset += count; + } + if( len > 0 ) { + var index = streambufoffset - (bit.band(streambufoffset, streambufsizemask)) + 1; + var buf = ffi.new('unsigned char[?]', streambufsize); + ++streambufferscount ; + streambufs[index] = buf; + var count = math.min(len, streambufsize); + ffi.copy(buf, s->sub(srcoffset + 1, srcoffset + 1 + count), count); + len -= count; + srcoffset += count; + streambufoffset += count; + } + if( streambufoffset >= 2 ** 47 ) { + error("too big input stream", 0); + } + } while(!( len == 0) ); + } + + var function getstreamchar(s) { + var offset = bit.band(s, streambufsizemask); + var index = s - offset + 1; + return streambufs[index][offset]; + } + + var checkstreamlen; + + var function getstreamstring(st, en) { + // TODO Optimalize access + var str = {}; + var i = st >= 0 && st || 1; + var to = en >= 0 && en || math.huge; + while( true ) { + if( i > to ) { break; } + if( ! checkstreamlen(i - 1) ) { return; } + if( last && (st < 0 || en < 0) ) { + for( j = i, streambufoffset ) { + str[#str + 1] = string.char(getstreamchar(j - 1)); + } + en = en < 0 && streambufoffset + en + 1 || en; + en = st > 0 && en - st + 1 || en; + st = st < 0 && streambufoffset + st + 1 || 1; + return table.concat(str)->sub(st, en); + } else { + str[#str + 1] = string.char(getstreamchar(i - 1)); + ++i ; + } + } + return table.concat(str); + } + + function checkstreamlen(index) { + var str; + while( true ) { + if( index < streambufoffset ) { + return true; + } else { + if( last ) { + s = streambufoffset; + return false; + } + var max = captop; + for( i = stackptr - 1, 0, -1 ) { + var val = STACK[i].X == CHOICE && STACK[i].caplevel || -1; + if( val >= 0 ) { + max = math.min(val, max); + } + } + var n, out, outindex = lpcap.getcapturesruntime(CAPTURE, null, getstreamstring, false, 0, max, captop, valuetable, unpack(arg, 1, argcount)); + if( n > 0 ) { + for( i = stackptr - 1, 0, -1 ) { + var val = STACK[i].caplevel; + if( val > 0 ) { + STACK[i].caplevel = STACK[i].caplevel - n; + } + } + captop -= n; + } + if( outindex > 0 ) { + nocapturereleased = false; + } + str, last = coroutine.yield(1, unpack(out, 1, outindex)); + addstreamdata(str); + } + } + } + + var function doublecapture() { + maxcapture *= 2; + var NEWCAPTURE = ffi.new("CAPTURE[?]", maxcapture); + ffi.copy(NEWCAPTURE, CAPTURE, ffi.sizeof('CAPTURE') * captop); + CAPTURE = NEWCAPTURE; + CAPTURESTACK[capturestackptr].capture = CAPTURE; + CAPTURESTACK[capturestackptr].maxcapture = maxcapture; + } + + var function pushcapture() { + CAPTURE[captop].idx = op.p[p].offset; + CAPTURE[captop].kind = band(op.p[p].val, 0x0f); + CAPTURE[captop].candelete = band(op.p[p].val, BCapcandelete) != 0 && 1 || 0; + ++captop ; + ++p ; + if( captop >= maxcapture ) { + doublecapture(); + } + } + + var function traceenter(typ, par) { + level = level + (par || 0); + io.write(('%s+%s %s\n')->format((' ')->rep(level), typ, valuetable[op.p[p].aux])); + } + + var function traceleave(inst) { + io.write(('%s- %s\n')->format((' ')->rep(level), valuetable[op.p[inst].aux])); + --level ; + } + + var function tracematch(typ, start, par, from, to, inst, extra, ...) { + var n, caps, capscount = lpcap.getcapturesruntime(CAPTURE, o, getstreamstring, true, start, captop, captop, valuetable, ...); + var capstr = {}; + for( i = 1, capscount ) { capstr[i] = tostring(caps[i]); } + extra = extra && '(' .. extra .. ')' || ''; + io.write(('%s=%s %s%s %s %s \n')->format((' ')->rep(level), typ, valuetable[op.p[inst].aux], extra, + o && o->sub(from, to) || getstreamstring(from, to), table.concat(capstr, " "))); + level -= par; + } + + var function fail() { + // pattern failed: try to backtrack + var X; + do { // remove pending calls + --stackptr ; + if( stackptr == -1 ) { + p = FAIL; + return; + } + s = STACK[stackptr].s; + X = STACK[stackptr].X; + if( usememoization && X == CALL && STACK[stackptr].memos != VOID ) { + Memo1[STACK[stackptr].pA + STACK[stackptr].memos * maxpointer] = FAIL; + Memo2[STACK[stackptr].pA + STACK[stackptr].memos * maxpointer] = FAIL; + } + // lvar.2 rest + if( X == LRFAIL ) { + CAPTURESTACK[capturestackptr] = null; + --capturestackptr ; + CAPTURE = CAPTURESTACK[capturestackptr].capture; + maxcapture = CAPTURESTACK[capturestackptr].maxcapture; + L[STACK[stackptr].pA + s * maxpointer] = null; + } + if( trace && (X == CALL || X == LRFAIL) ) { traceleave(STACK[stackptr].p - 1); } + } while(!( X == CHOICE || X >= 0) ); + p = STACK[stackptr].p; + for( i = #valuetable, STACK[stackptr].valuetabletop + 1, -1 ) { + table.remove(valuetable); + } + // inc.2 + if( X >= 0 ) { + s = X; + --capturestackptr ; + CAPTURE = CAPTURESTACK[capturestackptr].capture; + captop = CAPTURESTACK[capturestackptr].captop; + maxcapture = CAPTURESTACK[capturestackptr].maxcapture; + var capture = L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer].capturecommit; + while( captop + capture.captop >= maxcapture ) { + doublecapture(); + } + ffi.copy(CAPTURE + captop, capture.capture, capture.captop * ffi.sizeof('CAPTURE')); + captop = captop + capture.captop; + if( trace ) { tracematch('', captop - capture.captop, 1, STACK[stackptr].s + 1, s, STACK[stackptr].p - 1, L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer].level, unpack(arg, 1, argcount)); } + CAPTURESTACK[capturestackptr + 1] = null; + L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer] = null; + } else { + captop = STACK[stackptr].caplevel; + } + } + + var function doublestack() { + if( stackptr >= maxstack ) { + error(("backtrack stack overflow (current limit is %d)")->format(maxstack), 0); + } + stacklimit *= 2; + stacklimit = (stacklimit > maxstack) && maxstack || stacklimit; + var NEWSTACK = ffi.new("STACK[?]", stacklimit); + ffi.copy(NEWSTACK, STACK, ffi.sizeof('STACK') * stackptr); + STACK = NEWSTACK; + } + + + if( stream ) { + addstreamdata(o); + len = null; + o = null; + ptr = null; + } + while( true ) { + /* Only for debug + io.write(("s: |%s| stck:%d, caps:%d \n"):format(s + 1, stackptr, captop)) + if p ~= FAIL then + lpprint.printinst(op.p, p, valuetable) + lpprint.printcaplist(CAPTURE, captop, valuetable) + end + --*/ + if( p == FAIL ) { return -1; } + var code = op.p[p].code; + if( code == IEnd ) { + CAPTURE[captop].kind = Cclose; + CAPTURE[captop].s = -1; + return 0, lpcap.getcaptures(CAPTURE, o, getstreamstring, nocapturereleased && s + 1, valuetable, ...); + } else if( code == IRet ) { + if( STACK[stackptr - 1].X == CALL ) { + --stackptr ; + if( trace ) { tracematch('', STACK[stackptr].caplevel, 1, STACK[stackptr].s + 1, s, STACK[stackptr].p - 1, null, ...); } + p = STACK[stackptr].p; + if( usememoization && STACK[stackptr].memos != VOID ) { + var dif = captop - STACK[stackptr].caplevel; + var caps; + if( dif > 0 ) { + caps = ffi.new("CAPTURE[?]", dif); + ffi.copy(caps, CAPTURE + captop - dif, dif * ffi.sizeof('CAPTURE')); + } + var val = { s, dif, caps }; + Memo1[STACK[stackptr].pA + STACK[stackptr].memos * maxpointer] = val; + Memo2[STACK[stackptr].pA + STACK[stackptr].memos * maxpointer] = val; + } + } else { + var X = STACK[stackptr - 1].X; + // lvar.1 inc.1 + if( X == LRFAIL || s > X ) { + if( trace ) { tracematch('IB', 0, 0, STACK[stackptr - 1].s + 1, s, STACK[stackptr - 1].p - 1, L[STACK[stackptr - 1].pA + STACK[stackptr - 1].s * maxpointer].level + 1, ...); } + STACK[stackptr - 1].X = s; + p = STACK[stackptr - 1].pA; + s = STACK[stackptr - 1].s; + var lambda = L[p + s * maxpointer]; + lambda.level = lambda.level + 1; + lambda.X = STACK[stackptr - 1].X; + STACK[stackptr - 1].caplevel = captop; + STACK[stackptr - 1].valuetabletop = #valuetable; + CAPTURESTACK[capturestackptr].captop = captop; + lambda.capturecommit = CAPTURESTACK[capturestackptr]; + captop = 0; + CAPTURE = ffi.new("CAPTURE[?]", maxcapturedefault); + CAPTURESTACK[capturestackptr] = { capture = CAPTURE, captop = captop, maxcapture = maxcapturedefault }; + maxcapture = maxcapturedefault; + } else { + // inc.3 + --stackptr ; + p = STACK[stackptr].p; + s = STACK[stackptr].X; + for( i = #valuetable, STACK[stackptr].valuetabletop + 1, -1 ) { + table.remove(valuetable); + } + var lambda = L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer]; + --capturestackptr ; + CAPTURE = CAPTURESTACK[capturestackptr].capture; + captop = CAPTURESTACK[capturestackptr].captop; + maxcapture = CAPTURESTACK[capturestackptr].maxcapture; + var capture = lambda.capturecommit; + while( captop + capture.captop >= maxcapture ) { + doublecapture(); + } + ffi.copy(CAPTURE + captop, capture.capture, capture.captop * ffi.sizeof('CAPTURE')); + captop = captop + capture.captop; + if( trace ) { tracematch('', captop - capture.captop, 1, STACK[stackptr].s + 1, s, STACK[stackptr].p - 1, L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer].level, ...); } + CAPTURESTACK[capturestackptr + 1] = null; + L[STACK[stackptr].pA + STACK[stackptr].s * maxpointer] = null; + } + } + } else if( code == IBehind ) { + var n = op.p[p].val; + if( n > s ) { + fail(); + } else { + s -= n; + ++p ; + } + } else if( code == IJmp ) { + if( trace && op.p[p].aux != 0 ) { traceenter('TC'); } + p = p + op.p[p].offset; + } else if( code == IChoice ) { + if( stackptr == stacklimit ) { + doublestack(); + } + STACK[stackptr].X = CHOICE; + STACK[stackptr].p = p + op.p[p].offset; + STACK[stackptr].s = s; + STACK[stackptr].caplevel = captop; + STACK[stackptr].valuetabletop = #valuetable; + ++stackptr ; + ++p ; + } else if( code == ICall ) { + if( stackptr == stacklimit ) { + doublestack(); + } + var k = bit.band(op.p[p].val, 0xffff); + if( k == 0 ) { + var pA = p + op.p[p].offset; + var memo = Memo1[pA + s * maxpointer]; + if( usememoization && memo ) { + if( trace ) { traceenter('M', 1); } + if( memo == FAIL ) { + if( trace ) { traceleave(p); } + fail(); + } else { + var dif = memo[2]; + if( dif > 0 ) { + while( captop + dif >= maxcapture ) { + doublecapture(); + } + var caps = memo[3]; + ffi.copy(CAPTURE + captop, caps, dif * ffi.sizeof('CAPTURE')); + captop += dif; + } + if( trace ) { tracematch('M', captop - dif, 1, s + 1, memo[1], p, null, ...); } + s = memo[1]; + ++p ; + } + } else { + if( trace ) { traceenter('', 1); } + STACK[stackptr].X = CALL; + STACK[stackptr].s = s; + STACK[stackptr].p = p + 1; // save return address + STACK[stackptr].pA = pA; + STACK[stackptr].memos = s; + STACK[stackptr].caplevel = captop; + ++stackptr ; + p = pA; + if( usememoization && ! memo ) { + ++memoind ; + if( memoind > maxmemo ) { + memoind = 0; + Memo1 = Memo2; + Memo2 = {}; + } + } + } + } else { + var pA = p + op.p[p].offset; + var X = L[pA + s * maxpointer]; + // lvar.1 lvar.2 + if( ! X ) { + if( trace ) { traceenter('', 1); } + CAPTURESTACK[capturestackptr].captop = captop; + var capture = ffi.new("CAPTURE[?]", maxcapturedefault); + ++capturestackptr ; + CAPTURESTACK[capturestackptr] = { capture = capture, captop = captop, maxcapture = maxcapturedefault }; + CAPTURE = capture; + maxcapture = maxcapturedefault; + captop = 0; + L[pA + s * maxpointer] = { X = LRFAIL, k = k, cs = capturestackptr, level = 0 }; + STACK[stackptr].p = p + 1; + STACK[stackptr].pA = pA; + STACK[stackptr].s = s; + STACK[stackptr].X = LRFAIL; + ++stackptr ; + p = pA; + } else if( X.X == LRFAIL || k < X.k ) { + // lvar.3 lvar.5 + fail(); + } else { + // lvar.4 + var capture = X.capturecommit; + while( captop + capture.captop >= maxcapture ) { + doublecapture(); + } + ffi.copy(CAPTURE + captop, capture.capture, capture.captop * ffi.sizeof('CAPTURE')); + captop += capture.captop; + ++p; + s = X.X; + } + } + } else if( code == ICommit ) { + --stackptr; + p += op.p[p].offset; + } else if( code == IPartialCommit ) { + STACK[stackptr - 1].s = s; + STACK[stackptr - 1].caplevel = captop; + STACK[stackptr - 1].valuetabletop = #valuetable; + p += op.p[p].offset; + } else if( code == IBackCommit ) { + --stackptr ; + s = STACK[stackptr].s; + captop = STACK[stackptr].caplevel; + for( i = #valuetable, STACK[stackptr].valuetabletop + 1, -1 ) { + table.remove(valuetable); + } + p += op.p[p].offset; + } else if( code == IFailTwice ) { + --stackptr ; + fail(); + } else if( code == IFail ) { + fail(); + } else if( code == ICloseRunTime ) { + // invalidate memo + for( i = 0, stackptr - 1 ) { + STACK[i].memos = VOID; + } + var cs = {}; + cs.s = o; + cs.stream = getstreamstring; + cs.ocap = CAPTURE; + cs.ptop = arg; + cs.ptopcount = argcount; + var out = { outindex = 0, out = {} }; + var n = lpcap.runtimecap(cs, captop, s + 1, out, valuetable); // call function + captop -= n; + var res = resdyncaptures(out.out[1], s + 1, len && len + 1, checkstreamlen); // get result + // fail? + if( res == FAIL ) { + fail(); + } else { + s = res - 1; // else update current position + n = out.outindex - 1; // number of new captures + // any new capture? + if( n > 0 ) { + ++captop ; + while( captop + n + 1 >= maxcapture ) { + doublecapture(); + } + captop = captop + n + 1; + // add new captures to 'capture' list + adddyncaptures(s + 1, CAPTURE, captop - n - 2, n, out.out, valuetable); + } + ++p ; + } + } else if( code == ICloseCapture ) { + var s1 = s + 1; + assert(captop > 0); + // if possible, turn capture into a full capture + if( CAPTURE[captop - 1].siz == 0 && + s1 - CAPTURE[captop - 1].s < 255 ) { + CAPTURE[captop - 1].siz = s1 - CAPTURE[captop - 1].s + 1; + ++p ; + } else { + CAPTURE[captop].siz = 1; + CAPTURE[captop].s = s + 1; + pushcapture(); + } + } else if( code == IOpenCapture ) { + CAPTURE[captop].siz = 0; + CAPTURE[captop].s = s + 1; + pushcapture(); + } else if( code == IFullCapture ) { + CAPTURE[captop].siz = band(rshift(op.p[p].val, 4), 0x0F) + 1; // save capture size + CAPTURE[captop].s = s + 1 - band(rshift(op.p[p].val, 4), 0x0F); + pushcapture(); + // standard mode + } else if( o ) { + if( code == IAny ) { + if( s < len ) { + ++p ; + ++s ; + } else { + fail(); + } + } else if( code == ITestAny ) { + if( s < len ) { + ++p ; + } else { + p = p + op.p[p].offset; + } + } else if( code == IChar ) { + if( s < len && ptr[s] == op.p[p].val ) { + ++p ; + ++s ; + } else { + fail(); + } + } else if( code == ITestChar ) { + if( s < len && ptr[s] == op.p[p].val ) { + ++p ; + } else { + p = p + op.p[p].offset; + } + } else if( code == ISet ) { + var c = ptr[s]; + var set = valuetable[op.p[p].val]; + if( s < len && band(set[rshift(c, 5)], lshift(1, band(c, 31))) != 0 ) { + ++p ; + ++s ; + } else { + fail(); + } + } else if( code == ITestSet ) { + var c = ptr[s]; + var set = valuetable[op.p[p].val]; + if( s < len && band(set[rshift(c, 5)], lshift(1, band(c, 31))) != 0 ) { + ++p ; + } else { + p = p + op.p[p].offset; + } + } else if( code == ISpan ) { + while( s < len ) { + var c = ptr[s]; + var set = valuetable[op.p[p].val]; + if( band(set[rshift(c, 5)], lshift(1, band(c, 31))) == 0 ) { + break; + } + ++s ; + } + ++p ; + } + } else { + // stream mode + if( code == IAny ) { + if( checkstreamlen(s) ) { + ++p ; + ++s ; + } else { + fail(); + } + } else if( code == ITestAny ) { + if( checkstreamlen(s) ) { + ++p ; + } else { + p = p + op.p[p].offset; + } + } else if( code == IChar ) { + if( checkstreamlen(s) && getstreamchar(s) == op.p[p].val ) { + ++p ; + ++s ; + } else { + fail(); + } + } else if( code == ITestChar ) { + if( checkstreamlen(s) && getstreamchar(s) == op.p[p].val ) { + ++p ; + } else { + p = p + op.p[p].offset; + } + } else if( code == ISet ) { + var c = checkstreamlen(s) && getstreamchar(s); + var set = valuetable[op.p[p].val]; + if( c && band(set[rshift(c, 5)], lshift(1, band(c, 31))) != 0 ) { + ++p ; + ++s ; + } else { + fail(); + } + } else if( code == ITestSet ) { + var c = checkstreamlen(s) && getstreamchar(s); + var set = valuetable[op.p[p].val]; + if( c && band(set[rshift(c, 5)], lshift(1, band(c, 31))) != 0 ) { + ++p ; + } else { + p = p + op.p[p].offset; + } + } else if( code == ISpan ) { + while( checkstreamlen(s) ) { + var c = getstreamchar(s); + var set = valuetable[op.p[p].val]; + if( band(set[rshift(c, 5)], lshift(1, band(c, 31))) == 0 ) { + break; + } + ++s ; + } + ++p ; + } + } + } +} + +var function setmax(val) { + maxstack = val; + if( maxstack < INITBACK ) { + maxstack = INITBACK; + } +} + +var function setmaxbehind(val) { + MAXBEHIND = math.max(MAXBEHINDPREDICATE, MAXOFF, val || 0); +} + +var function enablememoization(val) { + usememoization = val; +} +var function enabletracing(val) { + trace = val; +} + +// Get the initial position for the match, interpreting negative +// values from the end of the subject + +var function initposition(len, pos) { + var ii = pos || 1; + // positive index? + if( (ii > 0) ) { + // inside the string? + if( ii <= len ) { + return ii - 1; // return it (corrected to 0-base) + } else { + return len; // crop at the end + } + } else { + // negative index + // inside the string? + if( -ii <= len ) { + return len + ii; // return position from the end + } else { + return 0; // crop at the beginning + } + } +} + +var function lp_match(pat, s, init, valuetable, ...) { + var i = initposition(s->len(), init) + 1; + return select(2, match(false, true, s, i, pat.code, valuetable, ...)); +} + +var function lp_streammatch(pat, init, valuetable, ...) { + var params = { ... }; + var paramslength = select('#', ...); + var fce = coroutine.wrap(function(s, last) { + return match(true, last, s, init || 1, pat.code, valuetable, unpack(params, 1, paramslength)); + }); + return fce; +} + +var function retcount(...) { + return select('#', ...), { ... }; +} + +// Only for testing purpose +// stream emulation (send all chars from string one char after char) +var function lp_emulatestreammatch(pat, s, init, valuetable, ...) { + init = initposition(s->len(), init) + 1; + var fce = lp_streammatch(pat, init, valuetable, ...); + var ret, count = {}, 0; + for( j = 1, #s ) { + var pcount, pret = retcount(fce(s->sub(j, j), j == #s)); // one char + if( pret[1] == -1 ) { + return; // fail + } else if( pret[1] == 0 ) { + // parsing finished + // collect result + for( i = 2, pcount ) { + ret[count + i - 1] = pret[i]; + } + count = count + pcount - 1; + return unpack(ret, 1, count); + } + for( i = 2, pcount ) { + ret[count + i - 1] = pret[i]; + } + count = count + pcount - 1; + } + return select(2, fce(s, true)); // empty string +} + +var function lp_load(str, fcetab, usemeta) { + var index = 0; + assert(str); + var ptr = ffi.cast('const char*', str); + var patsize = ffi.cast('uint32_t*', ptr + index)[0]; + index += 4; + var len = ffi.sizeof(treepatternelement) * patsize; + + var pat; + if( usemeta ) { + pat = treepattern(patsize); + } else { + pat = ffi.gc(ffi.cast('TREEPATTERN*', ffi.C.malloc(ffi.sizeof(treepattern, patsize))), + function(ct) { + if( ct.code != null ) { + ffi.C.free(ct.code.p); + ffi.C.free(ct.code); + } + ffi.C.free(ct); + }); + ffi.fill(pat, ffi.sizeof(treepattern, patsize)); + pat.treesize = patsize; + pat.id = 0; + } + ffi.copy(pat.p, ptr + index, len); + index += len; + if( usemeta ) { + pat.code = pattern(); + } else { + pat.code = ffi.cast('PATTERN*', ffi.C.malloc(ffi.sizeof(pattern))); + assert(pat.code != null); + pat.code.allocsize = 10; + pat.code.size = 0; + pat.code.p = ffi.C.malloc(ffi.sizeof(patternelement) * pat.code.allocsize); + assert(pat.code.p != null); + ffi.fill(pat.code.p, ffi.sizeof(patternelement) * pat.code.allocsize); + } + pat.code.size = ffi.cast('uint32_t*', ptr + index)[0]; + index += 4; + len = pat.code.size * ffi.sizeof(patternelement); + var data = ffi.string(ptr + index, len); + index += len; + var count = ffi.cast('uint32_t*', ptr + index)[0]; + index += 4; + var valuetable = {}; + for( i = 1, count ) { + var tag = ffi.string(ptr + index, 3); + index = index + 3; + //string + if( tag == 'str' ) { + len = ffi.cast('uint32_t*', ptr + index)[0]; + index += 4; + var val = ffi.string(ptr + index, len); + index += len; + valuetable[#valuetable + 1] = val; + } else if( tag == 'num' ) { + //number + len = ffi.cast('uint32_t*', ptr + index)[0]; + index += 4; + var val = ffi.string(ptr + index, len); + index += len; + valuetable[#valuetable + 1] = tonumber(val); + } else if( tag == 'cdt' ) { + //ctype + var val = settype(); + ffi.copy(val, ptr + index, ffi.sizeof(settype)); + index = index + ffi.sizeof(settype); + valuetable[#valuetable + 1] = val; + } else if( tag == 'fnc' ) { + //function + len = ffi.cast('uint32_t*', ptr + index)[0]; + index += 4; + var fname = ffi.string(ptr + index, len); + index += len; + len = ffi.cast('uint32_t*', ptr + index)[0]; + index += 4; + var val = ffi.string(ptr + index, len); + index += len; + if( fcetab && fcetab[fname] ) { + assert(type(fcetab[fname]) == 'function', ('"%s" is not function')->format(fname)); + valuetable[#valuetable + 1] = fcetab[fname]; + } else { + valuetable[#valuetable + 1] = loadstring(val); + } + } + } + pat.code.allocsize = pat.code.size; + pat.code.p = ffi.C.realloc(pat.code.p, ffi.sizeof(patternelement) * pat.code.allocsize); + assert(pat.code.p != null); + ffi.copy(pat.code.p, data, ffi.sizeof(patternelement) * pat.code.allocsize); + return pat, valuetable; +} + +var function lp_loadfile(fname, fcetab, usemeta) { + var file = assert(io.open(fname, 'rb')); + var pat, valuetable = lp_load(assert(file->read("*a")), fcetab, usemeta); + file->close(); + return pat, valuetable; +} +// ====================================================== + +return { + match = lp_match, + streammatch = lp_streammatch, + emulatestreammatch = lp_emulatestreammatch, + load = lp_load, + loadfile = lp_loadfile, + setmax = setmax, + setmaxbehind = setmaxbehind, + enablememoization = enablememoization, + enabletracing = enabletracing +}; diff --git a/src/re.ljs b/src/re.ljs new file mode 100644 index 0000000..873168a --- /dev/null +++ b/src/re.ljs @@ -0,0 +1,287 @@ +// $Id: re.lua,v 1.44 2013/03/26 20:11:40 roberto Exp $ +// 2014/08/15 changes rostislav + +// imported functions and modules +var tonumber, print, error = tonumber, print, error; +var setmetatable = setmetatable; +var m = require("lpeglj"); + +// 'm' will be used to parse expressions, and 'mm' will be used to +// create expressions; that is, 're' runs on 'm', creating patterns +// on 'mm' +var mm = m; + +// pattern's metatable +var mt = getmetatable(mm.P(0)); +mt = m.version() == "1.0.0.0LJ" && m || mt; + + + +// No more global accesses after this point +var version = _VERSION; +if( version == "Lua 5.2" ) { _ENV = null; } + + +var any = m.P(1); + + +// Pre-defined names +var Predef = { nl = m.P("\n") }; + + +var mem; +var fmem; +var gmem; + + +var function updatelocale () { + mm.locale(Predef); + Predef.a = Predef.alpha; + Predef.c = Predef.cntrl; + Predef.d = Predef.digit; + Predef.g = Predef.graph; + Predef.l = Predef.lower; + Predef.p = Predef.punct; + Predef.s = Predef.space; + Predef.u = Predef.upper; + Predef.w = Predef.alnum; + Predef.x = Predef.xdigit; + Predef.A = any - Predef.a; + Predef.C = any - Predef.c; + Predef.D = any - Predef.d; + Predef.G = any - Predef.g; + Predef.L = any - Predef.l; + Predef.P = any - Predef.p; + Predef.S = any - Predef.s; + Predef.U = any - Predef.u; + Predef.W = any - Predef.w; + Predef.X = any - Predef.x; + mem = {}; // restart memoization + fmem = {}; + gmem = {}; + var mt = {__mode = "v"}; + setmetatable(mem, mt); + setmetatable(fmem, mt); + setmetatable(gmem, mt); +} + + +updatelocale(); + + + +var I = m.P(function (s,i) { print(i, s->sub(1, i-1)); return i; }); + + +var function getdef (id, defs) { + var c = defs && defs[id]; + if( ! c ) { error("undefined name: " .. id); } + return c; +} + + +var function patt_error (s, i) { + var msg = (#s < i + 20) && s->sub(i) + || s->sub(i,i+20) .. "..."; + msg = ("pattern error near '%s'")->format(msg); + error(msg, 2); +} + +var function mult (p, n) { + var np = mm.P(true); + while( n >= 1 ) { + if( n%2 >= 1 ) { np *= p; } + p *= p; + n /= 2; + } + return np; +} + +var function equalcap (s, i, c) { + if( type(c) != "string" ) { return null; } + var e = #c + i; + if( type(s) == 'function' ) { // stream mode + if( s(i, e - 1) == c ) { return e; } else { return null; } + } else { + if( s->sub(i, e - 1) == c ) { return e; } else { return null; } + } +} + + +var S = (Predef.space + "--" * (any - Predef.nl)**0)**0; + +var name = m.R("AZ", "az", "__") * m.R("AZ", "az", "__", "09")**0; + +var arrow = S * "<-"; + +var seq_follow = m.P("/") + ")" + "}" + ":}" + "~}" + "|}" + (name * arrow) + -1; + +name = m.C(name); + + +// a defined name only have meaning in a given environment +var Def = name * m.Carg(1); + +var num = m.C(m.R("09")**1) * S / tonumber; + +var String = "'" * m.C((any - "'")**0) * "'" + + '"' * m.C((any - '"')**0) * '"'; + + +var defined = "%" * Def / function (c,Defs) { + var cat = Defs && Defs[c] || Predef[c]; + if( ! cat ) { error ("name '" .. c .. "' undefined"); } + return cat; +}; + +var Range = m.Cs(any * (m.P("-")/"") * (any - "]")) / mm.R; + +var item = defined + Range + m.C(any); + +var Class = + "[" + * (m.C(m.P("^")**-1)) // optional complement symbol + * m.Cf(item * (item - "]")**0, mt.__add) / + function (c, p) { return c == "^" && any - p || p; } + * "]"; + +var function adddef (t, k, exp) { + if( t[k] ) { + error("'"..k.."' already defined as a rule"); + } else { + t[k] = exp; + } + return t; +} + +var function firstdef (n, r) { return adddef({n}, n, r); } + + +var function NT (n, b, p) { + if( ! b ) { + error("rule '"..n.."' used outside a grammar"); + } else { return mm.V(n, p || 0); + } +} + + +var exp = m.P({ "Exp", + Exp = S * ( m.V("Grammar") + + m.Cf(m.V("Seq") * ("/" * S * m.V("Seq"))**0, mt.__add) ); + Seq = m.Cf(m.Cc(m.P("")) * m.V("Prefix")**0 , mt.__mul) + * (#seq_follow + patt_error); + Prefix = "&" * S * m.V("Prefix") / mt.__len + + "!" * S * m.V("Prefix") / mt.__unm + + m.V("Suffix"); + Suffix = m.Cf(m.V("Primary") * S * + ( ( m.P("+") * m.Cc(1, mt.__pow) + + m.P("*") * m.Cc(0, mt.__pow) + + m.P("?") * m.Cc(-1, mt.__pow) + + "^" * ( m.Cg(num * m.Cc(mult)) + + m.Cg(m.C(m.S("+-") * m.R("09")**1) * m.Cc(mt.__pow)) + ) + + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div)) + + m.P("{}") * m.Cc(null, m.Ct) + + m.Cg(Def / getdef * m.Cc(mt.__div)) + ) + + "=>" * S * m.Cg(Def / getdef * m.Cc(m.Cmt)) + ) * S + )**0, function (a,b,f) { return f(a,b); } ); + Primary = "(" * m.V("Exp") * ")" + + String / mm.P + + Class + + defined + + "{:" * (name * ":" + m.Cc(null)) * m.V("Exp") * ":}" / + function (n, p) { return mm.Cg(p, n); } + + "=" * name / function (n) { return mm.Cmt(mm.Cb(n), equalcap); } + + m.P("{}") / mm.Cp + + "{~" * m.V("Exp") * "~}" / mm.Cs + + "{|" * m.V("Exp") * "|}" / mm.Ct + + "{" * m.V("Exp") * "}" / mm.C + + m.P(".") * m.Cc(any) + + (name * m.Cb("G") * (S * ":" * S * num)**-1 * -arrow + "<" * name * m.Cb("G") * (S * ":" * S * num)**-1 * ">") / NT; + Definition = name * arrow * m.V("Exp"); + Grammar = m.Cg(m.Cc(true), "G") * + m.Cf(m.V("Definition") / firstdef * m.Cg(m.V("Definition"))**0, + adddef) / mm.P +}); + +var pattern = S * m.Cg(m.Cc(false), "G") * exp / mm.P * (-any + patt_error); + + +var function compile (p, defs) { + if( mm.type(p) == "pattern" ) { return p; } // already compiled + var cp = pattern->match(p, 1, defs); + if( ! cp ) { error("incorrect pattern", 3); } + return cp; +} + +var function match (s, p, i) { + var cp = mem[p]; + if( ! cp ) { + cp = compile(p); + mem[p] = cp; + } + return cp->match(s, i || 1); +} + +var function streammatch (p, i) { + var cp = mem[p]; + if( ! cp ) { + cp = compile(p); + mem[p] = cp; + } + return cp->streammatch(i || 1); +} + +// Only for testing purpose +var function emulatestreammatch(s, p, i) { + var cp = mem[p]; + if( ! cp ) { + cp = compile(p); + mem[p] = cp; + } + return cp->emulatestreammatch(s, i || 1); +} + +var function find (s, p, i) { + var cp = fmem[p]; + if( ! cp ) { + cp = compile(p) / 0; + cp = mm.P({ mm.Cp() * cp * mm.Cp() + 1 * mm.V(1) }); + fmem[p] = cp; + } + var e; + i, e = cp->match(s, i || 1); + if( i ) { return i, e - 1; + } else { return i; + } +} + +var function gsub (s, p, rep) { + var g = gmem[p] || {}; // ensure gmem[p] is not collected while here + gmem[p] = g; + var cp = g[rep]; + if( ! cp ) { + cp = compile(p); + cp = mm.Cs((cp / rep + 1)**0); + g[rep] = cp; + } + return cp->match(s); +} + + +// exported names +var re = { + compile = compile, + match = match, + streammatch = streammatch, + emulatestreammatch = emulatestreammatch, + find = find, + gsub = gsub, + updatelocale = updatelocale, +}; + +if( version == "Lua 5.1" ) { _G.re = re; } + +return re; diff --git a/tests/loadtest.ljs b/tests/loadtest.ljs new file mode 100644 index 0000000..d04d7f0 --- /dev/null +++ b/tests/loadtest.ljs @@ -0,0 +1,46 @@ +package.path = "./lpeglj/?.ljs;" .. package.path; + +var vm = require("lpvm"); +var m = require("lpeglj"); +var re = require("re"); + +var function checkeq(x, y, p) { + if( p ) { print(x, y); } + if( type(x) != "table" ) { assert(x == y); + } else { + for( k, v in pairs(x) ) { checkeq(v, y[k], p); } + for( k, v in pairs(y) ) { checkeq(v, x[k], p); } + } +} + +print("Tests for LPegLJ pattern saving and loading"); +print("version " .. m.version()); + +var c = re.compile([=[ + s <- ({(!longstring .)+} / longstring)* + longstring <- '[' {:init: '='* :} '[' close + close <- ']' =init ']' / . close +]=]); + +var teststring = 'data1[=[insidedata1]=]data2[==[====]==]data3[[]]'; + +var patfile = 'test.pat'; + +var patdata = c->dump(); +c->save(patfile); + +var pat = m.load(patdata); +checkeq({ pat->match(teststring) }, { "data1", "data2", "data3" }); + +pat = m.loadfile(patfile); +checkeq({ pat->match(teststring) }, { "data1", "data2", "data3" }); + +// use only vm module (lpvm + lpcap) +var valuetable; +pat, valuetable = vm.load(patdata); +checkeq({ vm.match(pat, teststring, 1, valuetable) }, { "data1", "data2", "data3" }); + +pat, valuetable = vm.loadfile(patfile); +checkeq({ vm.match(pat, teststring, 1, valuetable) }, { "data1", "data2", "data3" }); + +print('OK'); diff --git a/tests/lpeglj b/tests/lpeglj new file mode 120000 index 0000000..5cd551c --- /dev/null +++ b/tests/lpeglj @@ -0,0 +1 @@ +../src \ No newline at end of file diff --git a/tests/streamtest.ljs b/tests/streamtest.ljs new file mode 100644 index 0000000..a6c8584 --- /dev/null +++ b/tests/streamtest.ljs @@ -0,0 +1,1449 @@ +#!/usr/bin/env ljsjit + +package.path = "./lpeglj/?.ljs;" .. package.path; + +// $Id: test.lua,v 1.109 2015/09/28 17:01:25 roberto Exp $ + +// require"strict" -- just to be pedantic + +var m = require("lpeglj"); + + +// for general use +var a, b, c, d, e, f, g, p, t; + + +// compatibility with Lua 5.2 +var unpack = rawget(table, "unpack") || unpack; +var loadstring = rawget(_G, "loadstring") || load; + + +var any = m.P(1); +var space = m.S(" \t\n")**0; + +var function checkeq (x, y, p) { + if( p ) { print(x,y); } + if( type(x) != "table" ) { assert(x == y); + } else { + for( k,v in pairs(x) ) { checkeq(v, y[k], p); } + for( k,v in pairs(y) ) { checkeq(v, x[k], p); } + } +} + + +var mt = getmetatable(m.P(1)); +mt = m.version() == "1.0.0.0LJ" && m || mt; + + +var allchar = {}; +for( i=0,255 ) { allchar[i + 1] = i; } +allchar = string.char(unpack(allchar)); +assert(#allchar == 256); + +var function cs2str (c) { + return m.emulatestreammatch(m.Cs((c + m.P(1)/"")**0), allchar); +} + +var function eqcharset (c1, c2) { + assert(cs2str(c1) == cs2str(c2)); +} + + +print("General tests for LPeg library"); + +assert(type(m.version()) == "string"); +print("version " .. m.version() .. ' emulated stream mode'); +assert(m.type("alo") != "pattern"); +assert(m.type(io.input) != "pattern"); +assert(m.type(m.P("alo")) == "pattern"); + +// tests for some basic optimizations +assert(m.emulatestreammatch(m.P(false) + "a", "a") == 2); +assert(m.emulatestreammatch(m.P(true) + "a", "a") == 1); +assert(m.emulatestreammatch("a" + m.P(false), "b") == null); +assert(m.emulatestreammatch("a" + m.P(true), "b") == 1); + +assert(m.emulatestreammatch(m.P(false) * "a", "a") == null); +assert(m.emulatestreammatch(m.P(true) * "a", "a") == 2); +assert(m.emulatestreammatch("a" * m.P(false), "a") == null); +assert(m.emulatestreammatch("a" * m.P(true), "a") == 2); + +assert(m.emulatestreammatch(#m.P(false) * "a", "a") == null); +assert(m.emulatestreammatch(#m.P(true) * "a", "a") == 2); +assert(m.emulatestreammatch("a" * #m.P(false), "a") == null); +assert(m.emulatestreammatch("a" * #m.P(true), "a") == 2); + + +// tests for locale +{ + assert(m.locale(m) == m); + var xt = {}; + assert(m.locale(xt, m) == xt); + var x = m.locale(); + for( n,v in pairs(x) ) { + assert(type(n) == "string"); + eqcharset(v, m[n]); + } +} + + +assert(m.emulatestreammatch(3, "aaaa")); +assert(m.emulatestreammatch(4, "aaaa")); +assert(! m.emulatestreammatch(5, "aaaa")); +assert(m.emulatestreammatch(-3, "aa")); +assert(! m.emulatestreammatch(-3, "aaa")); +assert(! m.emulatestreammatch(-3, "aaaa")); +assert(! m.emulatestreammatch(-4, "aaaa")); +assert(m.P(-5)->emulatestreammatch("aaaa")); + +assert(m.emulatestreammatch("a", "alo") == 2); +assert(m.emulatestreammatch("al", "alo") == 3); +assert(! m.emulatestreammatch("alu", "alo")); +assert(m.emulatestreammatch(true, "") == 1); + +var digit = m.S("0123456789"); +var upper = m.S("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); +var lower = m.S("abcdefghijklmnopqrstuvwxyz"); +var letter = m.S("") + upper + lower; +var alpha = letter + digit + m.R(); + +eqcharset(m.S(""), m.P(false)); +eqcharset(upper, m.R("AZ")); +eqcharset(lower, m.R("az")); +eqcharset(upper + lower, m.R("AZ", "az")); +eqcharset(upper + lower, m.R("AZ", "cz", "aa", "bb", "90")); +eqcharset(digit, m.S("01234567") + "8" + "9"); +eqcharset(upper, letter - lower); +eqcharset(m.S(""), m.R()); +assert(cs2str(m.S("")) == ""); + +eqcharset(m.S("\0"), "\0"); +eqcharset(m.S("\1\0\2"), m.R("\0\2")); +eqcharset(m.S("\1\0\2"), m.R("\1\2") + "\0"); +eqcharset(m.S("\1\0\2") - "\0", m.R("\1\2")); + +var word = alpha**1 * (1 - alpha)**0; + +assert((word**0 * -1)->emulatestreammatch("alo alo")); +assert(m.emulatestreammatch(word**1 * -1, "alo alo")); +assert(m.emulatestreammatch(word**2 * -1, "alo alo")); +assert(! m.emulatestreammatch(word**3 * -1, "alo alo")); + +assert(! m.emulatestreammatch(word**-1 * -1, "alo alo")); +assert(m.emulatestreammatch(word**-2 * -1, "alo alo")); +assert(m.emulatestreammatch(word**-3 * -1, "alo alo")); + +var eos = m.P(-1); + +assert(m.emulatestreammatch(digit**0 * letter * digit * eos, "1298a1")); +assert(! m.emulatestreammatch(digit**0 * letter * eos, "1257a1")); + +b = { + [1] = "(" * (((1 - m.S("()")) + #m.P("(") * m.V(1))**0) * ")" +}; + +assert(m.emulatestreammatch(b, "(al())()")); +assert(! m.emulatestreammatch(b * eos, "(al())()")); +assert(m.emulatestreammatch(b * eos, "((al())()(é))")); +assert(! m.emulatestreammatch(b, "(al()()")); + +assert(! m.emulatestreammatch(letter**1 - "for", "foreach")); +assert(m.emulatestreammatch(letter**1 - ("for" * eos), "foreach")); +assert(! m.emulatestreammatch(letter**1 - ("for" * eos), "for")); + +function basiclookfor (p) { + return m.P ({ + [1] = p + (1 * m.V(1)) + }); +} + +function caplookfor (p) { + return basiclookfor(p->C()); +} + +assert(m.emulatestreammatch(caplookfor(letter**1), " 4achou123...") == "achou"); +a = {m.emulatestreammatch(caplookfor(letter**1)**0, " two words, one more ")}; +checkeq(a, {"two", "words", "one", "more"}); + +assert(m.emulatestreammatch( basiclookfor((#m.P(b) * 1) * m.Cp()), " ( (a)") == 7); + +a = {m.emulatestreammatch(m.C(digit**1 * m.Cc("d")) + m.C(letter**1 * m.Cc("l")), "123")}; +checkeq(a, {"123", "d"}); + +// bug in LPeg 0.12 (null value does not create a 'ktable') +assert(m.emulatestreammatch(m.Cc(null), "") == null); + +a = {m.emulatestreammatch(m.C(digit**1 * m.Cc("d")) + m.C(letter**1 * m.Cc("l")), "abcd")}; +checkeq(a, {"abcd", "l"}); + +a = {m.emulatestreammatch(m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')}; +checkeq(a, {10,20,30,2}); +a = {m.emulatestreammatch(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')}; +checkeq(a, {1,10,20,30,2}); +a = m.emulatestreammatch(m.Ct(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa'); +checkeq(a, {1,10,20,30,2}); +a = m.emulatestreammatch(m.Ct(m.Cp() * m.Cc(7,8) * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa'); +checkeq(a, {1,7,8,10,20,30,2}); +a = {m.emulatestreammatch(m.Cc() * m.Cc() * m.Cc(1) * m.Cc(2,3,4) * m.Cc() * 'a', 'aaa')}; +checkeq(a, {1,2,3,4}); + +a = {m.emulatestreammatch(m.Cp() * letter**1 * m.Cp(), "abcd")}; +checkeq(a, {1, 5}); + + +t = {m.emulatestreammatch({[1] = m.C(m.C(1) * m.V(1) + -1)}, "abc")}; +checkeq(t, {"abc", "a", "bc", "b", "c", "c", ""}); + +// bug in 0.12 ('hascapture' did not check for captures inside a rule) +{ + var pat = m.P({ + 'S'; + S1 = m.C('abc') + 3, + S = #m.V('S1') // rule has capture, but '#' must ignore it + }); + assert(pat->emulatestreammatch('abc') == 1); +} + + +// test for small capture boundary +for( i = 250,260 ) { + assert(#m.emulatestreammatch(m.C(i), string.rep('a', i)) == i); + assert(#m.emulatestreammatch(m.C(m.C(i)), string.rep('a', i)) == i); +} + +// tests for any*n and any*-n +for( n = 1, 550, 13 ) { + var x_1 = string.rep('x', n - 1); + var x = x_1 .. 'a'; + assert(! m.P(n)->emulatestreammatch(x_1)); + assert(m.P(n)->emulatestreammatch(x) == n + 1); + assert(n < 4 || m.emulatestreammatch(m.P(n) + "xxx", x_1) == 4); + assert(m.C(n)->emulatestreammatch(x) == x); + assert(m.C(m.C(n))->emulatestreammatch(x) == x); + assert(m.P(-n)->emulatestreammatch(x_1) == 1); + assert(! m.P(-n)->emulatestreammatch(x)); + assert(n < 13 || m.emulatestreammatch(m.Cc(20) * ((n - 13) * m.P(10)) * 3, x) == 20); + var n3 = math.floor(n/3); + assert(m.emulatestreammatch(n3 * m.Cp() * n3 * n3, x) == n3 + 1); +} + +// true values +assert(m.P(0)->emulatestreammatch("x") == 1); +assert(m.P(0)->emulatestreammatch("") == 1); +assert(m.C(0)->emulatestreammatch("x") == ""); + +assert(m.emulatestreammatch(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxu") == 1); +assert(m.emulatestreammatch(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxuxuxuxu") == 0); +assert(m.emulatestreammatch(m.C(m.P(2)**1), "abcde") == "abcd"); +p = m.Cc(0) * 1 + m.Cc(1) * 2 + m.Cc(2) * 3 + m.Cc(3) * 4; + + +// test for alternation optimization +assert(m.emulatestreammatch(m.P("a")**1 + "ab" + m.P("x")**0, "ab") == 2); +assert(m.emulatestreammatch((m.P("a")**1 + "ab" + m.P("x")**0 * 1)**0, "ab") == 3); +assert(m.emulatestreammatch(m.P("ab") + "cd" + "" + "cy" + "ak", "98") == 1); +assert(m.emulatestreammatch(m.P("ab") + "cd" + "ax" + "cy", "ax") == 3); +assert(m.emulatestreammatch("a" * m.P("b")**0 * "c" + "cd" + "ax" + "cy", "ax") == 3); +assert(m.emulatestreammatch((m.P("ab") + "cd" + "ax" + "cy")**0, "ax") == 3); +assert(m.emulatestreammatch(m.P(1) * "x" + m.S("") * "xu" + "ay", "ay") == 3); +assert(m.emulatestreammatch(m.P("abc") + "cde" + "aka", "aka") == 4); +assert(m.emulatestreammatch(m.S("abc") * "x" + "cde" + "aka", "ax") == 3); +assert(m.emulatestreammatch(m.S("abc") * "x" + "cde" + "aka", "aka") == 4); +assert(m.emulatestreammatch(m.S("abc") * "x" + "cde" + "aka", "cde") == 4); +assert(m.emulatestreammatch(m.S("abc") * "x" + "ide" + m.S("ab") * "ka", "aka") == 4); +assert(m.emulatestreammatch("ab" + m.S("abc") * m.P("y")**0 * "x" + "cde" + "aka", "ax") == 3); +assert(m.emulatestreammatch("ab" + m.S("abc") * m.P("y")**0 * "x" + "cde" + "aka", "aka") == 4); +assert(m.emulatestreammatch("ab" + m.S("abc") * m.P("y")**0 * "x" + "cde" + "aka", "cde") == 4); +assert(m.emulatestreammatch("ab" + m.S("abc") * m.P("y")**0 * "x" + "ide" + m.S("ab") * "ka", "aka") == 4); +assert(m.emulatestreammatch("ab" + m.S("abc") * m.P("y")**0 * "x" + "ide" + m.S("ab") * "ka", "ax") == 3); +assert(m.emulatestreammatch(m.P(1) * "x" + "cde" + m.S("ab") * "ka", "aka") == 4); +assert(m.emulatestreammatch(m.P(1) * "x" + "cde" + m.P(1) * "ka", "aka") == 4); +assert(m.emulatestreammatch(m.P(1) * "x" + "cde" + m.P(1) * "ka", "cde") == 4); +assert(m.emulatestreammatch(m.P("eb") + "cd" + m.P("e")**0 + "x", "ee") == 3); +assert(m.emulatestreammatch(m.P("ab") + "cd" + m.P("e")**0 + "x", "abcd") == 3); +assert(m.emulatestreammatch(m.P("ab") + "cd" + m.P("e")**0 + "x", "eeex") == 4); +assert(m.emulatestreammatch(m.P("ab") + "cd" + m.P("e")**0 + "x", "cd") == 3); +assert(m.emulatestreammatch(m.P("ab") + "cd" + m.P("e")**0 + "x", "x") == 1); +assert(m.emulatestreammatch(m.P("ab") + "cd" + m.P("e")**0 + "x" + "", "zee") == 1); +assert(m.emulatestreammatch(m.P("ab") + "cd" + m.P("e")**1 + "x", "abcd") == 3); +assert(m.emulatestreammatch(m.P("ab") + "cd" + m.P("e")**1 + "x", "eeex") == 4); +assert(m.emulatestreammatch(m.P("ab") + "cd" + m.P("e")**1 + "x", "cd") == 3); +assert(m.emulatestreammatch(m.P("ab") + "cd" + m.P("e")**1 + "x", "x") == 2); +assert(m.emulatestreammatch(m.P("ab") + "cd" + m.P("e")**1 + "x" + "", "zee") == 1); +assert(! m.emulatestreammatch(("aa" * m.P("bc")**-1 + "aab") * "e", "aabe")); + +assert(m.emulatestreammatch("alo" * (m.P("\n") + -1), "alo") == 4); + + +// bug in 0.12 (rc1) +assert(m.emulatestreammatch((m.P("\128\187\191") + m.S("abc"))**0, "\128\187\191") == 4); + +assert(m.emulatestreammatch(m.S("\0\128\255\127")**0, string.rep("\0\128\255\127", 10)) == + 4*10 + 1); + +// optimizations with optional parts +assert(m.emulatestreammatch(("ab" * -m.P("c"))**-1, "abc") == 1); +assert(m.emulatestreammatch(("ab" * #m.P("c"))**-1, "abd") == 1); +assert(m.emulatestreammatch(("ab" * m.B("c"))**-1, "ab") == 1); +assert(m.emulatestreammatch(("ab" * m.P("cd")**0)**-1, "abcdcdc") == 7); + +assert(m.emulatestreammatch(m.P("ab")**-1 - "c", "abcd") == 3); + +p = ('Aa' * ('Bb' * ('Cc' * m.P('Dd')**0)**0)**0)**-1; +assert(p->emulatestreammatch("AaBbCcDdBbCcDdDdDdBb") == 21); + + +// bug in 0.12.2 +// p = { ('ab' ('c' 'ef'?)*)? } +p = m.C(('ab' * ('c' * m.P('ef')**-1)**0)**-1); +s = "abcefccefc"; +assert(s == p->emulatestreammatch(s)); + + +pi = "3.14159 26535 89793 23846 26433 83279 50288 41971 69399 37510"; +assert(m.emulatestreammatch(m.Cs((m.P("1") / "a" + m.P("5") / "b" + m.P("9") / "c" + 1)**0), pi) == + m.emulatestreammatch(m.Cs((m.P(1) / {["1"] = "a", ["5"] = "b", ["9"] = "c"})**0), pi)); +print("+"); + + +// tests for capture optimizations +assert(m.emulatestreammatch((m.P(3) + 4 * m.Cp()) * "a", "abca") == 5); +t = {m.emulatestreammatch(((m.P("a") + m.Cp()) * m.P("x"))**0, "axxaxx")}; +checkeq(t, {3, 6}); + + +// tests for numbered captures +p = m.C(1); +assert(m.emulatestreammatch(m.C(m.C(p * m.C(2)) * m.C(3)) / 3, "abcdefgh") == "a"); +assert(m.emulatestreammatch(m.C(m.C(p * m.C(2)) * m.C(3)) / 1, "abcdefgh") == "abcdef"); +assert(m.emulatestreammatch(m.C(m.C(p * m.C(2)) * m.C(3)) / 4, "abcdefgh") == "bc"); +assert(m.emulatestreammatch(m.C(m.C(p * m.C(2)) * m.C(3)) / 0, "abcdefgh") == 7); + +a, b, c = m.emulatestreammatch(p * (m.C(p * m.C(2)) * m.C(3) / 4) * p, "abcdefgh"); +assert(a == "a" && b == "efg" && c == "h"); + +// test for table captures +t = m.emulatestreammatch(m.Ct(letter**1), "alo"); +checkeq(t, {}); + +t, n = m.emulatestreammatch(m.Ct(m.C(letter)**1) * m.Cc("t"), "alo"); +assert(n == "t" && table.concat(t) == "alo"); + +t = m.emulatestreammatch(m.Ct(m.C(m.C(letter)**1)), "alo"); +assert(table.concat(t, ";") == "alo;a;l;o"); + +t = m.emulatestreammatch(m.Ct(m.C(m.C(letter)**1)), "alo"); +assert(table.concat(t, ";") == "alo;a;l;o"); + +t = m.emulatestreammatch(m.Ct(m.Ct((m.Cp() * letter * m.Cp())**1)), "alo"); +assert(table.concat(t[1], ";") == "1;2;2;3;3;4"); + +t = m.emulatestreammatch(m.Ct(m.C(m.C(1) * 1 * m.C(1))), "alo"); +checkeq(t, {"alo", "a", "o"}); + + +// tests for groups +p = m.Cg(1); // no capture +assert(p->emulatestreammatch('x') == 'x'); +p = m.Cg(m.P(true)/function () { } * 1); // no value +assert(p->emulatestreammatch('x') == 'x'); +p = m.Cg(m.Cg(m.Cg(m.C(1)))); +assert(p->emulatestreammatch('x') == 'x'); +p = m.Cg(m.Cg(m.Cg(m.C(1))**0) * m.Cg(m.Cc(1) * m.Cc(2))); +t = {p->emulatestreammatch('abc')}; +checkeq(t, {'a', 'b', 'c', 1, 2}); + +p = m.Ct(m.Cg(m.Cc(10), "hi") * m.C(1)**0 * m.Cg(m.Cc(20), "ho")); +t = p->emulatestreammatch(''); +checkeq(t, {hi = 10, ho = 20}); +t = p->emulatestreammatch('abc'); +checkeq(t, {hi = 10, ho = 20, 'a', 'b', 'c'}); + +// non-string group names +p = m.Ct(m.Cg(1, print) * m.Cg(1, 23.5) * m.Cg(1, io)); +t = p->emulatestreammatch('abcdefghij'); +assert(t[print] == 'a' && t[23.5] == 'b' && t[io] == 'c'); + + +// test for error messages +var function checkerr (msg, f, ...) { + var st, err = pcall(f, ...); + assert(! st && m.emulatestreammatch({ m.P(msg) + 1 * m.V(1) }, err)); +} + +checkerr("rule '1' may be left recursive", m.emulatestreammatch, { m.V(1) * 'a' }, "a"); +checkerr("rule '1' used outside a grammar", m.emulatestreammatch, m.V(1), ""); +checkerr("rule 'hiii' used outside a grammar", m.emulatestreammatch, m.V('hiii'), ""); +checkerr("rule 'hiii' undefined in given grammar", m.emulatestreammatch, { m.V('hiii') }, ""); +checkerr("undefined in given grammar", m.emulatestreammatch, { m.V({}) }, ""); + +checkerr("rule 'A' is not a pattern", m.P, { m.P(1), A = {} }); +checkerr("grammar has no initial rule", m.P, { [print] = {} }); + +// grammar with a long call chain before left recursion +p = {'a', + a = m.V('b') * m.V('c') * m.V('d') * m.V('a'), + b = m.V('c'), + c = m.V('d'), + d = m.V('e'), + e = m.V('f'), + f = m.V('g'), + g = m.P('') +}; +checkerr("rule 'a' may be left recursive", m.emulatestreammatch, p, "a"); + +// Bug in peephole optimization of LPeg 0.12 (IJmp -> ICommit) +// the next grammar has an original sequence IJmp -> ICommit -> IJmp L1 +// that is optimized to ICommit L1 + +p = m.P ({ (m.P ({m.P('abc')}) + 'ayz') * m.V('y'); y = m.P('x') }); +assert(p->emulatestreammatch('abcx') == 5 && p->emulatestreammatch('ayzx') == 5 && ! p->emulatestreammatch('abc')); + + +{ + // large dynamic Cc + var lim = 2**16 - 1; + var xc = 0; + var function seq (n) { + if( n == 1 ) { ++xc ; return m.Cc(xc); + } else { + var m = math.floor(n / 2); + return seq(m) * seq(n - m); + } + } + p = m.Ct(seq(lim)); + t = p->emulatestreammatch(''); + assert(t[lim] == lim); + checkerr("too many", function () { p /= print; }); + checkerr("too many", seq, lim + 1); +} +// tests for non-pattern as arguments to pattern functions + +p = { ('a' * m.V(1))**-1 } * m.P('b') * { 'a' * m.V(2); m.V(1)**-1 }; +assert(m.emulatestreammatch(p, "aaabaac") == 7); + +p = m.P('abc') * 2 * -5 * true * 'de'; // mix of numbers and strings and booleans + +assert(p->emulatestreammatch("abc01de") == 8); +assert(p->emulatestreammatch("abc01de3456") == null); + +p = 'abc' * (2 * (-5 * (true * m.P('de')))); + +assert(p->emulatestreammatch("abc01de") == 8); +assert(p->emulatestreammatch("abc01de3456") == null); + +p = { m.V(2), m.P("abc") } * + (m.P({ "xx", xx = m.P("xx") }) + { "x", x = m.P("a") * m.V("x") + "" }); +assert(p->emulatestreammatch("abcaaaxx") == 7); +assert(p->emulatestreammatch("abcxx") == 6); + + +// a large table capture +t = m.emulatestreammatch(m.Ct(m.C('a')**0), string.rep("a", 10000)); +assert(#t == 10000 && t[1] == 'a' && t[#t] == 'a'); + +print('+'); + + +// bug in 0.10 (rechecking a grammar, after tail-call optimization) +m.P({ m.P ({ (m.P(3) + "xuxu")**0 * m.V("xuxu"), xuxu = m.P(1) }) }); + +var V = m.V; + +var Space = m.S(" \n\t")**0; +var Number = m.C(m.R("09")**1) * Space; +var FactorOp = m.C(m.S("+-")) * Space; +var TermOp = m.C(m.S("*/")) * Space; +var Open = "(" * Space; +var Close = ")" * Space; + + +var function f_factor (v1, op, v2, d) { + assert(d == null); + if( op == "+" ) { return v1 + v2; + } else { return v1 - v2; + } +} + + +var function f_term (v1, op, v2, d) { + assert(d == null); + if( op == "*" ) { return v1 * v2; + } else { return v1 / v2; + } +} + +G = m.P({ "Exp", + Exp = m.Cf(V("Factor") * m.Cg(FactorOp * V("Factor"))**0, f_factor); + Factor = m.Cf(V("Term") * m.Cg(TermOp * V("Term"))**0, f_term); + Term = Number / tonumber + Open * V("Exp") * Close; +}); + +G = Space * G * -1; + +for( _, s in ipairs({" 3 + 5*9 / (1+1) ", "3+4/2", "3+3-3- 9*2+3*9/1- 8"}) ) { + assert(m.emulatestreammatch(G, s) == loadstring("return "..s)()); +} + + +// test for grammars (errors deep in calling non-terminals) +g = m.P({ + [1] = m.V(2) + "a", + [2] = "a" * m.V(3) * "x", + [3] = "b" * m.V(3) + "c" +}); + +assert(m.emulatestreammatch(g, "abbbcx") == 7); +assert(m.emulatestreammatch(g, "abbbbx") == 2); + + +// tests for \0 +assert(m.emulatestreammatch(m.R("\0\1")**1, "\0\1\0") == 4); +assert(m.emulatestreammatch(m.S("\0\1ab")**1, "\0\1\0a") == 5); +assert(m.emulatestreammatch(m.P(1)**3, "\0\1\0a") == 5); +assert(! m.emulatestreammatch(-4, "\0\1\0a")); +assert(m.emulatestreammatch("\0\1\0a", "\0\1\0a") == 5); +assert(m.emulatestreammatch("\0\0\0", "\0\0\0") == 4); +assert(! m.emulatestreammatch("\0\0\0", "\0\0")); + + +// tests for predicates +assert(! m.emulatestreammatch(-m.P("a") * 2, "alo")); +assert(m.emulatestreammatch(- -m.P("a") * 2, "alo") == 3); +assert(m.emulatestreammatch(#m.P("a") * 2, "alo") == 3); +assert(m.emulatestreammatch(##m.P("a") * 2, "alo") == 3); +assert(! m.emulatestreammatch(##m.P("c") * 2, "alo")); +assert(m.emulatestreammatch(m.Cs((##m.P("a") * 1 + m.P(1)/".")**0), "aloal") == "a..a."); +assert(m.emulatestreammatch(m.Cs((#((#m.P("a"))/"") * 1 + m.P(1)/".")**0), "aloal") == "a..a."); +assert(m.emulatestreammatch(m.Cs((- -m.P("a") * 1 + m.P(1)/".")**0), "aloal") == "a..a."); +assert(m.emulatestreammatch(m.Cs((-((-m.P("a"))/"") * 1 + m.P(1)/".")**0), "aloal") == "a..a."); + +p = -m.P('a') * m.Cc(1) + -m.P('b') * m.Cc(2) + -m.P('c') * m.Cc(3); +assert(p->emulatestreammatch('a') == 2 && p->emulatestreammatch('') == 1 && p->emulatestreammatch('b') == 1); + +p = -m.P('a') * m.Cc(10) + #m.P('a') * m.Cc(20); +assert(p->emulatestreammatch('a') == 20 && p->emulatestreammatch('') == 10 && p->emulatestreammatch('b') == 10); + + + +// look-behind predicate +assert(! m.emulatestreammatch(m.B('a'), 'a')); +assert(m.emulatestreammatch(1 * m.B('a'), 'a') == 2); +assert(! m.emulatestreammatch(m.B(1), 'a')); +assert(m.emulatestreammatch(1 * m.B(1), 'a') == 2); +assert(m.emulatestreammatch(-m.B(1), 'a') == 1); +assert(m.emulatestreammatch(m.B(250), string.rep('a', 250)) == null); +assert(m.emulatestreammatch(250 * m.B(250), string.rep('a', 250)) == 251); + +// look-behind with an open call +checkerr("pattern may not have fixed length", m.B, m.V('S1')); +checkerr("too long to look behind", m.B, 260); + +B = #letter * -m.B(letter) + -letter * m.B(letter); +x = m.Ct({ (B * m.Cp())**-1 * (1 * m.V(1) + m.P(true)) }); +checkeq(m.emulatestreammatch(x, 'ar cal c'), {1,3,4,7,9,10}); +checkeq(m.emulatestreammatch(x, ' ar cal '), {2,4,5,8}); +checkeq(m.emulatestreammatch(x, ' '), {}); +checkeq(m.emulatestreammatch(x, 'aloalo'), {1,7}); + +assert(m.emulatestreammatch(B, "a") == 1); +assert(m.emulatestreammatch(1 * B, "a") == 2); +assert(! m.B(1 - letter)->emulatestreammatch("")); +assert((-m.B(letter))->emulatestreammatch("") == 1); + +assert((4 * m.B(letter, 4))->emulatestreammatch("aaaaaaaa") == 5); +assert(! (4 * m.B(#letter * 5))->emulatestreammatch("aaaaaaaa")); +assert((4 * -m.B(#letter * 5))->emulatestreammatch("aaaaaaaa") == 5); + +// look-behind with grammars +assert(m.emulatestreammatch('a' * m.B({'x', x = m.P(3)}), 'aaa') == null); +assert(m.emulatestreammatch('aa' * m.B({'x', x = m.P('aaa')}), 'aaaa') == null); +assert(m.emulatestreammatch('aaa' * m.B({'x', x = m.P('aaa')}), 'aaaaa') == 4); + + + +// bug in 0.9 +assert(m.emulatestreammatch(('a' * #m.P('b')), "ab") == 2); +assert(! m.emulatestreammatch(('a' * #m.P('b')), "a")); + +assert(! m.emulatestreammatch(#m.S('567'), "")); +assert(m.emulatestreammatch(#m.S('567') * 1, "6") == 2); + + +// tests for Tail Calls + +p = m.P({ 'a' * m.V(1) + '' }); +assert(p->emulatestreammatch(string.rep('a', 1000)) == 1001); + +// create a grammar for a simple DFA for even number of 0s and 1s +// +// ->1 <---0---> 2 +// ^ ^ +// | | +// 1 1 +// | | +// V V +// 3 <---0---> 4 +// +// this grammar should keep no backtracking information + +p = m.P({ + [1] = '0' * m.V(2) + '1' * m.V(3) + -1, + [2] = '0' * m.V(1) + '1' * m.V(4), + [3] = '0' * m.V(4) + '1' * m.V(1), + [4] = '0' * m.V(3) + '1' * m.V(2), +}); + +assert(p->emulatestreammatch(string.rep("00", 10000))); +assert(p->emulatestreammatch(string.rep("01", 10000))); +assert(p->emulatestreammatch(string.rep("011", 10000))); +assert(! p->emulatestreammatch(string.rep("011", 10000) .. "1")); +assert(! p->emulatestreammatch(string.rep("011", 10001))); + + +// this grammar does need backtracking info. +var lim = 10000; +p = m.P({ '0' * m.V(1) + '0' }); +checkerr("stack overflow", m.emulatestreammatch, p, string.rep("0", lim)); +m.setmaxstack(2*lim); +checkerr("stack overflow", m.emulatestreammatch, p, string.rep("0", lim)); +m.setmaxstack(2*lim + 4); +assert(m.emulatestreammatch(p, string.rep("0", lim)) == lim + 1); + +// this repetition should not need stack space (only the call does) +p = m.P({ ('a' * m.V(1))**0 * 'b' + 'c' }); +m.setmaxstack(200); +assert(p->emulatestreammatch(string.rep('a', 180) .. 'c' .. string.rep('b', 180)) == 362); + +m.setmaxstack(100); // restore low limit + +// tests for optional start position +assert(m.emulatestreammatch("a", "abc", 1)); +assert(m.emulatestreammatch("b", "abc", 2)); +assert(m.emulatestreammatch("c", "abc", 3)); +assert(! m.emulatestreammatch(1, "abc", 4)); +assert(m.emulatestreammatch("a", "abc", -3)); +assert(m.emulatestreammatch("b", "abc", -2)); +assert(m.emulatestreammatch("c", "abc", -1)); +assert(m.emulatestreammatch("abc", "abc", -4)); // truncate to position 1 + +assert(m.emulatestreammatch("", "abc", 10)); // empty string is everywhere! +assert(m.emulatestreammatch("", "", 10)); +assert(! m.emulatestreammatch(1, "", 1)); +assert(! m.emulatestreammatch(1, "", -1)); +assert(! m.emulatestreammatch(1, "", 0)); + +print("+"); + + +// tests for argument captures +checkerr("invalid argument", m.Carg, 0); +checkerr("invalid argument", m.Carg, -1); +checkerr("invalid argument", m.Carg, 2**18); +checkerr("absent extra argument #1", m.emulatestreammatch, m.Carg(1), 'a', 1); +assert(m.emulatestreammatch(m.Carg(1), 'a', 1, print) == print); +x = {m.emulatestreammatch(m.Carg(1) * m.Carg(2), '', 1, 10, 20)}; +checkeq(x, {10, 20}); + +assert(m.emulatestreammatch(m.Cmt(m.Cg(m.Carg(3), "a") * + m.Cmt(m.Cb("a"), function (s,i,x) { + assert(s(1,-1) == "a" && i == 1); + return i, x+1; + }) * + m.Carg(2), function (s,i,a,b,c) { + assert(s(1,-1) == "a" && i == 1 && c == null); + return i, 2*a + 3*b; +}) * "a", + "a", 1, false, 100, 1000) == 2*1001 + 3*100); + + +// tests for Lua functions + +t = {}; +s = ""; +p = m.P(function (s1, i) { assert(s == s1(1,-1)); t[#t + 1] = i; return null; }) * false; +s = "hi, this is a test"; +assert(m.emulatestreammatch(((p - m.P(-1)) + 2)**0, s) == string.len(s) + 1); +assert(#t == string.len(s)/2 && t[1] == 1 && t[2] == 3); + +assert(! m.emulatestreammatch(p, s)); + +p = mt.__add(function (s, i) { return i; }, function (s, i) { return null; }); +assert(m.emulatestreammatch(p, "alo")); + +p = mt.__mul(function (s, i) { return i; }, function (s, i) { return null; }); +assert(! m.emulatestreammatch(p, "alo")); + + +t = {}; +p = function (s1, i) { assert(s == s1(1,-1)); t[#t + 1] = i; return i; }; +s = "hi, this is a test"; +assert(m.emulatestreammatch((m.P(1) * p)**0, s) == string.len(s) + 1); +assert(#t == string.len(s) && t[1] == 2 && t[2] == 3); + +t = {}; +p = m.P(function (s1, i) { assert(s == s1(1,-1)); t[#t + 1] = i; +return i <= s1(1,-1)->len() && i; }) * 1; +s = "hi, this is a test"; +assert(m.emulatestreammatch(p**0, s) == string.len(s) + 1); +assert(#t == string.len(s) + 1 && t[1] == 1 && t[2] == 2); + +p = function (s1, i) { return m.emulatestreammatch(m.P("a")**1, s1(1,-1), i); }; +assert(m.emulatestreammatch(p, "aaaa") == 5); +assert(m.emulatestreammatch(p, "abaa") == 2); +assert(! m.emulatestreammatch(p, "baaa")); + +checkerr("invalid position", m.emulatestreammatch, function () { return 2**20; }, s); +checkerr("invalid position", m.emulatestreammatch, function () { return 0; }, s); +checkerr("invalid position", m.emulatestreammatch, function (s, i) { return i - 1; }, s); +checkerr("invalid position", m.emulatestreammatch, + m.P(1)**0 * function (_, i) { return i - 1; }, s); +assert(m.emulatestreammatch(m.P(1)**0 * function (_, i) { return i; } * -1, s)); +checkerr("invalid position", m.emulatestreammatch, + m.P(1)**0 * function (_, i) { return i + 1; }, s); +assert(m.emulatestreammatch(m.P(function (s, i) { return s(1,-1)->len() + 1; }) * -1, s)); +checkerr("invalid position", m.emulatestreammatch, m.P(function (s, i) { return s(1,-1)->len() + 2; }) * -1, s); +assert(! m.emulatestreammatch(m.P(function (s, i) { return s(1,-1)->len(); }) * -1, s)); +assert(m.emulatestreammatch(m.P(1)**0 * function (_, i) { return true; }, s) == + string.len(s) + 1); +for( i = 1, string.len(s) + 1 ) { + assert(m.emulatestreammatch(function (_, _) { return i; }, s) == i); +} + +p = (m.P(function (s, i) { return i%2 == 0 && i; }) * 1 + + m.P(function (s, i) { return i%2 != 0 && i + 2 <= s(1,-1)->len() && i; }) * 3)**0 + * -1; +assert(p->emulatestreammatch(string.rep('a', 14000))); + +// tests for Function Replacements +f = function (a, ...) { if( a != "x" ) { return {a, ...}; } }; + +t = m.emulatestreammatch(m.C(1)**0/f, "abc"); +checkeq(t, {"a", "b", "c"}); + +t = m.emulatestreammatch(m.C(1)**0/f/f, "abc"); +checkeq(t, {{"a", "b", "c"}}); + +t = m.emulatestreammatch(m.P(1)**0/f/f, "abc"); // no capture +checkeq(t, {{"abc"}}); + +t = m.emulatestreammatch((m.P(1)**0/f * m.Cp())/f, "abc"); +checkeq(t, {{"abc"}, 4}); + +t = m.emulatestreammatch((m.C(1)**0/f * m.Cp())/f, "abc"); +checkeq(t, {{"a", "b", "c"}, 4}); + +t = m.emulatestreammatch((m.C(1)**0/f * m.Cp())/f, "xbc"); +checkeq(t, {4}); + +t = m.emulatestreammatch(m.C(m.C(1)**0)/f, "abc"); +checkeq(t, {"abc", "a", "b", "c"}); + +g = function (...) { return 1, ...; }; +t = {m.emulatestreammatch(m.C(1)**0/g/g, "abc")}; +checkeq(t, {1, 1, "a", "b", "c"}); + +t = {m.emulatestreammatch(m.Cc(null,null,4) * m.Cc(null,3) * m.Cc(null, null) / g / g, "")}; +t1 = {1,1,null,null,4,null,3,null,null}; +for( i=1,10 ) { assert(t[i] == t1[i]); } + +// bug in 0.12.2: ktable with only null could be eliminated when joining +// with a pattern without ktable +assert((m.P("aaa") * m.Cc(null))->emulatestreammatch("aaa") == null); + +t = {m.emulatestreammatch((m.C(1) / function (x) { return x, x.."x"; })**0, "abc")}; +checkeq(t, {"a", "ax", "b", "bx", "c", "cx"}); + +t = m.emulatestreammatch(m.Ct((m.C(1) / function (x,y) { return y, x; } * m.Cc(1))**0), "abc"); +checkeq(t, {null, "a", 1, null, "b", 1, null, "c", 1}); + +// tests for Query Replacements + +assert(m.emulatestreammatch(m.C(m.C(1)**0)/{abc = 10}, "abc") == 10); +assert(m.emulatestreammatch(m.C(1)**0/{a = 10}, "abc") == 10); +assert(m.emulatestreammatch(m.S("ba")**0/{ab = 40}, "abc") == 40); +t = m.emulatestreammatch(m.Ct((m.S("ba")/{a = 40})**0), "abc"); +checkeq(t, {40}); + +assert(m.emulatestreammatch(m.Cs((m.C(1)/{a=".", d=".."})**0), "abcdde") == ".bc....e"); +assert(m.emulatestreammatch(m.Cs((m.C(1)/{f="."})**0), "abcdde") == "abcdde"); +assert(m.emulatestreammatch(m.Cs((m.C(1)/{d="."})**0), "abcdde") == "abc..e"); +assert(m.emulatestreammatch(m.Cs((m.C(1)/{e="."})**0), "abcdde") == "abcdd."); +assert(m.emulatestreammatch(m.Cs((m.C(1)/{e=".", f="+"})**0), "eefef") == "..+.+"); +assert(m.emulatestreammatch(m.Cs((m.C(1))**0), "abcdde") == "abcdde"); +assert(m.emulatestreammatch(m.Cs(m.C(m.C(1)**0)), "abcdde") == "abcdde"); +assert(m.emulatestreammatch(1 * m.Cs(m.P(1)**0), "abcdde") == "bcdde"); +assert(m.emulatestreammatch(m.Cs((m.C('0')/'x' + 1)**0), "abcdde") == "abcdde"); +assert(m.emulatestreammatch(m.Cs((m.C('0')/'x' + 1)**0), "0ab0b0") == "xabxbx"); +assert(m.emulatestreammatch(m.Cs((m.C('0')/'x' + m.P(1)/{b=3})**0), "b0a0b") == "3xax3"); +assert(m.emulatestreammatch(m.P(1)/'%0%0'/{aa = -3} * 'x', 'ax') == -3); +assert(m.emulatestreammatch(m.C(1)/'%0%1'/{aa = 'z'}/{z = -3} * 'x', 'ax') == -3); + +assert(m.emulatestreammatch(m.Cs(m.Cc(0) * (m.P(1)/"")), "4321") == "0"); + +assert(m.emulatestreammatch(m.Cs((m.P(1) / "%0")**0), "abcd") == "abcd"); +assert(m.emulatestreammatch(m.Cs((m.P(1) / "%0.%0")**0), "abcd") == "a.ab.bc.cd.d"); +assert(m.emulatestreammatch(m.Cs((m.P("a") / "%0.%0" + 1)**0), "abcad") == "a.abca.ad"); +assert(m.emulatestreammatch(m.C("a") / "%1%%%0", "a") == "a%a"); +assert(m.emulatestreammatch(m.Cs((m.P(1) / ".xx")**0), "abcd") == ".xx.xx.xx.xx"); +assert(m.emulatestreammatch(m.Cp() * m.P(3) * m.Cp()/"%2%1%1 - %0 ", "abcde") == + "411 - abc "); + +assert(m.emulatestreammatch(m.P(1)/"%0", "abc") == "a"); +checkerr("invalid capture index", m.emulatestreammatch, m.P(1)/"%1", "abc"); +checkerr("invalid capture index", m.emulatestreammatch, m.P(1)/"%9", "abc"); + +p = m.C(1); +p *= p; p *= p; p = p * p * m.C(1) / "%9 - %1"; +assert(p->emulatestreammatch("1234567890") == "9 - 1"); + +assert(m.emulatestreammatch(m.Cc(print), "") == print); + +// too many captures (just ignore extra ones) +p = m.C(1)**0 / "%2-%9-%0-%9"; +assert(p->emulatestreammatch("01234567890123456789") == "1-8-01234567890123456789-8"); +s = string.rep("12345678901234567890", 20); +assert(m.emulatestreammatch(m.C(1)**0 / "%9-%1-%0-%3", s) == "9-1-" .. s .. "-3"); + +// string captures with non-string subcaptures +p = m.Cc('alo') * m.C(1) / "%1 - %2 - %1"; +assert(p->emulatestreammatch('x') == 'alo - x - alo'); + +checkerr("invalid capture value (a boolean)", m.emulatestreammatch, m.Cc(true) / "%1", "a"); + +// long strings for string capture +l = 10000; +s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l); + +p = (m.C(m.P('a')**1) * m.C(m.P('b')**1) * m.C(m.P('c')**1)) / '%3%2%1'; + +assert(p->emulatestreammatch(s) == string.rep('c', l) .. + string.rep('b', l) .. + string.rep('a', l)); + +print("+"); + +// accumulator capture +function f (x) { return x + 1; } +assert(m.emulatestreammatch(m.Cf(m.Cc(0) * m.C(1)**0, f), "alo alo") == 7); + +t = {m.emulatestreammatch(m.Cf(m.Cc(1,2,3), error), "")}; +checkeq(t, {1}); +p = m.Cf(m.Ct(true) * m.Cg(m.C(m.R("az")**1) * "=" * m.C(m.R("az")**1) * ";")**0, + rawset); +t = p->emulatestreammatch("a=b;c=du;xux=yuy;"); +checkeq(t, {a="b", c="du", xux="yuy"}); + + +// errors in accumulator capture + +// no initial capture +checkerr("no initial value", m.emulatestreammatch, m.Cf(m.P(5), print), 'aaaaaa'); +// no initial capture (very long match forces fold to be a pair open-close) +checkerr("no initial value", m.emulatestreammatch, m.Cf(m.P(500), print), + string.rep('a', 600)); + +// nested capture produces no initial value +checkerr("no initial value", m.emulatestreammatch, m.Cf(m.P(1) / {}, print), "alo"); + + +// tests for loop checker + +var function isnullable (p) { + checkerr("may accept empty string", function (p) { return p**0; }, m.P(p)); +} + +isnullable(m.P("x")**-4); +assert(m.emulatestreammatch(((m.P(0) + 1) * m.S("al"))**0, "alo") == 3); +assert(m.emulatestreammatch((("x" + #m.P(1))**-4 * m.S("al"))**0, "alo") == 3); +isnullable(""); +isnullable(m.P("x")**0); +isnullable(m.P("x")**-1); +isnullable(m.P("x") + 1 + 2 + m.P("a")**-1); +isnullable(-m.P("ab")); +isnullable(- -m.P("ab")); +isnullable(# #(m.P("ab") + "xy")); +isnullable(- #m.P("ab")**0); +isnullable(# -m.P("ab")**1); +isnullable(#m.V(3)); +isnullable(m.V(3) + m.V(1) + m.P('a')**-1); +isnullable({[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(0)}); +assert(m.emulatestreammatch(m.P({[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(1)})**0, "abc") + == 3); +assert(m.emulatestreammatch(m.P("")**-3, "a") == 1); + +var function find (p, s) { + return m.emulatestreammatch(basiclookfor(p), s); +} + + +var function badgrammar (g, expected) { + var stat, msg = pcall(m.P, g); + assert(! stat); + if( expected ) { assert(find(expected, msg)); } +} + +badgrammar({[1] = m.V(1)}, "rule '1'"); +badgrammar({[1] = m.V(2)}, "rule '2'"); // invalid non-terminal +badgrammar({[1] = m.V("x")}, "rule 'x'"); // invalid non-terminal +badgrammar({[1] = m.V({})}, "rule '(a table)'"); // invalid non-terminal +badgrammar({[1] = #m.P("a") * m.V(1)}, "rule '1'"); // left-recursive +badgrammar({[1] = -m.P("a") * m.V(1)}, "rule '1'"); // left-recursive +badgrammar({[1] = -1 * m.V(1)}, "rule '1'"); // left-recursive +badgrammar({[1] = -1 + m.V(1)}, "rule '1'"); // left-recursive +badgrammar({[1] = 1 * m.V(2), [2] = m.V(2)}, "rule '2'"); // left-recursive +badgrammar({[1] = 1 * m.V(2)**0, [2] = m.P(0)}, "rule '1'"); // inf. loop +badgrammar({ m.V(2), m.V(3)**0, m.P("") }, "rule '2'"); // inf. loop +badgrammar({ m.V(2) * m.V(3)**0, m.V(3)**0, m.P("") }, "rule '1'"); // inf. loop +badgrammar({"x", x = #(m.V(1) * 'a') }, "rule '1'"); // inf. loop +badgrammar({ -(m.V(1) * 'a') }, "rule '1'"); // inf. loop +badgrammar({"x", x = m.P('a')**-1 * m.V("x")}, "rule 'x'"); // left recursive +badgrammar({"x", x = m.P('a') * m.V("y")**1, y = #m.P(1)}, "rule 'x'"); + +assert(m.emulatestreammatch({'a' * -m.V(1)}, "aaa") == 2); +assert(m.emulatestreammatch({'a' * -m.V(1)}, "aaaa") == null); + + +// good x bad grammars +m.P({ ('a' * m.V(1))**-1 }); +m.P({ -('a' * m.V(1)) }); +m.P({ ('abc' * m.V(1))**-1 }); +m.P({ -('abc' * m.V(1)) }); +badgrammar({ #m.P('abc') * m.V(1) }); +badgrammar({ -('a' + m.V(1)) }); +m.P({ #('a' * m.V(1)) }); +badgrammar({ #('a' + m.V(1)) }); +m.P({ m.B({ m.P('abc') }) * 'a' * m.V(1) }); +badgrammar({ m.B({ m.P('abc') }) * m.V(1) }); +badgrammar({ ('a' + m.P('bcd'))**-1 * m.V(1) }); + + +// simple tests for maximum sizes: +p = m.P("a"); +for( i=1,14 ) { p *= p; } + +p = {}; +for( i=1,100 ) { p[i] = m.P("a"); } +p = m.P(p); + + +// strange values for rule labels + +p = m.P({ "print", + print = m.V(print), + [print] = m.V(_G), + [_G] = m.P("a"), +}); + +assert(p->emulatestreammatch("a")); + +// initial rule +g = {}; +for( i = 1, 10 ) { g["i"..i] = "a" * m.V("i"..i+1); } +g.i11 = m.P(""); +for( i = 1, 10 ) { + g[1] = "i"..i; + var xp = m.P(g); + assert(xp->emulatestreammatch("aaaaaaaaaaa") == 11 - i + 1); +} + +print("+"); + + +// tests for back references +checkerr("back reference 'x' not found", m.emulatestreammatch, m.Cb('x'), ''); +checkerr("back reference 'b' not found", m.emulatestreammatch, m.Cg(1, 'a') * m.Cb('b'), 'a'); + +p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k")); +t = p->emulatestreammatch("ab"); +checkeq(t, {"a", "b"}); + +p = m.P(true); +for( i = 1, 10 ) { p = p * m.Cg(1, i); } +for( i = 1, 10 ) { + var xp = p * m.Cb(i); + assert(xp->emulatestreammatch('abcdefghij') == string.sub('abcdefghij', i, i)); +} + + +t = {}; +function foo (p) { t[#t + 1] = p; return p .. "x"; } + +p = m.Cg(m.C(2) / foo, "x") * m.Cb("x") * + m.Cg(m.Cb('x') / foo, "x") * m.Cb("x") * + m.Cg(m.Cb('x') / foo, "x") * m.Cb("x") * + m.Cg(m.Cb('x') / foo, "x") * m.Cb("x"); +x = {p->emulatestreammatch('ab')}; +checkeq(x, {'abx', 'abxx', 'abxxx', 'abxxxx'}); +checkeq(t, {'ab', + 'ab', 'abx', + 'ab', 'abx', 'abxx', + 'ab', 'abx', 'abxx', 'abxxx'}); + + + +// tests for match-time captures + +p = m.P('a') * (function (s, i) { return (s(1,-1)->sub(i, i) == 'b') && i + 1; }) + + 'acd'; + +assert(p->emulatestreammatch('abc') == 3); +assert(p->emulatestreammatch('acd') == 4); + +var function id (s, i, ...) { + return true, ...; +} + +assert(m.Cmt(m.Cs((m.Cmt(m.S('abc') / { a = 'x', c = 'y' }, id) + + m.R('09')**1 / string.char + + m.P(1))**0), id)->emulatestreammatch("acb98+68c") == "xyb\98+\68y"); + +p = m.P({'S', + S = m.V('atom') * space + + m.Cmt(m.Ct("(" * space * (m.Cmt(m.V('S')**1, id) + m.P(true)) * ")" * space), id), + atom = m.Cmt(m.C(m.R("AZ", "az", "09")**1), id) +}); +x = p->emulatestreammatch("(a g () ((b) c) (d (e)))"); +checkeq(x, {'a', 'g', {}, {{'b'}, 'c'}, {'d', {'e'}}}); + +x = {(m.Cmt(1, id)**0)->emulatestreammatch(string.rep('a', 500))}; +assert(#x == 500); + +id = function (s, i, x) { + if( x == 'a' ) { return i, 1, 3, 7; + } else { return null, 2, 4, 6, 8; + } +} + +p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))**0; +assert(table.concat({p->emulatestreammatch('abababab')}) == string.rep('137', 4)); + +var function ref (s, i, x) { + return m.emulatestreammatch(x, s(1,-1), i - x->len()); +} + +assert(m.Cmt(m.P(1)**0, ref)->emulatestreammatch('alo') == 4); +assert((m.P(1) * m.Cmt(m.P(1)**0, ref))->emulatestreammatch('alo') == 4); +assert(! (m.P(1) * m.Cmt(m.C(1)**0, ref))->emulatestreammatch('alo')); + +ref = function (s,i,x) { return i == tonumber(x) && i, 'xuxu'; }; + +assert(m.Cmt(1, ref)->emulatestreammatch('2')); +assert(! m.Cmt(1, ref)->emulatestreammatch('1')); +assert(m.Cmt(m.P(1)**0, ref)->emulatestreammatch('03')); + +function ref (s, i, a, b) { + if( a == b ) { return i, a->upper(); } +} + +p = m.Cmt(m.C(m.R("az")**1) * "-" * m.C(m.R("az")**1), ref); +p = (any - p)**0 * p * any**0 * -1; + +assert(p->emulatestreammatch('abbbc-bc ddaa') == 'BC'); + +{ // match-time captures cannot be optimized away +var touch = 0; +f = m.P(function () { ++touch ; return true; }); + +var function check(n) { n = n || 1; assert(touch == n); touch = 0; } + +assert(m.emulatestreammatch(f * false + 'b', 'a') == null); check(); +assert(m.emulatestreammatch(f * false + 'b', '') == null); check(); +assert(m.emulatestreammatch( (f * 'a')**0 * 'b', 'b') == 2); check(); +assert(m.emulatestreammatch( (f * 'a')**0 * 'b', '') == null); check(); +assert(m.emulatestreammatch( (f * 'a')**-1 * 'b', 'b') == 2); check(); +assert(m.emulatestreammatch( (f * 'a')**-1 * 'b', '') == null); check(); +assert(m.emulatestreammatch( ('b' + f * 'a')**-1 * 'b', '') == null); check(); +assert(m.emulatestreammatch( (m.P('b')**-1 * f * 'a')**-1 * 'b', '') == null); check(); +assert(m.emulatestreammatch( (-m.P(1) * m.P('b')**-1 * f * 'a')**-1 * 'b', '') == null); +check(); +assert(m.emulatestreammatch( (f * 'a' + 'b')**-1 * 'b', '') == null); check(); +assert(m.emulatestreammatch(f * 'a' + f * 'b', 'b') == 2); check(2); +assert(m.emulatestreammatch(f * 'a' + f * 'b', 'a') == 2); check(1); +assert(m.emulatestreammatch(-f * 'a' + 'b', 'b') == 2); check(1); +assert(m.emulatestreammatch(-f * 'a' + 'b', '') == null); check(1); +} + +c = '[' * m.Cg(m.P('=')**0, "init") * '[' * + { m.Cmt(']' * m.C(m.P('=')**0) * ']' * m.Cb("init"), function (_, _, s1, s2) { + return s1 == s2; }) + + 1 * m.V(1) } / 0; + +assert(c->emulatestreammatch('[==[]]====]]]]==]===[]') == 18); +assert(c->emulatestreammatch('[[]=]====]=]]]==]===[]') == 14); +assert(! c->emulatestreammatch('[[]=]====]=]=]==]===[]')); + + +// old bug: optimization of concat with fail removed match-time capture +p = m.Cmt(0, function (s) { p = s(1,-1); }) * m.P(false); +assert(! p->emulatestreammatch('alo')); +assert(p == 'alo'); + + +// ensure that failed match-time captures are not kept on Lua stack +{ + var xt = {__mode = "kv"}; setmetatable(xt,xt); + var xc = 0; + + var function foo (s,i) { + collectgarbage(); + assert(next(xt) == "__mode" && next(xt, "__mode") == null); + var x = {}; + xt[x] = true; + ++xc ; + return i, x; + } + + var xp = m.P({ m.Cmt(0, foo) * m.P(false) + m.P(1) * m.V(1) + m.P("") }); + xp->emulatestreammatch(string.rep('1', 10)); + assert(xc == 11); +} + +p = (m.P(function () { return true, "a"; }) * 'a' + + m.P(function (s, i) { return i, "aa", 20; }) * 'b' + + m.P(function (s,i) { if( i <= #s(1,-1) ) { return i, "aaa"; } }) * 1)**0; + +t = {p->emulatestreammatch('abacc')}; +checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}); + + +//----------------------------------------------------------------- +// Tests for 're' module +//----------------------------------------------------------------- + +var re = require ("re"); + +var match, compile = re.emulatestreammatch, re.compile; + + + +assert(match("a", ".") == 2); +assert(match("a", "''") == 1); +assert(match("", " ! . ") == 1); +assert(! match("a", " ! . ")); +assert(match("abcde", " ( . . ) * ") == 5); +assert(match("abbcde", " [a-c] +") == 5); +assert(match("0abbc1de", "'0' [a-c]+ '1'") == 7); +assert(match("0zz1dda", "'0' [^a-c]+ 'a'") == 8); +assert(match("abbc--", " [a-c] + +") == 5); +assert(match("abbc--", " [ac-] +") == 2); +assert(match("abbc--", " [-acb] + ") == 7); +assert(! match("abbcde", " [b-z] + ")); +assert(match("abb\"de", '"abb"["]"de"') == 7); +assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee"); +assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8); +t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")}; +checkeq(t, {4, 5, 7}); +t = {match("abceefe", "((&&'e' {})? .)*")}; +checkeq(t, {4, 5, 7}); +t = {match("abceefe", "( ( ! ! 'e' {} ) ? . ) *")}; +checkeq(t, {4, 5, 7}); +t = {match("abceefe", "(( & ! & ! 'e' {})? .)*")}; +checkeq(t, {4, 5, 7}); + +assert(match("cccx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 5); +assert(match("cdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 4); +assert(match("abcdcdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 8); + +assert(match("abc", "a <- (. a)?") == 4); +b = "balanced <- '(' ([^()] / balanced)* ')'"; +assert(match("(abc)", b)); +assert(match("(a(b)((c) (d)))", b)); +assert(! match("(a(b ((c) (d)))", b)); + +b = compile([=[ balanced <- "(" ([^()] / balanced)* ")" ]=]); +assert(b == m.P(b)); +assert(b->emulatestreammatch("((((a))(b)))")); + +g = [=[ + S <- "0" B / "1" A / "" -- balanced strings + A <- "0" S / "1" A A -- one more 0 + B <- "1" S / "0" B B -- one more 1 +]=]; +assert(match("00011011", g) == 9); + +g = [=[ + S <- ("0" B / "1" A)* + A <- "0" / "1" A A + B <- "1" / "0" B B +]=]; +assert(match("00011011", g) == 9); +assert(match("000110110", g) == 9); +assert(match("011110110", g) == 3); +assert(match("000110010", g) == 1); + +s = "aaaaaaaaaaaaaaaaaaaaaaaa"; +assert(match(s, "'a'^3") == 4); +assert(match(s, "'a'^0") == 1); +assert(match(s, "'a'^+3") == s->len() + 1); +assert(! match(s, "'a'^+30")); +assert(match(s, "'a'^-30") == s->len() + 1); +assert(match(s, "'a'^-5") == 6); +for( i = 1, s->len() ) { + assert(match(s, string.format("'a'^+%d", i)) >= i + 1); + assert(match(s, string.format("'a'^-%d", i)) <= i + 1); + assert(match(s, string.format("'a'^%d", i)) == i + 1); +} +assert(match("01234567890123456789", "[0-9]^3+") == 19); + + +assert(match("01234567890123456789", "({....}{...}) -> '%2%1'") == "4560123"); +t = match("0123456789", "{| {.}* |}"); +checkeq(t, {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}); +assert(match("012345", "{| (..) -> '%0%0' |}")[1] == "0101"); + +assert(match("abcdef", "( {.} {.} {.} {.} {.} ) -> 3") == "c"); +assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 3") == "d"); +assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 0") == 6); + +assert(! match("abcdef", "{:x: ({.} {.} {.}) -> 2 :} =x")); +assert(match("abcbef", "{:x: ({.} {.} {.}) -> 2 :} =x")); + +eqcharset(compile("[]]"), "]"); +eqcharset(compile("[][]"), m.S("[]")); +eqcharset(compile("[]-]"), m.S("-]")); +eqcharset(compile("[-]"), m.S("-")); +eqcharset(compile("[az-]"), m.S("a-z")); +eqcharset(compile("[-az]"), m.S("a-z")); +eqcharset(compile("[a-z]"), m.R("az")); +eqcharset(compile("[]['\"]"), m.S([=[]['"]=])); + +eqcharset(compile("[^]]"), any - "]"); +eqcharset(compile("[^][]"), any - m.S("[]")); +eqcharset(compile("[^]-]"), any - m.S("-]")); +eqcharset(compile("[^]-]"), any - m.S("-]")); +eqcharset(compile("[^-]"), any - m.S("-")); +eqcharset(compile("[^az-]"), any - m.S("a-z")); +eqcharset(compile("[^-az]"), any - m.S("a-z")); +eqcharset(compile("[^a-z]"), any - m.R("az")); +eqcharset(compile("[^]['\"]"), any - m.S([=[]['"]=])); + +// tests for comments in 're' +e = compile([=[ +A <- _B -- \t \n %nl .<> <- -> -- +_B <- 'x' --]=]); +assert(e->emulatestreammatch('xy') == 2); + +// tests for 're' with pre-definitions +defs = {digits = m.R("09"), letters = m.R("az"), _=m.P("__")}; +e = compile("%letters (%letters / %digits)*", defs); +assert(e->emulatestreammatch("x123") == 5); +e = compile("%_", defs); +assert(e->emulatestreammatch("__") == 3); + +e = compile([=[ + S <- A+ + A <- %letters+ B + B <- %digits+ +]=], defs); + +e = compile("{[0-9]+'.'?[0-9]*} -> sin", math); +assert(e->emulatestreammatch("2.34") == math.sin(2.34)); + + +function eq (_, _, a, b) { return a == b; } + +c = re.compile([=[ + longstring <- '[' {:init: '='* :} '[' close + close <- ']' =init ']' / . close +]=]); + +assert(c->emulatestreammatch('[==[]]===]]]]==]===[]') == 17); +assert(c->emulatestreammatch('[[]=]====]=]]]==]===[]') == 14); +assert(! c->emulatestreammatch('[[]=]====]=]=]==]===[]')); + +c = re.compile(" '[' {:init: '='* :} '[' (!(']' =init ']') .)* ']' =init ']' !. "); + +assert(c->emulatestreammatch('[==[]]===]]]]==]')); +assert(c->emulatestreammatch('[[]=]====]=][]==]===[]]')); +assert(! c->emulatestreammatch('[[]=]====]=]=]==]===[]')); + +assert(re.find("hi alalo", "{:x:..:} =x") == 4); +assert(re.find("hi alalo", "{:x:..:} =x", 4) == 4); +assert(! re.find("hi alalo", "{:x:..:} =x", 5)); +assert(re.find("hi alalo", "{'al'}", 5) == 6); +assert(re.find("hi aloalolo", "{:x:..:} =x") == 8); +assert(re.find("alo alohi x x", "{:word:%w+:}%W*(=word)!%w") == 11); + +// re.find discards any captures +a,b,c = re.find("alo", "{.}{'o'}"); +assert(a == 2 && b == 3 && c == null); + +match = function (s,p) { + var i,e = re.find(s,p); + if( i ) { return s->sub(i, e); } +} +assert(match("alo alo", '[a-z]+') == "alo"); +assert(match("alo alo", '{:x: [a-z]+ :} =x') == null); +assert(match("alo alo", "{:x: [a-z]+ :} ' ' =x") == "alo alo"); + +assert(re.gsub("alo alo", "[abc]", "x") == "xlo xlo"); +assert(re.gsub("alo alo", "%w+", ".") == ". ."); +assert(re.gsub("hi, how are you", "[aeiou]", string.upper) == + "hI, hOw ArE yOU"); + +s = 'hi [[a comment[=]=] ending here]] and [=[another]]=]]'; +c = re.compile(" '[' {:i: '='* :} '[' (!(']' =i ']') .)* ']' { =i } ']' "); +assert(re.gsub(s, c, "%2") == 'hi and =]'); +assert(re.gsub(s, c, "%0") == s); +assert(re.gsub('[=[hi]=]', c, "%2") == '='); + +assert(re.find("", "!.") == 1); +assert(re.find("alo", "!.") == 4); + +function addtag (s, i, t, tag) { t.tag = tag; return i, t; } + +c = re.compile([=[ + doc <- block !. + block <- (start {| (block / { [^<]+ })* |} end?) => addtag + start <- '<' {:tag: [a-z]+ :} '>' + end <- '' +]=], {addtag = addtag}); + +x = c->emulatestreammatch([=[ +hihellobuttotheend]=]); +checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but', + {'totheend'}}); + + +// tests for look-ahead captures +x = {re.emulatestreammatch("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")}; +checkeq(x, {"", "alo", ""}); + +assert(re.emulatestreammatch("aloalo", + "{~ (((&'al' {.}) -> 'A%1' / (&%l {.}) -> '%1%1') / .)* ~}") + == "AallooAalloo"); + +// bug in 0.9 (and older versions), due to captures in look-aheads +x = re.compile([=[ {~ (&(. ([a-z]* -> '*')) ([a-z]+ -> '+') ' '*)* ~} ]=]); +assert(x->emulatestreammatch("alo alo") == "+ +"); + +// valid capture in look-ahead (used inside the look-ahead itself) +x = re.compile([=[ + S <- &({:two: .. :} . =two) {[a-z]+} / . S +]=]); +assert(x->emulatestreammatch("hello aloaLo aloalo xuxu") == "aloalo"); + + +p = re.compile([=[ + block <- {| {:ident:space*:} line + ((=ident !space line) / &(=ident space) block)* |} + line <- {[^%nl]*} %nl + space <- '_' -- should be ' ', but '_' is simpler for editors +]=]); + +t= p->emulatestreammatch([=[ +1 +__1.1 +__1.2 +____1.2.1 +____ +2 +__2.1 +]=]); +checkeq(t, {"1", {"1.1", "1.2", {"1.2.1", "", ident = "____"}, ident = "__"}, + "2", {"2.1", ident = "__"}, ident = ""}); + + +// nested grammars +p = re.compile([=[ + s <- a b !. + b <- ( x <- ('b' x)? ) + a <- ( x <- 'a' x? ) +]=]); + +assert(p->emulatestreammatch('aaabbb')); +assert(p->emulatestreammatch('aaa')); +assert(! p->emulatestreammatch('bbb')); +assert(! p->emulatestreammatch('aaabbba')); + +// testing groups +t = {re.emulatestreammatch("abc", "{:S <- {:.:} {S} / '':}")}; +checkeq(t, {"a", "bc", "b", "c", "c", ""}); + +t = re.emulatestreammatch("1234", "{| {:a:.:} {:b:.:} {:c:.{.}:} |}"); +checkeq(t, {a="1", b="2", c="4"}); +t = re.emulatestreammatch("1234", "{|{:a:.:} {:b:{.}{.}:} {:c:{.}:}|}"); +checkeq(t, {a="1", b="2", c="4"}); +t = re.emulatestreammatch("12345", "{| {:.:} {:b:{.}{.}:} {:{.}{.}:} |}"); +checkeq(t, {"1", b="2", "4", "5"}); +t = re.emulatestreammatch("12345", "{| {:.:} {:{:b:{.}{.}:}:} {:{.}{.}:} |}"); +checkeq(t, {"1", "23", "4", "5"}); +t = re.emulatestreammatch("12345", "{| {:.:} {{:b:{.}{.}:}} {:{.}{.}:} |}"); +checkeq(t, {"1", "23", "4", "5"}); + + +// testing pre-defined names +assert(os.setlocale("C") == "C"); + +function eqlpeggsub (p1, p2) { + var s1 = cs2str(re.compile(p1)); + var s2 = string.gsub(allchar, "[^" .. p2 .. "]", ""); + // if s1 ~= s2 then print(#s1,#s2) end + assert(s1 == s2); +} + + +eqlpeggsub("%w", "%w"); +eqlpeggsub("%a", "%a"); +eqlpeggsub("%l", "%l"); +eqlpeggsub("%u", "%u"); +eqlpeggsub("%p", "%p"); +eqlpeggsub("%d", "%d"); +eqlpeggsub("%x", "%x"); +eqlpeggsub("%s", "%s"); +eqlpeggsub("%c", "%c"); + +eqlpeggsub("%W", "%W"); +eqlpeggsub("%A", "%A"); +eqlpeggsub("%L", "%L"); +eqlpeggsub("%U", "%U"); +eqlpeggsub("%P", "%P"); +eqlpeggsub("%D", "%D"); +eqlpeggsub("%X", "%X"); +eqlpeggsub("%S", "%S"); +eqlpeggsub("%C", "%C"); + +eqlpeggsub("[%w]", "%w"); +eqlpeggsub("[_%w]", "_%w"); +eqlpeggsub("[^%w]", "%W"); +eqlpeggsub("[%W%S]", "%W%S"); + +re.updatelocale(); + + +// testing nested substitutions x string captures + +p = re.compile([=[ + text <- {~ item* ~} + item <- macro / [^()] / '(' item* ')' + arg <- ' '* {~ (!',' item)* ~} + args <- '(' arg (',' arg)* ')' + macro <- ('apply' args) -> '%1(%2)' + / ('add' args) -> '%1 + %2' + / ('mul' args) -> '%1 * %2' +]=]); + +assert(p->emulatestreammatch("add(mul(a,b), apply(f,x))") == "a * b + f(x)"); + +rev = re.compile([=[ R <- (!.) -> '' / ({.} R) -> '%2%1']=]); + +assert(rev->emulatestreammatch("0123456789") == "9876543210"); + + +// testing error messages in re + +var function errmsg (p, err) { + checkerr(err, re.compile, p); +} + +errmsg('aaaa', "rule 'aaaa'"); +errmsg('a', 'outside'); +errmsg('b <- a', 'undefined'); +errmsg("x <- 'a' x <- 'b'", 'already defined'); +errmsg("'a' -", "near '-'"); + + +print("OK"); + + diff --git a/tests/streamtest2.ljs b/tests/streamtest2.ljs new file mode 100644 index 0000000..0745fd8 --- /dev/null +++ b/tests/streamtest2.ljs @@ -0,0 +1,189 @@ +package.path = "./lpeglj/?.ljs;" .. package.path; + +var m = require("lpeglj"); +var re = require("re"); + +var function checkeq(x, y, p) { + if( p ) { print(x, y); } + if( type(x) != "table" ) { assert(x == y); + } else { + for( k, v in pairs(x) ) { checkeq(v, y[k], p); } + for( k, v in pairs(y) ) { checkeq(v, x[k], p); } + } +} + +var ret; + +print("Tests for LPegLJ stream mode"); + +assert(type(m.version()) == "string"); +print("version " .. m.version()); + +var pat = m.C('abcd') * m.C('x'); +var fce = pat->streammatch(); + +ret = { fce("a") }; +checkeq(ret, { 1 }); +ret = { fce("b") }; +checkeq(ret, { 1 }); +ret = { fce("c") }; +checkeq(ret, { 1 }); +ret = { fce("d") }; +checkeq(ret, { 1, "abcd" }); +ret = { fce("x") }; +checkeq(ret, { 0, 'x' }); + +pat = m.C('abcd') * m.C('x') + m.C('abcd') * m.C('y'); +fce = pat->streammatch(); +ret = { fce("abcd") }; +checkeq(ret, { 1 }); +ret = { fce("y") }; +checkeq(ret, { 0, "abcd", "y" }); + +pat = m.C('abcd') ** 0 * m.C('x'); +fce = pat->streammatch(); +for( i = 1, 1e3 ) { + ret = { fce("ab") }; + checkeq(ret, { 1 }); + ret = { fce("cd") }; + checkeq(ret, { 1, "abcd" }); +} +ret = { fce("x") }; +checkeq(ret, { 0, "x" }); + +pat = (m.C('abcd') / "out") ** 0 * m.C('x'); +fce = pat->streammatch(); +for( i = 1, 1e3 ) { + ret = { fce("ab") }; + checkeq(ret, { 1 }); + ret = { fce("cd") }; + checkeq(ret, { 1, "out" }); +} +ret = { fce("x") }; +checkeq(ret, { 0, "x" }); + +pat = (m.C('abcd') / "pattern1" + m.C('efgh') / "pattern2" + (m.P(1) - 'xyz')) ** 0 * (m.C("xyz") / "pattern3"); +fce = pat->streammatch(); + +for( i = 1, 1e3 ) { + ret = { fce("ef") }; + checkeq(ret, { 1 }); + ret = { fce("gh") }; + checkeq(ret, { 1, "pattern2" }); + ret = { fce("a") }; + checkeq(ret, { 1 }); + ret = { fce("bcd") }; + checkeq(ret, { 1, "pattern1" }); +} +ret = { fce("xyz") }; +checkeq(ret, { 0, "pattern3" }); + +pat = m.P('abcd') * -1; +fce = pat->streammatch(); +ret = { fce("abc") }; +checkeq(ret, { 1 }); +ret = { fce("d") }; +checkeq(ret, { 1 }); +ret = { fce("", true) }; +checkeq(ret, { 0, 5 }); + +var field = '"' * m.Cs(((m.P(1) - '"') + m.P('""') / '"') ** 0) * '"' + + m.C((1 - m.S(',\n"')) ** 0); + +var record = field * (',' * field) ** 0 * (m.P('\n') + -1); + +fce = record->streammatch(); +ret = { fce('ab') }; +checkeq(ret, { 1 }); +ret = { fce('c') }; +checkeq(ret, { 1 }); +ret = { fce(',"def",') }; +checkeq(ret, { 1, 'abc', 'def' }); +ret = { fce('x', true) }; +checkeq(ret, { 0, 'x' }); + +record = re.compile([=[ + record <- field (',' field)* (%nl / !.) + field <- escaped / nonescaped + nonescaped <- { [^,"%nl]* } + escaped <- '"' {~ ([^"] / '""' -> '"')* ~} '"' +]=]); + +fce = record->streammatch(); +ret = { fce("a") }; +checkeq(ret, { 1 }); +ret = { fce("bc,") }; +checkeq(ret, { 1, 'abc' }); +ret = { fce("def", true) }; +checkeq(ret, { 0, 'def' }); + +var c = re.compile([=[ + s <- ({(!longstring .)+} / longstring)* + longstring <- '[' {:init: '='* :} '[' close + close <- ']' =init ']' / . close +]=]); + +var teststring = 'data1[=[insidedata1]=]data2[==[====]==]data3[[]]'; + +var output = { 'data1', 'data2', 'data3' }; + +fce = c->streammatch(); + +var index = 1; + +for( i = 1, #output ) { + var status, data; + do { + status, data = fce(teststring->sub(index, index), index == #teststring); + ++index ; + } while(!( data || status != 1) ); + checkeq(output[i], data); +} + +pat = m.C('a') * m.Cg('b', 'backref1') * m.C('c') * m.Cg('d', 'backref2') * m.C('e') * m.Cg('f', 'backref3') * + m.Cb('backref1') * m.C('g') * m.Cb('backref2') * m.C('h') * m.Cb('backref3') * m.C('i'); +fce = pat->streammatch(); + +ret = { fce("a") }; +checkeq(ret, { 1, 'a' }); +ret = { fce("b") }; +checkeq(ret, { 1 }); +ret = { fce("c") }; +checkeq(ret, { 1, "c" }); +ret = { fce("d") }; +checkeq(ret, { 1, }); +ret = { fce("e") }; +checkeq(ret, { 1, "e" }); +ret = { fce("f") }; +checkeq(ret, { 1, "b" }); +ret = { fce("g") }; +checkeq(ret, { 1, "g", "d" }); +ret = { fce("h") }; +checkeq(ret, { 1, "h", "f" }); +ret = { fce("i") }; +checkeq(ret, { 0, "i" }); + +pat = m.C('a') * (m.Cg(1, 'backref') * m.C('x1') * m.Cb('backref') + m.Cg(1, 'backref') * m.C('x2') * m.Cb('backref')); +fce = pat->streammatch(); +ret = { fce("a") }; +checkeq(ret, { 1, 'a' }); +ret = { fce("x") }; +checkeq(ret, { 1 }); +ret = { fce("x") }; +checkeq(ret, { 1 }); +ret = { fce("2") }; +checkeq(ret, { 0, 'x2', 'x' }); + + +pat = m.C('a') * m.Ct(m.Cg('b', 'index')) * m.C('c'); +fce = pat->streammatch(); + +ret = { fce("a") }; +checkeq(ret, { 1, 'a' }); +ret = { fce("b") }; +checkeq(ret, { 1, { index = 'b' } }); +ret = { fce("c") }; +checkeq(ret, { 0, 'c' }); + +print('OK'); + diff --git a/tests/test.ljs b/tests/test.ljs new file mode 100644 index 0000000..7a298a9 --- /dev/null +++ b/tests/test.ljs @@ -0,0 +1,1445 @@ +#!/usr/bin/env lua5.1 + +// $Id: test.lua,v 1.109 2015/09/28 17:01:25 roberto Exp $ + +// require"strict" -- just to be pedantic + +package.path = "./lpeglj/?.ljs;" .. package.path; + +var m = require("lpeglj"); + + +// for general use +var a, b, c, d, e, f, g, p, t; + + +// compatibility with Lua 5.2 +var unpack = rawget(table, "unpack") || unpack; +var loadstring = rawget(_G, "loadstring") || load; + + + +var any = m.P(1); +var space = m.S(" \t\n")**0; + +var function checkeq (x, y, p) { +if( p ) { print(x,y); } + if( type(x) != "table" ) { assert(x == y); + } else { + for( k,v in pairs(x) ) { checkeq(v, y[k], p); } + for( k,v in pairs(y) ) { checkeq(v, x[k], p); } + } +} + + +var mt = getmetatable(m.P(1)); +mt = m.version() == "1.0.0.0LJ" && m || mt; + + +var allchar = {}; +for( i=0,255 ) { allchar[i + 1] = i; } +allchar = string.char(unpack(allchar)); +assert(#allchar == 256); + +var function cs2str (c) { + return m.match(m.Cs((c + m.P(1)/"")**0), allchar); +} + +var function eqcharset (c1, c2) { + assert(cs2str(c1) == cs2str(c2)); +} + + +print("General tests for LPeg library"); + +assert(type(m.version()) == "string"); +print("version " .. m.version()); +assert(m.type("alo") != "pattern"); +assert(m.type(io.input) != "pattern"); +assert(m.type(m.P("alo")) == "pattern"); + +// tests for some basic optimizations +assert(m.match(m.P(false) + "a", "a") == 2); +assert(m.match(m.P(true) + "a", "a") == 1); +assert(m.match("a" + m.P(false), "b") == null); +assert(m.match("a" + m.P(true), "b") == 1); + +assert(m.match(m.P(false) * "a", "a") == null); +assert(m.match(m.P(true) * "a", "a") == 2); +assert(m.match("a" * m.P(false), "a") == null); +assert(m.match("a" * m.P(true), "a") == 2); + +assert(m.match(#m.P(false) * "a", "a") == null); +assert(m.match(#m.P(true) * "a", "a") == 2); +assert(m.match("a" * #m.P(false), "a") == null); +assert(m.match("a" * #m.P(true), "a") == 2); + + +// tests for locale +{ + assert(m.locale(m) == m); + t = {}; + assert(m.locale(t, m) == t); + var x = m.locale(); + for( n,v in pairs(x) ) { + assert(type(n) == "string"); + eqcharset(v, m[n]); + } +} + + +assert(m.match(3, "aaaa")); +assert(m.match(4, "aaaa")); +assert(! m.match(5, "aaaa")); +assert(m.match(-3, "aa")); +assert(! m.match(-3, "aaa")); +assert(! m.match(-3, "aaaa")); +assert(! m.match(-4, "aaaa")); +assert(m.P(-5)->match("aaaa")); + +assert(m.match("a", "alo") == 2); +assert(m.match("al", "alo") == 3); +assert(! m.match("alu", "alo")); +assert(m.match(true, "") == 1); + +var digit = m.S("0123456789"); +var upper = m.S("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); +var lower = m.S("abcdefghijklmnopqrstuvwxyz"); +var letter = m.S("") + upper + lower; +var alpha = letter + digit + m.R(); + +eqcharset(m.S(""), m.P(false)); +eqcharset(upper, m.R("AZ")); +eqcharset(lower, m.R("az")); +eqcharset(upper + lower, m.R("AZ", "az")); +eqcharset(upper + lower, m.R("AZ", "cz", "aa", "bb", "90")); +eqcharset(digit, m.S("01234567") + "8" + "9"); +eqcharset(upper, letter - lower); +eqcharset(m.S(""), m.R()); +assert(cs2str(m.S("")) == ""); + +eqcharset(m.S("\0"), "\0"); +eqcharset(m.S("\1\0\2"), m.R("\0\2")); +eqcharset(m.S("\1\0\2"), m.R("\1\2") + "\0"); +eqcharset(m.S("\1\0\2") - "\0", m.R("\1\2")); + +var word = alpha**1 * (1 - alpha)**0; + +assert((word**0 * -1)->match("alo alo")); +assert(m.match(word**1 * -1, "alo alo")); +assert(m.match(word**2 * -1, "alo alo")); +assert(! m.match(word**3 * -1, "alo alo")); + +assert(! m.match(word**-1 * -1, "alo alo")); +assert(m.match(word**-2 * -1, "alo alo")); +assert(m.match(word**-3 * -1, "alo alo")); + +var eos = m.P(-1); + +assert(m.match(digit**0 * letter * digit * eos, "1298a1")); +assert(! m.match(digit**0 * letter * eos, "1257a1")); + +b = { + [1] = "(" * (((1 - m.S("()")) + #m.P("(") * m.V(1))**0) * ")" +}; + +assert(m.match(b, "(al())()")); +assert(! m.match(b * eos, "(al())()")); +assert(m.match(b * eos, "((al())()(é))")); +assert(! m.match(b, "(al()()")); + +assert(! m.match(letter**1 - "for", "foreach")); +assert(m.match(letter**1 - ("for" * eos), "foreach")); +assert(! m.match(letter**1 - ("for" * eos), "for")); + +function basiclookfor (p) { + return m.P ({ + [1] = p + (1 * m.V(1)) + }); +} + +function caplookfor (p) { + return basiclookfor(p->C()); +} + +assert(m.match(caplookfor(letter**1), " 4achou123...") == "achou"); +a = {m.match(caplookfor(letter**1)**0, " two words, one more ")}; +checkeq(a, {"two", "words", "one", "more"}); + +assert(m.match( basiclookfor((#m.P(b) * 1) * m.Cp()), " ( (a)") == 7); + +a = {m.match(m.C(digit**1 * m.Cc("d")) + m.C(letter**1 * m.Cc("l")), "123")}; +checkeq(a, {"123", "d"}); + +// bug in LPeg 0.12 (null value does not create a 'ktable') +assert(m.match(m.Cc(null), "") == null); + +a = {m.match(m.C(digit**1 * m.Cc("d")) + m.C(letter**1 * m.Cc("l")), "abcd")}; +checkeq(a, {"abcd", "l"}); + +a = {m.match(m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')}; +checkeq(a, {10,20,30,2}); +a = {m.match(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')}; +checkeq(a, {1,10,20,30,2}); +a = m.match(m.Ct(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa'); +checkeq(a, {1,10,20,30,2}); +a = m.match(m.Ct(m.Cp() * m.Cc(7,8) * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa'); +checkeq(a, {1,7,8,10,20,30,2}); +a = {m.match(m.Cc() * m.Cc() * m.Cc(1) * m.Cc(2,3,4) * m.Cc() * 'a', 'aaa')}; +checkeq(a, {1,2,3,4}); + +a = {m.match(m.Cp() * letter**1 * m.Cp(), "abcd")}; +checkeq(a, {1, 5}); + + +t = {m.match({[1] = m.C(m.C(1) * m.V(1) + -1)}, "abc")}; +checkeq(t, {"abc", "a", "bc", "b", "c", "c", ""}); +// bug in 0.12 ('hascapture' did not check for captures inside a rule) +{ + var pat = m.P({ + 'S'; + S1 = m.C('abc') + 3, + S = #m.V('S1') // rule has capture, but '#' must ignore it + }); + assert(pat->match('abc') == 1); +} + + +// test for small capture boundary +for( i = 250,260 ) { + assert(#m.match(m.C(i), string.rep('a', i)) == i); + assert(#m.match(m.C(m.C(i)), string.rep('a', i)) == i); +} + + +// tests for any*n and any*-n +for( n = 1, 550, 13 ) { + var x_1 = string.rep('x', n - 1); + var x = x_1 .. 'a'; + assert(! m.P(n)->match(x_1)); + assert(m.P(n)->match(x) == n + 1); + assert(n < 4 || m.match(m.P(n) + "xxx", x_1) == 4); + assert(m.C(n)->match(x) == x); + assert(m.C(m.C(n))->match(x) == x); + assert(m.P(-n)->match(x_1) == 1); + assert(! m.P(-n)->match(x)); + assert(n < 13 || m.match(m.Cc(20) * ((n - 13) * m.P(10)) * 3, x) == 20); + var n3 = math.floor(n/3); + assert(m.match(n3 * m.Cp() * n3 * n3, x) == n3 + 1); +} + +// true values +assert(m.P(0)->match("x") == 1); +assert(m.P(0)->match("") == 1); +assert(m.C(0)->match("x") == ""); + +assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxu") == 1); +assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxuxuxuxu") == 0); +assert(m.match(m.C(m.P(2)**1), "abcde") == "abcd"); +p = m.Cc(0) * 1 + m.Cc(1) * 2 + m.Cc(2) * 3 + m.Cc(3) * 4; + + +// test for alternation optimization +assert(m.match(m.P("a")**1 + "ab" + m.P("x")**0, "ab") == 2); +assert(m.match((m.P("a")**1 + "ab" + m.P("x")**0 * 1)**0, "ab") == 3); +assert(m.match(m.P("ab") + "cd" + "" + "cy" + "ak", "98") == 1); +assert(m.match(m.P("ab") + "cd" + "ax" + "cy", "ax") == 3); +assert(m.match("a" * m.P("b")**0 * "c" + "cd" + "ax" + "cy", "ax") == 3); +assert(m.match((m.P("ab") + "cd" + "ax" + "cy")**0, "ax") == 3); +assert(m.match(m.P(1) * "x" + m.S("") * "xu" + "ay", "ay") == 3); +assert(m.match(m.P("abc") + "cde" + "aka", "aka") == 4); +assert(m.match(m.S("abc") * "x" + "cde" + "aka", "ax") == 3); +assert(m.match(m.S("abc") * "x" + "cde" + "aka", "aka") == 4); +assert(m.match(m.S("abc") * "x" + "cde" + "aka", "cde") == 4); +assert(m.match(m.S("abc") * "x" + "ide" + m.S("ab") * "ka", "aka") == 4); +assert(m.match("ab" + m.S("abc") * m.P("y")**0 * "x" + "cde" + "aka", "ax") == 3); +assert(m.match("ab" + m.S("abc") * m.P("y")**0 * "x" + "cde" + "aka", "aka") == 4); +assert(m.match("ab" + m.S("abc") * m.P("y")**0 * "x" + "cde" + "aka", "cde") == 4); +assert(m.match("ab" + m.S("abc") * m.P("y")**0 * "x" + "ide" + m.S("ab") * "ka", "aka") == 4); +assert(m.match("ab" + m.S("abc") * m.P("y")**0 * "x" + "ide" + m.S("ab") * "ka", "ax") == 3); +assert(m.match(m.P(1) * "x" + "cde" + m.S("ab") * "ka", "aka") == 4); +assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "aka") == 4); +assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "cde") == 4); +assert(m.match(m.P("eb") + "cd" + m.P("e")**0 + "x", "ee") == 3); +assert(m.match(m.P("ab") + "cd" + m.P("e")**0 + "x", "abcd") == 3); +assert(m.match(m.P("ab") + "cd" + m.P("e")**0 + "x", "eeex") == 4); +assert(m.match(m.P("ab") + "cd" + m.P("e")**0 + "x", "cd") == 3); +assert(m.match(m.P("ab") + "cd" + m.P("e")**0 + "x", "x") == 1); +assert(m.match(m.P("ab") + "cd" + m.P("e")**0 + "x" + "", "zee") == 1); +assert(m.match(m.P("ab") + "cd" + m.P("e")**1 + "x", "abcd") == 3); +assert(m.match(m.P("ab") + "cd" + m.P("e")**1 + "x", "eeex") == 4); +assert(m.match(m.P("ab") + "cd" + m.P("e")**1 + "x", "cd") == 3); +assert(m.match(m.P("ab") + "cd" + m.P("e")**1 + "x", "x") == 2); +assert(m.match(m.P("ab") + "cd" + m.P("e")**1 + "x" + "", "zee") == 1); +assert(! m.match(("aa" * m.P("bc")**-1 + "aab") * "e", "aabe")); + +assert(m.match("alo" * (m.P("\n") + -1), "alo") == 4); + + +// bug in 0.12 (rc1) +assert(m.match((m.P("\128\187\191") + m.S("abc"))**0, "\128\187\191") == 4); + +assert(m.match(m.S("\0\128\255\127")**0, string.rep("\0\128\255\127", 10)) == + 4*10 + 1); + +// optimizations with optional parts +assert(m.match(("ab" * -m.P("c"))**-1, "abc") == 1); +assert(m.match(("ab" * #m.P("c"))**-1, "abd") == 1); +assert(m.match(("ab" * m.B("c"))**-1, "ab") == 1); +assert(m.match(("ab" * m.P("cd")**0)**-1, "abcdcdc") == 7); + +assert(m.match(m.P("ab")**-1 - "c", "abcd") == 3); + +p = ('Aa' * ('Bb' * ('Cc' * m.P('Dd')**0)**0)**0)**-1; +assert(p->match("AaBbCcDdBbCcDdDdDdBb") == 21); + +// bug in 0.12.2 +// p = { ('ab' ('c' 'ef'?)*)? } +p = m.C(('ab' * ('c' * m.P('ef')**-1)**0)**-1); +s = "abcefccefc"; +assert(s == p->match(s)); + + +pi = "3.14159 26535 89793 23846 26433 83279 50288 41971 69399 37510"; +assert(m.match(m.Cs((m.P("1") / "a" + m.P("5") / "b" + m.P("9") / "c" + 1)**0), pi) == + m.match(m.Cs((m.P(1) / {["1"] = "a", ["5"] = "b", ["9"] = "c"})**0), pi)); +print("+"); + + +// tests for capture optimizations +assert(m.match((m.P(3) + 4 * m.Cp()) * "a", "abca") == 5); +t = {m.match(((m.P("a") + m.Cp()) * m.P("x"))**0, "axxaxx")}; +checkeq(t, {3, 6}); + + +// tests for numbered captures +p = m.C(1); +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 3, "abcdefgh") == "a"); +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 1, "abcdefgh") == "abcdef"); +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 4, "abcdefgh") == "bc"); +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 0, "abcdefgh") == 7); + +a, b, c = m.match(p * (m.C(p * m.C(2)) * m.C(3) / 4) * p, "abcdefgh"); +assert(a == "a" && b == "efg" && c == "h"); + +// test for table captures +t = m.match(m.Ct(letter**1), "alo"); +checkeq(t, {}); + +t, n = m.match(m.Ct(m.C(letter)**1) * m.Cc("t"), "alo"); +assert(n == "t" && table.concat(t) == "alo"); + +t = m.match(m.Ct(m.C(m.C(letter)**1)), "alo"); +assert(table.concat(t, ";") == "alo;a;l;o"); + +t = m.match(m.Ct(m.C(m.C(letter)**1)), "alo"); +assert(table.concat(t, ";") == "alo;a;l;o"); + +t = m.match(m.Ct(m.Ct((m.Cp() * letter * m.Cp())**1)), "alo"); +assert(table.concat(t[1], ";") == "1;2;2;3;3;4"); + +t = m.match(m.Ct(m.C(m.C(1) * 1 * m.C(1))), "alo"); +checkeq(t, {"alo", "a", "o"}); + + +// tests for groups +p = m.Cg(1); // no capture +assert(p->match('x') == 'x'); +p = m.Cg(m.P(true)/function () { } * 1); // no value +assert(p->match('x') == 'x'); +p = m.Cg(m.Cg(m.Cg(m.C(1)))); +assert(p->match('x') == 'x'); +p = m.Cg(m.Cg(m.Cg(m.C(1))**0) * m.Cg(m.Cc(1) * m.Cc(2))); +t = {p->match('abc')}; +checkeq(t, {'a', 'b', 'c', 1, 2}); + +p = m.Ct(m.Cg(m.Cc(10), "hi") * m.C(1)**0 * m.Cg(m.Cc(20), "ho")); +t = p->match(''); +checkeq(t, {hi = 10, ho = 20}); +t = p->match('abc'); +checkeq(t, {hi = 10, ho = 20, 'a', 'b', 'c'}); +// non-string group names +p = m.Ct(m.Cg(1, print) * m.Cg(1, 23.5) * m.Cg(1, io)); +t = p->match('abcdefghij'); +assert(t[print] == 'a' && t[23.5] == 'b' && t[io] == 'c'); + + +// test for error messages +var function checkerr (msg, f, ...) { + var st, err = pcall(f, ...); + assert(! st && m.match({ m.P(msg) + 1 * m.V(1) }, err)); +} + +checkerr("rule '1' may be left recursive", m.match, { m.V(1) * 'a' }, "a"); +checkerr("rule '1' used outside a grammar", m.match, m.V(1), ""); +checkerr("rule 'hiii' used outside a grammar", m.match, m.V('hiii'), ""); +checkerr("rule 'hiii' undefined in given grammar", m.match, { m.V('hiii') }, ""); +checkerr("undefined in given grammar", m.match, { m.V({}) }, ""); + +checkerr("rule 'A' is not a pattern", m.P, { m.P(1), A = {} }); +checkerr("grammar has no initial rule", m.P, { [print] = {} }); + +// grammar with a long call chain before left recursion +p = {'a', + a = m.V('b') * m.V('c') * m.V('d') * m.V('a'), + b = m.V('c'), + c = m.V('d'), + d = m.V('e'), + e = m.V('f'), + f = m.V('g'), + g = m.P('') +}; +checkerr("rule 'a' may be left recursive", m.match, p, "a"); + +// Bug in peephole optimization of LPeg 0.12 (IJmp -> ICommit) +// the next grammar has an original sequence IJmp -> ICommit -> IJmp L1 +// that is optimized to ICommit L1 + +p = m.P ({ (m.P ({m.P('abc')}) + 'ayz') * m.V('y'); y = m.P('x') }); +assert(p->match('abcx') == 5 && p->match('ayzx') == 5 && ! p->match('abc')); + + +{ + // large dynamic Cc + var lim = 2**16 - 1; + var xc = 0; + var function seq (n) { + if( n == 1 ) { ++xc ; return m.Cc(xc); + } else { + var m = math.floor(n / 2); + return seq(m) * seq(n - m); + } + } + p = m.Ct(seq(lim)); + t = p->match(''); + assert(t[lim] == lim); + checkerr("too many", function () { p /= print; }); + checkerr("too many", seq, lim + 1); +} +// tests for non-pattern as arguments to pattern functions + +p = { ('a' * m.V(1))**-1 } * m.P('b') * { 'a' * m.V(2); m.V(1)**-1 }; +assert(m.match(p, "aaabaac") == 7); + +p = m.P('abc') * 2 * -5 * true * 'de'; // mix of numbers and strings and booleans + +assert(p->match("abc01de") == 8); +assert(p->match("abc01de3456") == null); + +p = 'abc' * (2 * (-5 * (true * m.P('de')))); + +assert(p->match("abc01de") == 8); +assert(p->match("abc01de3456") == null); + +p = { m.V(2), m.P("abc") } * + (m.P({ "xx", xx = m.P("xx") }) + { "x", x = m.P("a") * m.V("x") + "" }); +assert(p->match("abcaaaxx") == 7); +assert(p->match("abcxx") == 6); + + +// a large table capture +t = m.match(m.Ct(m.C('a')**0), string.rep("a", 10000)); +assert(#t == 10000 && t[1] == 'a' && t[#t] == 'a'); + +print('+'); + + +// bug in 0.10 (rechecking a grammar, after tail-call optimization) +m.P({ m.P ({ (m.P(3) + "xuxu")**0 * m.V("xuxu"), xuxu = m.P(1) }) }); + +var V = m.V; + +var Space = m.S(" \n\t")**0; +var Number = m.C(m.R("09")**1) * Space; +var FactorOp = m.C(m.S("+-")) * Space; +var TermOp = m.C(m.S("*/")) * Space; +var Open = "(" * Space; +var Close = ")" * Space; + + +var function f_factor (v1, op, v2, d) { + assert(d == null); + if( op == "+" ) { return v1 + v2; + } else { return v1 - v2; + } +} + + +var function f_term (v1, op, v2, d) { + assert(d == null); + if( op == "*" ) { return v1 * v2; + } else { return v1 / v2; + } +} + +G = m.P({ "Exp", + Exp = m.Cf(V("Factor") * m.Cg(FactorOp * V("Factor"))**0, f_factor); + Factor = m.Cf(V("Term") * m.Cg(TermOp * V("Term"))**0, f_term); + Term = Number / tonumber + Open * V("Exp") * Close; +}); + +G = Space * G * -1; + +for( _, s in ipairs({" 3 + 5*9 / (1+1) ", "3+4/2", "3+3-3- 9*2+3*9/1- 8"}) ) { + assert(m.match(G, s) == loadstring("return "..s)()); +} + + +// test for grammars (errors deep in calling non-terminals) +g = m.P({ + [1] = m.V(2) + "a", + [2] = "a" * m.V(3) * "x", + [3] = "b" * m.V(3) + "c" +}); + +assert(m.match(g, "abbbcx") == 7); +assert(m.match(g, "abbbbx") == 2); + + +// tests for \0 +assert(m.match(m.R("\0\1")**1, "\0\1\0") == 4); +assert(m.match(m.S("\0\1ab")**1, "\0\1\0a") == 5); +assert(m.match(m.P(1)**3, "\0\1\0a") == 5); +assert(! m.match(-4, "\0\1\0a")); +assert(m.match("\0\1\0a", "\0\1\0a") == 5); +assert(m.match("\0\0\0", "\0\0\0") == 4); +assert(! m.match("\0\0\0", "\0\0")); + + +// tests for predicates +assert(! m.match(-m.P("a") * 2, "alo")); +assert(m.match(- -m.P("a") * 2, "alo") == 3); +assert(m.match(#m.P("a") * 2, "alo") == 3); +assert(m.match(##m.P("a") * 2, "alo") == 3); +assert(! m.match(##m.P("c") * 2, "alo")); +assert(m.match(m.Cs((##m.P("a") * 1 + m.P(1)/".")**0), "aloal") == "a..a."); +assert(m.match(m.Cs((#((#m.P("a"))/"") * 1 + m.P(1)/".")**0), "aloal") == "a..a."); +assert(m.match(m.Cs((- -m.P("a") * 1 + m.P(1)/".")**0), "aloal") == "a..a."); +assert(m.match(m.Cs((-((-m.P("a"))/"") * 1 + m.P(1)/".")**0), "aloal") == "a..a."); + +p = -m.P('a') * m.Cc(1) + -m.P('b') * m.Cc(2) + -m.P('c') * m.Cc(3); +assert(p->match('a') == 2 && p->match('') == 1 && p->match('b') == 1); + +p = -m.P('a') * m.Cc(10) + #m.P('a') * m.Cc(20); +assert(p->match('a') == 20 && p->match('') == 10 && p->match('b') == 10); + + + +// look-behind predicate +assert(! m.match(m.B('a'), 'a')); +assert(m.match(1 * m.B('a'), 'a') == 2); +assert(! m.match(m.B(1), 'a')); +assert(m.match(1 * m.B(1), 'a') == 2); +assert(m.match(-m.B(1), 'a') == 1); +assert(m.match(m.B(250), string.rep('a', 250)) == null); +assert(m.match(250 * m.B(250), string.rep('a', 250)) == 251); +// look-behind with an open call +checkerr("pattern may not have fixed length", m.B, m.V('S1')); +checkerr("too long to look behind", m.B, 260); + +B = #letter * -m.B(letter) + -letter * m.B(letter); +x = m.Ct({ (B * m.Cp())**-1 * (1 * m.V(1) + m.P(true)) }); +checkeq(m.match(x, 'ar cal c'), {1,3,4,7,9,10}); +checkeq(m.match(x, ' ar cal '), {2,4,5,8}); +checkeq(m.match(x, ' '), {}); +checkeq(m.match(x, 'aloalo'), {1,7}); + +assert(m.match(B, "a") == 1); +assert(m.match(1 * B, "a") == 2); +assert(! m.B(1 - letter)->match("")); +assert((-m.B(letter))->match("") == 1); + +assert((4 * m.B(letter, 4))->match("aaaaaaaa") == 5); +assert(! (4 * m.B(#letter * 5))->match("aaaaaaaa")); +assert((4 * -m.B(#letter * 5))->match("aaaaaaaa") == 5); + +// look-behind with grammars +assert(m.match('a' * m.B({'x', x = m.P(3)}), 'aaa') == null); +assert(m.match('aa' * m.B({'x', x = m.P('aaa')}), 'aaaa') == null); +assert(m.match('aaa' * m.B({'x', x = m.P('aaa')}), 'aaaaa') == 4); + + + +// bug in 0.9 +assert(m.match(('a' * #m.P('b')), "ab") == 2); +assert(! m.match(('a' * #m.P('b')), "a")); + +assert(! m.match(#m.S('567'), "")); +assert(m.match(#m.S('567') * 1, "6") == 2); + + +// tests for Tail Calls + +p = m.P({ 'a' * m.V(1) + '' }); +assert(p->match(string.rep('a', 1000)) == 1001); + +// create a grammar for a simple DFA for even number of 0s and 1s +// +// ->1 <---0---> 2 +// ^ ^ +// | | +// 1 1 +// | | +// V V +// 3 <---0---> 4 +// +// this grammar should keep no backtracking information + +p = m.P({ + [1] = '0' * m.V(2) + '1' * m.V(3) + -1, + [2] = '0' * m.V(1) + '1' * m.V(4), + [3] = '0' * m.V(4) + '1' * m.V(1), + [4] = '0' * m.V(3) + '1' * m.V(2), +}); + +assert(p->match(string.rep("00", 10000))); +assert(p->match(string.rep("01", 10000))); +assert(p->match(string.rep("011", 10000))); +assert(! p->match(string.rep("011", 10000) .. "1")); +assert(! p->match(string.rep("011", 10001))); + + +// this grammar does need backtracking info. +var lim = 10000; +p = m.P({ '0' * m.V(1) + '0' }); +checkerr("stack overflow", m.match, p, string.rep("0", lim)); +m.setmaxstack(2*lim); +checkerr("stack overflow", m.match, p, string.rep("0", lim)); +m.setmaxstack(2*lim + 4); +assert(m.match(p, string.rep("0", lim)) == lim + 1); + +// this repetition should not need stack space (only the call does) +p = m.P({ ('a' * m.V(1))**0 * 'b' + 'c' }); +m.setmaxstack(200); +assert(p->match(string.rep('a', 180) .. 'c' .. string.rep('b', 180)) == 362); + +m.setmaxstack(100); // restore low limit + +// tests for optional start position +assert(m.match("a", "abc", 1)); +assert(m.match("b", "abc", 2)); +assert(m.match("c", "abc", 3)); +assert(! m.match(1, "abc", 4)); +assert(m.match("a", "abc", -3)); +assert(m.match("b", "abc", -2)); +assert(m.match("c", "abc", -1)); +assert(m.match("abc", "abc", -4)); // truncate to position 1 + +assert(m.match("", "abc", 10)); // empty string is everywhere! +assert(m.match("", "", 10)); +assert(! m.match(1, "", 1)); +assert(! m.match(1, "", -1)); +assert(! m.match(1, "", 0)); + +print("+"); + + +// tests for argument captures +checkerr("invalid argument", m.Carg, 0); +checkerr("invalid argument", m.Carg, -1); +checkerr("invalid argument", m.Carg, 2**18); +checkerr("absent extra argument #1", m.match, m.Carg(1), 'a', 1); +assert(m.match(m.Carg(1), 'a', 1, print) == print); +x = {m.match(m.Carg(1) * m.Carg(2), '', 1, 10, 20)}; +checkeq(x, {10, 20}); + +assert(m.match(m.Cmt(m.Cg(m.Carg(3), "a") * + m.Cmt(m.Cb("a"), function (s,i,x) { + assert(s == "a" && i == 1); + return i, x+1; + }) * + m.Carg(2), function (s,i,a,b,c) { + assert(s == "a" && i == 1 && c == null); + return i, 2*a + 3*b; + }) * "a", + "a", 1, false, 100, 1000) == 2*1001 + 3*100); + + +// tests for Lua functions + +t = {}; +s = ""; +p = m.P(function (s1, i) { assert(s == s1); t[#t + 1] = i; return null; }) * false; +s = "hi, this is a test"; +assert(m.match(((p - m.P(-1)) + 2)**0, s) == string.len(s) + 1); +assert(#t == string.len(s)/2 && t[1] == 1 && t[2] == 3); + +assert(! m.match(p, s)); + +p = mt.__add(function (s, i) { return i; }, function (s, i) { return null; }); +assert(m.match(p, "alo")); + +p = mt.__mul(function (s, i) { return i; }, function (s, i) { return null; }); +assert(! m.match(p, "alo")); + + +t = {}; +p = function (s1, i) { assert(s == s1); t[#t + 1] = i; return i; }; +s = "hi, this is a test"; +assert(m.match((m.P(1) * p)**0, s) == string.len(s) + 1); +assert(#t == string.len(s) && t[1] == 2 && t[2] == 3); + +t = {}; +p = m.P(function (s1, i) { assert(s == s1); t[#t + 1] = i; + return i <= s1->len() && i; }) * 1; +s = "hi, this is a test"; +assert(m.match(p**0, s) == string.len(s) + 1); +assert(#t == string.len(s) + 1 && t[1] == 1 && t[2] == 2); + +p = function (s1, i) { return m.match(m.P("a")**1, s1, i); }; +assert(m.match(p, "aaaa") == 5); +assert(m.match(p, "abaa") == 2); +assert(! m.match(p, "baaa")); + +checkerr("invalid position", m.match, function () { return 2**20; }, s); +checkerr("invalid position", m.match, function () { return 0; }, s); +checkerr("invalid position", m.match, function (s, i) { return i - 1; }, s); +checkerr("invalid position", m.match, + m.P(1)**0 * function (_, i) { return i - 1; }, s); +assert(m.match(m.P(1)**0 * function (_, i) { return i; } * -1, s)); +checkerr("invalid position", m.match, + m.P(1)**0 * function (_, i) { return i + 1; }, s); +assert(m.match(m.P(function (s, i) { return s->len() + 1; }) * -1, s)); +checkerr("invalid position", m.match, m.P(function (s, i) { return s->len() + 2; }) * -1, s); +assert(! m.match(m.P(function (s, i) { return s->len(); }) * -1, s)); +assert(m.match(m.P(1)**0 * function (_, i) { return true; }, s) == + string.len(s) + 1); +for( i = 1, string.len(s) + 1 ) { + assert(m.match(function (_, _) { return i; }, s) == i); +} + +p = (m.P(function (s, i) { return i%2 == 0 && i; }) * 1 + + m.P(function (s, i) { return i%2 != 0 && i + 2 <= s->len() && i; }) * 3)**0 + * -1; +assert(p->match(string.rep('a', 14000))); + +// tests for Function Replacements +f = function (a, ...) { if( a != "x" ) { return {a, ...}; } }; + +t = m.match(m.C(1)**0/f, "abc"); +checkeq(t, {"a", "b", "c"}); + +t = m.match(m.C(1)**0/f/f, "abc"); +checkeq(t, {{"a", "b", "c"}}); + +t = m.match(m.P(1)**0/f/f, "abc"); // no capture +checkeq(t, {{"abc"}}); + +t = m.match((m.P(1)**0/f * m.Cp())/f, "abc"); +checkeq(t, {{"abc"}, 4}); + +t = m.match((m.C(1)**0/f * m.Cp())/f, "abc"); +checkeq(t, {{"a", "b", "c"}, 4}); + +t = m.match((m.C(1)**0/f * m.Cp())/f, "xbc"); +checkeq(t, {4}); + +t = m.match(m.C(m.C(1)**0)/f, "abc"); +checkeq(t, {"abc", "a", "b", "c"}); + +g = function (...) { return 1, ...; }; +t = {m.match(m.C(1)**0/g/g, "abc")}; +checkeq(t, {1, 1, "a", "b", "c"}); + +t = {m.match(m.Cc(null,null,4) * m.Cc(null,3) * m.Cc(null, null) / g / g, "")}; +t1 = {1,1,null,null,4,null,3,null,null}; +for( i=1,10 ) { assert(t[i] == t1[i]); } +// bug in 0.12.2: ktable with only null could be eliminated when joining +// with a pattern without ktable +assert((m.P("aaa") * m.Cc(null))->match("aaa") == null); + +t = {m.match((m.C(1) / function (x) { return x, x.."x"; })**0, "abc")}; +checkeq(t, {"a", "ax", "b", "bx", "c", "cx"}); + +t = m.match(m.Ct((m.C(1) / function (x,y) { return y, x; } * m.Cc(1))**0), "abc"); +checkeq(t, {null, "a", 1, null, "b", 1, null, "c", 1}); + +// tests for Query Replacements + +assert(m.match(m.C(m.C(1)**0)/{abc = 10}, "abc") == 10); +assert(m.match(m.C(1)**0/{a = 10}, "abc") == 10); +assert(m.match(m.S("ba")**0/{ab = 40}, "abc") == 40); +t = m.match(m.Ct((m.S("ba")/{a = 40})**0), "abc"); +checkeq(t, {40}); + +assert(m.match(m.Cs((m.C(1)/{a=".", d=".."})**0), "abcdde") == ".bc....e"); +assert(m.match(m.Cs((m.C(1)/{f="."})**0), "abcdde") == "abcdde"); +assert(m.match(m.Cs((m.C(1)/{d="."})**0), "abcdde") == "abc..e"); +assert(m.match(m.Cs((m.C(1)/{e="."})**0), "abcdde") == "abcdd."); +assert(m.match(m.Cs((m.C(1)/{e=".", f="+"})**0), "eefef") == "..+.+"); +assert(m.match(m.Cs((m.C(1))**0), "abcdde") == "abcdde"); +assert(m.match(m.Cs(m.C(m.C(1)**0)), "abcdde") == "abcdde"); +assert(m.match(1 * m.Cs(m.P(1)**0), "abcdde") == "bcdde"); +assert(m.match(m.Cs((m.C('0')/'x' + 1)**0), "abcdde") == "abcdde"); +assert(m.match(m.Cs((m.C('0')/'x' + 1)**0), "0ab0b0") == "xabxbx"); +assert(m.match(m.Cs((m.C('0')/'x' + m.P(1)/{b=3})**0), "b0a0b") == "3xax3"); +assert(m.match(m.P(1)/'%0%0'/{aa = -3} * 'x', 'ax') == -3); +assert(m.match(m.C(1)/'%0%1'/{aa = 'z'}/{z = -3} * 'x', 'ax') == -3); + +assert(m.match(m.Cs(m.Cc(0) * (m.P(1)/"")), "4321") == "0"); + +assert(m.match(m.Cs((m.P(1) / "%0")**0), "abcd") == "abcd"); +assert(m.match(m.Cs((m.P(1) / "%0.%0")**0), "abcd") == "a.ab.bc.cd.d"); +assert(m.match(m.Cs((m.P("a") / "%0.%0" + 1)**0), "abcad") == "a.abca.ad"); +assert(m.match(m.C("a") / "%1%%%0", "a") == "a%a"); +assert(m.match(m.Cs((m.P(1) / ".xx")**0), "abcd") == ".xx.xx.xx.xx"); +assert(m.match(m.Cp() * m.P(3) * m.Cp()/"%2%1%1 - %0 ", "abcde") == + "411 - abc "); + +assert(m.match(m.P(1)/"%0", "abc") == "a"); +checkerr("invalid capture index", m.match, m.P(1)/"%1", "abc"); +checkerr("invalid capture index", m.match, m.P(1)/"%9", "abc"); + +p = m.C(1); +p *= p; p *= p; p = p * p * m.C(1) / "%9 - %1"; +assert(p->match("1234567890") == "9 - 1"); + +assert(m.match(m.Cc(print), "") == print); + +// too many captures (just ignore extra ones) +p = m.C(1)**0 / "%2-%9-%0-%9"; +assert(p->match("01234567890123456789") == "1-8-01234567890123456789-8"); +s = string.rep("12345678901234567890", 20); +assert(m.match(m.C(1)**0 / "%9-%1-%0-%3", s) == "9-1-" .. s .. "-3"); + +// string captures with non-string subcaptures +p = m.Cc('alo') * m.C(1) / "%1 - %2 - %1"; +assert(p->match('x') == 'alo - x - alo'); + +checkerr("invalid capture value (a boolean)", m.match, m.Cc(true) / "%1", "a"); + +// long strings for string capture +l = 10000; +s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l); + +p = (m.C(m.P('a')**1) * m.C(m.P('b')**1) * m.C(m.P('c')**1)) / '%3%2%1'; + +assert(p->match(s) == string.rep('c', l) .. + string.rep('b', l) .. + string.rep('a', l)); + +print("+"); + +// accumulator capture +function f (x) { return x + 1; } +assert(m.match(m.Cf(m.Cc(0) * m.C(1)**0, f), "alo alo") == 7); + +t = {m.match(m.Cf(m.Cc(1,2,3), error), "")}; +checkeq(t, {1}); +p = m.Cf(m.Ct(true) * m.Cg(m.C(m.R("az")**1) * "=" * m.C(m.R("az")**1) * ";")**0, + rawset); +t = p->match("a=b;c=du;xux=yuy;"); +checkeq(t, {a="b", c="du", xux="yuy"}); + + +// errors in accumulator capture + +// no initial capture +checkerr("no initial value", m.match, m.Cf(m.P(5), print), 'aaaaaa'); +// no initial capture (very long match forces fold to be a pair open-close) +checkerr("no initial value", m.match, m.Cf(m.P(500), print), + string.rep('a', 600)); + +// nested capture produces no initial value +checkerr("no initial value", m.match, m.Cf(m.P(1) / {}, print), "alo"); + + +// tests for loop checker + +var function isnullable (p) { + checkerr("may accept empty string", function (p) { return p**0; }, m.P(p)); +} + +isnullable(m.P("x")**-4); +assert(m.match(((m.P(0) + 1) * m.S("al"))**0, "alo") == 3); +assert(m.match((("x" + #m.P(1))**-4 * m.S("al"))**0, "alo") == 3); +isnullable(""); +isnullable(m.P("x")**0); +isnullable(m.P("x")**-1); +isnullable(m.P("x") + 1 + 2 + m.P("a")**-1); +isnullable(-m.P("ab")); +isnullable(- -m.P("ab")); +isnullable(# #(m.P("ab") + "xy")); +isnullable(- #m.P("ab")**0); +isnullable(# -m.P("ab")**1); +isnullable(#m.V(3)); +isnullable(m.V(3) + m.V(1) + m.P('a')**-1); +isnullable({[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(0)}); +assert(m.match(m.P({[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(1)})**0, "abc") + == 3); +assert(m.match(m.P("")**-3, "a") == 1); + +var function find (p, s) { + return m.match(basiclookfor(p), s); +} + + +var function badgrammar (g, expected) { + var stat, msg = pcall(m.P, g); + assert(! stat); + if( expected ) { assert(find(expected, msg)); } +} + +badgrammar({[1] = m.V(1)}, "rule '1'"); +badgrammar({[1] = m.V(2)}, "rule '2'"); // invalid non-terminal +badgrammar({[1] = m.V("x")}, "rule 'x'"); // invalid non-terminal +badgrammar({[1] = m.V({})}, "rule '(a table)'"); // invalid non-terminal +badgrammar({[1] = #m.P("a") * m.V(1)}, "rule '1'"); // left-recursive +badgrammar({[1] = -m.P("a") * m.V(1)}, "rule '1'"); // left-recursive +badgrammar({[1] = -1 * m.V(1)}, "rule '1'"); // left-recursive +badgrammar({[1] = -1 + m.V(1)}, "rule '1'"); // left-recursive +badgrammar({[1] = 1 * m.V(2), [2] = m.V(2)}, "rule '2'"); // left-recursive +badgrammar({[1] = 1 * m.V(2)**0, [2] = m.P(0)}, "rule '1'"); // inf. loop +badgrammar({ m.V(2), m.V(3)**0, m.P("") }, "rule '2'"); // inf. loop +badgrammar({ m.V(2) * m.V(3)**0, m.V(3)**0, m.P("") }, "rule '1'"); // inf. loop +badgrammar({"x", x = #(m.V(1) * 'a') }, "rule '1'"); // inf. loop +badgrammar({ -(m.V(1) * 'a') }, "rule '1'"); // inf. loop +badgrammar({"x", x = m.P('a')**-1 * m.V("x")}, "rule 'x'"); // left recursive +badgrammar({"x", x = m.P('a') * m.V("y")**1, y = #m.P(1)}, "rule 'x'"); + +assert(m.match({'a' * -m.V(1)}, "aaa") == 2); +assert(m.match({'a' * -m.V(1)}, "aaaa") == null); + + +// good x bad grammars +m.P({ ('a' * m.V(1))**-1 }); +m.P({ -('a' * m.V(1)) }); +m.P({ ('abc' * m.V(1))**-1 }); +m.P({ -('abc' * m.V(1)) }); +badgrammar({ #m.P('abc') * m.V(1) }); +badgrammar({ -('a' + m.V(1)) }); +m.P({ #('a' * m.V(1)) }); +badgrammar({ #('a' + m.V(1)) }); +m.P({ m.B({ m.P('abc') }) * 'a' * m.V(1) }); +badgrammar({ m.B({ m.P('abc') }) * m.V(1) }); +badgrammar({ ('a' + m.P('bcd'))**-1 * m.V(1) }); + + +// simple tests for maximum sizes: +p = m.P("a"); +for( i=1,14 ) { p *= p; } + +p = {}; +for( i=1,100 ) { p[i] = m.P("a"); } +p = m.P(p); + + +// strange values for rule labels + +p = m.P({ "print", + print = m.V(print), + [print] = m.V(_G), + [_G] = m.P("a"), + }); + +assert(p->match("a")); + +// initial rule +g = {}; +for( i = 1, 10 ) { g["i"..i] = "a" * m.V("i"..i+1); } +g.i11 = m.P(""); +for( i = 1, 10 ) { + g[1] = "i"..i; + var xp = m.P(g); + assert(xp->match("aaaaaaaaaaa") == 11 - i + 1); +} + +print("+"); + + +// tests for back references +checkerr("back reference 'x' not found", m.match, m.Cb('x'), ''); +checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a'); + +p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k")); +t = p->match("ab"); +checkeq(t, {"a", "b"}); +p = m.P(true); +for( i = 1, 10 ) { p = p * m.Cg(1, i); } +for( i = 1, 10 ) { + var xp = p * m.Cb(i); + assert(xp->match('abcdefghij') == string.sub('abcdefghij', i, i)); +} + + +t = {}; +function foo (p) { t[#t + 1] = p; return p .. "x"; } + +p = m.Cg(m.C(2) / foo, "x") * m.Cb("x") * + m.Cg(m.Cb('x') / foo, "x") * m.Cb("x") * + m.Cg(m.Cb('x') / foo, "x") * m.Cb("x") * + m.Cg(m.Cb('x') / foo, "x") * m.Cb("x"); +x = {p->match('ab')}; +checkeq(x, {'abx', 'abxx', 'abxxx', 'abxxxx'}); +checkeq(t, {'ab', + 'ab', 'abx', + 'ab', 'abx', 'abxx', + 'ab', 'abx', 'abxx', 'abxxx'}); + + + +// tests for match-time captures + +p = m.P('a') * (function (s, i) { return (s->sub(i, i) == 'b') && i + 1; }) + + 'acd'; + +assert(p->match('abc') == 3); +assert(p->match('acd') == 4); + +var function id (s, i, ...) { + return true, ...; +} + +assert(m.Cmt(m.Cs((m.Cmt(m.S('abc') / { a = 'x', c = 'y' }, id) + + m.R('09')**1 / string.char + + m.P(1))**0), id)->match("acb98+68c") == "xyb\98+\68y"); + +p = m.P({'S', + S = m.V('atom') * space + + m.Cmt(m.Ct("(" * space * (m.Cmt(m.V('S')**1, id) + m.P(true)) * ")" * space), id), + atom = m.Cmt(m.C(m.R("AZ", "az", "09")**1), id) +}); +x = p->match("(a g () ((b) c) (d (e)))"); +checkeq(x, {'a', 'g', {}, {{'b'}, 'c'}, {'d', {'e'}}}); + +x = {(m.Cmt(1, id)**0)->match(string.rep('a', 500))}; +assert(#x == 500); + +id = function (s, i, x) { + if( x == 'a' ) { return i, 1, 3, 7; + } else { return null, 2, 4, 6, 8; + } +} + +p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))**0; +assert(table.concat({p->match('abababab')}) == string.rep('137', 4)); + +var function ref (s, i, x) { + return m.match(x, s, i - x->len()); +} + +assert(m.Cmt(m.P(1)**0, ref)->match('alo') == 4); +assert((m.P(1) * m.Cmt(m.P(1)**0, ref))->match('alo') == 4); +assert(! (m.P(1) * m.Cmt(m.C(1)**0, ref))->match('alo')); + +ref = function (s,i,x) { return i == tonumber(x) && i, 'xuxu'; }; + +assert(m.Cmt(1, ref)->match('2')); +assert(! m.Cmt(1, ref)->match('1')); +assert(m.Cmt(m.P(1)**0, ref)->match('03')); + +function ref (s, i, a, b) { + if( a == b ) { return i, a->upper(); } +} + +p = m.Cmt(m.C(m.R("az")**1) * "-" * m.C(m.R("az")**1), ref); +p = (any - p)**0 * p * any**0 * -1; + +assert(p->match('abbbc-bc ddaa') == 'BC'); + +{ // match-time captures cannot be optimized away + var touch = 0; + f = m.P(function () { ++touch ; return true; }); + + var function check(n) { n = n || 1; assert(touch == n); touch = 0; } + + assert(m.match(f * false + 'b', 'a') == null); check(); + assert(m.match(f * false + 'b', '') == null); check(); + assert(m.match( (f * 'a')**0 * 'b', 'b') == 2); check(); + assert(m.match( (f * 'a')**0 * 'b', '') == null); check(); + assert(m.match( (f * 'a')**-1 * 'b', 'b') == 2); check(); + assert(m.match( (f * 'a')**-1 * 'b', '') == null); check(); + assert(m.match( ('b' + f * 'a')**-1 * 'b', '') == null); check(); + assert(m.match( (m.P('b')**-1 * f * 'a')**-1 * 'b', '') == null); check(); + assert(m.match( (-m.P(1) * m.P('b')**-1 * f * 'a')**-1 * 'b', '') == null); + check(); + assert(m.match( (f * 'a' + 'b')**-1 * 'b', '') == null); check(); + assert(m.match(f * 'a' + f * 'b', 'b') == 2); check(2); + assert(m.match(f * 'a' + f * 'b', 'a') == 2); check(1); + assert(m.match(-f * 'a' + 'b', 'b') == 2); check(1); + assert(m.match(-f * 'a' + 'b', '') == null); check(1); +} + +c = '[' * m.Cg(m.P('=')**0, "init") * '[' * + { m.Cmt(']' * m.C(m.P('=')**0) * ']' * m.Cb("init"), function (_, _, s1, s2) { + return s1 == s2; }) + + 1 * m.V(1) } / 0; + +assert(c->match('[==[]]====]]]]==]===[]') == 18); +assert(c->match('[[]=]====]=]]]==]===[]') == 14); +assert(! c->match('[[]=]====]=]=]==]===[]')); + + +// old bug: optimization of concat with fail removed match-time capture +p = m.Cmt(0, function (s) { p = s; }) * m.P(false); +assert(! p->match('alo')); +assert(p == 'alo'); + + +// ensure that failed match-time captures are not kept on Lua stack +{ + var xt = {__mode = "kv"}; setmetatable(xt,xt); + var xc = 0; + + var function foo (s,i) { + collectgarbage(); + assert(next(xt) == "__mode" && next(xt, "__mode") == null); + var x = {}; + xt[x] = true; + ++xc; + return i, x; + } + + var xp = m.P({ m.Cmt(0, foo) * m.P(false) + m.P(1) * m.V(1) + m.P("") }); + xp->match(string.rep('1', 10)); + assert(xc == 11); +} + +p = (m.P(function () { return true, "a"; }) * 'a' + + m.P(function (s, i) { return i, "aa", 20; }) * 'b' + + m.P(function (s,i) { if( i <= #s ) { return i, "aaa"; } }) * 1)**0; + +t = {p->match('abacc')}; +checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}); + + +//----------------------------------------------------------------- +// Tests for 're' module +//----------------------------------------------------------------- + +var re = require ("re"); + +var match, compile = re.match, re.compile; + + + +assert(match("a", ".") == 2); +assert(match("a", "''") == 1); +assert(match("", " ! . ") == 1); +assert(! match("a", " ! . ")); +assert(match("abcde", " ( . . ) * ") == 5); +assert(match("abbcde", " [a-c] +") == 5); +assert(match("0abbc1de", "'0' [a-c]+ '1'") == 7); +assert(match("0zz1dda", "'0' [^a-c]+ 'a'") == 8); +assert(match("abbc--", " [a-c] + +") == 5); +assert(match("abbc--", " [ac-] +") == 2); +assert(match("abbc--", " [-acb] + ") == 7); +assert(! match("abbcde", " [b-z] + ")); +assert(match("abb\"de", '"abb"["]"de"') == 7); +assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee"); +assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8); +t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")}; +checkeq(t, {4, 5, 7}); +t = {match("abceefe", "((&&'e' {})? .)*")}; +checkeq(t, {4, 5, 7}); +t = {match("abceefe", "( ( ! ! 'e' {} ) ? . ) *")}; +checkeq(t, {4, 5, 7}); +t = {match("abceefe", "(( & ! & ! 'e' {})? .)*")}; +checkeq(t, {4, 5, 7}); + +assert(match("cccx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 5); +assert(match("cdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 4); +assert(match("abcdcdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 8); + +assert(match("abc", "a <- (. a)?") == 4); +b = "balanced <- '(' ([^()] / balanced)* ')'"; +assert(match("(abc)", b)); +assert(match("(a(b)((c) (d)))", b)); +assert(! match("(a(b ((c) (d)))", b)); + +b = compile([=[ balanced <- "(" ([^()] / balanced)* ")" ]=]); +assert(b == m.P(b)); +assert(b->match("((((a))(b)))")); + +g = [=[ + S <- "0" B / "1" A / "" -- balanced strings + A <- "0" S / "1" A A -- one more 0 + B <- "1" S / "0" B B -- one more 1 +]=]; +assert(match("00011011", g) == 9); + +g = [=[ + S <- ("0" B / "1" A)* + A <- "0" / "1" A A + B <- "1" / "0" B B +]=]; +assert(match("00011011", g) == 9); +assert(match("000110110", g) == 9); +assert(match("011110110", g) == 3); +assert(match("000110010", g) == 1); + +s = "aaaaaaaaaaaaaaaaaaaaaaaa"; +assert(match(s, "'a'^3") == 4); +assert(match(s, "'a'^0") == 1); +assert(match(s, "'a'^+3") == s->len() + 1); +assert(! match(s, "'a'^+30")); +assert(match(s, "'a'^-30") == s->len() + 1); +assert(match(s, "'a'^-5") == 6); +for( i = 1, s->len() ) { + assert(match(s, string.format("'a'^+%d", i)) >= i + 1); + assert(match(s, string.format("'a'^-%d", i)) <= i + 1); + assert(match(s, string.format("'a'^%d", i)) == i + 1); +} +assert(match("01234567890123456789", "[0-9]^3+") == 19); + + +assert(match("01234567890123456789", "({....}{...}) -> '%2%1'") == "4560123"); +t = match("0123456789", "{| {.}* |}"); +checkeq(t, {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}); +assert(match("012345", "{| (..) -> '%0%0' |}")[1] == "0101"); + +assert(match("abcdef", "( {.} {.} {.} {.} {.} ) -> 3") == "c"); +assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 3") == "d"); +assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 0") == 6); + +assert(! match("abcdef", "{:x: ({.} {.} {.}) -> 2 :} =x")); +assert(match("abcbef", "{:x: ({.} {.} {.}) -> 2 :} =x")); + +eqcharset(compile("[]]"), "]"); +eqcharset(compile("[][]"), m.S("[]")); +eqcharset(compile("[]-]"), m.S("-]")); +eqcharset(compile("[-]"), m.S("-")); +eqcharset(compile("[az-]"), m.S("a-z")); +eqcharset(compile("[-az]"), m.S("a-z")); +eqcharset(compile("[a-z]"), m.R("az")); +eqcharset(compile("[]['\"]"), m.S([=[]['"]=])); + +eqcharset(compile("[^]]"), any - "]"); +eqcharset(compile("[^][]"), any - m.S("[]")); +eqcharset(compile("[^]-]"), any - m.S("-]")); +eqcharset(compile("[^]-]"), any - m.S("-]")); +eqcharset(compile("[^-]"), any - m.S("-")); +eqcharset(compile("[^az-]"), any - m.S("a-z")); +eqcharset(compile("[^-az]"), any - m.S("a-z")); +eqcharset(compile("[^a-z]"), any - m.R("az")); +eqcharset(compile("[^]['\"]"), any - m.S([=[]['"]=])); + +// tests for comments in 're' +e = compile([=[ +A <- _B -- \t \n %nl .<> <- -> -- +_B <- 'x' --]=]); +assert(e->match('xy') == 2); + +// tests for 're' with pre-definitions +defs = {digits = m.R("09"), letters = m.R("az"), _=m.P("__")}; +e = compile("%letters (%letters / %digits)*", defs); +assert(e->match("x123") == 5); +e = compile("%_", defs); +assert(e->match("__") == 3); + +e = compile([=[ + S <- A+ + A <- %letters+ B + B <- %digits+ +]=], defs); + +e = compile("{[0-9]+'.'?[0-9]*} -> sin", math); +assert(e->match("2.34") == math.sin(2.34)); + + +function eq (_, _, a, b) { return a == b; } + +c = re.compile([=[ + longstring <- '[' {:init: '='* :} '[' close + close <- ']' =init ']' / . close +]=]); + +assert(c->match('[==[]]===]]]]==]===[]') == 17); +assert(c->match('[[]=]====]=]]]==]===[]') == 14); +assert(! c->match('[[]=]====]=]=]==]===[]')); + +c = re.compile(" '[' {:init: '='* :} '[' (!(']' =init ']') .)* ']' =init ']' !. "); + +assert(c->match('[==[]]===]]]]==]')); +assert(c->match('[[]=]====]=][]==]===[]]')); +assert(! c->match('[[]=]====]=]=]==]===[]')); + +assert(re.find("hi alalo", "{:x:..:} =x") == 4); +assert(re.find("hi alalo", "{:x:..:} =x", 4) == 4); +assert(! re.find("hi alalo", "{:x:..:} =x", 5)); +assert(re.find("hi alalo", "{'al'}", 5) == 6); +assert(re.find("hi aloalolo", "{:x:..:} =x") == 8); +assert(re.find("alo alohi x x", "{:word:%w+:}%W*(=word)!%w") == 11); + +// re.find discards any captures +a,b,c = re.find("alo", "{.}{'o'}"); +assert(a == 2 && b == 3 && c == null); + +match = function(s,p) { + var i,e = re.find(s,p); + if( i ) { return s->sub(i, e); } +} +assert(match("alo alo", '[a-z]+') == "alo"); +assert(match("alo alo", '{:x: [a-z]+ :} =x') == null); +assert(match("alo alo", "{:x: [a-z]+ :} ' ' =x") == "alo alo"); + +assert(re.gsub("alo alo", "[abc]", "x") == "xlo xlo"); +assert(re.gsub("alo alo", "%w+", ".") == ". ."); +assert(re.gsub("hi, how are you", "[aeiou]", string.upper) == + "hI, hOw ArE yOU"); + +s = 'hi [[a comment[=]=] ending here]] and [=[another]]=]]'; +c = re.compile(" '[' {:i: '='* :} '[' (!(']' =i ']') .)* ']' { =i } ']' "); +assert(re.gsub(s, c, "%2") == 'hi and =]'); +assert(re.gsub(s, c, "%0") == s); +assert(re.gsub('[=[hi]=]', c, "%2") == '='); + +assert(re.find("", "!.") == 1); +assert(re.find("alo", "!.") == 4); + +function addtag (s, i, t, tag) { t.tag = tag; return i, t; } + +c = re.compile([=[ + doc <- block !. + block <- (start {| (block / { [^<]+ })* |} end?) => addtag + start <- '<' {:tag: [a-z]+ :} '>' + end <- '' +]=], {addtag = addtag}); + +x = c->match([=[ +hihellobuttotheend]=]); +checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but', + {'totheend'}}); + + +// tests for look-ahead captures +x = {re.match("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")}; +checkeq(x, {"", "alo", ""}); + +assert(re.match("aloalo", + "{~ (((&'al' {.}) -> 'A%1' / (&%l {.}) -> '%1%1') / .)* ~}") + == "AallooAalloo"); + +// bug in 0.9 (and older versions), due to captures in look-aheads +x = re.compile([=[ {~ (&(. ([a-z]* -> '*')) ([a-z]+ -> '+') ' '*)* ~} ]=]); +assert(x->match("alo alo") == "+ +"); + +// valid capture in look-ahead (used inside the look-ahead itself) +x = re.compile([=[ + S <- &({:two: .. :} . =two) {[a-z]+} / . S +]=]); +assert(x->match("hello aloaLo aloalo xuxu") == "aloalo"); + + +p = re.compile([=[ + block <- {| {:ident:space*:} line + ((=ident !space line) / &(=ident space) block)* |} + line <- {[^%nl]*} %nl + space <- '_' -- should be ' ', but '_' is simpler for editors +]=]); + +t= p->match([=[ +1 +__1.1 +__1.2 +____1.2.1 +____ +2 +__2.1 +]=]); +checkeq(t, {"1", {"1.1", "1.2", {"1.2.1", "", ident = "____"}, ident = "__"}, + "2", {"2.1", ident = "__"}, ident = ""}); + + +// nested grammars +p = re.compile([=[ + s <- a b !. + b <- ( x <- ('b' x)? ) + a <- ( x <- 'a' x? ) +]=]); + +assert(p->match('aaabbb')); +assert(p->match('aaa')); +assert(! p->match('bbb')); +assert(! p->match('aaabbba')); + +// testing groups +t = {re.match("abc", "{:S <- {:.:} {S} / '':}")}; +checkeq(t, {"a", "bc", "b", "c", "c", ""}); + +t = re.match("1234", "{| {:a:.:} {:b:.:} {:c:.{.}:} |}"); +checkeq(t, {a="1", b="2", c="4"}); +t = re.match("1234", "{|{:a:.:} {:b:{.}{.}:} {:c:{.}:}|}"); +checkeq(t, {a="1", b="2", c="4"}); +t = re.match("12345", "{| {:.:} {:b:{.}{.}:} {:{.}{.}:} |}"); +checkeq(t, {"1", b="2", "4", "5"}); +t = re.match("12345", "{| {:.:} {:{:b:{.}{.}:}:} {:{.}{.}:} |}"); +checkeq(t, {"1", "23", "4", "5"}); +t = re.match("12345", "{| {:.:} {{:b:{.}{.}:}} {:{.}{.}:} |}"); +checkeq(t, {"1", "23", "4", "5"}); + + +// testing pre-defined names +assert(os.setlocale("C") == "C"); + +function eqlpeggsub (p1, p2) { + var s1 = cs2str(re.compile(p1)); + var s2 = string.gsub(allchar, "[^" .. p2 .. "]", ""); + // if s1 ~= s2 then print(#s1,#s2) end + assert(s1 == s2); +} + + +eqlpeggsub("%w", "%w"); +eqlpeggsub("%a", "%a"); +eqlpeggsub("%l", "%l"); +eqlpeggsub("%u", "%u"); +eqlpeggsub("%p", "%p"); +eqlpeggsub("%d", "%d"); +eqlpeggsub("%x", "%x"); +eqlpeggsub("%s", "%s"); +eqlpeggsub("%c", "%c"); + +eqlpeggsub("%W", "%W"); +eqlpeggsub("%A", "%A"); +eqlpeggsub("%L", "%L"); +eqlpeggsub("%U", "%U"); +eqlpeggsub("%P", "%P"); +eqlpeggsub("%D", "%D"); +eqlpeggsub("%X", "%X"); +eqlpeggsub("%S", "%S"); +eqlpeggsub("%C", "%C"); + +eqlpeggsub("[%w]", "%w"); +eqlpeggsub("[_%w]", "_%w"); +eqlpeggsub("[^%w]", "%W"); +eqlpeggsub("[%W%S]", "%W%S"); + +re.updatelocale(); + + +// testing nested substitutions x string captures + +p = re.compile([=[ + text <- {~ item* ~} + item <- macro / [^()] / '(' item* ')' + arg <- ' '* {~ (!',' item)* ~} + args <- '(' arg (',' arg)* ')' + macro <- ('apply' args) -> '%1(%2)' + / ('add' args) -> '%1 + %2' + / ('mul' args) -> '%1 * %2' +]=]); + +assert(p->match("add(mul(a,b), apply(f,x))") == "a * b + f(x)"); + +rev = re.compile([=[ R <- (!.) -> '' / ({.} R) -> '%2%1']=]); + +assert(rev->match("0123456789") == "9876543210"); + + +// testing error messages in re + +var function errmsg (p, err) { + checkerr(err, re.compile, p); +} + +errmsg('aaaa', "rule 'aaaa'"); +errmsg('a', 'outside'); +errmsg('b <- a', 'undefined'); +errmsg("x <- 'a' x <- 'b'", 'already defined'); +errmsg("'a' -", "near '-'"); + + +print("OK"); + + diff --git a/tests/testlr.ljs b/tests/testlr.ljs new file mode 100644 index 0000000..58921bb --- /dev/null +++ b/tests/testlr.ljs @@ -0,0 +1,208 @@ +package.path = "./lpeglj/?.ljs;" .. package.path; + +var lpeg = require("lpeglj"); +var re = require("re"); + +var m = lpeg; + +var function checkeq(x, y, p) { + if( p ) { print(x, y); } + if( type(x) != "table" ) { assert(x == y); + } else { + for( k, v in pairs(x) ) { checkeq(v, y[k], p); } + for( k, v in pairs(y) ) { checkeq(v, x[k], p); } + } +} + +print("Tests for LPegLJ left recursion"); + +assert(type(m.version()) == "string"); +print("version " .. m.version()); +m.enableleftrecursion(true); + +/* +direct left recursion +E ← E + n / n +--*/ + +var pat = m.P({ + "E"; + E = m.V("E") * '+' * "n" + "n", +}); + +assert(pat->match("n+n+n") == 6); + +/* +indirect left recursion +L ← P.x / x +P ← P(n) / L +--*/ + +pat = m.P({ + "L"; + L = m.V("P") * ".x" + "x", + P = m.V("P") * "(n)" + m.V("L") +}); + +assert(pat->match("x(n)(n).x(n).x") == 15); + +/* +left and right recursion with precedence rules +E ← E1 + E2 / E1 − E2 / E2 ∗ E3 / E2 ÷ E3 / E3 ∗∗ E3 / − E4 / (E1) / n +--*/ + + +pat = m.P({ + "E", + E = m.V("E", 1) * m.S('+-') * m.V("E", 2) + + m.V("E", 2) * m.S('*/') * m.V("E", 3) + + m.V("E", 3) * '**' * m.V("E", 3) + + '-' * m.V("E", 4) + + '(' * m.V("E") * ')' + + m.R('09') ** 1, +}); + +assert(pat->match("-1*(6+2/4+3-1)**2") == 18); + +/* +left and right recursion with precedence rules +E ← E1 + E2 / E1 − E2 / E2 ∗ E3 / E2 ÷ E3 / E3 ∗∗ E3 / − E4 / (E1) / n +create AST tree +--*/ + + +pat = m.P({ + "E", + E = m.Ct(m.V("E", 1) * m.C(m.S('+-')) * m.V("E", 2) + + m.V("E", 2) * m.C(m.S('*/')) * m.V("E", 3) + + m.V("E", 3) * m.C('**') * m.V("E", 3) + + m.C('-') * m.V("E", 4) + + '(' * m.V("E") * ')' + + m.C(m.R('09') ** 1)), +}); + +var ASTtree = pat->match("1+1+1"); +checkeq(ASTtree, { { { "1" }, "+", { "1" } }, "+", { "1" } }); + +ASTtree = pat->match("-1*(6+2/4+3-1)**2"); +checkeq(ASTtree, { { "-", { "1" } }, "*", { { { { { { "6" }, "+", { { "2" }, "/", { "4" } } }, "+", { "3" } }, "-", { "1" } } }, "**", { "2" } } }); + +// using re module with precedence (the same example as above) +// call_nonterminal : precedence_level or + +pat = [=[ + E <- (E:1 {[+-]} E:2 / + E:2 {[*/]} E:3 / + E:3 {'**'} E:3 / + {'-'} E:4 / + '(' E ')' / + {[0-9]+}) -> {} +]=]; + +ASTtree = re.match("-1*(6+2/4+3-1)**2", pat); +checkeq(ASTtree, { { "-", { "1" } }, "*", { { { { { { "6" }, "+", { { "2" }, "/", { "4" } } }, "+", { "3" } }, "-", { "1" } } }, "**", { "2" } } }); + +/* +simple evaluator +E ← E1 + E2 / E1 − E2 / E2 ∗ E3 / E2 ÷ E3 / E3 ∗∗ E3 / − E4 / (E1) / n +--*/ + +var eval = function(s, i, p1, p2, p3) { + var res; + if( p2 == '+' ) { + res = p1 + p3; + } else if( p2 == '-' ) { + res = p1 - p3; + } else if( p2 == '*' ) { + res = p1 * p3; + } else if( p2 == '/' ) { + res = p1 / p3; + } else if( p1 == '-' ) { + res = -p2; + } else if( p2 == '**' ) { + res = p1 ** p3; + } else { + res = p1; + } + return true, res; +}; + + +pat = m.P({ + "E", + E = m.Cmt(m.V("E", 1) * m.C(m.S('+-')) * m.V("E", 2) + + m.V("E", 2) * m.C(m.S('*/')) * m.V("E", 3) + + m.V("E", 3) * m.C('**') * m.V("E", 3) + + m.C('-') * m.V("E", 4) + + '(' * m.V("E") * ')' + + m.C(m.R('09') ** 1), eval), +}); + +assert(pat->match("-1*(6+2/4+3-1)**2") == -72.25); + + +pat = m.P({ + "E", + E = m.V("E", 1) * '+' * m.V("E", 2) / function(c1, c2) { return c1 + c2; } + + m.V("E", 1) * '-' * m.V("E", 2) / function(c1, c2) { return c1 - c2; } + + m.V("E", 2) * '*' * m.V("E", 3) / function(c1, c2) { return c1 * c2; } + + m.V("E", 2) * '/' * m.V("E", 3) / function(c1, c2) { return c1 / c2; } + + m.V("E", 3) * '**' * m.V("E", 3) / function(c1, c2) { return c1 ** c2; } + + '-' * m.V("E", 4) / function(c1) { return -c1; } + + '(' * m.V("E") * ')' + + m.C(m.R('09') ** 1), +}); + +assert(pat->match("-1*(6+2/4+3-1)**2") == -72.25); + +var def = { + plus = function(p1, p2) { return p1 + p2; }, + minus = function(p1, p2) { return p1 - p2; }, + mult = function(p1, p2) { return p1 * p2; }, + div = function(p1, p2) { return p1 / p2; }, + pow = function(p1, p2) { return p1 ** p2; }, + uminus = function(p1) { return -p1; }, + errfce = function(o, i) { + var errstr = o .. '\n' .. (' ')->rep(i) .. '^' .. '\n'; + io.write(errstr); + return false; + }, +}; + +pat = [=[ + P <- E s (!. / error) + s <- %s* + error <- '' => errfce + E <- (E:1 s'+' E:2) -> plus / + (E:1 s'-' E:2) -> minus / + (E:2 s'*' E:3) -> mult / + (E:2 s'/' E:3) -> div / + (E:3 s'**' E:3)-> pow / + (s'-' E:4) -> uminus / + s'(' E s')' / + s{[0-9]+} / + error +]=]; + +pat = re.compile(pat, def); +assert(re.match("-1 * (6 + 2 / 4 + 3 - 1)**2", pat) == -72.25); + +pat = [=[ + A <- B "a" + B <- C "b" + C <- B / A / "c" +]=]; + +pat = re.compile(pat); +assert(re.match("cbbabbba", pat) == 9); + +pat = [=[ + S <- A / B + A <- A "a" / B / "a" + B <- B "b" / A / "b" +]=]; + +pat = re.compile(pat); +assert(re.match("baabbaaa", pat) == 9); + +print("OK");