diff --git a/HISTORY b/HISTORY index 0c10edd..96acace 100644 --- a/HISTORY +++ b/HISTORY @@ -1,6 +1,10 @@ -HISTORY for LPeg 1.0 +HISTORY for LPeg 1.0.2 -* Changes from version 0.12 to 1.0 +* Changes from version 1.0.1 to 1.0.2 + --------------------------------- + + some bugs fixed + +* Changes from version 0.12 to 1.0.1 --------------------------------- + group "names" can be any Lua value + some bugs fixed diff --git a/doc.css b/doc.css new file mode 100644 index 0000000..3770e4e --- /dev/null +++ b/doc.css @@ -0,0 +1,223 @@ +body { + margin-left: 1em; + margin-right: 1em; + font-family: arial, helvetica, geneva, sans-serif; + background-color:#ffffff; margin:0px; +} + +code { + font-family: "Andale Mono", monospace; +} + +tt { + font-family: "Andale Mono", monospace; +} + +body, td, th { font-size: 11pt; } + +h1, h2, h3, h4 { margin-left: 0em; } + +textarea, pre, tt { font-size:10pt; } +body, td, th { color:#000000; } +small { font-size:0.85em; } +h1 { font-size:1.5em; } +h2 { font-size:1.25em; } +h3 { font-size:1.15em; } +h4 { font-size:1.06em; } + +a:link { font-weight:bold; color: #004080; text-decoration: none; } +a:visited { font-weight:bold; color: #006699; text-decoration: none; } +a:link:hover { text-decoration:underline; } +hr { color:#cccccc } +img { border-width: 0px; } + + +h3 { padding-top: 1em; } + +p { margin-left: 1em; } + +p.name { + font-family: "Andale Mono", monospace; + padding-top: 1em; + margin-left: 0em; +} + +blockquote { margin-left: 3em; } + +.example { + background-color: rgb(245, 245, 245); + border-top-width: 1px; + border-right-width: 1px; + border-bottom-width: 1px; + border-left-width: 1px; + border-top-style: solid; + border-right-style: solid; + border-bottom-style: solid; + border-left-style: solid; + border-top-color: silver; + border-right-color: silver; + border-bottom-color: silver; + border-left-color: silver; + padding: 1em; + margin-left: 1em; + margin-right: 1em; + font-family: "Andale Mono", monospace; + font-size: smaller; +} + + +hr { + margin-left: 0em; + background: #00007f; + border: 0px; + height: 1px; +} + +ul { list-style-type: disc; } + +table.index { border: 1px #00007f; } +table.index td { text-align: left; vertical-align: top; } +table.index ul { padding-top: 0em; margin-top: 0em; } + +table { + border: 1px solid black; + border-collapse: collapse; + margin-left: auto; + margin-right: auto; +} +th { + border: 1px solid black; + padding: 0.5em; +} +td { + border: 1px solid black; + padding: 0.5em; +} +div.header, div.footer { margin-left: 0em; } + +#container +{ + margin-left: 1em; + margin-right: 1em; + background-color: #f0f0f0; +} + +#product +{ + text-align: center; + border-bottom: 1px solid #cccccc; + background-color: #ffffff; +} + +#product big { + font-size: 2em; +} + +#product_logo +{ +} + +#product_name +{ +} + +#product_description +{ +} + +#main +{ + background-color: #f0f0f0; + border-left: 2px solid #cccccc; +} + +#navigation +{ + float: left; + width: 12em; + margin: 0; + vertical-align: top; + background-color: #f0f0f0; + overflow:visible; +} + +#navigation h1 { + background-color:#e7e7e7; + font-size:1.1em; + color:#000000; + text-align:left; + margin:0px; + padding:0.2em; + border-top:1px solid #dddddd; + border-bottom:1px solid #dddddd; +} + +#navigation ul +{ + font-size:1em; + list-style-type: none; + padding: 0; + margin: 1px; +} + +#navigation li +{ + text-indent: -1em; + margin: 0em 0em 0em 0.5em; + display: block; + padding: 3px 0px 0px 12px; +} + +#navigation li li a +{ + padding: 0px 3px 0px -1em; +} + +#content +{ + margin-left: 12em; + padding: 1em; + border-left: 2px solid #cccccc; + border-right: 2px solid #cccccc; + background-color: #ffffff; +} + +#about +{ + clear: both; + margin: 0; + padding: 5px; + border-top: 2px solid #cccccc; + background-color: #ffffff; +} + +@media print { + body { + font: 10pt "Times New Roman", "TimeNR", Times, serif; + } + a { font-weight:bold; color: #004080; text-decoration: underline; } + + #main { background-color: #ffffff; border-left: 0px; } + #container { margin-left: 2%; margin-right: 2%; background-color: #ffffff; } + + #content { margin-left: 0px; padding: 1em; border-left: 0px; border-right: 0px; background-color: #ffffff; } + + #navigation { display: none; + } + + #product_logo + { + display: none; + } + + #about img + { + display: none; + } + + .example { + font-family: "Andale Mono", monospace; + font-size: 8pt; + page-break-inside: avoid; + } +} diff --git a/lpcap.c b/lpcap.c index c9085de..1a3643d 100644 --- a/lpcap.c +++ b/lpcap.c @@ -271,15 +271,15 @@ int finddyncap (Capture *cap, Capture *last) { /* -** Calls a runtime capture. Returns number of captures removed by -** the call, including the initial Cgroup. (Captures to be added are -** on the Lua stack.) +** Calls a runtime capture. Returns number of captures "removed" by the +** call, that is, those inside the group capture. Captures to be added +** are on the Lua stack. */ int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) { int n, id; lua_State *L = cs->L; int otop = lua_gettop(L); - Capture *open = findopen(close); + Capture *open = findopen(close); /* get open group capture */ assert(captype(open) == Cgroup); id = finddyncap(open, close); /* get first dynamic capture argument */ close->kind = Cclose; /* closes the group */ @@ -299,7 +299,7 @@ int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) { } else *rem = 0; /* no dynamic captures removed */ - return close - open; /* number of captures of all kinds removed */ + return close - open - 1; /* number of captures to be removed */ } @@ -441,70 +441,88 @@ static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) { } +#if !defined(MAXRECLEVEL) +#define MAXRECLEVEL 200 +#endif + + /* ** Push all values of the current capture into the stack; returns ** number of values pushed */ static int pushcapture (CapState *cs) { lua_State *L = cs->L; + int res; luaL_checkstack(L, 4, "too many captures"); + if (cs->reclevel++ > MAXRECLEVEL) + return luaL_error(L, "subcapture nesting too deep"); switch (captype(cs->cap)) { case Cposition: { lua_pushinteger(L, cs->cap->s - cs->s + 1); cs->cap++; - return 1; + res = 1; + break; } case Cconst: { pushluaval(cs); cs->cap++; - return 1; + res = 1; + break; } case Carg: { int arg = (cs->cap++)->idx; if (arg + FIXEDARGS > cs->ptop) return luaL_error(L, "reference to absent extra argument #%d", arg); lua_pushvalue(L, arg + FIXEDARGS); - return 1; + res = 1; + break; } case Csimple: { int k = pushnestedvalues(cs, 1); lua_insert(L, -k); /* make whole match be first result */ - return k; + res = k; + break; } case Cruntime: { lua_pushvalue(L, (cs->cap++)->idx); /* value is in the stack */ - return 1; + res = 1; + break; } case Cstring: { luaL_Buffer b; luaL_buffinit(L, &b); stringcap(&b, cs); luaL_pushresult(&b); - return 1; + res = 1; + break; } case Csubst: { luaL_Buffer b; luaL_buffinit(L, &b); substcap(&b, cs); luaL_pushresult(&b); - return 1; + res = 1; + break; } case Cgroup: { if (cs->cap->idx == 0) /* anonymous group? */ - return pushnestedvalues(cs, 0); /* add all nested values */ + res = pushnestedvalues(cs, 0); /* add all nested values */ else { /* named group: add no values */ nextcap(cs); /* skip capture */ - return 0; + res = 0; } + break; } - case Cbackref: return backrefcap(cs); - case Ctable: return tablecap(cs); - case Cfunction: return functioncap(cs); - case Cnum: return numcap(cs); - case Cquery: return querycap(cs); - case Cfold: return foldcap(cs); - default: assert(0); return 0; + case Cbackref: res = backrefcap(cs); break; + case Ctable: res = tablecap(cs); break; + case Cfunction: res = functioncap(cs); break; + case Cnum: res = numcap(cs); break; + case Cquery: res = querycap(cs); break; + case Cfold: res = foldcap(cs); break; + default: assert(0); res = 0; } + cs->reclevel--; + return res; } @@ -521,7 +539,7 @@ int getcaptures (lua_State *L, const char *s, const char *r, int ptop) { int n = 0; if (!isclosecap(capture)) { /* is there any capture? */ CapState cs; - cs.ocap = cs.cap = capture; cs.L = L; + cs.ocap = cs.cap = capture; cs.L = L; cs.reclevel = 0; cs.s = s; cs.valuecached = 0; cs.ptop = ptop; do { /* collect their values */ n += pushcapture(&cs); diff --git a/lpcap.h b/lpcap.h index 355a38b..3bd5dcd 100644 --- a/lpcap.h +++ b/lpcap.h @@ -11,8 +11,21 @@ /* kinds of captures */ typedef enum CapKind { - Cclose, Cposition, Cconst, Cbackref, Carg, Csimple, Ctable, Cfunction, - Cquery, Cstring, Cnum, Csubst, Cfold, Cruntime, Cgroup + Cclose, /* not used in trees */ + Cposition, + Cconst, /* ktable[key] is Lua constant */ + Cbackref, /* ktable[key] is "name" of group to get capture */ + Carg, /* 'key' is arg's number */ + Csimple, /* next node is pattern */ + Ctable, /* next node is pattern */ + Cfunction, /* ktable[key] is function; next node is pattern */ + Cquery, /* ktable[key] is table; next node is pattern */ + Cstring, /* ktable[key] is string; next node is pattern */ + Cnum, /* numbered capture; 'key' is number of value to return */ + Csubst, /* substitution capture; next node is pattern */ + Cfold, /* ktable[key] is function; next node is pattern */ + Cruntime, /* not used in trees (is uses another type for tree) */ + Cgroup /* ktable[key] is group's "name" */ } CapKind; @@ -36,6 +49,7 @@ typedef struct CapState { int ptop; /* index of last argument to 'match' */ const char *s; /* original string */ int valuecached; /* value stored in cache slot */ + int reclevel; /* recursion level */ } CapState; diff --git a/lpcode.c b/lpcode.c index b8a5161..5d89f62 100644 --- a/lpcode.c +++ b/lpcode.c @@ -125,6 +125,27 @@ int tocharset (TTree *tree, Charset *cs) { } +/* +** Visit a TCall node taking care to stop recursion. If node not yet +** visited, return 'f(sib2(tree))', otherwise return 'def' (default +** value) +*/ +static int callrecursive (TTree *tree, int f (TTree *t), int def) { + int key = tree->key; + assert(tree->tag == TCall); + assert(sib2(tree)->tag == TRule); + if (key == 0) /* node already visited? */ + return def; /* return default value */ + else { /* first visit */ + int result; + tree->key = 0; /* mark call as already visited */ + result = f(sib2(tree)); /* go to called rule */ + tree->key = key; /* restore tree */ + return result; + } +} + + /* ** Check whether a pattern tree has captures */ @@ -134,14 +155,17 @@ int hascaptures (TTree *tree) { case TCapture: case TRunTime: return 1; case TCall: - tree = sib2(tree); goto tailcall; /* return hascaptures(sib2(tree)); */ + return callrecursive(tree, hascaptures, 0); + case TRule: /* do not follow siblings */ + tree = sib1(tree); goto tailcall; case TOpenCall: assert(0); default: { switch (numsiblings[tree->tag]) { case 1: /* return hascaptures(sib1(tree)); */ tree = sib1(tree); goto tailcall; case 2: - if (hascaptures(sib1(tree))) return 1; + if (hascaptures(sib1(tree))) + return 1; /* else return hascaptures(sib2(tree)); */ tree = sib2(tree); goto tailcall; default: assert(numsiblings[tree->tag] == 0); return 0; @@ -172,7 +196,7 @@ int hascaptures (TTree *tree) { int checkaux (TTree *tree, int pred) { tailcall: switch (tree->tag) { - case TChar: case TSet: case TAny: + case TChar: case TSet: case TAny: case TUTFR: case TFalse: case TOpenCall: return 0; /* not nullable */ case TRep: case TTrue: @@ -196,7 +220,7 @@ int checkaux (TTree *tree, int pred) { if (checkaux(sib2(tree), pred)) return 1; /* else return checkaux(sib1(tree), pred); */ tree = sib1(tree); goto tailcall; - case TCapture: case TGrammar: case TRule: + case TCapture: case TGrammar: case TRule: case TXInfo: /* return checkaux(sib1(tree), pred); */ tree = sib1(tree); goto tailcall; case TCall: /* return checkaux(sib2(tree), pred); */ @@ -208,38 +232,43 @@ int checkaux (TTree *tree, int pred) { /* ** number of characters to match a pattern (or -1 if variable) -** ('count' avoids infinite loops for grammars) */ -int fixedlenx (TTree *tree, int count, int len) { +int fixedlen (TTree *tree) { + int len = 0; /* to accumulate in tail calls */ tailcall: switch (tree->tag) { case TChar: case TSet: case TAny: return len + 1; + case TUTFR: + return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1; case TFalse: case TTrue: case TNot: case TAnd: case TBehind: return len; case TRep: case TRunTime: case TOpenCall: return -1; - case TCapture: case TRule: case TGrammar: - /* return fixedlenx(sib1(tree), count); */ + case TCapture: case TRule: case TGrammar: case TXInfo: + /* return fixedlen(sib1(tree)); */ tree = sib1(tree); goto tailcall; - case TCall: - if (count++ >= MAXRULES) - return -1; /* may be a loop */ - /* else return fixedlenx(sib2(tree), count); */ - tree = sib2(tree); goto tailcall; + case TCall: { + int n1 = callrecursive(tree, fixedlen, -1); + if (n1 < 0) + return -1; + else + return len + n1; + } case TSeq: { - len = fixedlenx(sib1(tree), count, len); - if (len < 0) return -1; - /* else return fixedlenx(sib2(tree), count, len); */ - tree = sib2(tree); goto tailcall; + int n1 = fixedlen(sib1(tree)); + if (n1 < 0) + return -1; + /* else return fixedlen(sib2(tree)) + len; */ + len += n1; tree = sib2(tree); goto tailcall; } case TChoice: { - int n1, n2; - n1 = fixedlenx(sib1(tree), count, len); - if (n1 < 0) return -1; - n2 = fixedlenx(sib2(tree), count, len); - if (n1 == n2) return n1; - else return -1; + int n1 = fixedlen(sib1(tree)); + int n2 = fixedlen(sib2(tree)); + if (n1 != n2 || n1 < 0) + return -1; + else + return len + n1; } default: assert(0); return 0; }; @@ -271,6 +300,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { tocharset(tree, firstset); return 0; } + case TUTFR: { + int c; + loopset(i, firstset->cs[i] = 0); /* erase all chars */ + for (c = tree->key; c <= sib1(tree)->key; c++) + setchar(firstset->cs, c); + return 0; + } case TTrue: { loopset(i, firstset->cs[i] = follow->cs[i]); return 1; /* accepts the empty string */ @@ -307,7 +343,7 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { loopset(i, firstset->cs[i] |= follow->cs[i]); return 1; /* accept the empty string */ } - case TCapture: case TGrammar: case TRule: { + case TCapture: case TGrammar: case TRule: case TXInfo: { /* return getfirst(sib1(tree), follow, firstset); */ tree = sib1(tree); goto tailcall; } @@ -329,9 +365,8 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { if (tocharset(sib1(tree), firstset)) { cs_complement(firstset); return 1; - } - /* else go through */ - } + } /* else */ + } /* FALLTHROUGH */ case TBehind: { /* instruction gives no new information */ /* call 'getfirst' only to check for math-time captures */ int e = getfirst(sib1(tree), follow, firstset); @@ -353,9 +388,9 @@ static int headfail (TTree *tree) { case TChar: case TSet: case TAny: case TFalse: return 1; case TTrue: case TRep: case TRunTime: case TNot: - case TBehind: + case TBehind: case TUTFR: return 0; - case TCapture: case TGrammar: case TRule: case TAnd: + case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ case TCall: tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ @@ -380,7 +415,7 @@ static int headfail (TTree *tree) { static int needfollow (TTree *tree) { tailcall: switch (tree->tag) { - case TChar: case TSet: case TAny: + case TChar: case TSet: case TAny: case TUTFR: case TFalse: case TTrue: case TAnd: case TNot: case TRunTime: case TGrammar: case TCall: case TBehind: return 0; @@ -436,6 +471,7 @@ int sizei (const Instruction *i) { case ITestSet: return CHARSETINSTSIZE + 1; case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: + case IUTFR: return 2; default: return 1; } @@ -513,6 +549,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) { } +static void codeutfr (CompileState *compst, TTree *tree) { + int i = addoffsetinst(compst, IUTFR, 0); + int to = sib1(tree)->u.n; + assert(sib1(tree)->tag == TXInfo); + getinstr(compst, i + 1).offset = tree->u.n; + getinstr(compst, i).i.aux = to & 0xff; + getinstr(compst, i).i.key = to >> 8; +} + + /* ** Add a capture instruction: ** 'op' is the capture instruction; 'cap' the capture kind; @@ -660,11 +706,11 @@ static void codebehind (CompileState *compst, TTree *tree) { /* ** Choice; optimizations: -** - when p1 is headfail or -** when first(p1) and first(p2) are disjoint, than -** a character not in first(p1) cannot go to p1, and a character -** in first(p1) cannot go to p2 (at it is not in first(p2)). -** (The optimization is not valid if p1 accepts the empty string, +** - when p1 is headfail or when first(p1) and first(p2) are disjoint, +** than a character not in first(p1) cannot go to p1 and a character +** in first(p1) cannot go to p2, either because p1 will accept +** (headfail) or because it is not in first(p2) (disjoint). +** (The second case is not valid if p1 accepts the empty string, ** as then there is no character at all...) ** - when p2 is empty and opt is true; a IPartialCommit can reuse ** the Choice already active in the stack. @@ -745,9 +791,10 @@ static void codeand (CompileState *compst, TTree *tree, int tt) { /* -** Captures: if pattern has fixed (and not too big) length, use -** a single IFullCapture instruction after the match; otherwise, -** enclose the pattern with OpenCapture - CloseCapture. +** Captures: if pattern has fixed (and not too big) length, and it +** has no nested captures, use a single IFullCapture instruction +** after the match; otherwise, enclose the pattern with OpenCapture - +** CloseCapture. */ static void codecapture (CompileState *compst, TTree *tree, int tt, const Charset *fl) { @@ -907,8 +954,10 @@ static void codegrammar (CompileState *compst, TTree *grammar) { int start = gethere(compst); /* here starts the initial rule */ jumptohere(compst, firstcall); for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { + TTree *r = sib1(rule); + assert(r->tag == TXInfo); positions[rulenumber++] = gethere(compst); /* save rule position */ - codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ + codegen(compst, sib1(r), 0, NOINST, fullset); /* code rule */ addinstruction(compst, IRet, 0); } assert(rule->tag == TTrue); @@ -919,8 +968,8 @@ static void codegrammar (CompileState *compst, TTree *grammar) { static void codecall (CompileState *compst, TTree *call) { int c = addoffsetinst(compst, IOpenCall, call->lr); /* to be corrected later */ - getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ - assert(sib2(call)->tag == TRule); + assert(sib1(sib2(call))->tag == TXInfo); + getinstr(compst, c).i.key = sib1(sib2(call))->u.n; /* rule number */ } @@ -958,6 +1007,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, case TSet: codecharset(compst, treebuffer(tree), tt); break; case TTrue: break; case TFalse: addinstruction(compst, IFail, 0); break; + case TUTFR: codeutfr(compst, tree); break; case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; case TRep: coderep(compst, sib1(tree), opt, fl); break; case TBehind: codebehind(compst, tree); break; @@ -1004,7 +1054,7 @@ static void peephole (CompileState *compst) { case IRet: case IFail: case IFailTwice: case IEnd: { /* instructions with unconditional implicit jumps */ code[i] = code[ft]; /* jump becomes that instruction */ - code[i + 1].i.code = IAny; /* 'no-op' for target position */ + code[i + 1].i.code = IEmpty; /* 'no-op' for target position */ break; } case ICommit: case IPartialCommit: diff --git a/lpcode.h b/lpcode.h index 9c5c74b..cada62c 100644 --- a/lpcode.h +++ b/lpcode.h @@ -13,7 +13,7 @@ int tocharset (TTree *tree, Charset *cs); int checkaux (TTree *tree, int pred); -int fixedlenx (TTree *tree, int count, int len); +int fixedlen (TTree *tree); int hascaptures (TTree *tree); int hasleftrecursion (TTree *tree); int lp_gc (lua_State *L); @@ -37,8 +37,6 @@ int sizei (const Instruction *i); */ #define nullable(t) checkaux(t, PEnullable) -#define fixedlen(t) fixedlenx(t, 0, 0) - #endif diff --git a/lpeg.html b/lpeg.html index 3b40cd8..5dbddfd 100644 --- a/lpeg.html +++ b/lpeg.html @@ -22,7 +22,7 @@
string (Set)lpeg.R("xy")lpeg.utfR(cp1, cp2)cp1 and
+ cp2patt^nn repetitions of pattpatt^-nre module,
which implements patterns following a regular-expression style
(e.g., [09]+).
-(This module is 260 lines of Lua code,
+(This module is 270 lines of Lua code,
and of course it uses LPeg to parse regular expressions and
translate them to regular LPeg patterns.)
@@ -164,7 +167,7 @@ lpeg.match (pattern, subject [, init])
An optional numeric argument init makes the match
start at that position in the subject string.
-As usual in Lua libraries,
+As in the Lua standard libraries,
a negative value counts from the end.
@@ -188,20 +191,23 @@ lpeg.type (value)lpeg.version ()lpeg.version-Returns a string with the running version of LPeg. +A string (not a function) with the running version of LPeg.
lpeg.setmaxstack (max)-Sets the maximum size for the backtrack stack used by LPeg to +Sets a limit for the size of the backtrack stack used by LPeg to track calls and choices. +(The default limit is 400.) Most well-written patterns need little backtrack levels and -therefore you seldom need to change this maximum; -but a few useful patterns may need more space. -Before changing this maximum you should try to rewrite your +therefore you seldom need to change this limit; +before changing it you should try to rewrite your pattern to avoid the need for extra space. +Nevertheless, a few useful patterns may overflow. +Also, with recursive grammars, +subjects with deep recursion may also need larger limits.
@@ -574,8 +580,9 @@-A capture is a pattern that creates values -(the so called semantic information) when it matches. +A capture is a pattern that produces values +(the so called semantic information) +according to what it matches. LPeg offers several kinds of captures, which produces values based on matches and combine these values to produce new values. @@ -629,10 +636,7 @@
-A capture pattern produces its values every time it succeeds.
-For instance,
-a capture inside a loop produces as many values as matched by the loop.
-A capture produces a value only when it succeeds.
+A capture pattern produces its values only when it succeeds.
For instance,
the pattern lpeg.C(lpeg.P"a"^-1)
produces the empty string when there is no "a"
@@ -640,14 +644,20 @@
lpeg.C("a")^-1
does not produce any value when there is no "a"
(because the pattern "a" fails).
+A pattern inside a loop or inside a recursive structure
+produces values for each match.
Usually,
-LPeg evaluates all captures only after (and if) the entire match succeeds.
-During match time it only gathers enough information
-to produce the capture values later.
-As a particularly important consequence,
+LPeg does not specify when (and if) it evaluates its captures.
+(As an example,
+consider the pattern lpeg.P"a" / func / 0.
+Because the "division" by 0 instructs LPeg to throw away the
+results from the pattern,
+LPeg may or may not call func.)
+Therefore, captures should avoid side effects.
+Moreover,
most captures cannot affect the way a pattern matches a subject.
The only exception to this rule is the
so-called match-time capture.
@@ -682,7 +692,8 @@
lpeg.Cb (name)name.
+group capture named name
+(where name can be any Lua value).
@@ -696,6 +707,12 @@
lpeg.Cb (name)+In the same way that LPeg does not specify when it evaluates captures, +it does not specify whether it reuses +values previously produced by the group +or re-evaluates them. +
lpeg.Cc ([value, ...])@@ -762,7 +779,8 @@
lpeg.Cg (patt [, name])patt
into a single capture.
The group may be anonymous (if no name is given)
-or named with the given name.
+or named with the given name
+(which can be any non-nil Lua value).
@@ -801,7 +819,7 @@
lpeg.Cs (patt)lpeg.Ct (patt)
Creates a table capture.
-This capture creates a table and puts all values from all anonymous captures
+This capture returns a table with all values from all anonymous captures
made by patt inside this table in successive integer keys,
starting at 1.
Moreover,
@@ -867,7 +885,8 @@
lpeg.Cmt(patt, function)
Creates a match-time capture.
Unlike all other captures,
-this one is evaluated immediately when a match occurs.
+this one is evaluated immediately when a match occurs
+(even if it is part of a larger pattern that fails later).
It forces the immediate evaluation of all its nested captures
and then calls function.
LPeg -source code.
+source code. + ++Probably, the easiest way to install LPeg is with +LuaRocks. +If you have LuaRocks installed, +the following command is all you need to install LPeg: +
$ luarocks install lpeg
-Copyright © 2014 Lua.org, PUC-Rio. +Copyright © 2007-2019 Lua.org, PUC-Rio.
Permission is hereby granted, free of charge,
diff --git a/lpprint.c b/lpprint.c
index 174d168..da18d05 100644
--- a/lpprint.c
+++ b/lpprint.c
@@ -37,13 +37,13 @@ void printcharset (const byte *st) {
}
-static void printcapkind (int kind) {
+static const char *capkind (int kind) {
const char *const modes[] = {
"close", "position", "constant", "backref",
"argument", "simple", "table", "function",
"query", "string", "num", "substitution", "fold",
"runtime", "group"};
- printf("%s", modes[kind]);
+ return modes[kind];
}
@@ -56,30 +56,34 @@ void printinst (const Instruction *op, const Instruction *p) {
const char *const names[] = {
"any", "char", "set",
"testany", "testchar", "testset",
- "span", "behind",
+ "span", "utf-range", "behind",
"ret", "end",
"choice", "jmp", "call", "open_call",
"commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup",
- "fullcapture", "opencapture", "closecapture", "closeruntime"
+ "fullcapture", "opencapture", "closecapture", "closeruntime",
+ "--"
};
printf("%02ld: %s ", (long)(p - op), names[p->i.code]);
switch ((Opcode)p->i.code) {
case IChar: {
- printf("'%c'", p->i.aux);
+ printf("'%c' (%02x)", p->i.aux, p->i.aux);
break;
}
case ITestChar: {
- printf("'%c'", p->i.aux); printjmp(op, p);
+ printf("'%c' (%02x)", p->i.aux, p->i.aux); printjmp(op, p);
+ break;
+ }
+ case IUTFR: {
+ printf("%d - %d", p[1].offset, utf_to(p));
break;
}
case IFullCapture: {
- printcapkind(getkind(p));
- printf(" (size = %d) (idx = %d)", getoff(p), p->i.key);
+ printf("%s (size = %d) (idx = %d)",
+ capkind(getkind(p)), getoff(p), p->i.key);
break;
}
case IOpenCapture: {
- printcapkind(getkind(p));
- printf(" (idx = %d)", p->i.key);
+ printf("%s (idx = %d)", capkind(getkind(p)), p->i.key);
break;
}
case ISet: {
@@ -124,8 +128,8 @@ void printpatt (Instruction *p, int n) {
#if defined(LPEG_DEBUG)
static void printcap (Capture *cap) {
- printcapkind(cap->kind);
- printf(" (idx: %d - size: %d) -> %p\n", cap->idx, cap->siz, cap->s);
+ printf("%s (idx: %d - size: %d) -> %p\n",
+ capkind(cap->kind), cap->idx, cap->siz, cap->s);
}
@@ -148,11 +152,11 @@ void printcaplist (Capture *cap, Capture *limit) {
static const char *tagnames[] = {
"char", "set", "any",
- "true", "false",
+ "true", "false", "utf8.range",
"rep",
"seq", "choice",
"not", "and",
- "call", "opencall", "rule", "grammar",
+ "call", "opencall", "rule", "xinfo", "grammar",
"behind",
"capture", "run-time"
};
@@ -160,6 +164,7 @@ static const char *tagnames[] = {
void printtree (TTree *tree, int ident) {
int i;
+ int sibs = numsiblings[tree->tag];
for (i = 0; i < ident; i++) printf(" ");
printf("%s", tagnames[tree->tag]);
switch (tree->tag) {
@@ -176,24 +181,34 @@ void printtree (TTree *tree, int ident) {
printf("\n");
break;
}
+ case TUTFR: {
+ assert(sib1(tree)->tag == TXInfo);
+ printf(" %d (%02x %d) - %d (%02x %d) \n",
+ tree->u.n, tree->key, tree->cap,
+ sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap);
+ break;
+ }
case TOpenCall: case TCall: {
- printf(" key: %d\n", tree->key);
+ assert(sib1(sib2(tree))->tag == TXInfo);
+ printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n);
break;
}
case TBehind: {
printf(" %d\n", tree->u.n);
- printtree(sib1(tree), ident + 2);
break;
}
case TCapture: {
- printf(" cap: %d key: %d n: %d\n", tree->cap, tree->key, tree->u.n);
- printtree(sib1(tree), ident + 2);
+ printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key);
break;
}
case TRule: {
- printf(" n: %d key: %d\n", tree->cap, tree->key);
- printtree(sib1(tree), ident + 2);
- break; /* do not print next rule as a sibling */
+ printf(" key: %d\n", tree->key);
+ sibs = 1; /* do not print 'sib2' (next rule) as a sibling */
+ break;
+ }
+ case TXInfo: {
+ printf(" n: %d\n", tree->u.n);
+ break;
}
case TGrammar: {
TTree *rule = sib1(tree);
@@ -203,18 +218,17 @@ void printtree (TTree *tree, int ident) {
rule = sib2(rule);
}
assert(rule->tag == TTrue); /* sentinel */
+ sibs = 0; /* siblings already handled */
break;
}
- default: {
- int sibs = numsiblings[tree->tag];
+ default:
printf("\n");
- if (sibs >= 1) {
- printtree(sib1(tree), ident + 2);
- if (sibs >= 2)
- printtree(sib2(tree), ident + 2);
- }
break;
- }
+ }
+ if (sibs >= 1) {
+ printtree(sib1(tree), ident + 2);
+ if (sibs >= 2)
+ printtree(sib2(tree), ident + 2);
}
}
diff --git a/lptree.c b/lptree.c
index 4b9cf9c..5ced65d 100644
--- a/lptree.c
+++ b/lptree.c
@@ -21,11 +21,11 @@
/* number of siblings for each tree */
const byte numsiblings[] = {
0, 0, 0, /* char, set, any */
- 0, 0, /* true, false */
+ 0, 0, 0, /* true, false, utf-range */
1, /* rep */
2, 2, /* seq, choice */
1, 1, /* not, and */
- 0, 0, 2, 1, /* call, opencall, rule, grammar */
+ 0, 0, 2, 1, 1, /* call, opencall, rule, prerule, grammar */
1, /* behind */
1, 1 /* capture, runtime capture */
};
@@ -64,7 +64,7 @@ static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t) {
t->tag = TCall;
t->u.ps = n - (t - g); /* position relative to node */
assert(sib2(t)->tag == TRule);
- sib2(t)->key = t->key;
+ sib2(t)->key = t->key; /* fix rule's key */
}
@@ -679,6 +679,56 @@ static int lp_range (lua_State *L) {
}
+/*
+** Fills a tree node with basic information about the UTF-8 code point
+** 'cpu': its value in 'n', its length in 'cap', and its first byte in
+** 'key'
+*/
+static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) {
+ int len, fb, cp;
+ cp = (int)cpu;
+ if (cp <= 0x7f) { /* one byte? */
+ len = 1;
+ fb = cp;
+ } else if (cp <= 0x7ff) {
+ len = 2;
+ fb = 0xC0 | (cp >> 6);
+ } else if (cp <= 0xffff) {
+ len = 3;
+ fb = 0xE0 | (cp >> 12);
+ }
+ else {
+ luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point");
+ len = 4;
+ fb = 0xF0 | (cp >> 18);
+ }
+ t->u.n = cp;
+ t->cap = len;
+ t->key = fb;
+}
+
+
+static int lp_utfr (lua_State *L) {
+ lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1);
+ lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2);
+ luaL_argcheck(L, from <= to, 2, "empty range");
+ if (to <= 0x7f) { /* ascii range? */
+ TTree *tree = newcharset(L); /* code it as a regular charset */
+ unsigned int f;
+ for (f = (int)from; f <= to; f++)
+ setchar(treebuffer(tree), f);
+ }
+ else { /* multi-byte utf-8 range */
+ TTree *tree = newtree(L, 2);
+ tree->tag = TUTFR;
+ codeutftree(L, tree, from, 1);
+ sib1(tree)->tag = TXInfo;
+ codeutftree(L, sib1(tree), to, 2);
+ }
+ return 1;
+}
+
+
/*
** Look-behind predicate
*/
@@ -723,6 +773,7 @@ static int capture_aux (lua_State *L, int cap, int labelidx) {
/*
** Fill a tree with an empty capture, using an empty (TTrue) sibling.
+** (The 'key' field must be filled by the caller to finish the tree.)
*/
static TTree *auxemptycap (TTree *tree, int cap) {
tree->tag = TCapture;
@@ -733,15 +784,17 @@ static TTree *auxemptycap (TTree *tree, int cap) {
/*
-** Create a tree for an empty capture
+** Create a tree for an empty capture.
*/
-static TTree *newemptycap (lua_State *L, int cap) {
- return auxemptycap(newtree(L, 2), cap);
+static TTree *newemptycap (lua_State *L, int cap, int key) {
+ TTree *tree = auxemptycap(newtree(L, 2), cap);
+ tree->key = key;
+ return tree;
}
/*
-** Create a tree for an empty capture with an associated Lua value
+** Create a tree for an empty capture with an associated Lua value.
*/
static TTree *newemptycapkey (lua_State *L, int cap, int idx) {
TTree *tree = auxemptycap(newtree(L, 2), cap);
@@ -802,16 +855,15 @@ static int lp_simplecapture (lua_State *L) {
static int lp_poscapture (lua_State *L) {
- newemptycap(L, Cposition);
+ newemptycap(L, Cposition, 0);
return 1;
}
static int lp_argcapture (lua_State *L) {
int n = (int)luaL_checkinteger(L, 1);
- TTree *tree = newemptycap(L, Carg);
- tree->key = n;
luaL_argcheck(L, 0 < n && n <= SHRT_MAX, 1, "invalid argument index");
+ newemptycap(L, Carg, n);
return 1;
}
@@ -911,7 +963,7 @@ static int collectrules (lua_State *L, int arg, int *totalsize) {
int size; /* accumulator for total size */
lua_newtable(L); /* create position table */
getfirstrule(L, arg, postab);
- size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */
+ size = 3 + getsize(L, postab + 2); /* TGrammar + TRule + TXInfo + rule */
lua_pushnil(L); /* prepare to traverse grammar table */
while (lua_next(L, arg) != 0) {
if (lua_tonumber(L, -2) == 1 ||
@@ -925,11 +977,11 @@ static int collectrules (lua_State *L, int arg, int *totalsize) {
lua_pushvalue(L, -2); /* push key (to insert into position table) */
lua_pushinteger(L, size);
lua_settable(L, postab);
- size += 1 + getsize(L, -1); /* update size */
+ size += 2 + getsize(L, -1); /* add 'TRule + TXInfo + rule' to size */
lua_pushvalue(L, -2); /* push key (for next lua_next) */
n++;
}
- *totalsize = size + 1; /* TTrue to finish list of rules */
+ *totalsize = size + 1; /* space for 'TTrue' finishing list of rules */
return n;
}
@@ -941,12 +993,14 @@ static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) {
int ridx = frule + 2*i + 1; /* index of i-th rule */
int rulesize;
TTree *rn = gettree(L, ridx, &rulesize);
+ TTree *pr = sib1(nd); /* points to rule's prerule */
nd->tag = TRule;
- nd->key = 0;
- nd->cap = i; /* rule number */
+ nd->key = 0; /* will be fixed when rule is used */
+ pr->tag = TXInfo;
+ pr->u.n = i; /* rule number */
nd->lr = 0;
- nd->u.ps = rulesize + 1; /* point to next rule */
- memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */
+ nd->u.ps = rulesize + 2; /* point to next rule */
+ memcpy(sib1(pr), rn, rulesize * sizeof(TTree)); /* copy rule */
mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */
nd = sib2(nd); /* move to next rule */
}
@@ -977,7 +1031,12 @@ static int checkloops (TTree *tree) {
}
-static int verifyerror (lua_State *L, int *passed, int npassed) {
+/*
+** Give appropriate error message for 'verifyrule'. If a rule appears
+** twice in 'passed', there is path from it back to itself without
+** advancing the subject.
+*/
+static int verifyerror (lua_State *L, unsigned short *passed, int npassed) {
int i, j;
for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */
for (j = i - 1; j >= 0; j--) {
@@ -999,14 +1058,16 @@ static int verifyerror (lua_State *L, int *passed, int npassed) {
** is only relevant if the first is nullable.
** Parameter 'nb' works as an accumulator, to allow tail calls in
** choices. ('nb' true makes function returns true.)
+** Parameter 'passed' is a list of already visited rules, 'npassed'
+** counts the elements in 'passed'.
** Assume ktable at the top of the stack.
*/
-static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
- int nb) {
+static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed,
+ int npassed, int nb) {
tailcall:
switch (tree->tag) {
case TChar: case TSet: case TAny:
- case TFalse:
+ case TFalse: case TUTFR:
return nb; /* cannot pass from here */
case TTrue:
case TBehind: /* look-behind cannot have calls */
@@ -1014,7 +1075,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
case TNot: case TAnd: case TRep:
/* return verifyrule(L, sib1(tree), passed, npassed, 1); */
tree = sib1(tree); nb = 1; goto tailcall;
- case TCapture: case TRunTime:
+ case TCapture: case TRunTime: case TXInfo:
/* return verifyrule(L, sib1(tree), passed, npassed, nb); */
tree = sib1(tree); goto tailcall;
case TCall:
@@ -1033,10 +1094,10 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
/* return verifyrule(L, sib2(tree), passed, npassed, nb); */
tree = sib2(tree); goto tailcall;
case TRule:
- if (npassed >= MAXRULES)
- return verifyerror(L, passed, npassed);
+ if (npassed >= MAXRULES) /* too many steps? */
+ return verifyerror(L, passed, npassed); /* error */
else {
- passed[npassed++] = tree->key;
+ passed[npassed++] = tree->key; /* add rule to path */
/* return verifyrule(L, sib1(tree), passed, npassed); */
tree = sib1(tree); goto tailcall;
}
@@ -1066,7 +1127,7 @@ static void findleftrecursivecalls (TTree *tree) {
}
static void verifygrammar (lua_State *L, TTree *grammar) {
- int passed[MAXRULES];
+ unsigned short passed[MAXRULES];
TTree *rule;
/* check left-recursive rules */
for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
@@ -1217,12 +1278,6 @@ static int lp_setmax (lua_State *L) {
}
-static int lp_version (lua_State *L) {
- lua_pushstring(L, VERSION);
- return 1;
-}
-
-
static int lp_type (lua_State *L) {
if (testpattern(L, 1))
lua_pushliteral(L, "pattern");
@@ -1291,8 +1346,9 @@ static struct luaL_Reg pattreg[] = {
{"P", lp_P},
{"S", lp_set},
{"R", lp_range},
+ {"utfR", lp_utfr},
{"locale", lp_locale},
- {"version", lp_version},
+ {"version", NULL},
{"setmaxstack", lp_setmax},
{"type", lp_type},
{NULL, NULL}
@@ -1321,6 +1377,8 @@ int luaopen_lpeg (lua_State *L) {
luaL_newlib(L, pattreg);
lua_pushvalue(L, -1);
lua_setfield(L, -3, "__index");
+ lua_pushliteral(L, "LPeg " VERSION);
+ lua_setfield(L, -2, "version");
return 1;
}
diff --git a/lptree.h b/lptree.h
index 96567ab..5907fb2 100644
--- a/lptree.h
+++ b/lptree.h
@@ -13,31 +13,40 @@
** types of trees
*/
typedef enum TTag {
- TChar = 0, TSet, TAny, /* standard PEG elements */
- TTrue, TFalse,
- TRep,
- TSeq, TChoice,
- TNot, TAnd,
- TCall,
- TOpenCall,
- TRule, /* sib1 is rule's pattern, sib2 is 'next' rule */
- TGrammar, /* sib1 is initial (and first) rule */
- TBehind, /* match behind */
- TCapture, /* regular capture */
- TRunTime /* run-time capture */
+ TChar = 0, /* 'n' = char */
+ TSet, /* the set is stored in next CHARSETSIZE bytes */
+ TAny,
+ TTrue,
+ TFalse,
+ TUTFR, /* range of UTF-8 codepoints; 'n' has initial codepoint;
+ 'cap' has length; 'key' has first byte;
+ extra info is similar for end codepoint */
+ TRep, /* 'sib1'* */
+ TSeq, /* 'sib1' 'sib2' */
+ TChoice, /* 'sib1' / 'sib2' */
+ TNot, /* !'sib1' */
+ TAnd, /* &'sib1' */
+ TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */
+ TOpenCall, /* ktable[key] is rule's key */
+ TRule, /* ktable[key] is rule's key (but key == 0 for unused rules);
+ 'sib1' is rule's pattern pre-rule; 'sib2' is next rule;
+ extra info 'n' is rule's sequential number */
+ TXInfo, /* extra info */
+ TGrammar, /* 'sib1' is initial (and first) rule */
+ TBehind, /* 'sib1' is pattern, 'n' is how much to go back */
+ TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind');
+ ktable[key] is Lua value associated with capture;
+ 'sib1' is capture body */
+ TRunTime /* run-time capture: 'key' is Lua function;
+ 'sib1' is capture body */
} TTag;
-/* number of siblings for each tree */
-extern const byte numsiblings[];
-
/*
** Tree trees
-** The first sibling of a tree (if there is one) is immediately after
-** the tree. A reference to a second sibling (ps) is its position
-** relative to the position of the tree itself. A key in ktable
-** uses the (unique) address of the original tree that created that
-** entry. NULL means no data.
+** The first child of a tree (if there is one) is immediately after
+** the tree. A reference to a second child (ps) is its position
+** relative to the position of the tree itself.
*/
typedef struct TTree {
byte tag;
@@ -45,7 +54,7 @@ typedef struct TTree {
byte lr;
unsigned short key; /* key in ktable for Lua data (0 if no key) */
union {
- int ps; /* occasional second sibling */
+ int ps; /* occasional second child */
int n; /* occasional counter */
} u;
} TTree;
@@ -62,10 +71,10 @@ typedef struct Pattern {
} Pattern;
-/* number of siblings for each tree */
+/* number of children for each tree */
extern const byte numsiblings[];
-/* access to siblings */
+/* access to children */
#define sib1(t) ((t) + 1)
#define sib2(t) ((t) + (t)->u.ps)
diff --git a/lptypes.h b/lptypes.h
index 50578dc..b0969f7 100644
--- a/lptypes.h
+++ b/lptypes.h
@@ -1,7 +1,7 @@
/*
** $Id: lptypes.h,v 1.14 2015/09/28 17:17:41 roberto Exp $
** LPeg - PEG pattern matching for Lua
-** Copyright 2007-2015, Lua.org & PUC-Rio (see 'lpeg.html' for license)
+** Copyright 2007-2019, Lua.org & PUC-Rio (see 'lpeg.html' for license)
** written by Roberto Ierusalimschy
*/
@@ -9,17 +9,13 @@
#define lptypes_h
-#if !defined(LPEG_DEBUG)
-#define NDEBUG
-#endif
-
#include The
equivalent to re Modulep / defs[name]
p => namelpeg.Cmt(p, defs[name])p ~> namelpeg.Cf(p, defs[name])& p! pp1 p2tag field telling what non terminal
that table represents.
We can add such a tag using
-named group captures:
+named group captures:
x = re.compile[[ @@ -406,7 +408,7 @@Patterns
p = [=[ pattern <- exp !. -exp <- S (alternative / grammar) +exp <- S (grammar / alternative) alternative <- seq ('/' S seq)* seq <- prefix* @@ -421,6 +423,7 @@Patterns
/ '=' name / '{}' / '{~' exp '~}' + / '{|' exp '|}' / '{' exp '}' / '.' / name S !arrow @@ -434,7 +437,7 @@Patterns
range <- . '-' [^]] S <- (%s / '--' [^%nl]*)* -- spaces and comments -name <- [A-Za-z][A-Za-z0-9_]* +name <- [A-Za-z_][A-Za-z0-9_]* arrow <- '<-' num <- [0-9]+ string <- '"' [^"]* '"' / "'" [^']* "'" @@ -450,7 +453,7 @@Patterns
License
-Copyright © 2008-2010 Lua.org, PUC-Rio. +Copyright © 2008-2015 Lua.org, PUC-Rio.
Permission is hereby granted, free of charge, diff --git a/re.lua b/re.lua index 1d8e159..77a4af8 100644 --- a/re.lua +++ b/re.lua @@ -71,13 +71,6 @@ updatelocale() local I = m.P(function (s,i) print(i, s:sub(1, i-1)); return i end) -local function getdef (id, defs) - local c = defs and defs[id] - if not c then error("undefined name: " .. id) end - return c -end - - local function patt_error (s, i) local msg = (#s < i + 20) and s:sub(i) or s:sub(i,i+20) .. "..." @@ -116,6 +109,20 @@ name = m.C(name) -- a defined name only have meaning in a given environment local Def = name * m.Carg(1) + +local function getdef (id, defs) + local c = defs and defs[id] + if not c then error("undefined name: " .. id) end + return c +end + +-- match a name and return a group of its corresponding definition +-- and 'f' (to be folded in 'Suffix') +local function defwithfunc (f) + return m.Cg(Def / getdef * m.Cc(f)) +end + + local num = m.C(m.R"09"^1) * S / tonumber local String = "'" * m.C((any - "'")^0) * "'" + @@ -130,7 +137,7 @@ end local Range = m.Cs(any * (m.P"-"/"") * (any - "]")) / mm.R -local item = defined + Range + m.C(any) +local item = (defined + Range + m.C(any)) / m.P local Class = "[" @@ -176,9 +183,10 @@ local exp = m.P{ "Exp", ) + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div)) + m.P"{}" * m.Cc(nil, m.Ct) - + m.Cg(Def / getdef * m.Cc(mt.__div)) + + defwithfunc(mt.__div) ) - + "=>" * S * m.Cg(Def / getdef * m.Cc(m.Cmt)) + + "=>" * S * defwithfunc(m.Cmt) + + "~>" * S * defwithfunc(m.Cf) ) * S )^0, function (a,b,f) return f(a,b) end ); Primary = "(" * m.V"Exp" * ")" diff --git a/test.lua b/test.lua index 7b685bb..1770294 100644 --- a/test.lua +++ b/test.lua @@ -4,8 +4,12 @@ -- require"strict" -- just to be pedantic +print(package.path, package.cpath) +package.path = './?.lua;' .. package.path +package.cpath = './?.so;' .. package.cpath local m = require"lpeg" +print(m.version) -- for general use local a, b, c, d, e, f, g, p, t @@ -48,8 +52,8 @@ end print"General tests for LPeg library" -assert(type(m.version()) == "string") -print("version " .. m.version()) +assert(type(m.version) == "string") +print(m.version) assert(m.type("alo") ~= "pattern") assert(m.type(io.input) ~= "pattern") assert(m.type(m.P"alo") == "pattern") @@ -202,6 +206,14 @@ do end +-- bug: loop in 'hascaptures' +do + local p = m.C(-m.P{m.P'x' * m.V(1) + m.P'y'}) + assert(p:match("xxx") == "") +end + + + -- test for small capture boundary for i = 250,260 do assert(#m.match(m.C(i), string.rep('a', i)) == i) @@ -398,7 +410,7 @@ assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc') do - -- large dynamic Cc + print "testing large dynamic Cc" local lim = 2^16 - 1 local c = 0 local function seq (n) @@ -416,6 +428,16 @@ do end +do + -- nesting of captures too deep + local p = m.C(1) + for i = 1, 300 do + p = m.Ct(p) + end + checkerr("too deep", p.match, p, "x") +end + + -- tests for non-pattern as arguments to pattern functions p = { ('a' * m.V(1))^-1 } * m.P'b' * { 'a' * m.V(2); m.V(1)^-1 } @@ -517,6 +539,27 @@ assert(m.match(m.Cs((#((#m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") assert(m.match(m.Cs((- -m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") assert(m.match(m.Cs((-((-m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") + +-- fixed length +do + -- 'and' predicate using fixed length + local p = m.C(#("a" * (m.P("bd") + "cd")) * 2) + assert(p:match("acd") == "ac") + + p = #m.P{ "a" * m.V(2), m.P"b" } * 2 + assert(p:match("abc") == 3) + + p = #(m.P"abc" * m.B"c") + assert(p:match("abc") == 1 and not p:match("ab")) + + p = m.P{ "a" * m.V(2), m.P"b"^1 } + checkerr("pattern may not have fixed length", m.B, p) + + p = "abc" * (m.P"b"^1 + m.P"a"^0) + checkerr("pattern may not have fixed length", m.B, p) +end + + p = -m.P'a' * m.Cc(1) + -m.P'b' * m.Cc(2) + -m.P'c' * m.Cc(3) assert(p:match('a') == 2 and p:match('') == 1 and p:match('b') == 1) @@ -817,7 +860,7 @@ s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l) p = (m.C(m.P'a'^1) * m.C(m.P'b'^1) * m.C(m.P'c'^1)) / '%3%2%1' assert(p:match(s) == string.rep('c', l) .. - string.rep('b', l) .. + string.rep('b', l) .. string.rep('a', l)) print"+" @@ -946,10 +989,10 @@ for i = 1, 10 do assert(p:match("aaaaaaaaaaa") == 11 - i + 1) end -print"+" --- tests for back references +print "testing back references" + checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') @@ -993,6 +1036,17 @@ local function id (s, i, ...) return true, ... end +do -- run-time capture in an end predicate (should discard its value) + local x = 0 + function foo (s, i) + x = x + 1 + return true, x + end + + local p = #(m.Cmt("", foo) * "xx") * m.Cmt("", foo) + assert(p:match("xx") == 2) +end + assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + m.R'09'^1 / string.char + m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") @@ -1011,8 +1065,8 @@ assert(#x == 500) local function id(s, i, x) if x == 'a' then return i, 1, 3, 7 else return nil, 2, 4, 6, 8 - end -end + end +end p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))^0 assert(table.concat{p:match('abababab')} == string.rep('137', 4)) @@ -1098,6 +1152,32 @@ do assert(c == 11) end + +-- Return a match-time capture that returns 'n' captures +local function manyCmt (n) + return m.Cmt("a", function () + local a = {}; for i = 1, n do a[i] = n - i end + return true, unpack(a) + end) +end + +-- bug in 1.0: failed match-time that used previous match-time results +do + local x + local function aux (...) x = #{...}; return false end + local res = {m.match(m.Cmt(manyCmt(20), aux) + manyCmt(10), "a")} + assert(#res == 10 and res[1] == 9 and res[10] == 0) +end + + +-- bug in 1.0: problems with math-times returning too many captures +do + local lim = 2^11 - 10 + local res = {m.match(manyCmt(lim), "a")} + assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) + checkerr("too many", m.match, manyCmt(2^15), "a") +end + p = (m.P(function () return true, "a" end) * 'a' + m.P(function (s, i) return i, "aa", 20 end) * 'b' + m.P(function (s,i) if i <= #s then return i, "aaa" end end) * 1)^0 @@ -1106,9 +1186,85 @@ t = {p:match('abacc')} checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) +do print"testing large grammars" + local lim = 1000 -- number of rules + local t = {} + + for i = 3, lim do + t[i] = m.V(i - 1) -- each rule calls previous one + end + t[1] = m.V(lim) -- start on last rule + t[2] = m.C("alo") -- final rule + + local P = m.P(t) -- build grammar + assert(P:match("alo") == "alo") + + t[#t + 1] = m.P("x") -- one more rule... + checkerr("too many rules", m.P, t) +end + + +print "testing UTF-8 ranges" + +do -- a few typical UTF-8 ranges + local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0" + + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0" + + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0" + + m.utfR(0, 0x7f)^1 / "ascii: %0" + + m.utfR(0, 0x10ffff) / "other: %0" + + p = m.Ct(p^0) * -m.P(1) + + local cyr = "ждюя" + local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀 + local cjk = "专举乸" + local ascii = "alo" + local last = "\244\143\191\191" -- U+10FFFF + + local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last + t = (p:match(s)) + + assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and + t[3] == "emot: " .. emot and t[4] == "other: —" and + t[5] == "cjk: " .. cjk and t[6] == "other: —" and + t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and + t[9] == nil) +end + + +do -- valid and invalid code points + local p = m.utfR(0, 0x10ffff)^0 + assert(p:match("汉字\128") == #"汉字" + 1) + assert(p:match("\244\159\191") == 1) + assert(p:match("\244\159\191\191") == 1) + assert(p:match("\255") == 1) + + -- basic errors + checkerr("empty range", m.utfR, 1, 0) + checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1) +end + + +do -- back references (fixed width) + -- match a byte after a CJK point + local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1) + p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p' + assert(p:match("ab д 专X x") == "X") + + -- match a byte after a hebrew point + local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1) + p = m.P(#"ש") * p + assert(p:match("שX") == "X") + + checkerr("fixed length", m.B, m.utfR(0, 0x10ffff)) +end + + + ------------------------------------------------------------------- -- Tests for 're' module ------------------------------------------------------------------- +print"testing 're' module" local re = require "re" @@ -1131,6 +1287,9 @@ assert(not match("abbcde", " [b-z] + ")) assert(match("abb\"de", '"abb"["]"de"') == 7) assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee") assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8) + +assert(re.match("aaand", "[a]^2") == 3) + local t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")} checkeq(t, {4, 5, 7}) local t = {match("abceefe", "((&&'e' {})? .)*")} @@ -1305,6 +1464,13 @@ checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but', {'totheend'}}) +-- test for folding captures +c = re.compile([[ + S <- (number (%s+ number)*) ~> add + number <- %d+ -> tonumber +]], {tonumber = tonumber, add = function (a,b) return a + b end}) +assert(c:match("3 401 50") == 3 + 401 + 50) + -- tests for look-ahead captures x = {re.match("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")} checkeq(x, {"", "alo", ""}) diff --git a/testlr.lua b/testlr.lua index 7911bac..f18c6ce 100644 --- a/testlr.lua +++ b/testlr.lua @@ -1,6 +1,12 @@ +print(package.path, package.path) +package.path = './?.lua;' .. package.path +package.cpath = './?.so;' .. package.cpath + local lpeg = require"lpeg" local re = require"re" +print(lpeg.version) + local m = lpeg @@ -15,8 +21,8 @@ end print"Tests for LPeg left recursion" -assert(type(m.version()) == "string") -print("version " .. m.version()) +assert(type(m.version) == "string") +print("version " .. m.version) --[[