From 7c59efd85a3cb9401c6a5760cf7b2a607645b7a3 Mon Sep 17 00:00:00 2001 From: mingodad Date: Mon, 12 Dec 2022 12:36:48 +0100 Subject: [PATCH] Update to lpeg-1.0.2 --- HISTORY | 8 +- doc.css | 223 +++++++++++++++++++++++++++++++++++++++++++++++++++++ lpcap.c | 62 +++++++++------ lpcap.h | 18 ++++- lpcode.c | 134 ++++++++++++++++++++++---------- lpcode.h | 4 +- lpeg.html | 76 ++++++++++++------ lpprint.c | 72 ++++++++++------- lptree.c | 124 +++++++++++++++++++++-------- lptree.h | 55 +++++++------ lptypes.h | 12 ++- lpvm.c | 165 +++++++++++++++++++++++++++------------ lpvm.h | 10 ++- makefile | 4 +- re.html | 11 ++- re.lua | 28 ++++--- test.lua | 182 +++++++++++++++++++++++++++++++++++++++++-- testlr.lua | 10 ++- 18 files changed, 932 insertions(+), 266 deletions(-) create mode 100644 doc.css diff --git a/HISTORY b/HISTORY index 0c10edd..96acace 100644 --- a/HISTORY +++ b/HISTORY @@ -1,6 +1,10 @@ -HISTORY for LPeg 1.0 +HISTORY for LPeg 1.0.2 -* Changes from version 0.12 to 1.0 +* Changes from version 1.0.1 to 1.0.2 + --------------------------------- + + some bugs fixed + +* Changes from version 0.12 to 1.0.1 --------------------------------- + group "names" can be any Lua value + some bugs fixed diff --git a/doc.css b/doc.css new file mode 100644 index 0000000..3770e4e --- /dev/null +++ b/doc.css @@ -0,0 +1,223 @@ +body { + margin-left: 1em; + margin-right: 1em; + font-family: arial, helvetica, geneva, sans-serif; + background-color:#ffffff; margin:0px; +} + +code { + font-family: "Andale Mono", monospace; +} + +tt { + font-family: "Andale Mono", monospace; +} + +body, td, th { font-size: 11pt; } + +h1, h2, h3, h4 { margin-left: 0em; } + +textarea, pre, tt { font-size:10pt; } +body, td, th { color:#000000; } +small { font-size:0.85em; } +h1 { font-size:1.5em; } +h2 { font-size:1.25em; } +h3 { font-size:1.15em; } +h4 { font-size:1.06em; } + +a:link { font-weight:bold; color: #004080; text-decoration: none; } +a:visited { font-weight:bold; color: #006699; text-decoration: none; } +a:link:hover { text-decoration:underline; } +hr { color:#cccccc } +img { border-width: 0px; } + + +h3 { padding-top: 1em; } + +p { margin-left: 1em; } + +p.name { + font-family: "Andale Mono", monospace; + padding-top: 1em; + margin-left: 0em; +} + +blockquote { margin-left: 3em; } + +.example { + background-color: rgb(245, 245, 245); + border-top-width: 1px; + border-right-width: 1px; + border-bottom-width: 1px; + border-left-width: 1px; + border-top-style: solid; + border-right-style: solid; + border-bottom-style: solid; + border-left-style: solid; + border-top-color: silver; + border-right-color: silver; + border-bottom-color: silver; + border-left-color: silver; + padding: 1em; + margin-left: 1em; + margin-right: 1em; + font-family: "Andale Mono", monospace; + font-size: smaller; +} + + +hr { + margin-left: 0em; + background: #00007f; + border: 0px; + height: 1px; +} + +ul { list-style-type: disc; } + +table.index { border: 1px #00007f; } +table.index td { text-align: left; vertical-align: top; } +table.index ul { padding-top: 0em; margin-top: 0em; } + +table { + border: 1px solid black; + border-collapse: collapse; + margin-left: auto; + margin-right: auto; +} +th { + border: 1px solid black; + padding: 0.5em; +} +td { + border: 1px solid black; + padding: 0.5em; +} +div.header, div.footer { margin-left: 0em; } + +#container +{ + margin-left: 1em; + margin-right: 1em; + background-color: #f0f0f0; +} + +#product +{ + text-align: center; + border-bottom: 1px solid #cccccc; + background-color: #ffffff; +} + +#product big { + font-size: 2em; +} + +#product_logo +{ +} + +#product_name +{ +} + +#product_description +{ +} + +#main +{ + background-color: #f0f0f0; + border-left: 2px solid #cccccc; +} + +#navigation +{ + float: left; + width: 12em; + margin: 0; + vertical-align: top; + background-color: #f0f0f0; + overflow:visible; +} + +#navigation h1 { + background-color:#e7e7e7; + font-size:1.1em; + color:#000000; + text-align:left; + margin:0px; + padding:0.2em; + border-top:1px solid #dddddd; + border-bottom:1px solid #dddddd; +} + +#navigation ul +{ + font-size:1em; + list-style-type: none; + padding: 0; + margin: 1px; +} + +#navigation li +{ + text-indent: -1em; + margin: 0em 0em 0em 0.5em; + display: block; + padding: 3px 0px 0px 12px; +} + +#navigation li li a +{ + padding: 0px 3px 0px -1em; +} + +#content +{ + margin-left: 12em; + padding: 1em; + border-left: 2px solid #cccccc; + border-right: 2px solid #cccccc; + background-color: #ffffff; +} + +#about +{ + clear: both; + margin: 0; + padding: 5px; + border-top: 2px solid #cccccc; + background-color: #ffffff; +} + +@media print { + body { + font: 10pt "Times New Roman", "TimeNR", Times, serif; + } + a { font-weight:bold; color: #004080; text-decoration: underline; } + + #main { background-color: #ffffff; border-left: 0px; } + #container { margin-left: 2%; margin-right: 2%; background-color: #ffffff; } + + #content { margin-left: 0px; padding: 1em; border-left: 0px; border-right: 0px; background-color: #ffffff; } + + #navigation { display: none; + } + + #product_logo + { + display: none; + } + + #about img + { + display: none; + } + + .example { + font-family: "Andale Mono", monospace; + font-size: 8pt; + page-break-inside: avoid; + } +} diff --git a/lpcap.c b/lpcap.c index c9085de..1a3643d 100644 --- a/lpcap.c +++ b/lpcap.c @@ -271,15 +271,15 @@ int finddyncap (Capture *cap, Capture *last) { /* -** Calls a runtime capture. Returns number of captures removed by -** the call, including the initial Cgroup. (Captures to be added are -** on the Lua stack.) +** Calls a runtime capture. Returns number of captures "removed" by the +** call, that is, those inside the group capture. Captures to be added +** are on the Lua stack. */ int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) { int n, id; lua_State *L = cs->L; int otop = lua_gettop(L); - Capture *open = findopen(close); + Capture *open = findopen(close); /* get open group capture */ assert(captype(open) == Cgroup); id = finddyncap(open, close); /* get first dynamic capture argument */ close->kind = Cclose; /* closes the group */ @@ -299,7 +299,7 @@ int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) { } else *rem = 0; /* no dynamic captures removed */ - return close - open; /* number of captures of all kinds removed */ + return close - open - 1; /* number of captures to be removed */ } @@ -441,70 +441,88 @@ static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) { } +#if !defined(MAXRECLEVEL) +#define MAXRECLEVEL 200 +#endif + + /* ** Push all values of the current capture into the stack; returns ** number of values pushed */ static int pushcapture (CapState *cs) { lua_State *L = cs->L; + int res; luaL_checkstack(L, 4, "too many captures"); + if (cs->reclevel++ > MAXRECLEVEL) + return luaL_error(L, "subcapture nesting too deep"); switch (captype(cs->cap)) { case Cposition: { lua_pushinteger(L, cs->cap->s - cs->s + 1); cs->cap++; - return 1; + res = 1; + break; } case Cconst: { pushluaval(cs); cs->cap++; - return 1; + res = 1; + break; } case Carg: { int arg = (cs->cap++)->idx; if (arg + FIXEDARGS > cs->ptop) return luaL_error(L, "reference to absent extra argument #%d", arg); lua_pushvalue(L, arg + FIXEDARGS); - return 1; + res = 1; + break; } case Csimple: { int k = pushnestedvalues(cs, 1); lua_insert(L, -k); /* make whole match be first result */ - return k; + res = k; + break; } case Cruntime: { lua_pushvalue(L, (cs->cap++)->idx); /* value is in the stack */ - return 1; + res = 1; + break; } case Cstring: { luaL_Buffer b; luaL_buffinit(L, &b); stringcap(&b, cs); luaL_pushresult(&b); - return 1; + res = 1; + break; } case Csubst: { luaL_Buffer b; luaL_buffinit(L, &b); substcap(&b, cs); luaL_pushresult(&b); - return 1; + res = 1; + break; } case Cgroup: { if (cs->cap->idx == 0) /* anonymous group? */ - return pushnestedvalues(cs, 0); /* add all nested values */ + res = pushnestedvalues(cs, 0); /* add all nested values */ else { /* named group: add no values */ nextcap(cs); /* skip capture */ - return 0; + res = 0; } + break; } - case Cbackref: return backrefcap(cs); - case Ctable: return tablecap(cs); - case Cfunction: return functioncap(cs); - case Cnum: return numcap(cs); - case Cquery: return querycap(cs); - case Cfold: return foldcap(cs); - default: assert(0); return 0; + case Cbackref: res = backrefcap(cs); break; + case Ctable: res = tablecap(cs); break; + case Cfunction: res = functioncap(cs); break; + case Cnum: res = numcap(cs); break; + case Cquery: res = querycap(cs); break; + case Cfold: res = foldcap(cs); break; + default: assert(0); res = 0; } + cs->reclevel--; + return res; } @@ -521,7 +539,7 @@ int getcaptures (lua_State *L, const char *s, const char *r, int ptop) { int n = 0; if (!isclosecap(capture)) { /* is there any capture? */ CapState cs; - cs.ocap = cs.cap = capture; cs.L = L; + cs.ocap = cs.cap = capture; cs.L = L; cs.reclevel = 0; cs.s = s; cs.valuecached = 0; cs.ptop = ptop; do { /* collect their values */ n += pushcapture(&cs); diff --git a/lpcap.h b/lpcap.h index 355a38b..3bd5dcd 100644 --- a/lpcap.h +++ b/lpcap.h @@ -11,8 +11,21 @@ /* kinds of captures */ typedef enum CapKind { - Cclose, Cposition, Cconst, Cbackref, Carg, Csimple, Ctable, Cfunction, - Cquery, Cstring, Cnum, Csubst, Cfold, Cruntime, Cgroup + Cclose, /* not used in trees */ + Cposition, + Cconst, /* ktable[key] is Lua constant */ + Cbackref, /* ktable[key] is "name" of group to get capture */ + Carg, /* 'key' is arg's number */ + Csimple, /* next node is pattern */ + Ctable, /* next node is pattern */ + Cfunction, /* ktable[key] is function; next node is pattern */ + Cquery, /* ktable[key] is table; next node is pattern */ + Cstring, /* ktable[key] is string; next node is pattern */ + Cnum, /* numbered capture; 'key' is number of value to return */ + Csubst, /* substitution capture; next node is pattern */ + Cfold, /* ktable[key] is function; next node is pattern */ + Cruntime, /* not used in trees (is uses another type for tree) */ + Cgroup /* ktable[key] is group's "name" */ } CapKind; @@ -36,6 +49,7 @@ typedef struct CapState { int ptop; /* index of last argument to 'match' */ const char *s; /* original string */ int valuecached; /* value stored in cache slot */ + int reclevel; /* recursion level */ } CapState; diff --git a/lpcode.c b/lpcode.c index b8a5161..5d89f62 100644 --- a/lpcode.c +++ b/lpcode.c @@ -125,6 +125,27 @@ int tocharset (TTree *tree, Charset *cs) { } +/* +** Visit a TCall node taking care to stop recursion. If node not yet +** visited, return 'f(sib2(tree))', otherwise return 'def' (default +** value) +*/ +static int callrecursive (TTree *tree, int f (TTree *t), int def) { + int key = tree->key; + assert(tree->tag == TCall); + assert(sib2(tree)->tag == TRule); + if (key == 0) /* node already visited? */ + return def; /* return default value */ + else { /* first visit */ + int result; + tree->key = 0; /* mark call as already visited */ + result = f(sib2(tree)); /* go to called rule */ + tree->key = key; /* restore tree */ + return result; + } +} + + /* ** Check whether a pattern tree has captures */ @@ -134,14 +155,17 @@ int hascaptures (TTree *tree) { case TCapture: case TRunTime: return 1; case TCall: - tree = sib2(tree); goto tailcall; /* return hascaptures(sib2(tree)); */ + return callrecursive(tree, hascaptures, 0); + case TRule: /* do not follow siblings */ + tree = sib1(tree); goto tailcall; case TOpenCall: assert(0); default: { switch (numsiblings[tree->tag]) { case 1: /* return hascaptures(sib1(tree)); */ tree = sib1(tree); goto tailcall; case 2: - if (hascaptures(sib1(tree))) return 1; + if (hascaptures(sib1(tree))) + return 1; /* else return hascaptures(sib2(tree)); */ tree = sib2(tree); goto tailcall; default: assert(numsiblings[tree->tag] == 0); return 0; @@ -172,7 +196,7 @@ int hascaptures (TTree *tree) { int checkaux (TTree *tree, int pred) { tailcall: switch (tree->tag) { - case TChar: case TSet: case TAny: + case TChar: case TSet: case TAny: case TUTFR: case TFalse: case TOpenCall: return 0; /* not nullable */ case TRep: case TTrue: @@ -196,7 +220,7 @@ int checkaux (TTree *tree, int pred) { if (checkaux(sib2(tree), pred)) return 1; /* else return checkaux(sib1(tree), pred); */ tree = sib1(tree); goto tailcall; - case TCapture: case TGrammar: case TRule: + case TCapture: case TGrammar: case TRule: case TXInfo: /* return checkaux(sib1(tree), pred); */ tree = sib1(tree); goto tailcall; case TCall: /* return checkaux(sib2(tree), pred); */ @@ -208,38 +232,43 @@ int checkaux (TTree *tree, int pred) { /* ** number of characters to match a pattern (or -1 if variable) -** ('count' avoids infinite loops for grammars) */ -int fixedlenx (TTree *tree, int count, int len) { +int fixedlen (TTree *tree) { + int len = 0; /* to accumulate in tail calls */ tailcall: switch (tree->tag) { case TChar: case TSet: case TAny: return len + 1; + case TUTFR: + return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1; case TFalse: case TTrue: case TNot: case TAnd: case TBehind: return len; case TRep: case TRunTime: case TOpenCall: return -1; - case TCapture: case TRule: case TGrammar: - /* return fixedlenx(sib1(tree), count); */ + case TCapture: case TRule: case TGrammar: case TXInfo: + /* return fixedlen(sib1(tree)); */ tree = sib1(tree); goto tailcall; - case TCall: - if (count++ >= MAXRULES) - return -1; /* may be a loop */ - /* else return fixedlenx(sib2(tree), count); */ - tree = sib2(tree); goto tailcall; + case TCall: { + int n1 = callrecursive(tree, fixedlen, -1); + if (n1 < 0) + return -1; + else + return len + n1; + } case TSeq: { - len = fixedlenx(sib1(tree), count, len); - if (len < 0) return -1; - /* else return fixedlenx(sib2(tree), count, len); */ - tree = sib2(tree); goto tailcall; + int n1 = fixedlen(sib1(tree)); + if (n1 < 0) + return -1; + /* else return fixedlen(sib2(tree)) + len; */ + len += n1; tree = sib2(tree); goto tailcall; } case TChoice: { - int n1, n2; - n1 = fixedlenx(sib1(tree), count, len); - if (n1 < 0) return -1; - n2 = fixedlenx(sib2(tree), count, len); - if (n1 == n2) return n1; - else return -1; + int n1 = fixedlen(sib1(tree)); + int n2 = fixedlen(sib2(tree)); + if (n1 != n2 || n1 < 0) + return -1; + else + return len + n1; } default: assert(0); return 0; }; @@ -271,6 +300,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { tocharset(tree, firstset); return 0; } + case TUTFR: { + int c; + loopset(i, firstset->cs[i] = 0); /* erase all chars */ + for (c = tree->key; c <= sib1(tree)->key; c++) + setchar(firstset->cs, c); + return 0; + } case TTrue: { loopset(i, firstset->cs[i] = follow->cs[i]); return 1; /* accepts the empty string */ @@ -307,7 +343,7 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { loopset(i, firstset->cs[i] |= follow->cs[i]); return 1; /* accept the empty string */ } - case TCapture: case TGrammar: case TRule: { + case TCapture: case TGrammar: case TRule: case TXInfo: { /* return getfirst(sib1(tree), follow, firstset); */ tree = sib1(tree); goto tailcall; } @@ -329,9 +365,8 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { if (tocharset(sib1(tree), firstset)) { cs_complement(firstset); return 1; - } - /* else go through */ - } + } /* else */ + } /* FALLTHROUGH */ case TBehind: { /* instruction gives no new information */ /* call 'getfirst' only to check for math-time captures */ int e = getfirst(sib1(tree), follow, firstset); @@ -353,9 +388,9 @@ static int headfail (TTree *tree) { case TChar: case TSet: case TAny: case TFalse: return 1; case TTrue: case TRep: case TRunTime: case TNot: - case TBehind: + case TBehind: case TUTFR: return 0; - case TCapture: case TGrammar: case TRule: case TAnd: + case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ case TCall: tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ @@ -380,7 +415,7 @@ static int headfail (TTree *tree) { static int needfollow (TTree *tree) { tailcall: switch (tree->tag) { - case TChar: case TSet: case TAny: + case TChar: case TSet: case TAny: case TUTFR: case TFalse: case TTrue: case TAnd: case TNot: case TRunTime: case TGrammar: case TCall: case TBehind: return 0; @@ -436,6 +471,7 @@ int sizei (const Instruction *i) { case ITestSet: return CHARSETINSTSIZE + 1; case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: + case IUTFR: return 2; default: return 1; } @@ -513,6 +549,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) { } +static void codeutfr (CompileState *compst, TTree *tree) { + int i = addoffsetinst(compst, IUTFR, 0); + int to = sib1(tree)->u.n; + assert(sib1(tree)->tag == TXInfo); + getinstr(compst, i + 1).offset = tree->u.n; + getinstr(compst, i).i.aux = to & 0xff; + getinstr(compst, i).i.key = to >> 8; +} + + /* ** Add a capture instruction: ** 'op' is the capture instruction; 'cap' the capture kind; @@ -660,11 +706,11 @@ static void codebehind (CompileState *compst, TTree *tree) { /* ** Choice; optimizations: -** - when p1 is headfail or -** when first(p1) and first(p2) are disjoint, than -** a character not in first(p1) cannot go to p1, and a character -** in first(p1) cannot go to p2 (at it is not in first(p2)). -** (The optimization is not valid if p1 accepts the empty string, +** - when p1 is headfail or when first(p1) and first(p2) are disjoint, +** than a character not in first(p1) cannot go to p1 and a character +** in first(p1) cannot go to p2, either because p1 will accept +** (headfail) or because it is not in first(p2) (disjoint). +** (The second case is not valid if p1 accepts the empty string, ** as then there is no character at all...) ** - when p2 is empty and opt is true; a IPartialCommit can reuse ** the Choice already active in the stack. @@ -745,9 +791,10 @@ static void codeand (CompileState *compst, TTree *tree, int tt) { /* -** Captures: if pattern has fixed (and not too big) length, use -** a single IFullCapture instruction after the match; otherwise, -** enclose the pattern with OpenCapture - CloseCapture. +** Captures: if pattern has fixed (and not too big) length, and it +** has no nested captures, use a single IFullCapture instruction +** after the match; otherwise, enclose the pattern with OpenCapture - +** CloseCapture. */ static void codecapture (CompileState *compst, TTree *tree, int tt, const Charset *fl) { @@ -907,8 +954,10 @@ static void codegrammar (CompileState *compst, TTree *grammar) { int start = gethere(compst); /* here starts the initial rule */ jumptohere(compst, firstcall); for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { + TTree *r = sib1(rule); + assert(r->tag == TXInfo); positions[rulenumber++] = gethere(compst); /* save rule position */ - codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ + codegen(compst, sib1(r), 0, NOINST, fullset); /* code rule */ addinstruction(compst, IRet, 0); } assert(rule->tag == TTrue); @@ -919,8 +968,8 @@ static void codegrammar (CompileState *compst, TTree *grammar) { static void codecall (CompileState *compst, TTree *call) { int c = addoffsetinst(compst, IOpenCall, call->lr); /* to be corrected later */ - getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ - assert(sib2(call)->tag == TRule); + assert(sib1(sib2(call))->tag == TXInfo); + getinstr(compst, c).i.key = sib1(sib2(call))->u.n; /* rule number */ } @@ -958,6 +1007,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, case TSet: codecharset(compst, treebuffer(tree), tt); break; case TTrue: break; case TFalse: addinstruction(compst, IFail, 0); break; + case TUTFR: codeutfr(compst, tree); break; case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; case TRep: coderep(compst, sib1(tree), opt, fl); break; case TBehind: codebehind(compst, tree); break; @@ -1004,7 +1054,7 @@ static void peephole (CompileState *compst) { case IRet: case IFail: case IFailTwice: case IEnd: { /* instructions with unconditional implicit jumps */ code[i] = code[ft]; /* jump becomes that instruction */ - code[i + 1].i.code = IAny; /* 'no-op' for target position */ + code[i + 1].i.code = IEmpty; /* 'no-op' for target position */ break; } case ICommit: case IPartialCommit: diff --git a/lpcode.h b/lpcode.h index 9c5c74b..cada62c 100644 --- a/lpcode.h +++ b/lpcode.h @@ -13,7 +13,7 @@ int tocharset (TTree *tree, Charset *cs); int checkaux (TTree *tree, int pred); -int fixedlenx (TTree *tree, int count, int len); +int fixedlen (TTree *tree); int hascaptures (TTree *tree); int hasleftrecursion (TTree *tree); int lp_gc (lua_State *L); @@ -37,8 +37,6 @@ int sizei (const Instruction *i); */ #define nullable(t) checkaux(t, PEnullable) -#define fixedlen(t) fixedlenx(t, 0, 0) - #endif diff --git a/lpeg.html b/lpeg.html index 3b40cd8..5dbddfd 100644 --- a/lpeg.html +++ b/lpeg.html @@ -22,7 +22,7 @@
LPeg
- Parsing Expression Grammars For Lua, version 0.12 + Parsing Expression Grammars For Lua, version 1.1
@@ -107,6 +107,9 @@

Introduction

Matches any character in string (Set) lpeg.R("xy") Matches any character between x and y (Range) +lpeg.utfR(cp1, cp2) + Matches an UTF-8 code point between cp1 and + cp2 patt^n Matches at least n repetitions of patt patt^-n @@ -142,7 +145,7 @@

Introduction

LPeg also offers the re module, which implements patterns following a regular-expression style (e.g., [09]+). -(This module is 260 lines of Lua code, +(This module is 270 lines of Lua code, and of course it uses LPeg to parse regular expressions and translate them to regular LPeg patterns.)

@@ -164,7 +167,7 @@

lpeg.match (pattern, subject [, init]) An optional numeric argument init makes the match start at that position in the subject string. -As usual in Lua libraries, +As in the Lua standard libraries, a negative value counts from the end.

@@ -188,20 +191,23 @@

lpeg.type (value)

Otherwise returns nil.

-

lpeg.version ()

+

lpeg.version

-Returns a string with the running version of LPeg. +A string (not a function) with the running version of LPeg.

lpeg.setmaxstack (max)

-Sets the maximum size for the backtrack stack used by LPeg to +Sets a limit for the size of the backtrack stack used by LPeg to track calls and choices. +(The default limit is 400.) Most well-written patterns need little backtrack levels and -therefore you seldom need to change this maximum; -but a few useful patterns may need more space. -Before changing this maximum you should try to rewrite your +therefore you seldom need to change this limit; +before changing it you should try to rewrite your pattern to avoid the need for extra space. +Nevertheless, a few useful patterns may overflow. +Also, with recursive grammars, +subjects with deep recursion may also need larger limits.

@@ -574,8 +580,9 @@

Grammars

Captures

-A capture is a pattern that creates values -(the so called semantic information) when it matches. +A capture is a pattern that produces values +(the so called semantic information) +according to what it matches. LPeg offers several kinds of captures, which produces values based on matches and combine these values to produce new values. @@ -629,10 +636,7 @@

Captures

-A capture pattern produces its values every time it succeeds. -For instance, -a capture inside a loop produces as many values as matched by the loop. -A capture produces a value only when it succeeds. +A capture pattern produces its values only when it succeeds. For instance, the pattern lpeg.C(lpeg.P"a"^-1) produces the empty string when there is no "a" @@ -640,14 +644,20 @@

Captures

while the pattern lpeg.C("a")^-1 does not produce any value when there is no "a" (because the pattern "a" fails). +A pattern inside a loop or inside a recursive structure +produces values for each match.

Usually, -LPeg evaluates all captures only after (and if) the entire match succeeds. -During match time it only gathers enough information -to produce the capture values later. -As a particularly important consequence, +LPeg does not specify when (and if) it evaluates its captures. +(As an example, +consider the pattern lpeg.P"a" / func / 0. +Because the "division" by 0 instructs LPeg to throw away the +results from the pattern, +LPeg may or may not call func.) +Therefore, captures should avoid side effects. +Moreover, most captures cannot affect the way a pattern matches a subject. The only exception to this rule is the so-called match-time capture. @@ -682,7 +692,8 @@

lpeg.Cb (name)

Creates a back capture. This pattern matches the empty string and produces the values produced by the most recent -group capture named name. +group capture named name +(where name can be any Lua value).

@@ -696,6 +707,12 @@

lpeg.Cb (name)

another complete capture.

+

+In the same way that LPeg does not specify when it evaluates captures, +it does not specify whether it reuses +values previously produced by the group +or re-evaluates them. +

lpeg.Cc ([value, ...])

@@ -762,7 +779,8 @@

lpeg.Cg (patt [, name])

It groups all values returned by patt into a single capture. The group may be anonymous (if no name is given) -or named with the given name. +or named with the given name +(which can be any non-nil Lua value).

@@ -801,7 +819,7 @@

lpeg.Cs (patt)

lpeg.Ct (patt)

Creates a table capture. -This capture creates a table and puts all values from all anonymous captures +This capture returns a table with all values from all anonymous captures made by patt inside this table in successive integer keys, starting at 1. Moreover, @@ -867,7 +885,8 @@

lpeg.Cmt(patt, function)

Creates a match-time capture. Unlike all other captures, -this one is evaluated immediately when a match occurs. +this one is evaluated immediately when a match occurs +(even if it is part of a larger pattern that fails later). It forces the immediate evaluation of all its nested captures and then calls function.

@@ -1375,13 +1394,20 @@

Arithmetic expressions

Download

LPeg -source code.

+source code.

+ +

+Probably, the easiest way to install LPeg is with +LuaRocks. +If you have LuaRocks installed, +the following command is all you need to install LPeg: +

$ luarocks install lpeg

License

-Copyright © 2014 Lua.org, PUC-Rio. +Copyright © 2007-2019 Lua.org, PUC-Rio.

Permission is hereby granted, free of charge, diff --git a/lpprint.c b/lpprint.c index 174d168..da18d05 100644 --- a/lpprint.c +++ b/lpprint.c @@ -37,13 +37,13 @@ void printcharset (const byte *st) { } -static void printcapkind (int kind) { +static const char *capkind (int kind) { const char *const modes[] = { "close", "position", "constant", "backref", "argument", "simple", "table", "function", "query", "string", "num", "substitution", "fold", "runtime", "group"}; - printf("%s", modes[kind]); + return modes[kind]; } @@ -56,30 +56,34 @@ void printinst (const Instruction *op, const Instruction *p) { const char *const names[] = { "any", "char", "set", "testany", "testchar", "testset", - "span", "behind", + "span", "utf-range", "behind", "ret", "end", "choice", "jmp", "call", "open_call", "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", - "fullcapture", "opencapture", "closecapture", "closeruntime" + "fullcapture", "opencapture", "closecapture", "closeruntime", + "--" }; printf("%02ld: %s ", (long)(p - op), names[p->i.code]); switch ((Opcode)p->i.code) { case IChar: { - printf("'%c'", p->i.aux); + printf("'%c' (%02x)", p->i.aux, p->i.aux); break; } case ITestChar: { - printf("'%c'", p->i.aux); printjmp(op, p); + printf("'%c' (%02x)", p->i.aux, p->i.aux); printjmp(op, p); + break; + } + case IUTFR: { + printf("%d - %d", p[1].offset, utf_to(p)); break; } case IFullCapture: { - printcapkind(getkind(p)); - printf(" (size = %d) (idx = %d)", getoff(p), p->i.key); + printf("%s (size = %d) (idx = %d)", + capkind(getkind(p)), getoff(p), p->i.key); break; } case IOpenCapture: { - printcapkind(getkind(p)); - printf(" (idx = %d)", p->i.key); + printf("%s (idx = %d)", capkind(getkind(p)), p->i.key); break; } case ISet: { @@ -124,8 +128,8 @@ void printpatt (Instruction *p, int n) { #if defined(LPEG_DEBUG) static void printcap (Capture *cap) { - printcapkind(cap->kind); - printf(" (idx: %d - size: %d) -> %p\n", cap->idx, cap->siz, cap->s); + printf("%s (idx: %d - size: %d) -> %p\n", + capkind(cap->kind), cap->idx, cap->siz, cap->s); } @@ -148,11 +152,11 @@ void printcaplist (Capture *cap, Capture *limit) { static const char *tagnames[] = { "char", "set", "any", - "true", "false", + "true", "false", "utf8.range", "rep", "seq", "choice", "not", "and", - "call", "opencall", "rule", "grammar", + "call", "opencall", "rule", "xinfo", "grammar", "behind", "capture", "run-time" }; @@ -160,6 +164,7 @@ static const char *tagnames[] = { void printtree (TTree *tree, int ident) { int i; + int sibs = numsiblings[tree->tag]; for (i = 0; i < ident; i++) printf(" "); printf("%s", tagnames[tree->tag]); switch (tree->tag) { @@ -176,24 +181,34 @@ void printtree (TTree *tree, int ident) { printf("\n"); break; } + case TUTFR: { + assert(sib1(tree)->tag == TXInfo); + printf(" %d (%02x %d) - %d (%02x %d) \n", + tree->u.n, tree->key, tree->cap, + sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap); + break; + } case TOpenCall: case TCall: { - printf(" key: %d\n", tree->key); + assert(sib1(sib2(tree))->tag == TXInfo); + printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n); break; } case TBehind: { printf(" %d\n", tree->u.n); - printtree(sib1(tree), ident + 2); break; } case TCapture: { - printf(" cap: %d key: %d n: %d\n", tree->cap, tree->key, tree->u.n); - printtree(sib1(tree), ident + 2); + printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key); break; } case TRule: { - printf(" n: %d key: %d\n", tree->cap, tree->key); - printtree(sib1(tree), ident + 2); - break; /* do not print next rule as a sibling */ + printf(" key: %d\n", tree->key); + sibs = 1; /* do not print 'sib2' (next rule) as a sibling */ + break; + } + case TXInfo: { + printf(" n: %d\n", tree->u.n); + break; } case TGrammar: { TTree *rule = sib1(tree); @@ -203,18 +218,17 @@ void printtree (TTree *tree, int ident) { rule = sib2(rule); } assert(rule->tag == TTrue); /* sentinel */ + sibs = 0; /* siblings already handled */ break; } - default: { - int sibs = numsiblings[tree->tag]; + default: printf("\n"); - if (sibs >= 1) { - printtree(sib1(tree), ident + 2); - if (sibs >= 2) - printtree(sib2(tree), ident + 2); - } break; - } + } + if (sibs >= 1) { + printtree(sib1(tree), ident + 2); + if (sibs >= 2) + printtree(sib2(tree), ident + 2); } } diff --git a/lptree.c b/lptree.c index 4b9cf9c..5ced65d 100644 --- a/lptree.c +++ b/lptree.c @@ -21,11 +21,11 @@ /* number of siblings for each tree */ const byte numsiblings[] = { 0, 0, 0, /* char, set, any */ - 0, 0, /* true, false */ + 0, 0, 0, /* true, false, utf-range */ 1, /* rep */ 2, 2, /* seq, choice */ 1, 1, /* not, and */ - 0, 0, 2, 1, /* call, opencall, rule, grammar */ + 0, 0, 2, 1, 1, /* call, opencall, rule, prerule, grammar */ 1, /* behind */ 1, 1 /* capture, runtime capture */ }; @@ -64,7 +64,7 @@ static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t) { t->tag = TCall; t->u.ps = n - (t - g); /* position relative to node */ assert(sib2(t)->tag == TRule); - sib2(t)->key = t->key; + sib2(t)->key = t->key; /* fix rule's key */ } @@ -679,6 +679,56 @@ static int lp_range (lua_State *L) { } +/* +** Fills a tree node with basic information about the UTF-8 code point +** 'cpu': its value in 'n', its length in 'cap', and its first byte in +** 'key' +*/ +static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) { + int len, fb, cp; + cp = (int)cpu; + if (cp <= 0x7f) { /* one byte? */ + len = 1; + fb = cp; + } else if (cp <= 0x7ff) { + len = 2; + fb = 0xC0 | (cp >> 6); + } else if (cp <= 0xffff) { + len = 3; + fb = 0xE0 | (cp >> 12); + } + else { + luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point"); + len = 4; + fb = 0xF0 | (cp >> 18); + } + t->u.n = cp; + t->cap = len; + t->key = fb; +} + + +static int lp_utfr (lua_State *L) { + lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1); + lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); + luaL_argcheck(L, from <= to, 2, "empty range"); + if (to <= 0x7f) { /* ascii range? */ + TTree *tree = newcharset(L); /* code it as a regular charset */ + unsigned int f; + for (f = (int)from; f <= to; f++) + setchar(treebuffer(tree), f); + } + else { /* multi-byte utf-8 range */ + TTree *tree = newtree(L, 2); + tree->tag = TUTFR; + codeutftree(L, tree, from, 1); + sib1(tree)->tag = TXInfo; + codeutftree(L, sib1(tree), to, 2); + } + return 1; +} + + /* ** Look-behind predicate */ @@ -723,6 +773,7 @@ static int capture_aux (lua_State *L, int cap, int labelidx) { /* ** Fill a tree with an empty capture, using an empty (TTrue) sibling. +** (The 'key' field must be filled by the caller to finish the tree.) */ static TTree *auxemptycap (TTree *tree, int cap) { tree->tag = TCapture; @@ -733,15 +784,17 @@ static TTree *auxemptycap (TTree *tree, int cap) { /* -** Create a tree for an empty capture +** Create a tree for an empty capture. */ -static TTree *newemptycap (lua_State *L, int cap) { - return auxemptycap(newtree(L, 2), cap); +static TTree *newemptycap (lua_State *L, int cap, int key) { + TTree *tree = auxemptycap(newtree(L, 2), cap); + tree->key = key; + return tree; } /* -** Create a tree for an empty capture with an associated Lua value +** Create a tree for an empty capture with an associated Lua value. */ static TTree *newemptycapkey (lua_State *L, int cap, int idx) { TTree *tree = auxemptycap(newtree(L, 2), cap); @@ -802,16 +855,15 @@ static int lp_simplecapture (lua_State *L) { static int lp_poscapture (lua_State *L) { - newemptycap(L, Cposition); + newemptycap(L, Cposition, 0); return 1; } static int lp_argcapture (lua_State *L) { int n = (int)luaL_checkinteger(L, 1); - TTree *tree = newemptycap(L, Carg); - tree->key = n; luaL_argcheck(L, 0 < n && n <= SHRT_MAX, 1, "invalid argument index"); + newemptycap(L, Carg, n); return 1; } @@ -911,7 +963,7 @@ static int collectrules (lua_State *L, int arg, int *totalsize) { int size; /* accumulator for total size */ lua_newtable(L); /* create position table */ getfirstrule(L, arg, postab); - size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */ + size = 3 + getsize(L, postab + 2); /* TGrammar + TRule + TXInfo + rule */ lua_pushnil(L); /* prepare to traverse grammar table */ while (lua_next(L, arg) != 0) { if (lua_tonumber(L, -2) == 1 || @@ -925,11 +977,11 @@ static int collectrules (lua_State *L, int arg, int *totalsize) { lua_pushvalue(L, -2); /* push key (to insert into position table) */ lua_pushinteger(L, size); lua_settable(L, postab); - size += 1 + getsize(L, -1); /* update size */ + size += 2 + getsize(L, -1); /* add 'TRule + TXInfo + rule' to size */ lua_pushvalue(L, -2); /* push key (for next lua_next) */ n++; } - *totalsize = size + 1; /* TTrue to finish list of rules */ + *totalsize = size + 1; /* space for 'TTrue' finishing list of rules */ return n; } @@ -941,12 +993,14 @@ static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) { int ridx = frule + 2*i + 1; /* index of i-th rule */ int rulesize; TTree *rn = gettree(L, ridx, &rulesize); + TTree *pr = sib1(nd); /* points to rule's prerule */ nd->tag = TRule; - nd->key = 0; - nd->cap = i; /* rule number */ + nd->key = 0; /* will be fixed when rule is used */ + pr->tag = TXInfo; + pr->u.n = i; /* rule number */ nd->lr = 0; - nd->u.ps = rulesize + 1; /* point to next rule */ - memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */ + nd->u.ps = rulesize + 2; /* point to next rule */ + memcpy(sib1(pr), rn, rulesize * sizeof(TTree)); /* copy rule */ mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */ nd = sib2(nd); /* move to next rule */ } @@ -977,7 +1031,12 @@ static int checkloops (TTree *tree) { } -static int verifyerror (lua_State *L, int *passed, int npassed) { +/* +** Give appropriate error message for 'verifyrule'. If a rule appears +** twice in 'passed', there is path from it back to itself without +** advancing the subject. +*/ +static int verifyerror (lua_State *L, unsigned short *passed, int npassed) { int i, j; for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */ for (j = i - 1; j >= 0; j--) { @@ -999,14 +1058,16 @@ static int verifyerror (lua_State *L, int *passed, int npassed) { ** is only relevant if the first is nullable. ** Parameter 'nb' works as an accumulator, to allow tail calls in ** choices. ('nb' true makes function returns true.) +** Parameter 'passed' is a list of already visited rules, 'npassed' +** counts the elements in 'passed'. ** Assume ktable at the top of the stack. */ -static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, - int nb) { +static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed, + int npassed, int nb) { tailcall: switch (tree->tag) { case TChar: case TSet: case TAny: - case TFalse: + case TFalse: case TUTFR: return nb; /* cannot pass from here */ case TTrue: case TBehind: /* look-behind cannot have calls */ @@ -1014,7 +1075,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, case TNot: case TAnd: case TRep: /* return verifyrule(L, sib1(tree), passed, npassed, 1); */ tree = sib1(tree); nb = 1; goto tailcall; - case TCapture: case TRunTime: + case TCapture: case TRunTime: case TXInfo: /* return verifyrule(L, sib1(tree), passed, npassed, nb); */ tree = sib1(tree); goto tailcall; case TCall: @@ -1033,10 +1094,10 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ tree = sib2(tree); goto tailcall; case TRule: - if (npassed >= MAXRULES) - return verifyerror(L, passed, npassed); + if (npassed >= MAXRULES) /* too many steps? */ + return verifyerror(L, passed, npassed); /* error */ else { - passed[npassed++] = tree->key; + passed[npassed++] = tree->key; /* add rule to path */ /* return verifyrule(L, sib1(tree), passed, npassed); */ tree = sib1(tree); goto tailcall; } @@ -1066,7 +1127,7 @@ static void findleftrecursivecalls (TTree *tree) { } static void verifygrammar (lua_State *L, TTree *grammar) { - int passed[MAXRULES]; + unsigned short passed[MAXRULES]; TTree *rule; /* check left-recursive rules */ for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { @@ -1217,12 +1278,6 @@ static int lp_setmax (lua_State *L) { } -static int lp_version (lua_State *L) { - lua_pushstring(L, VERSION); - return 1; -} - - static int lp_type (lua_State *L) { if (testpattern(L, 1)) lua_pushliteral(L, "pattern"); @@ -1291,8 +1346,9 @@ static struct luaL_Reg pattreg[] = { {"P", lp_P}, {"S", lp_set}, {"R", lp_range}, + {"utfR", lp_utfr}, {"locale", lp_locale}, - {"version", lp_version}, + {"version", NULL}, {"setmaxstack", lp_setmax}, {"type", lp_type}, {NULL, NULL} @@ -1321,6 +1377,8 @@ int luaopen_lpeg (lua_State *L) { luaL_newlib(L, pattreg); lua_pushvalue(L, -1); lua_setfield(L, -3, "__index"); + lua_pushliteral(L, "LPeg " VERSION); + lua_setfield(L, -2, "version"); return 1; } diff --git a/lptree.h b/lptree.h index 96567ab..5907fb2 100644 --- a/lptree.h +++ b/lptree.h @@ -13,31 +13,40 @@ ** types of trees */ typedef enum TTag { - TChar = 0, TSet, TAny, /* standard PEG elements */ - TTrue, TFalse, - TRep, - TSeq, TChoice, - TNot, TAnd, - TCall, - TOpenCall, - TRule, /* sib1 is rule's pattern, sib2 is 'next' rule */ - TGrammar, /* sib1 is initial (and first) rule */ - TBehind, /* match behind */ - TCapture, /* regular capture */ - TRunTime /* run-time capture */ + TChar = 0, /* 'n' = char */ + TSet, /* the set is stored in next CHARSETSIZE bytes */ + TAny, + TTrue, + TFalse, + TUTFR, /* range of UTF-8 codepoints; 'n' has initial codepoint; + 'cap' has length; 'key' has first byte; + extra info is similar for end codepoint */ + TRep, /* 'sib1'* */ + TSeq, /* 'sib1' 'sib2' */ + TChoice, /* 'sib1' / 'sib2' */ + TNot, /* !'sib1' */ + TAnd, /* &'sib1' */ + TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ + TOpenCall, /* ktable[key] is rule's key */ + TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); + 'sib1' is rule's pattern pre-rule; 'sib2' is next rule; + extra info 'n' is rule's sequential number */ + TXInfo, /* extra info */ + TGrammar, /* 'sib1' is initial (and first) rule */ + TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ + TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); + ktable[key] is Lua value associated with capture; + 'sib1' is capture body */ + TRunTime /* run-time capture: 'key' is Lua function; + 'sib1' is capture body */ } TTag; -/* number of siblings for each tree */ -extern const byte numsiblings[]; - /* ** Tree trees -** The first sibling of a tree (if there is one) is immediately after -** the tree. A reference to a second sibling (ps) is its position -** relative to the position of the tree itself. A key in ktable -** uses the (unique) address of the original tree that created that -** entry. NULL means no data. +** The first child of a tree (if there is one) is immediately after +** the tree. A reference to a second child (ps) is its position +** relative to the position of the tree itself. */ typedef struct TTree { byte tag; @@ -45,7 +54,7 @@ typedef struct TTree { byte lr; unsigned short key; /* key in ktable for Lua data (0 if no key) */ union { - int ps; /* occasional second sibling */ + int ps; /* occasional second child */ int n; /* occasional counter */ } u; } TTree; @@ -62,10 +71,10 @@ typedef struct Pattern { } Pattern; -/* number of siblings for each tree */ +/* number of children for each tree */ extern const byte numsiblings[]; -/* access to siblings */ +/* access to children */ #define sib1(t) ((t) + 1) #define sib2(t) ((t) + (t)->u.ps) diff --git a/lptypes.h b/lptypes.h index 50578dc..b0969f7 100644 --- a/lptypes.h +++ b/lptypes.h @@ -1,7 +1,7 @@ /* ** $Id: lptypes.h,v 1.14 2015/09/28 17:17:41 roberto Exp $ ** LPeg - PEG pattern matching for Lua -** Copyright 2007-2015, Lua.org & PUC-Rio (see 'lpeg.html' for license) +** Copyright 2007-2019, Lua.org & PUC-Rio (see 'lpeg.html' for license) ** written by Roberto Ierusalimschy */ @@ -9,17 +9,13 @@ #define lptypes_h -#if !defined(LPEG_DEBUG) -#define NDEBUG -#endif - #include #include #include "lua.h" -#define VERSION "1.0.0" +#define VERSION "1.1.0 LR" #define PATTERN_T "lpeg-pattern" @@ -41,6 +37,8 @@ #define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) #define luaL_newlib(L,f) luaL_register(L,"lpeg",f) +typedef size_t lua_Unsigned; + #endif @@ -55,7 +53,7 @@ #endif -/* maximum number of rules in a grammar */ +/* maximum number of rules in a grammar (limited by 'unsigned short') */ #if !defined(MAXRULES) #define MAXRULES 1000 #endif diff --git a/lpvm.c b/lpvm.c index 4b32a4b..79153a4 100644 --- a/lpvm.c +++ b/lpvm.c @@ -28,6 +28,35 @@ static const Instruction giveup = {{IGiveup, 0, 0}}; +/* +** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. +*/ +static const char *utf8_decode (const char *o, int *val) { + static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu}; + const unsigned char *s = (const unsigned char *)o; + unsigned int c = s[0]; /* first byte */ + unsigned int res = 0; /* final result */ + if (c < 0x80) /* ascii? */ + res = c; + else { + int count = 0; /* to count number of continuation bytes */ + while (c & 0x40) { /* still have continuation bytes? */ + int cc = s[++count]; /* read next byte */ + if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ + return NULL; /* invalid byte sequence */ + res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ + c <<= 1; /* to test next bit */ + } + res |= (c & 0x7F) << (count * 5); /* add first byte */ + if (count > 3 || res > 0x10FFFFu || res <= limits[count]) + return NULL; /* invalid byte sequence */ + s += count; /* skip continuation bytes read */ + } + *val = res; + return (const char *)s + 1; /* +1 to include first byte */ +} + + /* ** {====================================================== ** Virtual Machine @@ -48,18 +77,38 @@ typedef struct Stack { /* -** Double the size of the array of captures +** Ensures the size of array 'capture' (with size '*capsize' and +** 'captop' elements being used) is enough to accomodate 'n' extra +** elements plus one. (Because several opcodes add stuff to the capture +** array, it is simpler to ensure the array always has at least one free +** slot upfront and check its size later.) */ -static Capture *doublecap (lua_State *L, Capture *cap, int captop, int ptop, int capstackptr) { - Capture *newc; - if (captop >= INT_MAX/((int)sizeof(Capture) * 2)) - luaL_error(L, "too many captures"); - newc = (Capture *)lua_newuserdata(L, captop * 2 * sizeof(Capture)); - memcpy(newc, cap, captop * sizeof(Capture)); - lua_replace(L, caplistidx(ptop)); - lua_pushvalue(L, caplistidx(ptop)); // update capture base in Capture Stack - lua_rawseti(L, caplistsidx(ptop), capstackptr); - return newc; + +/* new size in number of elements cannot overflow integers, and new + size in bytes cannot overflow size_t. */ +#define MAXNEWSIZE \ + (((size_t)INT_MAX) <= (~(size_t)0 / sizeof(Capture)) ? \ + ((size_t)INT_MAX) : (~(size_t)0 / sizeof(Capture))) + +static Capture *growcap (lua_State *L, Capture *capture, int *capsize, + int captop, int n, int ptop) { + if (*capsize - captop > n) + return capture; /* no need to grow array */ + else { /* must grow */ + Capture *newc; + unsigned int newsize = captop + n + 1; /* minimum size needed */ + if (newsize < MAXNEWSIZE / 2) + newsize *= 2; /* twice that size, if not too big */ + else if (newsize < (MAXNEWSIZE / 9) * 8) + newsize += newsize / 8; /* else, try 9/8 that size */ + else + luaL_error(L, "too many captures"); + newc = (Capture *)lua_newuserdata(L, newsize * sizeof(Capture)); + memcpy(newc, capture, captop * sizeof(Capture)); + *capsize = newsize; + lua_replace(L, caplistidx(ptop)); + return newc; + } } /* @@ -136,24 +185,24 @@ static int resdyncaptures (lua_State *L, int fr, int curr, int limit) { /* -** Add capture values returned by a dynamic capture to the capture list -** 'base', nested inside a group capture. 'fd' indexes the first capture -** value, 'n' is the number of values (at least 1). +** Add capture values returned by a dynamic capture to the list +** 'capture', nested inside a group. 'fd' indexes the first capture +** value, 'n' is the number of values (at least 1). The open group +** capture is already in 'capture', before the place for the new entries. */ -static void adddyncaptures (const char *s, Capture *base, int n, int fd) { +static void adddyncaptures (const char *s, Capture *capture, int n, int fd) { int i; - /* Cgroup capture is already there */ - assert(base[0].kind == Cgroup && base[0].siz == 0); - base[0].idx = 0; /* make it an anonymous group */ - for (i = 1; i <= n; i++) { /* add runtime captures */ - base[i].kind = Cruntime; - base[i].siz = 1; /* mark it as closed */ - base[i].idx = fd + i - 1; /* stack index of capture value */ - base[i].s = s; + assert(capture[-1].kind == Cgroup && capture[-1].siz == 0); + capture[-1].idx = 0; /* make group capture an anonymous group */ + for (i = 0; i < n; i++) { /* add runtime captures */ + capture[i].kind = Cruntime; + capture[i].siz = 1; /* mark it as closed */ + capture[i].idx = fd + i; /* stack index of capture value */ + capture[i].s = s; } - base[i].kind = Cclose; /* close group */ - base[i].siz = 1; - base[i].s = s; + capture[n].kind = Cclose; /* close group */ + capture[n].siz = 1; + capture[n].s = s; } @@ -228,7 +277,7 @@ static int removecapturesfromstack (lua_State *L, int capstacktop, int ptop) { /* ** */ -static void putcapturestolambda (lua_State *L, int ndyncap, int captop, int capstacktop, int ptop) { +static void putcapturestolambda (lua_State *L, int ndyncap, int captop, int ptop) { int i; lua_pushvalue(L, caplistidx(ptop)); lua_setfield(L,-2,"commitcap"); @@ -248,7 +297,7 @@ static void putcapturestolambda (lua_State *L, int ndyncap, int captop, int caps /* ** */ -static Capture * addcapturesfromlambda (lua_State *L, int lambdaindex, Capture * capture, int *ndyncap, int *captop, int *capsize, int capstacktop, int ptop) { +static Capture * addcapturesfromlambda (lua_State *L, int lambdaindex, Capture * capture, int *ndyncap, int *captop, int *capsize, int ptop) { int i, commitdyncapcount, commitcaptop; Capture * commitcapture; lua_pushinteger(L, lambdaindex); @@ -272,8 +321,7 @@ static Capture * addcapturesfromlambda (lua_State *L, int lambdaindex, Capture * *ndyncap += commitdyncapcount; if (commitcaptop > 0) { if (*captop + commitcaptop >= *capsize) { - capture = doublecap(L, capture, *captop + commitcaptop, ptop, capstacktop); - *capsize = 2 * (*captop + commitcaptop); + capture = growcap(L, capture, capsize, *captop + commitcaptop, 0, ptop); } memcpy(capture + *captop, commitcapture, commitcaptop * sizeof(Capture)); *captop += commitcaptop; @@ -323,10 +371,11 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, capstack->capsize = capsize; for (;;) { #if defined(DEBUG) + printf("-------------------------------------\n"); + printcaplist(capture, capture + captop); printf("s: |%s| stck:%d, dyncaps:%d, caps:%d ", - s, stack - getstackbase(L, ptop), ndyncap, captop); + s, (int)(stack - getstackbase(L, ptop)), ndyncap, captop); printinst(op, p); - printcaplist(capture, capture + captop); #endif assert(dyncaplistidx(ptop) + ndyncap == lua_gettop(L) && ndyncap <= captop); switch ((Opcode)p->i.code) { @@ -357,7 +406,7 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, lua_gettable(L, lambdaidx(ptop)); lua_pushinteger(L,(stack - 1)->X == (char*)LRFAIL ? LRFAIL : (stack - 1)->X - o); lua_setfield(L,-2,"X"); - putcapturestolambda (L, ndyncap, captop, capstacktop, ptop); + putcapturestolambda (L, ndyncap, captop, ptop); lua_pop(L,1); if (ndyncap > 0) lua_pop(L, ndyncap); @@ -382,7 +431,7 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, capture = getcapturesfromstack (L, ndyncap, newdyncap, capstacktop, ptop); ndyncap = newdyncap; lambdaindex = (stack->pA - op) * maxpointer + (stack->s - o); - capture = addcapturesfromlambda (L, lambdaindex, capture, &ndyncap, &captop, &capsize, capstacktop, ptop); + capture = addcapturesfromlambda (L, lambdaindex, capture, &ndyncap, &captop, &capsize, ptop); clearlambdaitem (L, lambdaindex, ptop); } } @@ -393,6 +442,17 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, else goto fail; continue; } + case IUTFR: { + int codepoint; + if (s >= e) + goto fail; + s = utf8_decode (s, &codepoint); + if (s && p[1].offset <= codepoint && codepoint <= utf_to(p)) + p += 2; + else + goto fail; + continue; + } case ITestAny: { if (s < e) p += 2; else p += getoffset(p); @@ -507,7 +567,7 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, goto fail; else // rule lvar.4 { - capture = addcapturesfromlambda (L, lambdaindex, capture, &ndyncap, &captop, &capsize, capstacktop, ptop); + capture = addcapturesfromlambda (L, lambdaindex, capture, &ndyncap, &captop, &capsize, ptop); p += 2; s = o + X_X; } @@ -538,7 +598,7 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, case IFailTwice: assert(stack > getstackbase(L, ptop)); stack--; - /* go through */ + /* FALLTHROUGH */ case IFail: fail: { /* pattern failed: try to backtrack */ const char* X; @@ -562,6 +622,9 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, if (ndyncap > 0) /* is there matchtime captures? */ ndyncap -= removedyncap(L, capture, stack->caplevel, captop); p = stack->p; +#if defined(DEBUG) + printf("**FAIL**\n"); +#endif if (X) // rule inc.2 { int lambdaindex; @@ -574,7 +637,7 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, capture = getcapturesfromstack (L, ndyncap, newdyncap, capstacktop, ptop); ndyncap = newdyncap; lambdaindex = (stack->pA - op) * maxpointer + (stack->s - o); - capture = addcapturesfromlambda (L, lambdaindex, capture, &ndyncap, &captop, &capsize, capstacktop, ptop); + capture = addcapturesfromlambda (L, lambdaindex, capture, &ndyncap, &captop, &capsize, ptop); clearlambdaitem (L, lambdaindex, ptop); } else @@ -585,23 +648,27 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, CapState cs; int rem, res, n; int fr = lua_gettop(L) + 1; /* stack index of first result */ - cs.s = o; cs.L = L; cs.ocap = capture; cs.ptop = ptop; + cs.reclevel = 0; cs.L = L; + cs.s = o; cs.ocap = capture; cs.ptop = ptop; n = runtimecap(&cs, capture + captop, s, &rem); /* call function */ captop -= n; /* remove nested captures */ + ndyncap -= rem; /* update number of dynamic captures */ fr -= rem; /* 'rem' items were popped from Lua stack */ res = resdyncaptures(L, fr, s - o, e - o); /* get result */ if (res == -1) /* fail? */ goto fail; s = o + res; /* else update current position */ n = lua_gettop(L) - fr + 1; /* number of new captures */ - ndyncap += n - rem; /* update number of dynamic captures */ - if (n > 0) { /* any new capture? */ - if ((captop += n + 2) >= capsize) { - capture = doublecap(L, capture, captop, ptop, capstacktop); - capsize = 2 * captop; - } - /* add new captures to 'capture' list */ - adddyncaptures(s, capture + captop - n - 2, n, fr); + ndyncap += n; /* update number of dynamic captures */ + if (n == 0) /* no new captures? */ + captop--; /* remove open group */ + else { /* new captures; keep original open group */ + if (fr + n >= SHRT_MAX) + luaL_error(L, "too many results in match-time capture"); + /* add new captures + close group to 'capture' list */ + capture = growcap(L, capture, &capsize, captop, n + 1, ptop); + adddyncaptures(s, capture + captop, n, fr); + captop += n + 1; /* new captures + close group */ } p++; continue; @@ -633,10 +700,8 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, pushcapture: { capture[captop].idx = p->i.key; capture[captop].kind = getkind(p); - if (++captop >= capsize) { - capture = doublecap(L, capture, captop, ptop, capstacktop); - capsize = 2 * captop; - } + captop++; + capture = growcap(L, capture, &capsize, captop, 0, ptop); p++; continue; } diff --git a/lpvm.h b/lpvm.h index 757b9e1..3fd5749 100644 --- a/lpvm.h +++ b/lpvm.h @@ -17,6 +17,7 @@ typedef enum Opcode { ITestChar, /* if char != aux, jump to 'offset' */ ITestSet, /* if char not in buff, jump to 'offset' */ ISpan, /* read a span of chars in buff */ + IUTFR, /* if codepoint not in range [offset, utf_to], fail */ IBehind, /* walk back 'aux' characters (fail if not possible) */ IRet, /* return from a rule */ IEnd, /* end of pattern */ @@ -26,14 +27,15 @@ typedef enum Opcode { IOpenCall, /* call rule number 'key' (must be closed to a ICall) */ ICommit, /* pop choice and jump to 'offset' */ IPartialCommit, /* update top choice to current position and jump */ - IBackCommit, /* "fails" but jump to its own 'offset' */ + IBackCommit, /* backtrack like "fail" but jump to its own 'offset' */ IFailTwice, /* pop one choice and then fail */ IFail, /* go back to saved state on choice and jump to saved offset */ IGiveup, /* internal use */ IFullCapture, /* complete capture of last 'off' chars */ IOpenCapture, /* start a capture */ ICloseCapture, - ICloseRunTime + ICloseRunTime, + IEmpty /* to fill empty slots left by optimizations */ } Opcode; @@ -49,6 +51,10 @@ typedef union Instruction { } Instruction; +/* extract 24-bit value from an instruction */ +#define utf_to(inst) (((inst)->i.key << 8) | (inst)->i.aux) + + void printpatt (Instruction *p, int n); const char *match (lua_State *L, const char *o, const char *s, const char *e, Instruction *op, Capture *capture, int ptop); diff --git a/makefile b/makefile index 7a8463e..73d2f1a 100644 --- a/makefile +++ b/makefile @@ -29,11 +29,11 @@ FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o # For Linux linux: - make lpeg.so "DLLFLAGS = -shared -fPIC" + $(MAKE) lpeg.so "DLLFLAGS = -shared -fPIC" # For Mac OS macosx: - make lpeg.so "DLLFLAGS = -bundle -undefined dynamic_lookup" + $(MAKE) lpeg.so "DLLFLAGS = -bundle -undefined dynamic_lookup" lpeg.so: $(FILES) env $(CC) $(DLLFLAGS) $(FILES) -o lpeg.so diff --git a/re.html b/re.html index 4717ec2..b11477c 100644 --- a/re.html +++ b/re.html @@ -93,6 +93,8 @@

The re Module

equivalent to p / defs[name] p => name match-time capture equivalent to lpeg.Cmt(p, defs[name]) +p ~> name fold capture +equivalent to lpeg.Cf(p, defs[name]) & p and predicate ! p not predicate p1 p2 concatenation @@ -296,7 +298,7 @@

Abstract Syntax Trees

a tag field telling what non terminal that table represents. We can add such a tag using -named group captures: +named group captures:

 x = re.compile[[
@@ -406,7 +408,7 @@ 

Patterns

p = [=[ pattern <- exp !. -exp <- S (alternative / grammar) +exp <- S (grammar / alternative) alternative <- seq ('/' S seq)* seq <- prefix* @@ -421,6 +423,7 @@

Patterns

/ '=' name / '{}' / '{~' exp '~}' + / '{|' exp '|}' / '{' exp '}' / '.' / name S !arrow @@ -434,7 +437,7 @@

Patterns

range <- . '-' [^]] S <- (%s / '--' [^%nl]*)* -- spaces and comments -name <- [A-Za-z][A-Za-z0-9_]* +name <- [A-Za-z_][A-Za-z0-9_]* arrow <- '<-' num <- [0-9]+ string <- '"' [^"]* '"' / "'" [^']* "'" @@ -450,7 +453,7 @@

Patterns

License

-Copyright © 2008-2010 Lua.org, PUC-Rio. +Copyright © 2008-2015 Lua.org, PUC-Rio.

Permission is hereby granted, free of charge, diff --git a/re.lua b/re.lua index 1d8e159..77a4af8 100644 --- a/re.lua +++ b/re.lua @@ -71,13 +71,6 @@ updatelocale() local I = m.P(function (s,i) print(i, s:sub(1, i-1)); return i end) -local function getdef (id, defs) - local c = defs and defs[id] - if not c then error("undefined name: " .. id) end - return c -end - - local function patt_error (s, i) local msg = (#s < i + 20) and s:sub(i) or s:sub(i,i+20) .. "..." @@ -116,6 +109,20 @@ name = m.C(name) -- a defined name only have meaning in a given environment local Def = name * m.Carg(1) + +local function getdef (id, defs) + local c = defs and defs[id] + if not c then error("undefined name: " .. id) end + return c +end + +-- match a name and return a group of its corresponding definition +-- and 'f' (to be folded in 'Suffix') +local function defwithfunc (f) + return m.Cg(Def / getdef * m.Cc(f)) +end + + local num = m.C(m.R"09"^1) * S / tonumber local String = "'" * m.C((any - "'")^0) * "'" + @@ -130,7 +137,7 @@ end local Range = m.Cs(any * (m.P"-"/"") * (any - "]")) / mm.R -local item = defined + Range + m.C(any) +local item = (defined + Range + m.C(any)) / m.P local Class = "[" @@ -176,9 +183,10 @@ local exp = m.P{ "Exp", ) + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div)) + m.P"{}" * m.Cc(nil, m.Ct) - + m.Cg(Def / getdef * m.Cc(mt.__div)) + + defwithfunc(mt.__div) ) - + "=>" * S * m.Cg(Def / getdef * m.Cc(m.Cmt)) + + "=>" * S * defwithfunc(m.Cmt) + + "~>" * S * defwithfunc(m.Cf) ) * S )^0, function (a,b,f) return f(a,b) end ); Primary = "(" * m.V"Exp" * ")" diff --git a/test.lua b/test.lua index 7b685bb..1770294 100644 --- a/test.lua +++ b/test.lua @@ -4,8 +4,12 @@ -- require"strict" -- just to be pedantic +print(package.path, package.cpath) +package.path = './?.lua;' .. package.path +package.cpath = './?.so;' .. package.cpath local m = require"lpeg" +print(m.version) -- for general use local a, b, c, d, e, f, g, p, t @@ -48,8 +52,8 @@ end print"General tests for LPeg library" -assert(type(m.version()) == "string") -print("version " .. m.version()) +assert(type(m.version) == "string") +print(m.version) assert(m.type("alo") ~= "pattern") assert(m.type(io.input) ~= "pattern") assert(m.type(m.P"alo") == "pattern") @@ -202,6 +206,14 @@ do end +-- bug: loop in 'hascaptures' +do + local p = m.C(-m.P{m.P'x' * m.V(1) + m.P'y'}) + assert(p:match("xxx") == "") +end + + + -- test for small capture boundary for i = 250,260 do assert(#m.match(m.C(i), string.rep('a', i)) == i) @@ -398,7 +410,7 @@ assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc') do - -- large dynamic Cc + print "testing large dynamic Cc" local lim = 2^16 - 1 local c = 0 local function seq (n) @@ -416,6 +428,16 @@ do end +do + -- nesting of captures too deep + local p = m.C(1) + for i = 1, 300 do + p = m.Ct(p) + end + checkerr("too deep", p.match, p, "x") +end + + -- tests for non-pattern as arguments to pattern functions p = { ('a' * m.V(1))^-1 } * m.P'b' * { 'a' * m.V(2); m.V(1)^-1 } @@ -517,6 +539,27 @@ assert(m.match(m.Cs((#((#m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") assert(m.match(m.Cs((- -m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") assert(m.match(m.Cs((-((-m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") + +-- fixed length +do + -- 'and' predicate using fixed length + local p = m.C(#("a" * (m.P("bd") + "cd")) * 2) + assert(p:match("acd") == "ac") + + p = #m.P{ "a" * m.V(2), m.P"b" } * 2 + assert(p:match("abc") == 3) + + p = #(m.P"abc" * m.B"c") + assert(p:match("abc") == 1 and not p:match("ab")) + + p = m.P{ "a" * m.V(2), m.P"b"^1 } + checkerr("pattern may not have fixed length", m.B, p) + + p = "abc" * (m.P"b"^1 + m.P"a"^0) + checkerr("pattern may not have fixed length", m.B, p) +end + + p = -m.P'a' * m.Cc(1) + -m.P'b' * m.Cc(2) + -m.P'c' * m.Cc(3) assert(p:match('a') == 2 and p:match('') == 1 and p:match('b') == 1) @@ -817,7 +860,7 @@ s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l) p = (m.C(m.P'a'^1) * m.C(m.P'b'^1) * m.C(m.P'c'^1)) / '%3%2%1' assert(p:match(s) == string.rep('c', l) .. - string.rep('b', l) .. + string.rep('b', l) .. string.rep('a', l)) print"+" @@ -946,10 +989,10 @@ for i = 1, 10 do assert(p:match("aaaaaaaaaaa") == 11 - i + 1) end -print"+" --- tests for back references +print "testing back references" + checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') @@ -993,6 +1036,17 @@ local function id (s, i, ...) return true, ... end +do -- run-time capture in an end predicate (should discard its value) + local x = 0 + function foo (s, i) + x = x + 1 + return true, x + end + + local p = #(m.Cmt("", foo) * "xx") * m.Cmt("", foo) + assert(p:match("xx") == 2) +end + assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + m.R'09'^1 / string.char + m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") @@ -1011,8 +1065,8 @@ assert(#x == 500) local function id(s, i, x) if x == 'a' then return i, 1, 3, 7 else return nil, 2, 4, 6, 8 - end -end + end +end p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))^0 assert(table.concat{p:match('abababab')} == string.rep('137', 4)) @@ -1098,6 +1152,32 @@ do assert(c == 11) end + +-- Return a match-time capture that returns 'n' captures +local function manyCmt (n) + return m.Cmt("a", function () + local a = {}; for i = 1, n do a[i] = n - i end + return true, unpack(a) + end) +end + +-- bug in 1.0: failed match-time that used previous match-time results +do + local x + local function aux (...) x = #{...}; return false end + local res = {m.match(m.Cmt(manyCmt(20), aux) + manyCmt(10), "a")} + assert(#res == 10 and res[1] == 9 and res[10] == 0) +end + + +-- bug in 1.0: problems with math-times returning too many captures +do + local lim = 2^11 - 10 + local res = {m.match(manyCmt(lim), "a")} + assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) + checkerr("too many", m.match, manyCmt(2^15), "a") +end + p = (m.P(function () return true, "a" end) * 'a' + m.P(function (s, i) return i, "aa", 20 end) * 'b' + m.P(function (s,i) if i <= #s then return i, "aaa" end end) * 1)^0 @@ -1106,9 +1186,85 @@ t = {p:match('abacc')} checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) +do print"testing large grammars" + local lim = 1000 -- number of rules + local t = {} + + for i = 3, lim do + t[i] = m.V(i - 1) -- each rule calls previous one + end + t[1] = m.V(lim) -- start on last rule + t[2] = m.C("alo") -- final rule + + local P = m.P(t) -- build grammar + assert(P:match("alo") == "alo") + + t[#t + 1] = m.P("x") -- one more rule... + checkerr("too many rules", m.P, t) +end + + +print "testing UTF-8 ranges" + +do -- a few typical UTF-8 ranges + local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0" + + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0" + + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0" + + m.utfR(0, 0x7f)^1 / "ascii: %0" + + m.utfR(0, 0x10ffff) / "other: %0" + + p = m.Ct(p^0) * -m.P(1) + + local cyr = "ждюя" + local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀 + local cjk = "专举乸" + local ascii = "alo" + local last = "\244\143\191\191" -- U+10FFFF + + local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last + t = (p:match(s)) + + assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and + t[3] == "emot: " .. emot and t[4] == "other: —" and + t[5] == "cjk: " .. cjk and t[6] == "other: —" and + t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and + t[9] == nil) +end + + +do -- valid and invalid code points + local p = m.utfR(0, 0x10ffff)^0 + assert(p:match("汉字\128") == #"汉字" + 1) + assert(p:match("\244\159\191") == 1) + assert(p:match("\244\159\191\191") == 1) + assert(p:match("\255") == 1) + + -- basic errors + checkerr("empty range", m.utfR, 1, 0) + checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1) +end + + +do -- back references (fixed width) + -- match a byte after a CJK point + local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1) + p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p' + assert(p:match("ab д 专X x") == "X") + + -- match a byte after a hebrew point + local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1) + p = m.P(#"ש") * p + assert(p:match("שX") == "X") + + checkerr("fixed length", m.B, m.utfR(0, 0x10ffff)) +end + + + ------------------------------------------------------------------- -- Tests for 're' module ------------------------------------------------------------------- +print"testing 're' module" local re = require "re" @@ -1131,6 +1287,9 @@ assert(not match("abbcde", " [b-z] + ")) assert(match("abb\"de", '"abb"["]"de"') == 7) assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee") assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8) + +assert(re.match("aaand", "[a]^2") == 3) + local t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")} checkeq(t, {4, 5, 7}) local t = {match("abceefe", "((&&'e' {})? .)*")} @@ -1305,6 +1464,13 @@ checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but', {'totheend'}}) +-- test for folding captures +c = re.compile([[ + S <- (number (%s+ number)*) ~> add + number <- %d+ -> tonumber +]], {tonumber = tonumber, add = function (a,b) return a + b end}) +assert(c:match("3 401 50") == 3 + 401 + 50) + -- tests for look-ahead captures x = {re.match("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")} checkeq(x, {"", "alo", ""}) diff --git a/testlr.lua b/testlr.lua index 7911bac..f18c6ce 100644 --- a/testlr.lua +++ b/testlr.lua @@ -1,6 +1,12 @@ +print(package.path, package.path) +package.path = './?.lua;' .. package.path +package.cpath = './?.so;' .. package.cpath + local lpeg = require"lpeg" local re = require"re" +print(lpeg.version) + local m = lpeg @@ -15,8 +21,8 @@ end print"Tests for LPeg left recursion" -assert(type(m.version()) == "string") -print("version " .. m.version()) +assert(type(m.version) == "string") +print("version " .. m.version) --[[