From 7c59efd85a3cb9401c6a5760cf7b2a607645b7a3 Mon Sep 17 00:00:00 2001
From: mingodad
-Returns a string with the running version of LPeg.
+A string (not a function) with the running version of LPeg.
-Sets the maximum size for the backtrack stack used by LPeg to
+Sets a limit for the size of the backtrack stack used by LPeg to
track calls and choices.
+(The default limit is 400.)
Most well-written patterns need little backtrack levels and
-therefore you seldom need to change this maximum;
-but a few useful patterns may need more space.
-Before changing this maximum you should try to rewrite your
+therefore you seldom need to change this limit;
+before changing it you should try to rewrite your
pattern to avoid the need for extra space.
+Nevertheless, a few useful patterns may overflow.
+Also, with recursive grammars,
+subjects with deep recursion may also need larger limits.
-A capture is a pattern that creates values
-(the so called semantic information) when it matches.
+A capture is a pattern that produces values
+(the so called semantic information)
+according to what it matches.
LPeg offers several kinds of captures,
which produces values based on matches and combine these values to
produce new values.
@@ -629,10 +636,7 @@
-A capture pattern produces its values every time it succeeds.
-For instance,
-a capture inside a loop produces as many values as matched by the loop.
-A capture produces a value only when it succeeds.
+A capture pattern produces its values only when it succeeds.
For instance,
the pattern
Usually,
-LPeg evaluates all captures only after (and if) the entire match succeeds.
-During match time it only gathers enough information
-to produce the capture values later.
-As a particularly important consequence,
+LPeg does not specify when (and if) it evaluates its captures.
+(As an example,
+consider the pattern
@@ -696,6 +707,12 @@
+In the same way that LPeg does not specify when it evaluates captures,
+it does not specify whether it reuses
+values previously produced by the group
+or re-evaluates them.
+
@@ -762,7 +779,8 @@
@@ -801,7 +819,7 @@
Creates a table capture.
-This capture creates a table and puts all values from all anonymous captures
+This capture returns a table with all values from all anonymous captures
made by
Creates a match-time capture.
Unlike all other captures,
-this one is evaluated immediately when a match occurs.
+this one is evaluated immediately when a match occurs
+(even if it is part of a larger pattern that fails later).
It forces the immediate evaluation of all its nested captures
and then calls LPeg
-source code.
+Probably, the easiest way to install LPeg is with
+LuaRocks.
+If you have LuaRocks installed,
+the following command is all you need to install LPeg:
+
-Copyright © 2014 Lua.org, PUC-Rio.
+Copyright © 2007-2019 Lua.org, PUC-Rio.
Permission is hereby granted, free of charge,
diff --git a/lpprint.c b/lpprint.c
index 174d168..da18d05 100644
--- a/lpprint.c
+++ b/lpprint.c
@@ -37,13 +37,13 @@ void printcharset (const byte *st) {
}
-static void printcapkind (int kind) {
+static const char *capkind (int kind) {
const char *const modes[] = {
"close", "position", "constant", "backref",
"argument", "simple", "table", "function",
"query", "string", "num", "substitution", "fold",
"runtime", "group"};
- printf("%s", modes[kind]);
+ return modes[kind];
}
@@ -56,30 +56,34 @@ void printinst (const Instruction *op, const Instruction *p) {
const char *const names[] = {
"any", "char", "set",
"testany", "testchar", "testset",
- "span", "behind",
+ "span", "utf-range", "behind",
"ret", "end",
"choice", "jmp", "call", "open_call",
"commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup",
- "fullcapture", "opencapture", "closecapture", "closeruntime"
+ "fullcapture", "opencapture", "closecapture", "closeruntime",
+ "--"
};
printf("%02ld: %s ", (long)(p - op), names[p->i.code]);
switch ((Opcode)p->i.code) {
case IChar: {
- printf("'%c'", p->i.aux);
+ printf("'%c' (%02x)", p->i.aux, p->i.aux);
break;
}
case ITestChar: {
- printf("'%c'", p->i.aux); printjmp(op, p);
+ printf("'%c' (%02x)", p->i.aux, p->i.aux); printjmp(op, p);
+ break;
+ }
+ case IUTFR: {
+ printf("%d - %d", p[1].offset, utf_to(p));
break;
}
case IFullCapture: {
- printcapkind(getkind(p));
- printf(" (size = %d) (idx = %d)", getoff(p), p->i.key);
+ printf("%s (size = %d) (idx = %d)",
+ capkind(getkind(p)), getoff(p), p->i.key);
break;
}
case IOpenCapture: {
- printcapkind(getkind(p));
- printf(" (idx = %d)", p->i.key);
+ printf("%s (idx = %d)", capkind(getkind(p)), p->i.key);
break;
}
case ISet: {
@@ -124,8 +128,8 @@ void printpatt (Instruction *p, int n) {
#if defined(LPEG_DEBUG)
static void printcap (Capture *cap) {
- printcapkind(cap->kind);
- printf(" (idx: %d - size: %d) -> %p\n", cap->idx, cap->siz, cap->s);
+ printf("%s (idx: %d - size: %d) -> %p\n",
+ capkind(cap->kind), cap->idx, cap->siz, cap->s);
}
@@ -148,11 +152,11 @@ void printcaplist (Capture *cap, Capture *limit) {
static const char *tagnames[] = {
"char", "set", "any",
- "true", "false",
+ "true", "false", "utf8.range",
"rep",
"seq", "choice",
"not", "and",
- "call", "opencall", "rule", "grammar",
+ "call", "opencall", "rule", "xinfo", "grammar",
"behind",
"capture", "run-time"
};
@@ -160,6 +164,7 @@ static const char *tagnames[] = {
void printtree (TTree *tree, int ident) {
int i;
+ int sibs = numsiblings[tree->tag];
for (i = 0; i < ident; i++) printf(" ");
printf("%s", tagnames[tree->tag]);
switch (tree->tag) {
@@ -176,24 +181,34 @@ void printtree (TTree *tree, int ident) {
printf("\n");
break;
}
+ case TUTFR: {
+ assert(sib1(tree)->tag == TXInfo);
+ printf(" %d (%02x %d) - %d (%02x %d) \n",
+ tree->u.n, tree->key, tree->cap,
+ sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap);
+ break;
+ }
case TOpenCall: case TCall: {
- printf(" key: %d\n", tree->key);
+ assert(sib1(sib2(tree))->tag == TXInfo);
+ printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n);
break;
}
case TBehind: {
printf(" %d\n", tree->u.n);
- printtree(sib1(tree), ident + 2);
break;
}
case TCapture: {
- printf(" cap: %d key: %d n: %d\n", tree->cap, tree->key, tree->u.n);
- printtree(sib1(tree), ident + 2);
+ printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key);
break;
}
case TRule: {
- printf(" n: %d key: %d\n", tree->cap, tree->key);
- printtree(sib1(tree), ident + 2);
- break; /* do not print next rule as a sibling */
+ printf(" key: %d\n", tree->key);
+ sibs = 1; /* do not print 'sib2' (next rule) as a sibling */
+ break;
+ }
+ case TXInfo: {
+ printf(" n: %d\n", tree->u.n);
+ break;
}
case TGrammar: {
TTree *rule = sib1(tree);
@@ -203,18 +218,17 @@ void printtree (TTree *tree, int ident) {
rule = sib2(rule);
}
assert(rule->tag == TTrue); /* sentinel */
+ sibs = 0; /* siblings already handled */
break;
}
- default: {
- int sibs = numsiblings[tree->tag];
+ default:
printf("\n");
- if (sibs >= 1) {
- printtree(sib1(tree), ident + 2);
- if (sibs >= 2)
- printtree(sib2(tree), ident + 2);
- }
break;
- }
+ }
+ if (sibs >= 1) {
+ printtree(sib1(tree), ident + 2);
+ if (sibs >= 2)
+ printtree(sib2(tree), ident + 2);
}
}
diff --git a/lptree.c b/lptree.c
index 4b9cf9c..5ced65d 100644
--- a/lptree.c
+++ b/lptree.c
@@ -21,11 +21,11 @@
/* number of siblings for each tree */
const byte numsiblings[] = {
0, 0, 0, /* char, set, any */
- 0, 0, /* true, false */
+ 0, 0, 0, /* true, false, utf-range */
1, /* rep */
2, 2, /* seq, choice */
1, 1, /* not, and */
- 0, 0, 2, 1, /* call, opencall, rule, grammar */
+ 0, 0, 2, 1, 1, /* call, opencall, rule, prerule, grammar */
1, /* behind */
1, 1 /* capture, runtime capture */
};
@@ -64,7 +64,7 @@ static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t) {
t->tag = TCall;
t->u.ps = n - (t - g); /* position relative to node */
assert(sib2(t)->tag == TRule);
- sib2(t)->key = t->key;
+ sib2(t)->key = t->key; /* fix rule's key */
}
@@ -679,6 +679,56 @@ static int lp_range (lua_State *L) {
}
+/*
+** Fills a tree node with basic information about the UTF-8 code point
+** 'cpu': its value in 'n', its length in 'cap', and its first byte in
+** 'key'
+*/
+static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) {
+ int len, fb, cp;
+ cp = (int)cpu;
+ if (cp <= 0x7f) { /* one byte? */
+ len = 1;
+ fb = cp;
+ } else if (cp <= 0x7ff) {
+ len = 2;
+ fb = 0xC0 | (cp >> 6);
+ } else if (cp <= 0xffff) {
+ len = 3;
+ fb = 0xE0 | (cp >> 12);
+ }
+ else {
+ luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point");
+ len = 4;
+ fb = 0xF0 | (cp >> 18);
+ }
+ t->u.n = cp;
+ t->cap = len;
+ t->key = fb;
+}
+
+
+static int lp_utfr (lua_State *L) {
+ lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1);
+ lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2);
+ luaL_argcheck(L, from <= to, 2, "empty range");
+ if (to <= 0x7f) { /* ascii range? */
+ TTree *tree = newcharset(L); /* code it as a regular charset */
+ unsigned int f;
+ for (f = (int)from; f <= to; f++)
+ setchar(treebuffer(tree), f);
+ }
+ else { /* multi-byte utf-8 range */
+ TTree *tree = newtree(L, 2);
+ tree->tag = TUTFR;
+ codeutftree(L, tree, from, 1);
+ sib1(tree)->tag = TXInfo;
+ codeutftree(L, sib1(tree), to, 2);
+ }
+ return 1;
+}
+
+
/*
** Look-behind predicate
*/
@@ -723,6 +773,7 @@ static int capture_aux (lua_State *L, int cap, int labelidx) {
/*
** Fill a tree with an empty capture, using an empty (TTrue) sibling.
+** (The 'key' field must be filled by the caller to finish the tree.)
*/
static TTree *auxemptycap (TTree *tree, int cap) {
tree->tag = TCapture;
@@ -733,15 +784,17 @@ static TTree *auxemptycap (TTree *tree, int cap) {
/*
-** Create a tree for an empty capture
+** Create a tree for an empty capture.
*/
-static TTree *newemptycap (lua_State *L, int cap) {
- return auxemptycap(newtree(L, 2), cap);
+static TTree *newemptycap (lua_State *L, int cap, int key) {
+ TTree *tree = auxemptycap(newtree(L, 2), cap);
+ tree->key = key;
+ return tree;
}
/*
-** Create a tree for an empty capture with an associated Lua value
+** Create a tree for an empty capture with an associated Lua value.
*/
static TTree *newemptycapkey (lua_State *L, int cap, int idx) {
TTree *tree = auxemptycap(newtree(L, 2), cap);
@@ -802,16 +855,15 @@ static int lp_simplecapture (lua_State *L) {
static int lp_poscapture (lua_State *L) {
- newemptycap(L, Cposition);
+ newemptycap(L, Cposition, 0);
return 1;
}
static int lp_argcapture (lua_State *L) {
int n = (int)luaL_checkinteger(L, 1);
- TTree *tree = newemptycap(L, Carg);
- tree->key = n;
luaL_argcheck(L, 0 < n && n <= SHRT_MAX, 1, "invalid argument index");
+ newemptycap(L, Carg, n);
return 1;
}
@@ -911,7 +963,7 @@ static int collectrules (lua_State *L, int arg, int *totalsize) {
int size; /* accumulator for total size */
lua_newtable(L); /* create position table */
getfirstrule(L, arg, postab);
- size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */
+ size = 3 + getsize(L, postab + 2); /* TGrammar + TRule + TXInfo + rule */
lua_pushnil(L); /* prepare to traverse grammar table */
while (lua_next(L, arg) != 0) {
if (lua_tonumber(L, -2) == 1 ||
@@ -925,11 +977,11 @@ static int collectrules (lua_State *L, int arg, int *totalsize) {
lua_pushvalue(L, -2); /* push key (to insert into position table) */
lua_pushinteger(L, size);
lua_settable(L, postab);
- size += 1 + getsize(L, -1); /* update size */
+ size += 2 + getsize(L, -1); /* add 'TRule + TXInfo + rule' to size */
lua_pushvalue(L, -2); /* push key (for next lua_next) */
n++;
}
- *totalsize = size + 1; /* TTrue to finish list of rules */
+ *totalsize = size + 1; /* space for 'TTrue' finishing list of rules */
return n;
}
@@ -941,12 +993,14 @@ static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) {
int ridx = frule + 2*i + 1; /* index of i-th rule */
int rulesize;
TTree *rn = gettree(L, ridx, &rulesize);
+ TTree *pr = sib1(nd); /* points to rule's prerule */
nd->tag = TRule;
- nd->key = 0;
- nd->cap = i; /* rule number */
+ nd->key = 0; /* will be fixed when rule is used */
+ pr->tag = TXInfo;
+ pr->u.n = i; /* rule number */
nd->lr = 0;
- nd->u.ps = rulesize + 1; /* point to next rule */
- memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */
+ nd->u.ps = rulesize + 2; /* point to next rule */
+ memcpy(sib1(pr), rn, rulesize * sizeof(TTree)); /* copy rule */
mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */
nd = sib2(nd); /* move to next rule */
}
@@ -977,7 +1031,12 @@ static int checkloops (TTree *tree) {
}
-static int verifyerror (lua_State *L, int *passed, int npassed) {
+/*
+** Give appropriate error message for 'verifyrule'. If a rule appears
+** twice in 'passed', there is path from it back to itself without
+** advancing the subject.
+*/
+static int verifyerror (lua_State *L, unsigned short *passed, int npassed) {
int i, j;
for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */
for (j = i - 1; j >= 0; j--) {
@@ -999,14 +1058,16 @@ static int verifyerror (lua_State *L, int *passed, int npassed) {
** is only relevant if the first is nullable.
** Parameter 'nb' works as an accumulator, to allow tail calls in
** choices. ('nb' true makes function returns true.)
+** Parameter 'passed' is a list of already visited rules, 'npassed'
+** counts the elements in 'passed'.
** Assume ktable at the top of the stack.
*/
-static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
- int nb) {
+static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed,
+ int npassed, int nb) {
tailcall:
switch (tree->tag) {
case TChar: case TSet: case TAny:
- case TFalse:
+ case TFalse: case TUTFR:
return nb; /* cannot pass from here */
case TTrue:
case TBehind: /* look-behind cannot have calls */
@@ -1014,7 +1075,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
case TNot: case TAnd: case TRep:
/* return verifyrule(L, sib1(tree), passed, npassed, 1); */
tree = sib1(tree); nb = 1; goto tailcall;
- case TCapture: case TRunTime:
+ case TCapture: case TRunTime: case TXInfo:
/* return verifyrule(L, sib1(tree), passed, npassed, nb); */
tree = sib1(tree); goto tailcall;
case TCall:
@@ -1033,10 +1094,10 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
/* return verifyrule(L, sib2(tree), passed, npassed, nb); */
tree = sib2(tree); goto tailcall;
case TRule:
- if (npassed >= MAXRULES)
- return verifyerror(L, passed, npassed);
+ if (npassed >= MAXRULES) /* too many steps? */
+ return verifyerror(L, passed, npassed); /* error */
else {
- passed[npassed++] = tree->key;
+ passed[npassed++] = tree->key; /* add rule to path */
/* return verifyrule(L, sib1(tree), passed, npassed); */
tree = sib1(tree); goto tailcall;
}
@@ -1066,7 +1127,7 @@ static void findleftrecursivecalls (TTree *tree) {
}
static void verifygrammar (lua_State *L, TTree *grammar) {
- int passed[MAXRULES];
+ unsigned short passed[MAXRULES];
TTree *rule;
/* check left-recursive rules */
for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
@@ -1217,12 +1278,6 @@ static int lp_setmax (lua_State *L) {
}
-static int lp_version (lua_State *L) {
- lua_pushstring(L, VERSION);
- return 1;
-}
-
-
static int lp_type (lua_State *L) {
if (testpattern(L, 1))
lua_pushliteral(L, "pattern");
@@ -1291,8 +1346,9 @@ static struct luaL_Reg pattreg[] = {
{"P", lp_P},
{"S", lp_set},
{"R", lp_range},
+ {"utfR", lp_utfr},
{"locale", lp_locale},
- {"version", lp_version},
+ {"version", NULL},
{"setmaxstack", lp_setmax},
{"type", lp_type},
{NULL, NULL}
@@ -1321,6 +1377,8 @@ int luaopen_lpeg (lua_State *L) {
luaL_newlib(L, pattreg);
lua_pushvalue(L, -1);
lua_setfield(L, -3, "__index");
+ lua_pushliteral(L, "LPeg " VERSION);
+ lua_setfield(L, -2, "version");
return 1;
}
diff --git a/lptree.h b/lptree.h
index 96567ab..5907fb2 100644
--- a/lptree.h
+++ b/lptree.h
@@ -13,31 +13,40 @@
** types of trees
*/
typedef enum TTag {
- TChar = 0, TSet, TAny, /* standard PEG elements */
- TTrue, TFalse,
- TRep,
- TSeq, TChoice,
- TNot, TAnd,
- TCall,
- TOpenCall,
- TRule, /* sib1 is rule's pattern, sib2 is 'next' rule */
- TGrammar, /* sib1 is initial (and first) rule */
- TBehind, /* match behind */
- TCapture, /* regular capture */
- TRunTime /* run-time capture */
+ TChar = 0, /* 'n' = char */
+ TSet, /* the set is stored in next CHARSETSIZE bytes */
+ TAny,
+ TTrue,
+ TFalse,
+ TUTFR, /* range of UTF-8 codepoints; 'n' has initial codepoint;
+ 'cap' has length; 'key' has first byte;
+ extra info is similar for end codepoint */
+ TRep, /* 'sib1'* */
+ TSeq, /* 'sib1' 'sib2' */
+ TChoice, /* 'sib1' / 'sib2' */
+ TNot, /* !'sib1' */
+ TAnd, /* &'sib1' */
+ TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */
+ TOpenCall, /* ktable[key] is rule's key */
+ TRule, /* ktable[key] is rule's key (but key == 0 for unused rules);
+ 'sib1' is rule's pattern pre-rule; 'sib2' is next rule;
+ extra info 'n' is rule's sequential number */
+ TXInfo, /* extra info */
+ TGrammar, /* 'sib1' is initial (and first) rule */
+ TBehind, /* 'sib1' is pattern, 'n' is how much to go back */
+ TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind');
+ ktable[key] is Lua value associated with capture;
+ 'sib1' is capture body */
+ TRunTime /* run-time capture: 'key' is Lua function;
+ 'sib1' is capture body */
} TTag;
-/* number of siblings for each tree */
-extern const byte numsiblings[];
-
/*
** Tree trees
-** The first sibling of a tree (if there is one) is immediately after
-** the tree. A reference to a second sibling (ps) is its position
-** relative to the position of the tree itself. A key in ktable
-** uses the (unique) address of the original tree that created that
-** entry. NULL means no data.
+** The first child of a tree (if there is one) is immediately after
+** the tree. A reference to a second child (ps) is its position
+** relative to the position of the tree itself.
*/
typedef struct TTree {
byte tag;
@@ -45,7 +54,7 @@ typedef struct TTree {
byte lr;
unsigned short key; /* key in ktable for Lua data (0 if no key) */
union {
- int ps; /* occasional second sibling */
+ int ps; /* occasional second child */
int n; /* occasional counter */
} u;
} TTree;
@@ -62,10 +71,10 @@ typedef struct Pattern {
} Pattern;
-/* number of siblings for each tree */
+/* number of children for each tree */
extern const byte numsiblings[];
-/* access to siblings */
+/* access to children */
#define sib1(t) ((t) + 1)
#define sib2(t) ((t) + (t)->u.ps)
diff --git a/lptypes.h b/lptypes.h
index 50578dc..b0969f7 100644
--- a/lptypes.h
+++ b/lptypes.h
@@ -1,7 +1,7 @@
/*
** $Id: lptypes.h,v 1.14 2015/09/28 17:17:41 roberto Exp $
** LPeg - PEG pattern matching for Lua
-** Copyright 2007-2015, Lua.org & PUC-Rio (see 'lpeg.html' for license)
+** Copyright 2007-2019, Lua.org & PUC-Rio (see 'lpeg.html' for license)
** written by Roberto Ierusalimschy
*/
@@ -9,17 +9,13 @@
#define lptypes_h
-#if !defined(LPEG_DEBUG)
-#define NDEBUG
-#endif
-
#include Introduction
Matches any character in
string (Set)
+lpeg.R("xy")Matches any character between x and y (Range)
+ lpeg.utfR(cp1, cp2)Matches an UTF-8 code point between cp1 and
+ cp2patt^nMatches at least n repetitions of patt
@@ -142,7 +145,7 @@ patt^-nIntroduction
LPeg also offers the re module,
which implements patterns following a regular-expression style
(e.g., [09]+).
-(This module is 260 lines of Lua code,
+(This module is 270 lines of Lua code,
and of course it uses LPeg to parse regular expressions and
translate them to regular LPeg patterns.)
@@ -164,7 +167,7 @@ lpeg.match (pattern, subject [, init])
An optional numeric argument init makes the match
start at that position in the subject string.
-As usual in Lua libraries,
+As in the Lua standard libraries,
a negative value counts from the end.
@@ -188,20 +191,23 @@
Otherwise returns nil.
-lpeg.type (value)
+lpeg.version ()lpeg.versionlpeg.setmaxstack (max)Grammars
Captures
Captures
lpeg.C(lpeg.P"a"^-1)
produces the empty string when there is no "a"
@@ -640,14 +644,20 @@ Captures
while the pattern lpeg.C("a")^-1
does not produce any value when there is no "a"
(because the pattern "a" fails).
+A pattern inside a loop or inside a recursive structure
+produces values for each match.
lpeg.P"a" / func / 0.
+Because the "division" by 0 instructs LPeg to throw away the
+results from the pattern,
+LPeg may or may not call func.)
+Therefore, captures should avoid side effects.
+Moreover,
most captures cannot affect the way a pattern matches a subject.
The only exception to this rule is the
so-called match-time capture.
@@ -682,7 +692,8 @@
Creates a back capture.
This pattern matches the empty string and
produces the values produced by the most recent
-group capture named lpeg.Cb (name)name.
+group capture named name
+(where name can be any Lua value).
another complete capture.
+lpeg.Cb (name)lpeg.Cc ([value, ...])
It groups all values returned by lpeg.Cg (patt [, name])patt
into a single capture.
The group may be anonymous (if no name is given)
-or named with the given name.
+or named with the given name
+(which can be any non-nil Lua value).
lpeg.Cs (patt)lpeg.Ct (patt)patt inside this table in successive integer keys,
starting at 1.
Moreover,
@@ -867,7 +885,8 @@ lpeg.Cmt(patt, function)function.
Arithmetic expressions
Download
$ luarocks install lpeg
License
The
equivalent to re Modulep / defs[name]
+p => namematch-time capture
equivalent to lpeg.Cmt(p, defs[name])p ~> namefold capture
+equivalent to lpeg.Cf(p, defs[name])& pand predicate ! pnot predicate
@@ -296,7 +298,7 @@ p1 p2concatenation Abstract Syntax Trees
a tag field telling what non terminal
that table represents.
We can add such a tag using
-named group captures:
+named group captures:
x = re.compile[[ @@ -406,7 +408,7 @@Patterns
p = [=[ pattern <- exp !. -exp <- S (alternative / grammar) +exp <- S (grammar / alternative) alternative <- seq ('/' S seq)* seq <- prefix* @@ -421,6 +423,7 @@Patterns
/ '=' name / '{}' / '{~' exp '~}' + / '{|' exp '|}' / '{' exp '}' / '.' / name S !arrow @@ -434,7 +437,7 @@Patterns
range <- . '-' [^]] S <- (%s / '--' [^%nl]*)* -- spaces and comments -name <- [A-Za-z][A-Za-z0-9_]* +name <- [A-Za-z_][A-Za-z0-9_]* arrow <- '<-' num <- [0-9]+ string <- '"' [^"]* '"' / "'" [^']* "'" @@ -450,7 +453,7 @@Patterns
License
-Copyright © 2008-2010 Lua.org, PUC-Rio. +Copyright © 2008-2015 Lua.org, PUC-Rio.
Permission is hereby granted, free of charge, diff --git a/re.lua b/re.lua index 1d8e159..77a4af8 100644 --- a/re.lua +++ b/re.lua @@ -71,13 +71,6 @@ updatelocale() local I = m.P(function (s,i) print(i, s:sub(1, i-1)); return i end) -local function getdef (id, defs) - local c = defs and defs[id] - if not c then error("undefined name: " .. id) end - return c -end - - local function patt_error (s, i) local msg = (#s < i + 20) and s:sub(i) or s:sub(i,i+20) .. "..." @@ -116,6 +109,20 @@ name = m.C(name) -- a defined name only have meaning in a given environment local Def = name * m.Carg(1) + +local function getdef (id, defs) + local c = defs and defs[id] + if not c then error("undefined name: " .. id) end + return c +end + +-- match a name and return a group of its corresponding definition +-- and 'f' (to be folded in 'Suffix') +local function defwithfunc (f) + return m.Cg(Def / getdef * m.Cc(f)) +end + + local num = m.C(m.R"09"^1) * S / tonumber local String = "'" * m.C((any - "'")^0) * "'" + @@ -130,7 +137,7 @@ end local Range = m.Cs(any * (m.P"-"/"") * (any - "]")) / mm.R -local item = defined + Range + m.C(any) +local item = (defined + Range + m.C(any)) / m.P local Class = "[" @@ -176,9 +183,10 @@ local exp = m.P{ "Exp", ) + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div)) + m.P"{}" * m.Cc(nil, m.Ct) - + m.Cg(Def / getdef * m.Cc(mt.__div)) + + defwithfunc(mt.__div) ) - + "=>" * S * m.Cg(Def / getdef * m.Cc(m.Cmt)) + + "=>" * S * defwithfunc(m.Cmt) + + "~>" * S * defwithfunc(m.Cf) ) * S )^0, function (a,b,f) return f(a,b) end ); Primary = "(" * m.V"Exp" * ")" diff --git a/test.lua b/test.lua index 7b685bb..1770294 100644 --- a/test.lua +++ b/test.lua @@ -4,8 +4,12 @@ -- require"strict" -- just to be pedantic +print(package.path, package.cpath) +package.path = './?.lua;' .. package.path +package.cpath = './?.so;' .. package.cpath local m = require"lpeg" +print(m.version) -- for general use local a, b, c, d, e, f, g, p, t @@ -48,8 +52,8 @@ end print"General tests for LPeg library" -assert(type(m.version()) == "string") -print("version " .. m.version()) +assert(type(m.version) == "string") +print(m.version) assert(m.type("alo") ~= "pattern") assert(m.type(io.input) ~= "pattern") assert(m.type(m.P"alo") == "pattern") @@ -202,6 +206,14 @@ do end +-- bug: loop in 'hascaptures' +do + local p = m.C(-m.P{m.P'x' * m.V(1) + m.P'y'}) + assert(p:match("xxx") == "") +end + + + -- test for small capture boundary for i = 250,260 do assert(#m.match(m.C(i), string.rep('a', i)) == i) @@ -398,7 +410,7 @@ assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc') do - -- large dynamic Cc + print "testing large dynamic Cc" local lim = 2^16 - 1 local c = 0 local function seq (n) @@ -416,6 +428,16 @@ do end +do + -- nesting of captures too deep + local p = m.C(1) + for i = 1, 300 do + p = m.Ct(p) + end + checkerr("too deep", p.match, p, "x") +end + + -- tests for non-pattern as arguments to pattern functions p = { ('a' * m.V(1))^-1 } * m.P'b' * { 'a' * m.V(2); m.V(1)^-1 } @@ -517,6 +539,27 @@ assert(m.match(m.Cs((#((#m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") assert(m.match(m.Cs((- -m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") assert(m.match(m.Cs((-((-m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") + +-- fixed length +do + -- 'and' predicate using fixed length + local p = m.C(#("a" * (m.P("bd") + "cd")) * 2) + assert(p:match("acd") == "ac") + + p = #m.P{ "a" * m.V(2), m.P"b" } * 2 + assert(p:match("abc") == 3) + + p = #(m.P"abc" * m.B"c") + assert(p:match("abc") == 1 and not p:match("ab")) + + p = m.P{ "a" * m.V(2), m.P"b"^1 } + checkerr("pattern may not have fixed length", m.B, p) + + p = "abc" * (m.P"b"^1 + m.P"a"^0) + checkerr("pattern may not have fixed length", m.B, p) +end + + p = -m.P'a' * m.Cc(1) + -m.P'b' * m.Cc(2) + -m.P'c' * m.Cc(3) assert(p:match('a') == 2 and p:match('') == 1 and p:match('b') == 1) @@ -817,7 +860,7 @@ s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l) p = (m.C(m.P'a'^1) * m.C(m.P'b'^1) * m.C(m.P'c'^1)) / '%3%2%1' assert(p:match(s) == string.rep('c', l) .. - string.rep('b', l) .. + string.rep('b', l) .. string.rep('a', l)) print"+" @@ -946,10 +989,10 @@ for i = 1, 10 do assert(p:match("aaaaaaaaaaa") == 11 - i + 1) end -print"+" --- tests for back references +print "testing back references" + checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') @@ -993,6 +1036,17 @@ local function id (s, i, ...) return true, ... end +do -- run-time capture in an end predicate (should discard its value) + local x = 0 + function foo (s, i) + x = x + 1 + return true, x + end + + local p = #(m.Cmt("", foo) * "xx") * m.Cmt("", foo) + assert(p:match("xx") == 2) +end + assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + m.R'09'^1 / string.char + m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") @@ -1011,8 +1065,8 @@ assert(#x == 500) local function id(s, i, x) if x == 'a' then return i, 1, 3, 7 else return nil, 2, 4, 6, 8 - end -end + end +end p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))^0 assert(table.concat{p:match('abababab')} == string.rep('137', 4)) @@ -1098,6 +1152,32 @@ do assert(c == 11) end + +-- Return a match-time capture that returns 'n' captures +local function manyCmt (n) + return m.Cmt("a", function () + local a = {}; for i = 1, n do a[i] = n - i end + return true, unpack(a) + end) +end + +-- bug in 1.0: failed match-time that used previous match-time results +do + local x + local function aux (...) x = #{...}; return false end + local res = {m.match(m.Cmt(manyCmt(20), aux) + manyCmt(10), "a")} + assert(#res == 10 and res[1] == 9 and res[10] == 0) +end + + +-- bug in 1.0: problems with math-times returning too many captures +do + local lim = 2^11 - 10 + local res = {m.match(manyCmt(lim), "a")} + assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) + checkerr("too many", m.match, manyCmt(2^15), "a") +end + p = (m.P(function () return true, "a" end) * 'a' + m.P(function (s, i) return i, "aa", 20 end) * 'b' + m.P(function (s,i) if i <= #s then return i, "aaa" end end) * 1)^0 @@ -1106,9 +1186,85 @@ t = {p:match('abacc')} checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) +do print"testing large grammars" + local lim = 1000 -- number of rules + local t = {} + + for i = 3, lim do + t[i] = m.V(i - 1) -- each rule calls previous one + end + t[1] = m.V(lim) -- start on last rule + t[2] = m.C("alo") -- final rule + + local P = m.P(t) -- build grammar + assert(P:match("alo") == "alo") + + t[#t + 1] = m.P("x") -- one more rule... + checkerr("too many rules", m.P, t) +end + + +print "testing UTF-8 ranges" + +do -- a few typical UTF-8 ranges + local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0" + + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0" + + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0" + + m.utfR(0, 0x7f)^1 / "ascii: %0" + + m.utfR(0, 0x10ffff) / "other: %0" + + p = m.Ct(p^0) * -m.P(1) + + local cyr = "ждюя" + local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀 + local cjk = "专举乸" + local ascii = "alo" + local last = "\244\143\191\191" -- U+10FFFF + + local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last + t = (p:match(s)) + + assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and + t[3] == "emot: " .. emot and t[4] == "other: —" and + t[5] == "cjk: " .. cjk and t[6] == "other: —" and + t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and + t[9] == nil) +end + + +do -- valid and invalid code points + local p = m.utfR(0, 0x10ffff)^0 + assert(p:match("汉字\128") == #"汉字" + 1) + assert(p:match("\244\159\191") == 1) + assert(p:match("\244\159\191\191") == 1) + assert(p:match("\255") == 1) + + -- basic errors + checkerr("empty range", m.utfR, 1, 0) + checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1) +end + + +do -- back references (fixed width) + -- match a byte after a CJK point + local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1) + p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p' + assert(p:match("ab д 专X x") == "X") + + -- match a byte after a hebrew point + local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1) + p = m.P(#"ש") * p + assert(p:match("שX") == "X") + + checkerr("fixed length", m.B, m.utfR(0, 0x10ffff)) +end + + + ------------------------------------------------------------------- -- Tests for 're' module ------------------------------------------------------------------- +print"testing 're' module" local re = require "re" @@ -1131,6 +1287,9 @@ assert(not match("abbcde", " [b-z] + ")) assert(match("abb\"de", '"abb"["]"de"') == 7) assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee") assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8) + +assert(re.match("aaand", "[a]^2") == 3) + local t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")} checkeq(t, {4, 5, 7}) local t = {match("abceefe", "((&&'e' {})? .)*")} @@ -1305,6 +1464,13 @@ checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but', {'totheend'}}) +-- test for folding captures +c = re.compile([[ + S <- (number (%s+ number)*) ~> add + number <- %d+ -> tonumber +]], {tonumber = tonumber, add = function (a,b) return a + b end}) +assert(c:match("3 401 50") == 3 + 401 + 50) + -- tests for look-ahead captures x = {re.match("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")} checkeq(x, {"", "alo", ""}) diff --git a/testlr.lua b/testlr.lua index 7911bac..f18c6ce 100644 --- a/testlr.lua +++ b/testlr.lua @@ -1,6 +1,12 @@ +print(package.path, package.path) +package.path = './?.lua;' .. package.path +package.cpath = './?.so;' .. package.cpath + local lpeg = require"lpeg" local re = require"re" +print(lpeg.version) + local m = lpeg @@ -15,8 +21,8 @@ end print"Tests for LPeg left recursion" -assert(type(m.version()) == "string") -print("version " .. m.version()) +assert(type(m.version) == "string") +print("version " .. m.version) --[[