Skip to content
This repository was archived by the owner on Jan 9, 2023. It is now read-only.

Commit a4d3708

Browse files
authored
Merge pull request #62 from chrisburr/fix-59
Improve performance of get_matching_variables
2 parents 7eff0fb + fec37dd commit a4d3708

File tree

2 files changed

+28
-9
lines changed

2 files changed

+28
-9
lines changed

root_pandas/readwrite.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from numpy.lib.recfunctions import append_fields
88
from pandas import DataFrame, RangeIndex
99
from root_numpy import root2array, list_trees
10-
from fnmatch import fnmatch
10+
import fnmatch
1111
from root_numpy import list_branches
1212
from root_numpy.extern.six import string_types
1313
import itertools
@@ -59,17 +59,24 @@ def get_nonscalar_columns(array):
5959

6060

6161
def get_matching_variables(branches, patterns, fail=True):
62-
selected = []
63-
64-
for p in patterns:
62+
# Convert branches to a set to make x "in branches" O(1) on average
63+
branches = set(branches)
64+
patterns = set(patterns)
65+
# Find any trivial matches
66+
selected = list(branches.intersection(patterns))
67+
# Any matches that weren't trivial need to be looped over...
68+
for pattern in patterns.difference(selected):
6569
found = False
66-
for b in branches:
67-
if fnmatch(b, p):
70+
# Avoid using fnmatch if the pattern if possible
71+
if re.findall(r'(\*)|(\?)|(\[.*\])|(\[\!.*\])', pattern):
72+
for match in fnmatch.filter(branches, pattern):
6873
found = True
69-
if fnmatch(b, p) and b not in selected:
70-
selected.append(b)
74+
if match not in selected:
75+
selected.append(match)
76+
elif pattern in branches:
77+
raise NotImplementedError('I think this is impossible?')
7178
if not found and fail:
72-
raise ValueError("Pattern '{}' didn't match any branch".format(p))
79+
raise ValueError("Pattern '{}' didn't match any branch".format(pattern))
7380
return selected
7481

7582

tests/test.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,18 @@ def test_nonscalar_columns():
230230
os.remove(path)
231231

232232

233+
def test_get_matching_variables_performance():
234+
"""Performance regression test for #59"""
235+
import random
236+
import string
237+
import root_pandas.readwrite
238+
for n in [10, 100, 1000, 10000]:
239+
branches = [' '.join(random.sample(string.ascii_letters*100, k=100)) for i in range(n)]
240+
patterns = [' '.join(random.sample(string.ascii_letters*100, k=100)) for i in range(n)]
241+
root_pandas.readwrite.get_matching_variables(branches, patterns, fail=False)
242+
root_pandas.readwrite.get_matching_variables(branches, branches, fail=False)
243+
244+
233245
def test_noexpand_prefix():
234246
xs = np.array([1, 2, 3])
235247
df = pd.DataFrame({'x': xs})

0 commit comments

Comments
 (0)