|
1 | | - |
2 | | -""" |
3 | | -A module that extends pandas to support the ROOT data format. |
4 | | -""" |
5 | | - |
6 | | -import numpy as np |
7 | | -from numpy.lib.recfunctions import append_fields |
8 | | -from pandas import DataFrame |
9 | | -from root_numpy import root2array, list_trees |
10 | | -from fnmatch import fnmatch |
11 | | -from root_numpy import list_branches |
12 | | -from root_numpy.extern.six import string_types |
13 | | -import itertools |
14 | | -from math import ceil |
15 | | -import re |
16 | | -import ROOT |
17 | | - |
18 | | -from .utils import stretch |
19 | | - |
20 | | - |
21 | | -__all__ = ['read_root'] |
22 | | - |
23 | | - |
24 | | -def expand_braces(orig): |
25 | | - r = r'.*(\{.+?[^\\]\})' |
26 | | - p = re.compile(r) |
27 | | - |
28 | | - s = orig[:] |
29 | | - res = list() |
30 | | - |
31 | | - m = p.search(s) |
32 | | - if m is not None: |
33 | | - sub = m.group(1) |
34 | | - open_brace = s.find(sub) |
35 | | - close_brace = open_brace + len(sub) - 1 |
36 | | - if sub.find(',') != -1: |
37 | | - for pat in sub.strip('{}').split(','): |
38 | | - res.extend(expand_braces(s[:open_brace] + pat + s[close_brace+1:])) |
39 | | - |
40 | | - else: |
41 | | - res.extend(expand_braces(s[:open_brace] + sub.replace('}', '\\}') + s[close_brace+1:])) |
42 | | - |
43 | | - else: |
44 | | - res.append(s.replace('\\}', '}')) |
45 | | - |
46 | | - return list(set(res)) |
47 | | - |
48 | | - |
49 | | -def get_matching_variables(branches, patterns, fail=True): |
50 | | - selected = [] |
51 | | - |
52 | | - for p in patterns: |
53 | | - found = False |
54 | | - for b in branches: |
55 | | - if fnmatch(b, p): |
56 | | - found = True |
57 | | - if fnmatch(b, p) and b not in selected: |
58 | | - selected.append(b) |
59 | | - if not found and fail: |
60 | | - raise ValueError("Pattern '{}' didn't match any branch".format(p)) |
61 | | - return selected |
62 | | - |
63 | | - |
64 | | -def read_root(path, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs): |
65 | | - """ |
66 | | - Read a ROOT file into a pandas DataFrame. |
67 | | - Further *args and *kwargs are passed to root_numpy's root2array. |
68 | | - If the root file contains a branch matching __index__*, it will become the DataFrame's index. |
69 | | -
|
70 | | - Parameters |
71 | | - ---------- |
72 | | - path: string |
73 | | - The path to the root file. |
74 | | - key: string |
75 | | - The key of the tree to load. |
76 | | - columns: str or sequence of str |
77 | | - A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read. |
78 | | - ignore: str or sequence of str |
79 | | - A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument). |
80 | | - chunksize: int |
81 | | - If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows. |
82 | | - where: str |
83 | | - Only rows that match the expression will be read. |
84 | | - flatten: bool |
85 | | - If set to True, will use root_numpy.stretch to flatten arrays in the root file into individual entries. |
86 | | - All arrays specified in the columns must have the same length for this to work. |
87 | | - Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries, |
88 | | - so you will be iterating over a number of entries that is potentially larger than chunksize. |
89 | | - The index of each element within its former array will be saved in the __array_index column. |
90 | | -
|
91 | | - Returns |
92 | | - ------- |
93 | | - DataFrame created from matching data in the specified TTree |
94 | | -
|
95 | | - Notes |
96 | | - ----- |
97 | | -
|
98 | | - >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100') |
99 | | -
|
100 | | - """ |
101 | | - if not key: |
102 | | - trees = list_trees(path) |
103 | | - if len(trees) == 1: |
104 | | - key = trees[0] |
105 | | - elif len(trees) == 0: |
106 | | - raise ValueError('No trees found in {}'.format(path)) |
107 | | - else: |
108 | | - raise ValueError('More than one tree found in {}'.format(path)) |
109 | | - |
110 | | - branches = list_branches(path, key) |
111 | | - |
112 | | - if not columns: |
113 | | - all_vars = branches |
114 | | - else: |
115 | | - if isinstance(columns, string_types): |
116 | | - columns = [columns] |
117 | | - # __index__* is always loaded if it exists |
118 | | - # XXX Figure out what should happen with multi-dimensional indices |
119 | | - index_branches = filter(lambda x: x.startswith('__index__'), branches) |
120 | | - if index_branches: |
121 | | - columns = columns[:] |
122 | | - columns.append(index_branches[0]) |
123 | | - columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns)))) |
124 | | - all_vars = get_matching_variables(branches, columns) |
125 | | - |
126 | | - if ignore: |
127 | | - if isinstance(ignore, string_types): |
128 | | - ignore = [ignore] |
129 | | - ignored = get_matching_variables(branches, ignore, fail=False) |
130 | | - ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored)))) |
131 | | - if any(map(lambda x: x.startswith('__index__', ignored))): |
132 | | - raise ValueError('__index__* branch is being ignored!') |
133 | | - for var in ignored: |
134 | | - all_vars.remove(var) |
135 | | - |
136 | | - def do_flatten(arr): |
137 | | - arr_, idx = stretch(arr, return_indices=True) |
138 | | - arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True) |
139 | | - return arr |
140 | | - |
141 | | - if chunksize: |
142 | | - f = ROOT.TFile.Open(path) |
143 | | - n_entries = f.Get(key).GetEntries() |
144 | | - f.Close() |
145 | | - |
146 | | - def genchunks(): |
147 | | - for chunk in range(int(ceil(float(n_entries) / chunksize))): |
148 | | - arr = root2array(path, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs) |
149 | | - if flatten: |
150 | | - arr = do_flatten(arr) |
151 | | - yield convert_to_dataframe(arr) |
152 | | - |
153 | | - return genchunks() |
154 | | - |
155 | | - arr = root2array(path, key, all_vars, selection=where, *args, **kwargs) |
156 | | - if flatten: |
157 | | - arr = do_flatten(arr) |
158 | | - return convert_to_dataframe(arr) |
159 | | - |
160 | | - |
161 | | -def convert_to_dataframe(array): |
162 | | - indices = list(filter(lambda x: x.startswith('__index__'), array.dtype.names)) |
163 | | - if len(indices) == 0: |
164 | | - df = DataFrame.from_records(array) |
165 | | - elif len(indices) == 1: |
166 | | - # We store the index under the __index__* branch, where |
167 | | - # * is the name of the index |
168 | | - df = DataFrame.from_records(array, index=indices[0]) |
169 | | - index_name = indices[0][len('__index__'):] |
170 | | - if not index_name: |
171 | | - # None means the index has no name |
172 | | - index_name = None |
173 | | - df.index.name = index_name |
174 | | - else: |
175 | | - raise ValueError("More than one index found in file") |
176 | | - return df |
177 | | - |
178 | | - |
179 | | -def to_root(df, path, key='default', mode='w', *args, **kwargs): |
180 | | - """ |
181 | | - Write DataFrame to a ROOT file. |
182 | | -
|
183 | | - Parameters |
184 | | - ---------- |
185 | | - path: string |
186 | | - File path to new ROOT file (will be overwritten) |
187 | | - key: string |
188 | | - Name of tree that the DataFrame will be saved as |
189 | | - mode: string, {'w', 'a'} |
190 | | - Mode that the file should be opened in (default: 'w') |
191 | | -
|
192 | | - Notes |
193 | | - ----- |
194 | | -
|
195 | | - Further *args and *kwargs are passed to root_numpy's array2root. |
196 | | -
|
197 | | - >>> df = DataFrame({'x': [1,2,3], 'y': [4,5,6]}) |
198 | | - >>> df.to_root('test.root') |
199 | | -
|
200 | | - The DataFrame index will be saved as a branch called '__index__*', |
201 | | - where * is the name of the index in the original DataFrame |
202 | | - """ |
203 | | - |
204 | | - if mode == 'a': |
205 | | - mode = 'update' |
206 | | - elif mode == 'w': |
207 | | - mode = 'recreate' |
208 | | - else: |
209 | | - raise ValueError('Unknown mode: {}. Must be "a" or "w".'.format(mode)) |
210 | | - |
211 | | - from root_numpy import array2root |
212 | | - # We don't want to modify the user's DataFrame here, so we make a shallow copy |
213 | | - df_ = df.copy(deep=False) |
214 | | - name = df_.index.name |
215 | | - if name is None: |
216 | | - # Handle the case where the index has no name |
217 | | - name = '' |
218 | | - df_['__index__' + name] = df_.index |
219 | | - arr = df_.to_records(index=False) |
220 | | - array2root(arr, path, key, mode=mode, *args, **kwargs) |
221 | | - |
222 | | - |
223 | | -# Patch pandas DataFrame to support to_root method |
224 | | -DataFrame.to_root = to_root |
| 1 | +from .readwrite import read_root |
0 commit comments