Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions PyRDF/CallableGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_action_nodes(self, node_py=None):
# current PyRDF node as the head node
node_py = self.head_node
else:
if node_py.operation.is_action():
if node_py.operation.is_action() or node_py.operation.is_info():
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would this imply when we will add support for instant actions? Something like this would maybe start to be to long

if node_py.operation.is_action() or node_py.operation.is_info() or node_py.operation.is_instant_action():

Instead could it be better to type the following?

if not node_py.operation.is_transformation():

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also this function is called get_action_nodes so we need to remember to change the name afterwards

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer the explicit expression, for now we do not require three conditions and we could even include instant_actions in info or maybe a more suitable name if we need to support it.

# Collect all action nodes in order to return them
return_nodes.append(node_py)

Expand Down Expand Up @@ -96,7 +96,7 @@ def mapper(node_cpp, node_py=None):
# recursive call
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now this part becomes a little fuzzy. While before we were actually returning PyROOT class instances in the form of lazy transformations and booked actions, now we can also return fundamental types like strings and vectors.

parent_node = pyroot_node

if node_py.operation.is_action():
if node_py.operation.is_action() or node_py.operation.is_info():
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above, maybe use if not self.node_py.operation.is_transformation():

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same answer as above, indeed this could be refactored and use one single function to return nodes and values.

# Collect all action nodes in order to return them
return_vals.append(pyroot_node)

Expand Down
34 changes: 31 additions & 3 deletions PyRDF/Node.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def __init__(self, get_head, operation, *args):
self.children = []
self._new_op_name = ""
self.value = None
self.ResultPtr = None
self.pyroot_node = None
self.has_user_references = True

Expand Down Expand Up @@ -101,7 +102,29 @@ def __setstate__(self, state):
else:
self.operation = None

def is_prunable(self):
def is_prunable_action(self):
"""
Checks if an action node can be pruned. Action nodes whose value was
already computed can be pruned.

Returns:
bool: True if the node is an action and its value was already
computed, False otherwise.
"""
return self.operation and self.operation.is_action() and self.value

def is_prunable_info(self):
"""
Checks if an info node can be pruned. Info nodes whose ResultPtr was
already assigned can be pruned.

Returns:
bool: True if the node is an action and its value was already
computed, False otherwise.
"""
return self.operation and self.operation.is_info() and self.ResultPtr

def is_prunable_node(self):
"""
Checks whether the current node can be pruned from the computational
graph.
Expand All @@ -113,7 +136,7 @@ def is_prunable(self):
if not self.children:
# Every pruning condition is written on a separate line
if not self.has_user_references or \
(self.operation and self.operation.is_action() and self.value):
self.is_prunable_action() or self.is_prunable_info():

# ***** Condition 1 *****
# If the node is wrapped by a proxy which is not directly
Expand All @@ -123,6 +146,11 @@ def is_prunable(self):
# If the current node's value was already
# computed, it should get pruned only if it's
# an Action node.

# ***** Condition 3 *****
# If the current node's value was already
# computed, it should get pruned only if it's
# an Info operation.
return True
return False

Expand All @@ -144,4 +172,4 @@ def graph_prune(self):
children.append(n)

self.children = children
return self.is_prunable()
return self.is_prunable_node()
18 changes: 16 additions & 2 deletions PyRDF/Operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class Operation(object):
print(PyRDF.current_backend.supported_operations)
"""

Types = Enum("Types", "ACTION TRANSFORMATION INSTANT_ACTION")
Types = Enum("Types", "ACTION TRANSFORMATION INSTANT_ACTION INFO")

def __init__(self, name, *args, **kwargs):
"""
Expand Down Expand Up @@ -80,7 +80,11 @@ def _classify_operation(self, name):
'Take': ops.ACTION,
'Graph': ops.ACTION,
'Snapshot': ops.INSTANT_ACTION,
'Foreach': ops.INSTANT_ACTION
'Foreach': ops.INSTANT_ACTION,
'GetColumnNames': ops.INFO,
'GetDefinedColumnNames': ops.INFO,
'GetColumnType': ops.INFO,
'GetFilterNames': ops.INFO,
}

op_type = operations_dict.get(name)
Expand All @@ -107,3 +111,13 @@ def is_transformation(self):
False otherwise.
"""
return self.op_type == Operation.Types.TRANSFORMATION

def is_info(self):
"""
Checks if the current operation is an info operation.

Returns:
bool: True if the current operation is a transformation,
False otherwise.
"""
return self.op_type == Operation.Types.INFO
14 changes: 11 additions & 3 deletions PyRDF/Proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def GetValue(self):
from PyRDF import current_backend
if not self.proxied_node.value: # If event-loop not triggered
generator = CallableGenerator(self.proxied_node.get_head())
current_backend.execute(generator)
current_backend.execute(generator, trigger_loop=True)

return self.proxied_node.value

Expand Down Expand Up @@ -123,10 +123,10 @@ def _create_new_op(self, *args, **kwargs):
Handles an operation call to the current node and returns the new node
built using the operation call.
"""
from PyRDF import current_backend
# Create a new `Operation` object for the
# incoming operation call
op = Operation(self.proxied_node._new_op_name, *args, **kwargs)

# Create a new `Node` object to house the operation
newNode = Node(operation=op, get_head=self.proxied_node.get_head)

Expand All @@ -136,5 +136,13 @@ def _create_new_op(self, *args, **kwargs):
# Return the appropriate proxy object for the node
if op.is_action():
return ActionProxy(newNode)
else:
elif op.is_transformation():
return TransformationProxy(newNode)
else:
try:
generator = CallableGenerator(self.proxied_node.get_head())
current_backend.execute(generator, trigger_loop=False)
except TypeError as e:
self.proxied_node.children.remove(newNode)
raise e
return newNode.ResultPtr
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case newNode.ResultPtr will not hold a RResultPtr but one of the types returned by the "other operations" of ROOT RDataFrame, such as strings or vectors. Only Display makes an exception, but it is not included in this PR.

Furthermore, while I get that creating a new node also for info operations can guarantee the execution of the same event loop only once via the generator.execute function, I don't think that these kind of operations are suited to be "nodes" in the graph, they are just querying the dataframe for metadata

6 changes: 5 additions & 1 deletion PyRDF/backend/Backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ class Backend(ABC):
'Foreach',
'Reduce',
'Aggregate',
'GetColumnNames',
'GetDefinedColumnNames',
'GetColumnType',
'GetFilterNames',
'Graph'
]

Expand Down Expand Up @@ -93,7 +97,7 @@ def check_supported(self, operation_name):
)

@abstractmethod
def execute(self, generator):
def execute(self, generator, trigger_loop=False):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this, now that I see it I wish it were added before. One counterargument to it though: in this way we are creating pyroot nodes even when no computation is involved, or not ?

I am thinking a situation in which the user first wants to find out the name of a column

df = PyRDF.RDataFrame("some_tree","some_file.root")

colnames = df.GetColumnNames() # this will create pyroot nodes and return a list of strings

and then executes some operations

filter1 = df.Filter("valid c++ filter")
histo1 = filter1.Histo1D("col1")

histo1.Draw() # this triggers the event loop, recreating the same pyroot objects

A way to solve this would be to implement some checks in the mapper function of the CallableGenerator, e.g.

if not node_py.pyroot_node:
    RDFOperation...
    [...]

"""
Subclasses must define how to run the RDataFrame graph on a given
environment.
Expand Down
4 changes: 2 additions & 2 deletions PyRDF/backend/Dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def _get_friend_info(self, tree):

return FriendInfo(friend_names, friend_file_names)

def execute(self, generator):
def execute(self, generator, trigger_loop=True):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

distributed info operations won't work now, will they? Similar to how Count doesn't work in spark, it returns a fundamental type so we can't merge it. We should add all info operations to the operations not supported by Dist

"""
Executes the current RDataFrame graph
in the given distributed environment.
Expand Down Expand Up @@ -526,7 +526,7 @@ def reducer(values_list1, values_list2):
warnings.warn(msg, UserWarning, stacklevel=2)
PyRDF.use("local")
from .. import current_backend
return current_backend.execute(generator)
return current_backend.execute(generator, trigger_loop=True)

# Values produced after Map-Reduce
values = self.ProcessAndMerge(mapper, reducer)
Expand Down
15 changes: 7 additions & 8 deletions PyRDF/backend/Local.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, config={}):
if op not in operations_not_supported]
self.pyroot_rdf = None

def execute(self, generator):
def execute(self, generator, trigger_loop=False):
"""
Executes locally the current RDataFrame graph.

Expand All @@ -52,22 +52,21 @@ def execute(self, generator):

# if the RDataFrame has not been created yet or if a new one
# is created by the user in the same session
if (not self.pyroot_rdf) or \
(self.pyroot_rdf is not generator.head_node):
if not self.pyroot_rdf or self.pyroot_rdf is not generator.head_node:
self.pyroot_rdf = ROOT.ROOT.RDataFrame(*generator.head_node.args)

values = mapper(self.pyroot_rdf) # Execute the mapper function

# Get the action nodes in the same order as values
nodes = generator.get_action_nodes()

values[0].GetValue() # Trigger event-loop

for i in range(len(values)):
for node, value in zip(nodes, values):
# Set the obtained values and
# 'RResultPtr's of action nodes
nodes[i].value = values[i].GetValue()
if trigger_loop and hasattr(value, 'GetValue'):
# Info actions do not have GetValue
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't call them "info actions". Info operations do not trigger the event loop and they have a different scope than the action operations

node.value = value.GetValue()
# We store the 'RResultPtr's because,
# those should be in scope while doing
# a 'GetValue' call on them
nodes[i].ResultPtr = values[i]
node.ResultPtr = value
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not all of these outputs are RResultPtrs. Of all the "other operations" of the ROOT RDataFrame class, only Display returns an RResultPtr.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Furthermore, for those nodes that actually return RResultPtrs, we are storing both the pointer and the pointed result. Why do we need that?

69 changes: 69 additions & 0 deletions tests/integration/local/test_info_operations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import unittest
import PyRDF
import ROOT


class InfoOperationsLocalTest(unittest.TestCase):
"""
Check that Info operations return the expected result rather than a proxy.
"""

def test_GetColumnNames(self):
"""
GetColumnNames returns ROOT string vector without running the event
loop.
"""
rdf = PyRDF.RDataFrame(1)
d = rdf.Define('a', 'rdfentry_').Define('b', 'a*a')

column_names = d.GetColumnNames()
expected_columns = ROOT.std.vector('string')()
expected_columns.push_back("a")
expected_columns.push_back("b")

for column, expected in zip(column_names, expected_columns):
self.assertEqual(column, expected)

def test_GetColumnType(self):
"""
GetColumnType returns the type of a given column as a string.
"""
rdf = PyRDF.RDataFrame(1)
d = rdf.Define('a', 'rdfentry_').Define('b', 'a*a')

a_typename = d.GetColumnType('a')
b_typename = d.GetColumnType('b')
expected_type = 'ULong64_t'

self.assertEqual(a_typename, expected_type)
self.assertEqual(b_typename, expected_type)

def test_GetDefinedColumnNames(self):
"""
GetDefinedColumnNames returns the names of the defined columns.
"""
rdf = PyRDF.RDataFrame(1)
d = rdf.Define('a', 'rdfentry_').Define('b', 'a*a')

column_names = d.GetColumnNames()
expected_columns = ROOT.std.vector('string')()
expected_columns.push_back("a")
expected_columns.push_back("b")

for column, expected in zip(column_names, expected_columns):
self.assertEqual(column, expected)

def test_GetFilterNames(self):
"""
GetFilterNames returns the names of the filters created.
"""
rdf = PyRDF.RDataFrame(1)
filter_name = 'custom_filter'
d = rdf.Filter('rdfentry_ > 1', filter_name)

filters = d.GetFilterNames()
expected_filters = ROOT.std.vector('string')()
expected_filters.push_back(filter_name)

for f, expected in zip(filters, expected_filters):
self.assertEqual(f, expected)
5 changes: 5 additions & 0 deletions tests/unit/test_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ def test_transformation(self):
op = Operation("Define", "c1")
self.assertEqual(op.op_type, Operation.Types.TRANSFORMATION)

def test_info_action(self):
"""Info nodes are classified accurately."""
op = Operation("GetColumnNames")
self.assertEqual(op.op_type, Operation.Types.INFO)

def test_none(self):
"""Incorrect operations raise an Exception."""
with self.assertRaises(Exception):
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def test_node_attr_transformation(self):
"children",
"_new_op_name",
"value",
"ResultPtr",
"pyroot_node",
"has_user_references"
]
Expand Down Expand Up @@ -145,7 +146,7 @@ class TestBackend(Backend):
Test backend to verify the working of 'GetValue' instance method
in Proxy.
"""
def execute(self, generator):
def execute(self, generator, trigger_loop):
"""
Test implementation of the execute method
for 'TestBackend'. This records the head
Expand Down